import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
# 加載數據
file_path = '/content/drive/My Drive/MSCI_Taiwan_30_data_with_OBV.csv'
data = pd.read_csv(file_path)
# 確保日期列已經轉換為 datetime 類型
data['Date'] = pd.to_datetime(data['Date'])
# 將 Close_MSCI 重命名為 Close
data.rename(columns={'Close_MSCI': 'Close'}, inplace=True)
# 計算移動平均線 (SMA)
data['MA10'] = data['Close'].rolling(window=10).mean()
data['MA50'] = data['Close'].rolling(window=50).mean()
# 計算相對強弱指數 (RSI)
def compute_rsi(data, window=14):
delta = data['Close'].diff(1)
gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
rs = gain / loss
rsi = 100 - (100 / (1 + rs))
return rsi
data['RSI'] = compute_rsi(data)
# 計算移動平均收斂背離 (MACD)
def compute_macd(data, fast=12, slow=26, signal=9):
exp1 = data['Close'].ewm(span=fast, adjust=False).mean()
exp2 = data['Close'].ewm(span=slow, adjust=False).mean()
macd = exp1 - exp2
signal_line = macd.ewm(span=signal, adjust=False).mean()
macd_hist = macd - signal_line
return macd, signal_line, macd_hist
data['MACD'], data['MACD_Signal'], data['MACD_Hist'] = compute_macd(data)
# 填補缺失值
numeric_columns = data.select_dtypes(include=[np.number]).columns
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].mean())
# 選擇數值列進行相關性分析
numeric_data = data.select_dtypes(include=[np.number])
# 特徵選擇
correlation_matrix = numeric_data.corr()
target_corr = correlation_matrix['Close_TAIEX'].abs().sort_values(ascending=False)
selected_features = target_corr[target_corr > 0.5].index
# 特徵縮放
scaled_data = StandardScaler().fit_transform(numeric_data[selected_features])
scaled_data = pd.DataFrame(scaled_data, columns=selected_features)
# 分割數據為訓練集和測試集
train_size = int(len(data) * 0.8)
train_data = scaled_data[:train_size]
test_data = scaled_data[train_size:]
train_labels = data['Close_TAIEX'][:train_size]
test_labels = data['Close_TAIEX'][train_size:]
# 訓練隨機森林模型
rf_model = RandomForestRegressor()
rf_model.fit(train_data, train_labels)
rf_predictions = rf_model.predict(test_data)
# 訓練支持向量回歸模型 (SVR)
svr_model = SVR()
svr_model.fit(train_data, train_labels)
svr_predictions = svr_model.predict(test_data)
# 繪製實際值與預測值的圖表
plt.figure(figsize=(12, 6))
# 繪製實際值
plt.plot(data['Date'][train_size:], test_labels, label='Actual', color='blue', linewidth=0.5)
# 繪製隨機森林預測值
plt.plot(data['Date'][train_size:], rf_predictions, label='Random Forest Predictions', color='red', linewidth=0.5, alpha=0.7)
# 繪製SVR預測值
plt.plot(data['Date'][train_size:], svr_predictions, label='SVR Predictions', color='green', linewidth=0.5, alpha=0.7)
plt.title('Model Predictions vs Actual Values')
plt.xlabel('Date')
plt.ylabel('Close_TAIEX (Scaled)')
plt.legend()
# 設置日期格式和標注每年的第一天
plt.gca().xaxis.set_major_locator(mdates.YearLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y/%m/%d'))
plt.gcf().autofmt_xdate() # 自動旋轉日期標籤
plt.show()