Linear Regress.TAIEX_POC.0628
1'st: 2024-06-28 最簡單的模型
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yfinance as yf
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from google.colab import drive
import datetime
# 授權 Google Drive
drive.mount('/content/drive')
# 定義下載資料的函數
def download_data(ticker, start_date):
data = yf.download(ticker, start=start_date)
data['Return'] = data['Adj Close'].pct_change()
data['Trade Amount'] = data['Volume'] * data['Adj Close'] # 个股成交量(金額)
data['MA7'] = data['Adj Close'].rolling(window=7).mean()
data['MA21'] = data['Adj Close'].rolling(window=21).mean()
data['MA50'] = data['Adj Close'].rolling(window=50).mean() # 新增特徵
data['MA100'] = data['Adj Close'].rolling(window=100).mean() # 新增特徵
data['RSI14'] = calculate_rsi(data['Adj Close'], 14)
data['Beta_60'] = calculate_beta(data['Return'], data['Return'].rolling(window=60).mean(), 60)
data['Beta_120'] = calculate_beta(data['Return'], data['Return'].rolling(window=120).mean(), 120)
return data
def calculate_rsi(series, period):
delta = series.diff(1)
gain = (delta.where(delta > 0, 0)).rolling(window=period).mean()
loss = (-delta.where(delta < 0, 0)).rolling(window=period).mean()
rs = gain / loss
rsi = 100 - (100 / (1 + rs))
return rsi
def calculate_beta(stock_returns, market_returns, window):
cov_matrix = stock_returns.rolling(window=window).cov(market_returns)
market_variance = market_returns.rolling(window=window).var()
beta = cov_matrix / market_variance
return beta
# 下載台股指數和個股資料
tickers = ["^TWII"]
data = {ticker: download_data(ticker, "2020-01-01") for ticker in tickers} # 增加更多历史数据
# 將資料保存為CSV文件到Google Drive
for ticker in tickers:
data[ticker].to_csv(f"/content/drive/My Drive/{ticker}.csv")
# 構建模型的資料集
stock_data = data["^TWII"]
stock_data['Daily Change'] = stock_data['Adj Close'].pct_change()
features = stock_data[['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'Daily Change', 'Trade Amount', 'MA7', 'MA21', 'MA50', 'MA100', 'RSI14', 'Beta_60', 'Beta_120']]
target = stock_data[['Adj Close']]
# 去除目標變數為0的資料
mask = target['Adj Close'] != 0
features = features[mask]
target = target[mask]
# 填充缺失值
features.fillna(method='ffill', inplace=True)
features.fillna(method='bfill', inplace=True)
target.fillna(method='ffill', inplace=True)
target.fillna(method='bfill', inplace=True)
# 分割訓練集和測試集
train_size = int(len(features) * 0.9)
X_train, X_test = features[:train_size], features[train_size:]
y_train, y_test = target[:train_size], target[train_size:]
# 正規化資料
feature_scaler = MinMaxScaler()
target_scaler = MinMaxScaler()
X_train_scaled = feature_scaler.fit_transform(X_train)
X_test_scaled = feature_scaler.transform(X_test)
y_train_scaled = target_scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = target_scaler.transform(y_test.values.reshape(-1, 1))
# 構建線性回歸模型
model = LinearRegression()
model.fit(X_train_scaled, y_train_scaled)
# 預測
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)
# 將預測結果反轉回原始尺度
y_train_pred = target_scaler.inverse_transform(y_train_pred)
y_test_pred = target_scaler.inverse_transform(y_test_pred)
# 確保 y_train_pred 和 y_test_pred 是一維的
y_train_pred = y_train_pred.flatten()
y_test_pred = y_test_pred.flatten()
# 確保 dates 的長度與 target 一致
dates = data["^TWII"].loc["2020-01-01":].index # 更新日期範圍
target = target.iloc[:len(dates)]
target_dates = dates[:len(target)]
# 確保訓練集預測日期與預測值長度一致
train_dates = target_dates[:train_size]
test_dates = target_dates[train_size:train_size + len(y_test_pred)]
# 繪圖
plt.figure(figsize=(14, 7))
# 繪製實際值
plt.plot(target_dates, target.values, color='blue', label='Actual TAIEX')
# 確保訓練集預測日期與預測值長度一致
plt.plot(train_dates, y_train_pred, color='orange', label='Training Prediction')
# 確保測試集預測日期與預測值長度一致
plt.plot(test_dates, y_test_pred, color='green', label='Testing Prediction')
# 預測未來30天
future_steps = 30
future_predictions = []
if len(X_test_scaled) > 0:
last_test_data = X_test_scaled[-1].reshape(1, -1)
for _ in range(future_steps):
next_pred = model.predict(last_test_data)
# 將每日變化限制在前一天的10%以內
if len(future_predictions) > 0:
max_change = future_predictions[-1] * 1.10
min_change = future_predictions[-1] * 0.90
next_pred = np.clip(next_pred, min_change, max_change)
future_predictions.append(next_pred[0])
last_test_data = np.roll(last_test_data, -1, axis=1)
last_test_data[0, -1] = next_pred[0]
future_predictions = target_scaler.inverse_transform(future_predictions)
# Make sure the future predictions start from the last point of actual data
if len(target.values) > 0:
future_predictions = np.insert(future_predictions, 0, target.values[-1])
# 生成未來日期索引
future_dates = pd.date_range(start=target_dates[-1], periods=future_steps + 1)
plt.plot(future_dates, future_predictions.flatten(), color='red', label='Future Prediction')
# 標識未來第 10、20、30 天的日期
for i, day in enumerate([10, 20, 30]):
plt.axvline(future_dates[day], color='purple', linestyle='--')
plt.text(future_dates[day], future_predictions[day], f'{future_dates[day].strftime("%m/%d")}: {future_predictions[day]:.2f}', color='purple')
plt.legend()
plt.title('TAIEX Prediction')
plt.xlabel('Days')
plt.ylabel('TAIEX Index')
# 標識日期
plt.gca().xaxis.set_major_formatter(plt.matplotlib.dates.DateFormatter('%Y/%m/%d'))
plt.gca().xaxis.set_major_locator(plt.matplotlib.dates.YearLocator())
plt.xticks(rotation=45)
plt.show()
![](https://thepearl.ghost.io/content/images/2024/06/data-src-image-9292c8cd-b71b-40e0-85ad-854c1c72b8a9.png)