import os
import gc
import time
import pickle
import optuna
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
from contextlib import contextmanager
from nehori import tilib
from nehori import protra
g_path = "C:\\Users\\XXXX\\stock\\Protra"
@contextmanager
def timer(title):
t0 = time.time()
yield
print("{} - done in {:.0f}s".format(title, time.time() - t0))
# CSVの読み込み(利用しない)
def read_csv2(stock_id, skiprows, skipfooter):
file = "tosho/" + str(stock_id) + ".csv"
if not os.path.exists(file):
print("[Error] " + file + " does not exist.")
return None, False
else:
return pd.read_csv(file, skiprows=skiprows,
skipfooter=skipfooter, engine="python",
names=("date", "open", "high", "low", "close", "volume"),
# For "ValueError: DataFrame.dtypes for data must be int, float or bool."
dtype={'open': float, 'high': float, 'low': float, 'close': float, 'volume': float}
), True
# Protraからの直接読み込み
def read_protra_stock(stock_id, skiprows, skipfooter):
global g_path
p = protra.PriceList(g_path)
l_2d = p.readPriceList(stock_id)
if (skiprows != 0):
del l_2d[:skiprows]
if (skipfooter != 0):
del l_2d[-1 * skipfooter:]
df = pd.DataFrame(l_2d, columns=("date", "open", "high", "low", "close", "volume"))
df = df.astype({'open': float, 'high': float, 'low': float, 'close': float, 'volume': float})
return df, True
# 概要出力
def display_overview(df):
# それぞれのデータのサイズを確認
print("The size of df is : "+str(df.shape))
# 列名を表示
print(df.columns)
# 表の一部分表示
print(df.head().append(df.tail()))
# 予測値(当日の終値 - 始値 >= 0か?)
def get_target_value(df):
df['target'] = df['close'].shift(-1) - df['open'].shift(-1)
df.loc[(df['target'] >= 0), 'target'] = 1
df.loc[(0 > df['target']), 'target'] = 0
return df
# データ前処理
def pre_processing(df):
# 目的変数(*日後の始値の上昇値)
df = get_target_value(df)
# 曜日追加
df['day'] = pd.to_datetime(df['date']).dt.dayofweek
# 新特徴データ
df = tilib.add_new_features(df)
# 欠損値を列の1つ手前の値で埋める
df = df.fillna(method='ffill')
return df
# feature importanceをプロット
def display_importances(feature_importance_df_):
cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by = "importance", ascending = False)[:40].index
best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
plt.figure(figsize = (8, 10))
sns.barplot(x = "importance", y = "feature", data = best_features.sort_values(by = "importance", ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.savefig('lgbm_importances01.png')
# ROC曲線をプロット
def display_roc(list_label, list_score):
fpr, tpr, thresholds = roc_curve(list_label, list_score)
auc = metrics.auc(fpr, tpr)
plt.plot(fpr, tpr, label='ROC curve (area = %.2f)'%auc)
plt.legend()
plt.title('ROC curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.grid(True)
# Optuna(ハイパーパラメータ自動最適化ツール)
class Objective:
def __init__(self, x, y, excluded_feats, num_folds = 4, stratified = False):
self.x = x
self.y = y
self.excluded_feats = excluded_feats
self.stratified = stratified
self.num_folds = num_folds
def __call__(self, trial):
df_train = self.x
y = self.y
excluded_feats = self.excluded_feats
stratified = self.stratified
num_folds = self.num_folds
# Cross validation model
if stratified:
folds = StratifiedKFold(n_splits = num_folds, shuffle = True, random_state = 1001)
else:
folds = KFold(n_splits = num_folds, shuffle = True, random_state = 1001)
oof_preds = np.zeros(df_train.shape[0])
feats = [f for f in df_train.columns if f not in excluded_feats]
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(df_train[feats], y)):
X_train, y_train = df_train[feats].iloc[train_idx], y.iloc[train_idx]
X_valid, y_valid = df_train[feats].iloc[valid_idx], y.iloc[valid_idx]
clf = LGBMClassifier(objective = 'binary',
reg_alpha = trial.suggest_loguniform('reg_alpha', 1e-4, 100.0),
reg_lambda = trial.suggest_loguniform('reg_lambda', 1e-4, 100.0),
num_leaves = trial.suggest_int('num_leaves', 10, 40),
silent = True)
# trainとvalidを指定し学習
clf.fit(X_train, y_train, eval_set = [(X_train, y_train), (X_valid, y_valid)],
eval_metric = 'auc', verbose = 0, early_stopping_rounds = 200)
oof_preds[valid_idx] = clf.predict_proba(X_valid, num_iteration = clf.best_iteration_)[:, 1]
accuracy = roc_auc_score(y, oof_preds)
return 1.0 - accuracy
import lightgbm as lgb
# 決定木を可視化
def display_tree(clf):
print('Plotting tree with graphviz...')
graph = lgb.create_tree_digraph(clf, tree_index=1, format='png', name='Tree',
show_info=['split_gain','internal_weight','leaf_weight','internal_value','leaf_count'])
graph.render(view=True)
def load_model(num):
clf = None
file = "model" + str(num) + ".pickle"
if os.path.exists(file):
with open(file, mode='rb') as fp:
clf = pickle.load(fp)
return clf
def save_model(num, clf):
with open("model" + str(num) + ".pickle", mode='wb') as fp:
pickle.dump(clf, fp, protocol=2)
# Cross validation with KFold
def cross_validation(df_train, y, df_test, excluded_feats, num_folds = 4, stratified = False, debug = False):
print("Starting cross_validation. Train shape: {}, test shape: {}".format(df_train.shape, df_test.shape))
# Cross validation model
if stratified:
folds = StratifiedKFold(n_splits = num_folds, shuffle = True, random_state = 1001)
else:
folds = KFold(n_splits = num_folds, shuffle = True, random_state = 1001)
# Create arrays and dataframes to store results
oof_preds = np.zeros(df_train.shape[0])
sub_preds = np.zeros(df_test.shape[0])
df_feature_importance = pd.DataFrame()
feats = [f for f in df_train.columns if f not in excluded_feats]
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(df_train[feats], y)):
X_train, y_train = df_train[feats].iloc[train_idx], y.iloc[train_idx]
X_valid, y_valid = df_train[feats].iloc[valid_idx], y.iloc[valid_idx]
# LightGBM
clf = LGBMClassifier(max_depth=6,
num_leaves = 29)
# trainとvalidを指定し学習
clf.fit(X_train, y_train, eval_set = [(X_train, y_train), (X_valid, y_valid)],
eval_metric = "auc", verbose = 0, early_stopping_rounds = 200)
oof_preds[valid_idx] = clf.predict_proba(X_valid, num_iteration = clf.best_iteration_)[:, 1]
sub_preds = clf.predict_proba(df_test[feats], num_iteration = clf.best_iteration_)[:, 1]
df_fold_importance = pd.DataFrame()
df_fold_importance["feature"] = feats
df_fold_importance["importance"] = clf.feature_importances_
df_fold_importance["fold"] = n_fold + 1
df_feature_importance = pd.concat([df_feature_importance, df_fold_importance], axis=0)
save_model(n_fold, clf)
del clf, X_train, y_train, X_valid, y_valid
gc.collect()
print('Full AUC score %.6f' % roc_auc_score(y, oof_preds))
#display_roc(y, oof_preds)
display_importances(df_feature_importance)
return sub_preds
# Protraファイルの作成
def create_protra_dataset(code, value, value2, date, y_pred, flag):
# 利益が高いと判定したものだけ残す
y_pred = np.where(y_pred >= flag, True, False)
s = ""
s += " if ((int)Code == " + code + ")\n"
s += " if ( \\\n"
for i in range(len(y_pred)):
if(y_pred[i]):
(year, month, day) = date[i].split('/')
s += "(Year == " + str(int(year)) + " && Month == " + str(int(month)) + " && Day == " + str(int(day)) + ") || \\\n"
s += " (Year == 3000))\n"
s += " return " + value +"\n"
s += " else\n"
s += " return " + value2 +"\n"
s += " end\n"
s += " end\n"
return s
def pred_load_model(clfs, df, stock_id, excluded_feats):
n_splits = len(clfs)
sub_preds = np.zeros(df.shape[0])
feats = [f for f in df.columns if f not in excluded_feats]
for clf in clfs:
sub_preds += clf.predict_proba(df[feats], num_iteration = clf.best_iteration_)[:, 1] / n_splits
s = create_protra_dataset("1570", "1", "0", df["date"], sub_preds, 0.6)
s = s + create_protra_dataset("1357", "0", "1", df["date"], sub_preds, 0.6)
return s
# 時価総額ランキングTop20
stock_names = [
"1001",
]
# 日経255
updown_stock_names = [
"4151","4502","4503","4506","4507","4519","4523","4568","4578","3105","6479","6501",
"6503","6504","6506","6645","6674","6701","6702","6703","6724","6752","6758","6762",
"6770","6841","6857","6902","6952","6954","6971","6976","7735","7751","7752","8035",
"7201","7202","7203","7205","7211","7261","7267","7269","7270","7272","4543","4902",
"7731","7733","7762","9412","9432","9433","9437","9613","9984","8303","8304",
"8306","8308","8309","8316","8331","8354","8355","8411","8253","8601","8604","8628",
"8630","8725","8729","8750","8766","8795","1332","1333","2002","2269","2282","2501",
"2502","2503","2531","2801","2802","2871","2914","3086","3099","3382","8028","8233",
"8252","8267","9983","2413","2432","4324","4689","4704","4751","4755",
"9602","9735","9766","1605","3101","3103","3401","3402","3861","3863","3405","3407",
"4004","4005","4021","4042","4043","4061","4063","4183","4188","4208","4272","4452",
"4631","4901","4911","6988","5019","5020","5101","5108","5201","5202","5214","5232",
"5233","5301","5332","5333","5401","5406","5411","5541","3436","5703","5706","5707",
"5711","5713","5714","5801","5802","5803","5901","2768","8001","8002","8015","8031",
"8053","8058","1721","1801","1802","1803","1808","1812","1925","1928","1963","5631",
"6103","6113","6301","6302","6305","6326","6361","6367","6471","6472","6473","7004",
"7011","7013","7003","7012","7832","7911","7912","7951","3289","8801","8802","8804",
"8830","9001","9005","9007","9008","9009","9020","9021","9022","9062","9064","9101",
"9104","9107","9202","9301","9501","9502","9503","9531","9532",
]
# 騰落レシオを作成
def get_up_down_ratio(skiprows, skipfooter):
# 一つずつ pandaで読み込む
df, val = read_protra_stock("1001", 0, 0)
cols = ["date", "up1", "up5", "up25", "down1", "down5", "down25"]
df_updown = pd.DataFrame(index=[], columns=cols)
df_updown["date"] = df["date"]
df_updown.fillna(0, inplace=True) # 初期化
for stock_id in updown_stock_names:
df, val = read_protra_stock(stock_id, skiprows, skipfooter)
# 1日、5日、25日前と比較して上昇しているかカウントする
df_updown["up1"] = df_updown["up1"].where(df["close"].diff(1) >= 0, df_updown["up1"] + 1)
df_updown["up5"] = df_updown["up5"].where(df["close"].diff(5) >= 0, df_updown["up5"] + 1)
df_updown["up25"] = df_updown["up25"].where(df["close"].diff(25) >= 0, df_updown["up25"] + 1)
df_updown["down1"] = df_updown["down1"].where(df["close"].diff(1) < 0, df_updown["down1"] + 1)
df_updown["down5"] = df_updown["down5"].where(df["close"].diff(5) < 0, df_updown["down5"] + 1)
df_updown["down25"] = df_updown["down25"].where(df["close"].diff(25) < 0, df_updown["down25"] + 1)
display_overview(df_updown)
# 騰落レシオ=(25)日間の値上がり銘柄数合計 ÷ (25)日間の下がり銘柄数合計 × 100
df_updown["updown1"] = df_updown["up1"] / df_updown["down1"] * 100
df_updown["updown5"] = df_updown["up5"] / df_updown["down5"] * 100
df_updown["updown25"] = df_updown["up25"] / df_updown["down25"] * 100
display_overview(df_updown)
# 他の銘柄の合計数と加算する
df_updown = df_updown.astype({'updown1': float, 'updown5': float, 'updown25': float, 'up1': float, 'up5': float, 'up25': float, 'down1': float, 'down5': float, 'down25': float})
return df_updown
def main(df_train, df_test, stock_id):
# 概要出力
#display_overview(df_train)
# 学習モデル構築
df_test = df_test.drop("target", axis=1)
df_train = df_train.dropna(subset=["target"])
# 正解データ・失敗データだけ利用する
df_train = df_train[(df_train['target'] == 1) | (df_train['target'] == 0)]
excluded_feats = ['target', 'date']
s = ""
# 学習データが存在する場合
if (len(df_train)):
if True:
# 交差検証
y_pred = cross_validation(df_train, df_train['target'], df_test, excluded_feats, 2, True, True)
print(y_pred)
s = protra.create_protra_dataset(stock_id, df_test["date"], y_pred, 0.8)
else:
# ハイパーパラメータ探索
objective = Objective(x=df_train, y=df_train['target'],
excluded_feats=excluded_feats, num_folds = 5, stratified = True)
study = optuna.create_study(sampler = optuna.samplers.RandomSampler(seed = 0))
study.optimize(objective, n_trials = 50)
return s
# 結合版
if __name__ == '__main__':
with timer("Up down ratio creation"):
df_updown = get_up_down_ratio(0, 0)
with timer("Data read"):
df_train = pd.DataFrame()
df_test = pd.DataFrame()
for stock_id in stock_names:
print(str(stock_id))
# CVを使っているのでTest用に一定数を未知のデータとする
df, val = read_protra_stock(stock_id, 200, 200)
df = pd.merge(df, df_updown, on='date')
if not val:
continue
df = pre_processing(df)
df_train = pd.concat([df_train, df])
#display_overview(df)
# CVを使っているのでTest用に一定数を未知のデータとする
df_test, val = read_protra_stock(stock_id, 0, 200)
df_test = pd.merge(df_test, df_updown, on='date')
if (val2):
df_test = pd.merge(df_test, df2, on='date')
#df_test = pd.concat([df_test, df])
#display_overview(df_test)
# データ前処理
with timer("Cross validation"):
df_test = pre_processing(df_test)
display_overview(df_train)
# closeの欠損値が含まれている行を削除
df_train = df_train.dropna(subset=["close"])
main(df_train, df_test, stock_id)
s = ""
with timer("start back test"):
clf = []
for i in range(2):
clf.append(load_model(i))
excluded_feats = ['target', 'date']
for stock_id in stock_names:
df_test, val = read_protra_stock(stock_id, 0, 0)
df_test = pd.merge(df_test, df_updown, on='date')
df_test = pre_processing(df_test)
#display_overview(df_test)
s += pred_load_model(clf, df_test, stock_id, excluded_feats)
with open(g_path + "\\lib\\LightGBM.pt", mode='w') as f:
f.write(protra.merge_protra_dataset(s))