import time
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import PowerTransformer
from sklearn.linear_model import LinearRegression, ElasticNet, Lasso, Ridge
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
DEBUG = False
# 外れ値の削除
def del_outlier(df):
df = df.drop(df[(df["TotalSF"]>10000) & (df["SalePrice"]<300000)].index)
for col in df:
if df[col].dtype != "object" and col != "SalePrice":
df = df.drop(df[(df[col]>=0) & (df["SalePrice"]>600000)].index)
df = df.drop(df[(df["OverallQual"]<10) & (df["SalePrice"]>500000)].index)
df = df.drop(df[(df["OverallQual"]<5) & (df["SalePrice"]>200000)].index)
df = df.drop(df[(df["SF"]>=0) & (df["SalePrice"]>500000)].index)
df = df.drop(df[(df["Garage"]>=0) & (df["SalePrice"]>500000)].index)
df = df.drop(df[(df["Year"]>=0) & (df["SalePrice"]>500000)].index)
return df
# 多重共線性の削除
def del_multicollinearity(df):
df["Garage"] = df["GarageCars"] + df["GarageArea"]
df["Year"] = df["YearBuilt"] + df["YearRemodAdd"]
df["SF"] = df["WoodDeckSF"] + df["OpenPorchSF"]
df["TotalSF"] = df["1stFlrSF"] + df["2ndFlrSF"] + df["TotalBsmtSF"] + df["GrLivArea"]
# カラム削除
cols = ["GarageCars","GarageArea","1stFlrSF", "2ndFlrSF", "TotalBsmtSF",
"GrLivArea", "YearBuilt", "YearRemodAdd", "WoodDeckSF", "OpenPorchSF"]
for i in cols:
df = df.drop(i, axis=1)
return df
# 欠損値補完
def do_imputation(df):
df['LotFrontage'] = df.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
for col in df:
#check if it is a number
if df[col].dtype != "object":
df[col].fillna(0.0, inplace=True)
else:
df[col].fillna("NA", inplace=True)
return df
# 順序関係に従ってエンコーディング
def encode_mapping(df, i):
num=-1
mapping={}
for j in df.groupby(i)['SalePrice'].mean().sort_values().index:
num+=1
mapping[j]=num
df[i]= df[i].map(mapping)
return df
def encode_categorical1(df):
cols = ("MSZoning", 'FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond',
'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1',
'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir')
for col in cols:
df = encode_mapping(df, col)
return df
# ラベルエンコーダー
def encode_label(df, i):
lbl = LabelEncoder()
lbl.fit(list(df[i].values))
df[i] = lbl.transform(list(df[i].values))
return df
def encode_categorical2(df):
cols = ("MSZoning", 'FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond',
'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1',
'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir')
for col in cols:
df = encode_label(df, col)
return df
# 不要変数の削除
def del_variable(df):
cols = ("MSSubClass", "OverallCond", "YrSold", "MoSold", "Id", "LandContour", "Utilities", "LotConfig",
"Neighborhood", "Condition1","Condition2", "BldgType", "HouseStyle", "RoofStyle", "RoofMatl",
"Exterior1st", "Exterior2nd", "MasVnrType", "Foundation", "Heating", "Electrical", "GarageType",
"KitchenAbvGr", "MiscFeature", "SaleType", "SaleCondition")
for i in cols:
df = df.drop(i, axis=1)
return df
# 歪度の絶対値が0.5より大きい変数だけに絞る
def count_skew(df):
# 数値の説明変数のリストを作成
num_feats = df.dtypes[df.dtypes != "object"].index
# 各説明変数の歪度を計算
skewed_feats = df[num_feats].apply(lambda x: x.skew()).sort_values(ascending = False)
# 歪度の絶対値が0.5より大きい変数だけに絞る
skewed_feats_over = skewed_feats[abs(skewed_feats) > 0.45].index
if (DEBUG):
# 各変数の最小値を表示
for i in skewed_feats_over:
print(str(min(df[i])) + "\t" + str(i))
return skewed_feats, skewed_feats_over
# Yeo-Johnson変換
def trans_yeo_johnson(df, skewed_feats_over):
pt = PowerTransformer()
#pt.fit(df[skewed_feats_over])
# 変換後のデータで各列を置換
df[skewed_feats_over] = pt.fit_transform(df[skewed_feats_over])
# 各説明変数の歪度を計算
skewed_feats_fixed = df[skewed_feats_over].apply(lambda x: x.skew()).sort_values(ascending = False)
return df
# Yeo-Johnson逆変換
def inverse_trans_yeo_johnson(df, df2):
pt = PowerTransformer()
pt.fit_transform(df)
return pt.inverse_transform(df2)
# 新たな特徴量の追加
def add_new_feature(df):
# 特徴量に1部屋あたりの面積を追加
df["FeetPerRoom"] = df["TotalSF"]/df["TotRmsAbvGrd"]
return df
# [EDA]ランダムフォレスト
def do_RandomForestRegressor(df):
y_train = df['SalePrice']
X_train = df.drop(['SalePrice'], axis=1)
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
return rf
# [EDA]説明変数の関係の可視化
def visualize_target(df, rf):
fig = plt.figure(figsize=(20,20))
ranking = np.argsort(-rf.feature_importances_)
y_train = df['SalePrice']
X_train = df.drop(['SalePrice'], axis=1)
X_train = X_train.iloc[:,ranking[:24]]
for i in np.arange(24):
# fig.add_subplot(行,列,場所)
ax = fig.add_subplot(6,4,i+1)
sns.regplot(x=X_train.iloc[:,i], y=y_train)
plt.tight_layout()
plt.show()
fig.savefig("figure3.png")
clf_names = ["LinearRegression",
"ElasticNet",
"Lasso",
"Ridge",
"LGBMRegressor",
"CatBoostRegressor",
"XGBRegressor",
]
clf_params = ["",
"max_iter=1000, tol=0.0001",
"max_iter=1000, tol=0.0001",
"",
"boosting_type='gbdt', num_leaves=31, learning_rate=0.1, n_estimators=100",
"logging_level='Silent'",
"max_depth=3, learning_rate=0.1, n_estimators=100",
]
def sklearn_model(x_train, y_train):
start = time.time()
models = list()
model = None # もっと精度が高いモデル
total = 0.0
name = ""
for i in range(len(clf_names)):
clf = eval("%s(%s)" % (clf_names[i], clf_params[i]))
clf.fit(x_train, y_train)
score = clf.score(x_train, y_train)
print('%s Accuracy:' % clf_names[i], score)
models.append(clf)
if total <= score:
total = score
model = clf
name = clf_names[i]
print(str(time.time() - start))
print('%s was selected' % name)
return models, model
### 探索するパラメータ空間
def param():
ret = {
'num_leaves':[15, 20, 25, 31, 35],
'n_estimators':[50, 100, 250, 500, 750],
'boosting_type':['gbdt', 'dart', 'goss', 'rf'],
}
return ret
def sklearn_model2(x_train, y_train):
start = time.time()
models = list()
gscv = GridSearchCV(LGBMRegressor(), param(), cv=4, verbose=0)
gscv.fit(x_train, y_train)
# 最も良いパラメタ、モデル
print(gscv.best_score_)
print(gscv.best_params_)
# スコアの一覧を取得
gs_result = pd.DataFrame.from_dict(gscv.cv_results_)
gs_result.to_csv('gs_result.csv')
print("time:" + str(time.time() - start))
return models, gscv
def main(df_train, df_test):
df_result = pd.DataFrame()
df_result['Id'] = df_test['Id']
# 学習データ
df_train = del_multicollinearity(df_train)
df_train = del_outlier(df_train)
df_train = do_imputation(df_train)
df_train = del_variable(df_train)
df_train = encode_categorical1(df_train)
df_train = add_new_feature(df_train)
# original data before trans_yeo_johnson
y_pre_train = df_train["SalePrice"]
skewed_feats, skewed_feats_over = count_skew(df_train)
df_train = trans_yeo_johnson(df_train, skewed_feats_over)
rf = do_RandomForestRegressor(df_train)
#visualize_target(df_train, rf)
#-----------------------------------------
# テストデータ
df_test = del_multicollinearity(df_test)
df_test = do_imputation(df_test)
df_test = del_variable(df_test)
df_test = encode_categorical2(df_test)
df_test = add_new_feature(df_test)
skewed_feats, skewed_feats_over = count_skew(df_test)
df_test = trans_yeo_johnson(df_test, skewed_feats_over)
#学習データを目的変数とそれ以外に分ける
X_train = df_train.drop("SalePrice", axis=1)
X_test = df_test
y_train = df_train["SalePrice"]
models, model = sklearn_model2(X_train, y_train)
# Don't forget to convert the prediction back to non-log scale
predictions = model.predict(X_test)
#print(yy_train.values.reshape(-1,1))
#print(predictions.reshape(-1, 1))
df_result['SalePrice'] = inverse_trans_yeo_johnson(y_pre_train.values.reshape(-1,1), predictions.reshape(-1, 1))
df_result.to_csv("result.csv",index=False)
display(df_result)
# CSV読み込み
sns.set()
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
main(df_train, df_test)