# utilities import os import time import pandas as pd import numpy as np import seaborn as sns
# models from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC, LinearSVC from sklearn.ensemble import RandomForestClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.naive_bayes import GaussianNB from sklearn.linear_model import Perceptron from sklearn.linear_model import SGDClassifier from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor as rfr from sklearn.ensemble import ExtraTreesRegressor as etr from sklearn.linear_model import BayesianRidge as br from sklearn.ensemble import GradientBoostingRegressor as gbr from sklearn.linear_model import Ridge from sklearn.linear_model import Lasso from sklearn.linear_model import LinearRegression as lr from sklearn.linear_model import ElasticNet as en from sklearn.kernel_ridge import KernelRidge as kr
import lightgbm as lgb import xgboost as xgb
# metrics from sklearn import metrics from datetime import datetime import matplotlib.pyplot as plt from sklearn.metrics import roc_auc_score, roc_curve, mean_squared_error,mean_absolute_error, f1_score
from sklearn.model_selection import KFold, StratifiedKFold,GroupKFold, RepeatedKFold from sklearn.model_selection import train_test_split from sklearn.model_selection import GridSearchCV from sklearn import preprocessing import logging import warnings warnings.filterwarnings('ignore') #消除warning
#make feature +5 #csv中有复数值:-1、-2、-3、-8,将他们视为有问题的特征,但是不删去 defgetres1(row): return len([x for x in row.values if type(x)==int and x<0]) defgetres2(row): return len([x for x in row.values if type(x)==int and x==-8]) defgetres3(row): return len([x for x in row.values if type(x)==int and x==-1]) defgetres4(row): return len([x for x in row.values if type(x)==int and x==-2]) defgetres5(row): return len([x for x in row.values if type(x)==int and x==-3]) #检查数据 data['neg1'] = data[data.columns].apply(lambda row:getres1(row),axis=1) data.loc[data['neg1']>20,'neg1'] = 20#平滑处理,最多出现20次 data['neg2'] = data[data.columns].apply(lambda row:getres2(row),axis=1) data['neg3'] = data[data.columns].apply(lambda row:getres3(row),axis=1) data['neg4'] = data[data.columns].apply(lambda row:getres4(row),axis=1) data['neg5'] = data[data.columns].apply(lambda row:getres5(row),axis=1)
1 2 3 4 5 6 7 8 9 10 11 12 13 14
work_cols = [i for i in data.columns if'work'in i.split('_')[0]] # columns start with work_ s_cols = [i for i in data.columns if's' == i.split('_')[0]] # columns start with s_ edu_cols = [i for i in data.columns if'edu' == i.split('_')[0]] # columns start with s_
for col in work_cols: data[col]= data[col].fillna(0) for col in s_cols: data[col]= data[col].fillna(0)
for col in ['edu_yr', 'edu_status','minor_child','marital_now','marital_1st','social_neighbor','social_friend']: data[col]= data[col].fillna(0)
# stacking predictions from LGBM, XGBoost and RandomForest # We need to transpose the results so that each row of matrix = predictions from different models # on the same sample train_stack2 = np.vstack([oof_lgb_263,oof_xgb_263,oof_rfr_263]).transpose() # transpose()函数的作用就是调换x,y,z的位置,也就是数组的索引值 test_stack2 = np.vstack([predictions_lgb_263, predictions_xgb_263,predictions_rfr_263,]).transpose() #交叉验证:5折,重复2次 folds_stack = RepeatedKFold(n_splits=5, n_repeats=2, random_state=7) oof_stack2 = np.zeros(train_stack2.shape[0]) predictions_lr2 = np.zeros(test_stack2.shape[0]) for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(train_stack2,target)): print("fold {}".format(fold_)) trn_data, trn_y = train_stack2[trn_idx], target.iloc[trn_idx].values val_data, val_y = train_stack2[val_idx], target.iloc[val_idx].values #using ElasticNet as stage 2 model for prediction lr2 = en() lr2.fit(trn_data, trn_y) oof_stack2[val_idx] = lr2.predict(val_data) predictions_lr2 += lr2.predict(test_stack2) / 10 mean_squared_error(target.values, oof_stack2)