代码:
数据

训练
预测

Kaggle是一个数据分析的竞赛平台。
泰坦尼克号的生死预测
数据说明:
pclass:舱位
sex:性别。使用0、1来标记男性和女性。
age:年龄,存在部分缺失值。使用决策树or随机森林or根据舱位,将舱位相同的均值作为该舱位存在缺失值的人的年龄等等。
fare票价,与舱位有直接的联系。
cabin房间号:缺失值严重。
embarked:起始城市,有三个C、Q、S,外加缺失值Unknow,所以人为将embarked变成四列,
S——1000 C——0100 Q——0010

数据预处理:
代码:
使用logistic回归/随机森林/XGBoost
-
- import xgboost as xgb
- import numpy as np
- from sklearn.linear_model import LogisticRegression
- from sklearn.model_selection import train_test_split
- from sklearn.ensemble import RandomForestRegressor
- from sklearn.ensemble import RandomForestClassifier
- from sklearn.metrics import accuracy_score
- import pandas as pd
- import csv
-
-
- def show_accuracy(a, b, tip):
- acc = a.ravel() == b.ravel()
- acc_rate = 100 * float(acc.sum()) / a.size
- print '%s正确率:%.3f%%' % (tip, acc_rate)
- return acc_rate
-
-
- def load_data(file_name, is_train):
- data = pd.read_csv(file_name) # 数据文件路径
- # print 'data.describe() = \n', data.describe()
-
- # 性别
- data['Sex'] = data['Sex'].map({'female': 0, 'male': 1}).astype(int)
-
- # 补齐船票价格缺失值
- if len(data.Fare[data.Fare.isnull()]) > 0:
- fare = np.zeros(3)
- for f in range(0, 3):
- fare[f] = data[data.Pclass == f + 1]['Fare'].dropna().median()
- for f in range(0, 3): # loop 0 to 2
- data.loc[(data.Fare.isnull()) & (data.Pclass == f + 1), 'Fare'] = fare[f]
-
- # 年龄:使用均值代替缺失值
- # mean_age = data['Age'].dropna().mean()
- # data.loc[(data.Age.isnull()), 'Age'] = mean_age
- if is_train:
- # 年龄:使用随机森林预测年龄缺失值
- print '随机森林预测缺失年龄:--start--'
- data_for_age = data[['Age', 'Survived', 'Fare', 'Parch', 'SibSp', 'Pclass']]
- age_exist = data_for_age.loc[(data.Age.notnull())] # 年龄不缺失的数据
- age_null = data_for_age.loc[(data.Age.isnull())]
- # print age_exist
- x = age_exist.values[:, 1:]
- y = age_exist.values[:, 0]
- rfr = RandomForestRegressor(n_estimators=1000)
- rfr.fit(x, y)
- age_hat = rfr.predict(age_null.values[:, 1:])
- # print age_hat
- data.loc[(data.Age.isnull()), 'Age'] = age_hat
- print '随机森林预测缺失年龄:--over--'
- else:
- print '随机森林预测缺失年龄2:--start--'
- data_for_age = data[['Age', 'Fare', 'Parch', 'SibSp', 'Pclass']]
- age_exist = data_for_age.loc[(data.Age.notnull())] # 年龄不缺失的数据
- age_null = data_for_age.loc[(data.Age.isnull())]
- # print age_exist
- x = age_exist.values[:, 1:]
- y = age_exist.values[:, 0]
- rfr = RandomForestRegressor(n_estimators=1000)
- rfr.fit(x, y)
- age_hat = rfr.predict(age_null.values[:, 1:])
- # print age_hat
- data.loc[(data.Age.isnull()), 'Age'] = age_hat
- print '随机森林预测缺失年龄2:--over--'
-
- # 起始城市
- data.loc[(data.Embarked.isnull()), 'Embarked'] = 'S' # 保留缺失出发城市
- # data['Embarked'] = data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2, 'U': 0}).astype(int)
- # print data['Embarked']
- embarked_data = pd.get_dummies(data.Embarked)
- print embarked_data
- # embarked_data = embarked_data.rename(columns={'S': 'Southampton', 'C': 'Cherbourg', 'Q': 'Queenstown', 'U': 'UnknownCity'})
- embarked_data = embarked_data.rename(columns=lambda x: 'Embarked_' + str(x))
- data = pd.concat([data, embarked_data], axis=1)
- print data.describe()
- data.to_csv('New_Data.csv')
-
- x = data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_C', 'Embarked_Q', 'Embarked_S']]
- # x = data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
- y = None
- if 'Survived' in data:
- y = data['Survived']
-
- x = np.array(x)
- y = np.array(y)
-
- # 思考:这样做,其实发生了什么?
- x = np.tile(x, (5, 1))
- y = np.tile(y, (5, ))
- if is_train:
- return x, y
- return x, data['PassengerId']
-
-
- def write_result(c, c_type):
- file_name = 'Titanic.test.csv'
- x, passenger_id = load_data(file_name, False)
-
- if type == 3:
- x = xgb.DMatrix(x)
- y = c.predict(x)
- y[y > 0.5] = 1
- y[~(y > 0.5)] = 0
-
- predictions_file = open("Prediction_%d.csv" % c_type, "wb")
- open_file_object = csv.writer(predictions_file)
- open_file_object.writerow(["PassengerId", "Survived"])
- open_file_object.writerows(zip(passenger_id, y))
- predictions_file.close()
-
-
- if __name__ == "__main__":
- x, y = load_data('Titanic.train.csv', True)
- x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1)
- #
- lr = LogisticRegression(penalty='l2')
- lr.fit(x_train, y_train)
- y_hat = lr.predict(x_test)
- lr_acc = accuracy_score(y_test, y_hat)
- # write_result(lr, 1)
-
- rfc = RandomForestClassifier(n_estimators=100)
- rfc.fit(x_train, y_train)
- y_hat = rfc.predict(x_test)
- rfc_acc = accuracy_score(y_test, y_hat)
- # write_result(rfc, 2)
-
- # XGBoost
- data_train = xgb.DMatrix(x_train, label=y_train)
- data_test = xgb.DMatrix(x_test, label=y_test)
- watch_list = [(data_test, 'eval'), (data_train, 'train')]
- param = {'max_depth': 6, 'eta': 0.8, 'silent': 1, 'objective': 'binary:logistic'}
- # 'subsample': 1, 'alpha': 0, 'lambda': 0, 'min_child_weight': 1}
- bst = xgb.train(param, data_train, num_boost_round=100, evals=watch_list)
- y_hat = bst.predict(data_test)
- # write_result(bst, 3)
- y_hat[y_hat > 0.5] = 1
- y_hat[~(y_hat > 0.5)] = 0
- xgb_acc = accuracy_score(y_test, y_hat)
-
- print 'Logistic回归:%.3f%%' % lr_acc
- print '随机森林:%.3f%%' % rfc_acc
- print 'XGBoost:%.3f%%' % xgb_acc