导入外部包 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 import pandas as pdfrom pandas import Series,DataFrameimport numpy as npimport matplotlib.pyplot as pltimport seaborn as snssns.set_style('whitegrid' ) %matplotlib inline from sklearn.linear_model import LogisticRegressionfrom sklearn.svm import SVC,LinearSVCfrom sklearn.ensemble import RandomForestClassifierfrom sklearn.neighbors import KNeighborsClassifierfrom sklearn.naive_bayes import GaussianNB
使用pandas导入并分析数据集 1 2 3 4 titanic_df = pd.read_csv(r"F:\数据集\Titanic数据分析\train.csv" ) test_df = pd.read_csv(r"F:\数据集\Titanic数据分析\test.csv" ) titanic_df.head()
使用dataFram.info 和describe查看具体信息
票号,id,姓名,对预测无关可以删除 1 2 titanic_df = titanic_df.drop(['PassengerId' ,'Name' ,'Ticket' ],axis=1 ) test_df = test_df.drop(['Name' ,'Ticket' ],axis=1 )
查看Embarked(登船港口)对生存的影响 以出现次数最多的S港口填充缺失值,factorplo(FacetGrid 的一种匪类图)t绘制每个港口的生存情况, 再绘制每个港口登船人数和不同港口登船的 存活人员和非存活人员。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 titanic_df['Embarked' ] = titanic_df['Embarked' ].fillna("S" ) sns.factorplot('Embarked' ,'Survived' ,data=titanic_df,size=4 ,aspect=3 ) fig, (axis1,axis2,axis3) =plt.subplots(1 ,3 ,figsize=(15 ,5 )) sns.countplot(x="Embarked" ,data=titanic_df,ax=axis1) sns.countplot(x='Survived' ,hue='Embarked' , data=titanic_df, order=[0 ,1 ],ax =axis2) embark_perc = titanic_df[['Embarked' ,'Survived' ]].groupby(['Embarked' ],as_index=False ).mean() sns.barplot(x='Embarked' ,y='Survived' ,data=embark_perc,order=['S' ,'C' ,'Q' ], ax=axis3) embark_dummies_titanic = pd.get_dummies(titanic_df['Embarked' ]) embark_dummies_titanic.drop(['S' ],axis=1 ,inplace=True ) embark_dummies_test = pd.get_dummies(test_df['Embarked' ]) embark_dummies_test.drop(['S' ], axis=1 , inplace=True ) titanic_df = titanic_df.join(embark_dummies_titanic) test_df = test_df.join(embark_dummies_test) titanic_df.drop(['Embarked' ], axis=1 ,inplace=True ) test_df.drop(['Embarked' ], axis=1 , inplace=True )
对票价进行分析 先以中位数填充票价的缺失值,获取生还人员和非生还人员的票价,平均值和标准差 再绘制票价的直方图,生还和非生还乘客的票价箱线图。1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 test_df['Fare' ].fillna(titanic_df['Fare' ].median(),inplace=True ) titanic_df['Fare' ] = titanic_df['Fare' ].astype(int) test_df['Fare' ] = test_df['Fare' ].astype(int) fare_not_survived = titanic_df['Fare' ][titanic_df['Survived' ]== 0 ] fare_survived = titanic_df['Fare' ][titanic_df['Survived' ]==1 ] avgerage_fare = DataFrame([fare_not_survived.mean(),fare_survived.mean()]) std_fare =DataFrame([fare_not_survived.std(), fare_survived.std()]) titanic_df['Fare' ].plot(kind='hist' ,figsize=(5 ,3 ), bins=100 , xlim=(0 ,50 )) avgerage_fare.index.names = std_fare.index.names = ["Survived" ] avgerage_fare.plot(yerr = std_fare,kind='bar' ,legend=False )
对Age列进行分析 分别绘制以mean+std和mean-std之间随机数填充的前、后Age列。1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 fig, (axis1,axis2) = plt.subplots(1 ,2 ,figsize=(15 ,4 )) axis1.set_title('Original Age values - Titanic' ) axis2.set_title('New Age values - Titanic' ) average_age_titanic = titanic_df["Age" ].mean() std_age_titanic = titanic_df["Age" ].std() count_nan_age_titanic = titanic_df["Age" ].isnull().sum() average_age_test = test_df["Age" ].mean() std_age_test = test_df["Age" ].std() count_nan_age_test = test_df["Age" ].isnull().sum() rand_1 = np.random.randint(average_age_titanic - std_age_titanic, average_age_titanic + std_age_titanic, size = count_nan_age_titanic) rand_2 = np.random.randint(average_age_test - std_age_test, average_age_test + std_age_test, size = count_nan_age_test) titanic_df['Age' ].dropna().astype(int).hist(bins=70 , ax=axis1) titanic_df["Age" ][np.isnan(titanic_df["Age" ])] = rand_1 test_df["Age" ][np.isnan(test_df["Age" ])] = rand_2 titanic_df['Age' ] = titanic_df['Age' ].astype(int) test_df['Age' ] = test_df['Age' ].astype(int) titanic_df['Age' ].hist(bins=70 , ax=axis2)
使用密度估计图
查看不同年纪乘客的存活情况(group后计算mean)
1 2 3 4 5 6 7 8 9 10 facet = sns.FacetGrid(titanic_df, hue="Survived" ,aspect=4 ) facet.map(sns.kdeplot,'Age' ,shade= True ) facet.set(xlim=(0 , titanic_df['Age' ].max())) facet.add_legend() fig, axis1 = plt.subplots(1 ,1 ,figsize=(18 ,4 )) average_age = titanic_df[["Age" , "Survived" ]].groupby(['Age' ],as_index=False ).mean() sns.barplot(x='Age' , y='Survived' , data=average_age)
后序待处理
Sex(性别) family(乘客是否有亲属在船上) pclass(乘客阶级)这三列对存活情况未分析
利用Python机器学习库 sklearn中Logistic 回归和随机森林等算法对各类人群的生还情况做预测。
Cabin 客舱号码 Cabin 客舱号码, 因为有很多空值,对预测不产生影响,可以删除
1 2 titanic_df.drop("Cabin" ,axis=1 ,inplace=True ) test_df.drop("Cabin" ,axis=1 ,inplace=True )
分析家庭成员对Survive影响情况 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 titanic_df['Family' ] = titanic_df["Parch" ] + titanic_df["SibSp" ] titanic_df['Family' ].loc[titanic_df['Family' ] > 0 ] = 1 titanic_df['Family' ].loc[titanic_df['Family' ] == 0 ] = 0 test_df['Family' ] = test_df["Parch" ] + test_df["SibSp" ] test_df['Family' ].loc[test_df['Family' ] > 0 ] = 1 test_df['Family' ].loc[test_df['Family' ] == 0 ] = 0 titanic_df = titanic_df.drop(['SibSp' ,'Parch' ], axis=1 ) test_df = test_df.drop(['SibSp' ,'Parch' ], axis=1 ) fig, (axis1,axis2) = plt.subplots(1 ,2 ,sharex=True ,figsize=(10 ,5 )) sns.countplot(x='Family' , data=titanic_df, order=[1 ,0 ], ax=axis1) family_perc = titanic_df[["Family" , "Survived" ]].groupby(['Family' ],as_index=False ).mean() sns.barplot(x='Family' , y='Survived' , data=family_perc, order=[1 ,0 ], ax=axis2) axis1.set_xticklabels(["With Family" ,"Alone" ], rotation=0 )
可以看出,只要具有家庭成员,存活率就会上升
分析sex影响因数 分析性别影响因素,把小于16的乘客认定为child 可以把乘客分为 males females children
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 def get_person (passenger) : age,sex = passenger return 'child' if age < 16 else sex titanic_df['Person' ] = titanic_df[['Age' ,'Sex' ]].apply(get_person,axis=1 ) test_df['Person' ] = test_df[['Age' ,'Sex' ]].apply(get_person,axis=1 ) titanic_df.drop(['Sex' ],axis=1 ,inplace=True ) test_df.drop(['Sex' ],axis=1 ,inplace=True ) person_dummies_titanic = pd.get_dummies(titanic_df['Person' ]) person_dummies_titanic.columns = ['Child' ,'Female' ,'Male' ] person_dummies_titanic.drop(['Male' ], axis=1 , inplace=True ) person_dummies_test = pd.get_dummies(test_df['Person' ]) person_dummies_test.columns = ['Child' ,'Female' ,'Male' ] person_dummies_test.drop(['Male' ], axis=1 , inplace=True ) titanic_df = titanic_df.join(person_dummies_titanic) test_df = test_df.join(person_dummies_test) fig, (axis1,axis2) = plt.subplots(1 ,2 ,figsize=(10 ,5 )) sns.countplot(x='Person' , data=titanic_df, ax=axis1) person_perc = titanic_df[["Person" , "Survived" ]].groupby(['Person' ],as_index=False ).mean() sns.barplot(x='Person' , y='Survived' , data=person_perc, ax=axis2, order=['male' ,'female' ,'child' ]) titanic_df.drop(['Person' ],axis=1 ,inplace=True ) test_df.drop(['Person' ],axis=1 ,inplace=True )
从图表中可以看出 年龄小于16 的children 有更高几率存活
sns factorplot 分析社会阶级因数 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 sns.factorplot('Pclass' ,'Survived' ,order=[1 ,2 ,3 ], data=titanic_df,size=5 ) pclass_dummies_titanic = pd.get_dummies(titanic_df['Pclass' ]) pclass_dummies_titanic.columns = ['Class_1' ,'Class_2' ,'Class_3' ] pclass_dummies_titanic.drop(['Class_3' ], axis=1 , inplace=True ) pclass_dummies_test = pd.get_dummies(test_df['Pclass' ]) pclass_dummies_test.columns = ['Class_1' ,'Class_2' ,'Class_3' ] pclass_dummies_test.drop(['Class_3' ], axis=1 , inplace=True ) titanic_df.drop(['Pclass' ],axis=1 ,inplace=True ) test_df.drop(['Pclass' ],axis=1 ,inplace=True ) titanic_df = titanic_df.join(pclass_dummies_titanic) test_df = test_df.join(pclass_dummies_test)
使用sklearn对存活率做预测 定义训练、测试数据集 1 2 3 X_train = titanic_df.drop("Survived" ,axis=1 ) Y_train = titanic_df["Survived" ] X_test = test_df.drop("PassengerId" ,axis=1 ).copy()
利用逻辑回归预测
利用随机森林做预测
使用随机森林算法预测效果比逻辑回归算法预测效果好
利用逻辑回归获得每个特征值的相关系数 1 2 3 4 5 coeff_df = DataFrame(titanic_df.columns.delete(0 )) coeff_df.columns = ['Features' ] coeff_df["Coefficient Estimate" ] = pd.Series(logreg.coef_[0 ]) coeff_df
将预测结果保存到本地 1 2 3 4 5 6 7 submission = pd.DataFrame({ "PassengerId" : test_df["PassengerId" ], "Name" : test_df['Name' ] "Survived" : Y_pred }) submission.to_csv('titanic.csv' , index=False )