These are example solutions to the third Python homework, involving performing data analysis on the titanic data set. Other correction solutions and implementations are possible.
################################ IMPORTS ################################# Need to import and format the data, create# box plots, correlation matrix, contingency tablesimport pandas as pd# needed to create pie chartsimport matplotlib.pyplot as plt# Need for chi-squared testimport scipy
################################ IMPORT THE DATA FILE ################################titanic_data = pd.read_csv("../DataSets/titanic_train.csv")
################################ DESCRIBE ################################# Use the describe function to print a statistical# summary of the numeric data.titanic_data.describe()
PassengerId
Survived
Pclass
Age
SibSp
Parch
Fare
count
891.000000
891.000000
891.000000
714.000000
891.000000
891.000000
891.000000
mean
446.000000
0.383838
2.308642
29.699118
0.523008
0.381594
32.204208
std
257.353842
0.486592
0.836071
14.526497
1.102743
0.806057
49.693429
min
1.000000
0.000000
1.000000
0.420000
0.000000
0.000000
0.000000
25%
223.500000
0.000000
2.000000
20.125000
0.000000
0.000000
7.910400
50%
446.000000
0.000000
3.000000
28.000000
0.000000
0.000000
14.454200
75%
668.500000
1.000000
3.000000
38.000000
1.000000
0.000000
31.000000
max
891.000000
1.000000
3.000000
80.000000
8.000000
6.000000
512.329200
################################ BOX PLOT: SURVIVED ################################# Use the Pandas Dataframe boxplot functiontitanic_data.boxplot("Survived")plt.show()
################################ BOX PLOT: PCLASS ################################# Use the Pandas Dataframe boxplot functiontitanic_data.boxplot("Pclass")plt.show()
################################ BOX PLOT: AGE ################################# Use the Pandas Dataframe boxplot functiontitanic_data.boxplot("Age")plt.show()
################################ BOX PLOT: SIBSP ################################# Use the Pandas Dataframe boxplot functiontitanic_data.boxplot("SibSp")plt.show()
################################ BOX PLOT: PARCH ################################# Use the Pandas Dataframe boxplot functiontitanic_data.boxplot("Parch")plt.show()
################################ BOX PLOT: FARE ################################# Use the Pandas Dataframe boxplot functiontitanic_data.boxplot("Fare")plt.show()
################################ INFO ################################# Use the info function to determine which columns# are categorical (object data type). Name and Cabin# have too many individual values to make a good pie chart.titanic_data.info()
################################ PIE CHART: SEX ################################# Use value counts to format the column of datasex_counts = titanic_data["Sex"].value_counts()# Create the pie chart with labels for the different slices, add a title, and# show the graphplt.pie(sex_counts, labels=sex_counts.index)plt.title("Sex of Titanic Passengers")plt.show()
################################ PIE CHART: EMBARKED ################################# Use value counts to format the column of dataembarked_counts = titanic_data["Embarked"].value_counts()# Create the pie chart with labels for the different slices, add a title, and# show the graphplt.pie(embarked_counts, labels=["Southampton", "Cherbourg", "Queenstown"])plt.title("Port of Embarkation for Passengers on the Titanic")plt.show()
############################################## CORRELATION MATRIX ############################################### Create the correlation matrix with only the columns of numeric# data and then display itcorr_matrix = titanic_data.corr(numeric_only=True)corr_matrix# Largest negative correlation is -0.55 which is between Pclass and Fare# Largest positive correlation is 0.41 and is between SibSp and Parch
PassengerId
Survived
Pclass
Age
SibSp
Parch
Fare
PassengerId
1.000000
-0.005007
-0.035144
0.036847
-0.057527
-0.001652
0.012658
Survived
-0.005007
1.000000
-0.338481
-0.077221
-0.035322
0.081629
0.257307
Pclass
-0.035144
-0.338481
1.000000
-0.369226
0.083081
0.018443
-0.549500
Age
0.036847
-0.077221
-0.369226
1.000000
-0.308247
-0.189119
0.096067
SibSp
-0.057527
-0.035322
0.083081
-0.308247
1.000000
0.414838
0.159651
Parch
-0.001652
0.081629
0.018443
-0.189119
0.414838
1.000000
0.216225
Fare
0.012658
0.257307
-0.549500
0.096067
0.159651
0.216225
1.000000
############################################## CONTINGENCY TABLE AND CHI SQUARED TEST ############################################## Create the contingency table with the two relavent columns of# categorical datacont_table = pd.crosstab(titanic_data["Sex"], titanic_data["Embarked"])#Perform the chi-squated test and print the p valuep = scipy.stats.chi2_contingency(cont_table).pvalueprint(p)# Since p < 0.05 the Sex of the titanic passengers and the port of embarkation# are not independent variables