{ "cells": [ { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "data = pd.read_csv('titanic4.csv')" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
5603Moran, Mr. JamesmaleNaN003308778.4583NaNQ
6701McCarthy, Mr. Timothy Jmale54.0001746351.8625E46S
7803Palsson, Master. Gosta Leonardmale2.03134990921.0750NaNS
8913Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)female27.00234774211.1333NaNS
91012Nasser, Mrs. Nicholas (Adele Achem)female14.01023773630.0708NaNC
\n", "
" ], "text/plain": [ " PassengerId Survived Pclass \\\n", "0 1 0 3 \n", "1 2 1 1 \n", "2 3 1 3 \n", "3 4 1 1 \n", "4 5 0 3 \n", "5 6 0 3 \n", "6 7 0 1 \n", "7 8 0 3 \n", "8 9 1 3 \n", "9 10 1 2 \n", "\n", " Name Sex Age SibSp \\\n", "0 Braund, Mr. Owen Harris male 22.0 1 \n", "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", "2 Heikkinen, Miss. Laina female 26.0 0 \n", "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", "4 Allen, Mr. William Henry male 35.0 0 \n", "5 Moran, Mr. James male NaN 0 \n", "6 McCarthy, Mr. Timothy J male 54.0 0 \n", "7 Palsson, Master. Gosta Leonard male 2.0 3 \n", "8 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27.0 0 \n", "9 Nasser, Mrs. Nicholas (Adele Achem) female 14.0 1 \n", "\n", " Parch Ticket Fare Cabin Embarked \n", "0 0 A/5 21171 7.2500 NaN S \n", "1 0 PC 17599 71.2833 C85 C \n", "2 0 STON/O2. 3101282 7.9250 NaN S \n", "3 0 113803 53.1000 C123 S \n", "4 0 373450 8.0500 NaN S \n", "5 0 330877 8.4583 NaN Q \n", "6 0 17463 51.8625 E46 S \n", "7 1 349909 21.0750 NaN S \n", "8 2 347742 11.1333 NaN S \n", "9 0 237736 30.0708 NaN C " ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.head(10)" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PclassSexAgeSibSpParchFareSurvivedTicketCabinEmbarked
03male22.0107.25000A/5 21171NaNS
11female38.01071.28331PC 17599C85C
23female26.0007.92501STON/O2. 3101282NaNS
31female35.01053.10001113803C123S
43male35.0008.05000373450NaNS
\n", "
" ], "text/plain": [ " Pclass Sex Age SibSp Parch Fare Survived Ticket \\\n", "0 3 male 22.0 1 0 7.2500 0 A/5 21171 \n", "1 1 female 38.0 1 0 71.2833 1 PC 17599 \n", "2 3 female 26.0 0 0 7.9250 1 STON/O2. 3101282 \n", "3 1 female 35.0 1 0 53.1000 1 113803 \n", "4 3 male 35.0 0 0 8.0500 0 373450 \n", "\n", " Cabin Embarked \n", "0 NaN S \n", "1 C85 C \n", "2 NaN S \n", "3 C123 S \n", "4 NaN S " ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Survived', 'Ticket', 'Cabin', 'Embarked']]\n", "data.head()" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PclassSexAgeSibSpParchFareSurvivedCabin
03male22.0107.25000NaN
11female38.01071.28331C85
23female26.0007.92501NaN
31female35.01053.10001C123
43male35.0008.05000NaN
\n", "
" ], "text/plain": [ " Pclass Sex Age SibSp Parch Fare Survived Cabin\n", "0 3 male 22.0 1 0 7.2500 0 NaN\n", "1 1 female 38.0 1 0 71.2833 1 C85\n", "2 3 female 26.0 0 0 7.9250 1 NaN\n", "3 1 female 35.0 1 0 53.1000 1 C123\n", "4 3 male 35.0 0 0 8.0500 0 NaN" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Survived', 'Cabin']]\n", "data.head()" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 891 entries, 0 to 890\n", "Data columns (total 8 columns):\n", "Pclass 891 non-null int64\n", "Sex 891 non-null object\n", "Age 714 non-null float64\n", "SibSp 891 non-null int64\n", "Parch 891 non-null int64\n", "Fare 891 non-null float64\n", "Survived 891 non-null int64\n", "Cabin 204 non-null object\n", "dtypes: float64(2), int64(4), object(2)\n", "memory usage: 55.8+ KB\n" ] } ], "source": [ "data.info()\n", "# Cabin column has only 204 entries and the rest are Nan. We can exclude the column as it has too many missing (nan) values\n" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PclassSexAgeSibSpParchSurvived
03male22.0100
11female38.0101
23female26.0001
31female35.0101
43male35.0000
53maleNaN000
61male54.0000
73male2.0310
83female27.0021
92female14.0101
103female4.0111
111female58.0001
123male20.0000
133male39.0150
143female14.0000
\n", "
" ], "text/plain": [ " Pclass Sex Age SibSp Parch Survived\n", "0 3 male 22.0 1 0 0\n", "1 1 female 38.0 1 0 1\n", "2 3 female 26.0 0 0 1\n", "3 1 female 35.0 1 0 1\n", "4 3 male 35.0 0 0 0\n", "5 3 male NaN 0 0 0\n", "6 1 male 54.0 0 0 0\n", "7 3 male 2.0 3 1 0\n", "8 3 female 27.0 0 2 1\n", "9 2 female 14.0 1 0 1\n", "10 3 female 4.0 1 1 1\n", "11 1 female 58.0 0 0 1\n", "12 3 male 20.0 0 0 0\n", "13 3 male 39.0 1 5 0\n", "14 3 female 14.0 0 0 0" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Survived']]\n", "data.head(15)" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PclassSexAgeSibSpParchSurvived
03male22.000000100
11female38.000000101
23female26.000000001
31female35.000000101
43male35.000000000
53male29.699118000
61male54.000000000
73male2.000000310
83female27.000000021
92female14.000000101
103female4.000000111
111female58.000000001
123male20.000000000
133male39.000000150
143female14.000000000
\n", "
" ], "text/plain": [ " Pclass Sex Age SibSp Parch Survived\n", "0 3 male 22.000000 1 0 0\n", "1 1 female 38.000000 1 0 1\n", "2 3 female 26.000000 0 0 1\n", "3 1 female 35.000000 1 0 1\n", "4 3 male 35.000000 0 0 0\n", "5 3 male 29.699118 0 0 0\n", "6 1 male 54.000000 0 0 0\n", "7 3 male 2.000000 3 1 0\n", "8 3 female 27.000000 0 2 1\n", "9 2 female 14.000000 1 0 1\n", "10 3 female 4.000000 1 1 1\n", "11 1 female 58.000000 0 0 1\n", "12 3 male 20.000000 0 0 0\n", "13 3 male 39.000000 1 5 0\n", "14 3 female 14.000000 0 0 0" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Replaces all NAN values in Age column with the mean values of the col.\n", "data['Age'].fillna(data['Age'].mean(), inplace=True)\n", "#dataframe.Column_Name.fillna(dataframe.Column_Name.mean(),inplace=True)\n", "data.head(15)" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PclassSexAgeSibSpParchSurvived
03022.000000100
11138.000000101
23126.000000001
31135.000000101
43035.000000000
53029.699118000
61054.000000000
7302.000000310
83127.000000021
92114.000000101
10314.000000111
111158.000000001
123020.000000000
133039.000000150
143114.000000000
\n", "
" ], "text/plain": [ " Pclass Sex Age SibSp Parch Survived\n", "0 3 0 22.000000 1 0 0\n", "1 1 1 38.000000 1 0 1\n", "2 3 1 26.000000 0 0 1\n", "3 1 1 35.000000 1 0 1\n", "4 3 0 35.000000 0 0 0\n", "5 3 0 29.699118 0 0 0\n", "6 1 0 54.000000 0 0 0\n", "7 3 0 2.000000 3 1 0\n", "8 3 1 27.000000 0 2 1\n", "9 2 1 14.000000 1 0 1\n", "10 3 1 4.000000 1 1 1\n", "11 1 1 58.000000 0 0 1\n", "12 3 0 20.000000 0 0 0\n", "13 3 0 39.000000 1 5 0\n", "14 3 1 14.000000 0 0 0" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# lets convert the Sex column by mapping male to 0 and female to 1 in the dataset\n", "data['Sex'] = data['Sex'].map({'male': 0, 'female': 1})\n", "data.head(15)\n" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [], "source": [ "# write out the clean dataset to a new csv file\n", "data.to_csv('cleansed.csv', sep=',')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.4" } }, "nbformat": 4, "nbformat_minor": 2 }