From d4ec8ddb9957e101c29a9639ce8b689aac46a585 Mon Sep 17 00:00:00 2001 From: yx-xyc <60683403+yx-xyc@users.noreply.github.com> Date: Wed, 13 Jan 2021 17:34:20 +0800 Subject: [PATCH] Update Day 1_Data_Preprocessing.ipynb --- Code/Day 1_Data_Preprocessing.ipynb | 50 +++++++++++++++++++++++++++-- 1 file changed, 47 insertions(+), 3 deletions(-) diff --git a/Code/Day 1_Data_Preprocessing.ipynb b/Code/Day 1_Data_Preprocessing.ipynb index d19db6a..7ebf597 100644 --- a/Code/Day 1_Data_Preprocessing.ipynb +++ b/Code/Day 1_Data_Preprocessing.ipynb @@ -19,7 +19,11 @@ }, { "cell_type": "code", +<<<<<<< Updated upstream "execution_count": 4, +======= + "execution_count": 2, +>>>>>>> Stashed changes "metadata": {}, "outputs": [], "source": [ @@ -37,15 +41,44 @@ }, { "cell_type": "code", +<<<<<<< Updated upstream "execution_count": 7, +======= + "execution_count": 6, +>>>>>>> Stashed changes "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ +<<<<<<< Updated upstream "Step 2: Importing dataset\nX\n[['France' 44.0 72000.0]\n ['Spain' 27.0 48000.0]\n ['Germany' 30.0 54000.0]\n ['Spain' 38.0 61000.0]\n ['Germany' 40.0 nan]\n ['France' 35.0 58000.0]\n ['Spain' nan 52000.0]\n ['France' 48.0 79000.0]\n ['Germany' 50.0 83000.0]\n ['France' 37.0 67000.0]]\nY\n['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']\n" +======= + "Step 2: Importing dataset\nX:\n[['France' 44.0 72000.0]\n ['Spain' 27.0 48000.0]\n ['Germany' 30.0 54000.0]\n ['Spain' 38.0 61000.0]\n ['Germany' 40.0 nan]\n ['France' 35.0 58000.0]\n ['Spain' nan 52000.0]\n ['France' 48.0 79000.0]\n ['Germany' 50.0 83000.0]\n ['France' 37.0 67000.0]]\nY:\n['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']\n" +>>>>>>> Stashed changes ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Country Age Salary Purchased\n", + "0 France 44.0 72000.0 No\n", + "1 Spain 27.0 48000.0 Yes\n", + "2 Germany 30.0 54000.0 No\n", + "3 Spain 38.0 61000.0 No\n", + "4 Germany 40.0 NaN Yes\n", + "5 France 35.0 58000.0 Yes\n", + "6 Spain NaN 52000.0 No\n", + "7 France 48.0 79000.0 Yes\n", + "8 Germany 50.0 83000.0 No\n", + "9 France 37.0 67000.0 Yes" + ], + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
CountryAgeSalaryPurchased
0France44.072000.0No
1Spain27.048000.0Yes
2Germany30.054000.0No
3Spain38.061000.0No
4Germany40.0NaNYes
5France35.058000.0Yes
6SpainNaN52000.0No
7France48.079000.0Yes
8Germany50.083000.0No
9France37.067000.0Yes
\n
" + }, + "metadata": {}, + "execution_count": 6 } ], "source": [ @@ -55,10 +88,11 @@ "#取最后一列\n", "Y = dataset.iloc[ : , 3].values\n", "print(\"Step 2: Importing dataset\")\n", - "print(\"X\")\n", + "print(\"X:\")\n", "print(X)\n", - "print(\"Y\")\n", - "print(Y)" + "print(\"Y:\")\n", + "print(Y)\n", + "dataset.head(100)" ] }, { @@ -98,6 +132,7 @@ ], "source": [ "# If you use the newest version of sklearn, use the lines of code commented out\n", +<<<<<<< Updated upstream "from sklearn.impute import SimpleImputer\n", "imputer = SimpleImputer(missing_values=np.nan, strategy=\"mean\")\n", "#from sklearn.preprocessing import Imputer\n", @@ -105,6 +140,15 @@ "#imputer = Imputer(missing_values = \"NaN\", strategy = \"mean\", axis = 0)\n", "print(X[ 0:4 , 1:3])\n", "imputer = imputer.fit(X[ 0:4 , 1:3])\n", +======= + "# from sklearn.impute import SimpleImputer\n", + "# imputer = SimpleImputer(missing_values=\"NaN\", strategy=\"mean\")\n", + "from sklearn.preprocessing import Imputer\n", + "# axis=0表示按列进行\n", + "imputer = Imputer(missing_values = \"NaN\", strategy = \"mean\", axis = 0)\n", + "print(imputer)\n", + "imputer = imputer.fit(X[ : , 1:3])\n", +>>>>>>> Stashed changes "X[ : , 1:3] = imputer.transform(X[ : , 1:3])\n", "print(\"---------------------\")\n", "print(\"Step 3: Handling the missing data\")\n",