Update Day 1_Data_Preprocessing.ipynb

2021-01-13 17:34:20 +08:00
parent 9b231e4166
commit d4ec8ddb99
1 changed files with 47 additions and 3 deletions
--- a/1_Data_Preprocessing.ipynb
+++ b/1_Data_Preprocessing.ipynb
@ -19,7 +19,11 @@
  },
  {
   "cell_type": "code",
+<<<<<<< Updated upstream
   "execution_count": 4,
+=======
+   "execution_count": 2,
+>>>>>>> Stashed changes
   "metadata": {},
   "outputs": [],
   "source": [
@ -37,15 +41,44 @@
  },
  {
   "cell_type": "code",
+<<<<<<< Updated upstream
   "execution_count": 7,
+=======
+   "execution_count": 6,
+>>>>>>> Stashed changes
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
+<<<<<<< Updated upstream
      "Step 2: Importing dataset\nX\n[['France' 44.0 72000.0]\n ['Spain' 27.0 48000.0]\n ['Germany' 30.0 54000.0]\n ['Spain' 38.0 61000.0]\n ['Germany' 40.0 nan]\n ['France' 35.0 58000.0]\n ['Spain' nan 52000.0]\n ['France' 48.0 79000.0]\n ['Germany' 50.0 83000.0]\n ['France' 37.0 67000.0]]\nY\n['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']\n"
+=======
+      "Step 2: Importing dataset\nX:\n[['France' 44.0 72000.0]\n ['Spain' 27.0 48000.0]\n ['Germany' 30.0 54000.0]\n ['Spain' 38.0 61000.0]\n ['Germany' 40.0 nan]\n ['France' 35.0 58000.0]\n ['Spain' nan 52000.0]\n ['France' 48.0 79000.0]\n ['Germany' 50.0 83000.0]\n ['France' 37.0 67000.0]]\nY:\n['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']\n"
+>>>>>>> Stashed changes
     ]
+    },
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "   Country   Age   Salary Purchased\n",
+       "0   France  44.0  72000.0        No\n",
+       "1    Spain  27.0  48000.0       Yes\n",
+       "2  Germany  30.0  54000.0        No\n",
+       "3    Spain  38.0  61000.0        No\n",
+       "4  Germany  40.0      NaN       Yes\n",
+       "5   France  35.0  58000.0       Yes\n",
+       "6    Spain   NaN  52000.0        No\n",
+       "7   France  48.0  79000.0       Yes\n",
+       "8  Germany  50.0  83000.0        No\n",
+       "9   France  37.0  67000.0       Yes"
+      ],
+      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Country</th>\n      <th>Age</th>\n      <th>Salary</th>\n      <th>Purchased</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>France</td>\n      <td>44.0</td>\n      <td>72000.0</td>\n      <td>No</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Spain</td>\n      <td>27.0</td>\n      <td>48000.0</td>\n      <td>Yes</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Germany</td>\n      <td>30.0</td>\n      <td>54000.0</td>\n      <td>No</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>Spain</td>\n      <td>38.0</td>\n      <td>61000.0</td>\n      <td>No</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>Germany</td>\n      <td>40.0</td>\n      <td>NaN</td>\n      <td>Yes</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>France</td>\n      <td>35.0</td>\n      <td>58000.0</td>\n      <td>Yes</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>Spain</td>\n      <td>NaN</td>\n      <td>52000.0</td>\n      <td>No</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>France</td>\n      <td>48.0</td>\n      <td>79000.0</td>\n      <td>Yes</td>\n    </tr>\n    <tr>\n      <th>8</th>\n      <td>Germany</td>\n      <td>50.0</td>\n      <td>83000.0</td>\n      <td>No</td>\n    </tr>\n    <tr>\n      <th>9</th>\n      <td>France</td>\n      <td>37.0</td>\n      <td>67000.0</td>\n      <td>Yes</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
+     },
+     "metadata": {},
+     "execution_count": 6
    }
   ],
   "source": [
@ -55,10 +88,11 @@
    "#取最后一列\n",
    "Y = dataset.iloc[ : , 3].values\n",
    "print(\"Step 2: Importing dataset\")\n",
-    "print(\"X\")\n",
+    "print(\"X:\")\n",
    "print(X)\n",
-    "print(\"Y\")\n",
-    "print(Y)"
+    "print(\"Y:\")\n",
+    "print(Y)\n",
+    "dataset.head(100)"
   ]
  },
  {
@ -98,6 +132,7 @@
   ],
   "source": [
    "# If you use the newest version of sklearn, use the lines of code commented out\n",
+<<<<<<< Updated upstream
    "from sklearn.impute import SimpleImputer\n",
    "imputer = SimpleImputer(missing_values=np.nan, strategy=\"mean\")\n",
    "#from sklearn.preprocessing import Imputer\n",
@ -105,6 +140,15 @@
    "#imputer = Imputer(missing_values = \"NaN\", strategy = \"mean\", axis = 0)\n",
    "print(X[ 0:4 , 1:3])\n",
    "imputer = imputer.fit(X[ 0:4 , 1:3])\n",
+=======
+    "# from sklearn.impute import SimpleImputer\n",
+    "# imputer = SimpleImputer(missing_values=\"NaN\", strategy=\"mean\")\n",
+    "from sklearn.preprocessing import Imputer\n",
+    "# axis=0表示按列进行\n",
+    "imputer = Imputer(missing_values = \"NaN\", strategy = \"mean\", axis = 0)\n",
+    "print(imputer)\n",
+    "imputer = imputer.fit(X[ : , 1:3])\n",
+>>>>>>> Stashed changes
    "X[ : , 1:3] = imputer.transform(X[ : , 1:3])\n",
    "print(\"---------------------\")\n",
    "print(\"Step 3: Handling the missing data\")\n",