Update Day 1_Data_Preprocessing.ipynb

This commit is contained in:
yx-xyc
2021-01-13 17:34:20 +08:00
parent 9b231e4166
commit d4ec8ddb99

View File

@ -19,7 +19,11 @@
},
{
"cell_type": "code",
<<<<<<< Updated upstream
"execution_count": 4,
=======
"execution_count": 2,
>>>>>>> Stashed changes
"metadata": {},
"outputs": [],
"source": [
@ -37,15 +41,44 @@
},
{
"cell_type": "code",
<<<<<<< Updated upstream
"execution_count": 7,
=======
"execution_count": 6,
>>>>>>> Stashed changes
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
<<<<<<< Updated upstream
"Step 2: Importing dataset\nX\n[['France' 44.0 72000.0]\n ['Spain' 27.0 48000.0]\n ['Germany' 30.0 54000.0]\n ['Spain' 38.0 61000.0]\n ['Germany' 40.0 nan]\n ['France' 35.0 58000.0]\n ['Spain' nan 52000.0]\n ['France' 48.0 79000.0]\n ['Germany' 50.0 83000.0]\n ['France' 37.0 67000.0]]\nY\n['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']\n"
=======
"Step 2: Importing dataset\nX:\n[['France' 44.0 72000.0]\n ['Spain' 27.0 48000.0]\n ['Germany' 30.0 54000.0]\n ['Spain' 38.0 61000.0]\n ['Germany' 40.0 nan]\n ['France' 35.0 58000.0]\n ['Spain' nan 52000.0]\n ['France' 48.0 79000.0]\n ['Germany' 50.0 83000.0]\n ['France' 37.0 67000.0]]\nY:\n['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']\n"
>>>>>>> Stashed changes
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
" Country Age Salary Purchased\n",
"0 France 44.0 72000.0 No\n",
"1 Spain 27.0 48000.0 Yes\n",
"2 Germany 30.0 54000.0 No\n",
"3 Spain 38.0 61000.0 No\n",
"4 Germany 40.0 NaN Yes\n",
"5 France 35.0 58000.0 Yes\n",
"6 Spain NaN 52000.0 No\n",
"7 France 48.0 79000.0 Yes\n",
"8 Germany 50.0 83000.0 No\n",
"9 France 37.0 67000.0 Yes"
],
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Country</th>\n <th>Age</th>\n <th>Salary</th>\n <th>Purchased</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>France</td>\n <td>44.0</td>\n <td>72000.0</td>\n <td>No</td>\n </tr>\n <tr>\n <th>1</th>\n <td>Spain</td>\n <td>27.0</td>\n <td>48000.0</td>\n <td>Yes</td>\n </tr>\n <tr>\n <th>2</th>\n <td>Germany</td>\n <td>30.0</td>\n <td>54000.0</td>\n <td>No</td>\n </tr>\n <tr>\n <th>3</th>\n <td>Spain</td>\n <td>38.0</td>\n <td>61000.0</td>\n <td>No</td>\n </tr>\n <tr>\n <th>4</th>\n <td>Germany</td>\n <td>40.0</td>\n <td>NaN</td>\n <td>Yes</td>\n </tr>\n <tr>\n <th>5</th>\n <td>France</td>\n <td>35.0</td>\n <td>58000.0</td>\n <td>Yes</td>\n </tr>\n <tr>\n <th>6</th>\n <td>Spain</td>\n <td>NaN</td>\n <td>52000.0</td>\n <td>No</td>\n </tr>\n <tr>\n <th>7</th>\n <td>France</td>\n <td>48.0</td>\n <td>79000.0</td>\n <td>Yes</td>\n </tr>\n <tr>\n <th>8</th>\n <td>Germany</td>\n <td>50.0</td>\n <td>83000.0</td>\n <td>No</td>\n </tr>\n <tr>\n <th>9</th>\n <td>France</td>\n <td>37.0</td>\n <td>67000.0</td>\n <td>Yes</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"metadata": {},
"execution_count": 6
}
],
"source": [
@ -55,10 +88,11 @@
"#取最后一列\n",
"Y = dataset.iloc[ : , 3].values\n",
"print(\"Step 2: Importing dataset\")\n",
"print(\"X\")\n",
"print(\"X:\")\n",
"print(X)\n",
"print(\"Y\")\n",
"print(Y)"
"print(\"Y:\")\n",
"print(Y)\n",
"dataset.head(100)"
]
},
{
@ -98,6 +132,7 @@
],
"source": [
"# If you use the newest version of sklearn, use the lines of code commented out\n",
<<<<<<< Updated upstream
"from sklearn.impute import SimpleImputer\n",
"imputer = SimpleImputer(missing_values=np.nan, strategy=\"mean\")\n",
"#from sklearn.preprocessing import Imputer\n",
@ -105,6 +140,15 @@
"#imputer = Imputer(missing_values = \"NaN\", strategy = \"mean\", axis = 0)\n",
"print(X[ 0:4 , 1:3])\n",
"imputer = imputer.fit(X[ 0:4 , 1:3])\n",
=======
"# from sklearn.impute import SimpleImputer\n",
"# imputer = SimpleImputer(missing_values=\"NaN\", strategy=\"mean\")\n",
"from sklearn.preprocessing import Imputer\n",
"# axis=0表示按列进行\n",
"imputer = Imputer(missing_values = \"NaN\", strategy = \"mean\", axis = 0)\n",
"print(imputer)\n",
"imputer = imputer.fit(X[ : , 1:3])\n",
>>>>>>> Stashed changes
"X[ : , 1:3] = imputer.transform(X[ : , 1:3])\n",
"print(\"---------------------\")\n",
"print(\"Step 3: Handling the missing data\")\n",