Update Day 1_Data_Preprocessing.ipynb
This commit is contained in:
@ -19,7 +19,11 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
<<<<<<< Updated upstream
|
||||
"execution_count": 4,
|
||||
=======
|
||||
"execution_count": 2,
|
||||
>>>>>>> Stashed changes
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -37,15 +41,44 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
<<<<<<< Updated upstream
|
||||
"execution_count": 7,
|
||||
=======
|
||||
"execution_count": 6,
|
||||
>>>>>>> Stashed changes
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": [
|
||||
<<<<<<< Updated upstream
|
||||
"Step 2: Importing dataset\nX\n[['France' 44.0 72000.0]\n ['Spain' 27.0 48000.0]\n ['Germany' 30.0 54000.0]\n ['Spain' 38.0 61000.0]\n ['Germany' 40.0 nan]\n ['France' 35.0 58000.0]\n ['Spain' nan 52000.0]\n ['France' 48.0 79000.0]\n ['Germany' 50.0 83000.0]\n ['France' 37.0 67000.0]]\nY\n['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']\n"
|
||||
=======
|
||||
"Step 2: Importing dataset\nX:\n[['France' 44.0 72000.0]\n ['Spain' 27.0 48000.0]\n ['Germany' 30.0 54000.0]\n ['Spain' 38.0 61000.0]\n ['Germany' 40.0 nan]\n ['France' 35.0 58000.0]\n ['Spain' nan 52000.0]\n ['France' 48.0 79000.0]\n ['Germany' 50.0 83000.0]\n ['France' 37.0 67000.0]]\nY:\n['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']\n"
|
||||
>>>>>>> Stashed changes
|
||||
]
|
||||
},
|
||||
{
|
||||
"output_type": "execute_result",
|
||||
"data": {
|
||||
"text/plain": [
|
||||
" Country Age Salary Purchased\n",
|
||||
"0 France 44.0 72000.0 No\n",
|
||||
"1 Spain 27.0 48000.0 Yes\n",
|
||||
"2 Germany 30.0 54000.0 No\n",
|
||||
"3 Spain 38.0 61000.0 No\n",
|
||||
"4 Germany 40.0 NaN Yes\n",
|
||||
"5 France 35.0 58000.0 Yes\n",
|
||||
"6 Spain NaN 52000.0 No\n",
|
||||
"7 France 48.0 79000.0 Yes\n",
|
||||
"8 Germany 50.0 83000.0 No\n",
|
||||
"9 France 37.0 67000.0 Yes"
|
||||
],
|
||||
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Country</th>\n <th>Age</th>\n <th>Salary</th>\n <th>Purchased</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>France</td>\n <td>44.0</td>\n <td>72000.0</td>\n <td>No</td>\n </tr>\n <tr>\n <th>1</th>\n <td>Spain</td>\n <td>27.0</td>\n <td>48000.0</td>\n <td>Yes</td>\n </tr>\n <tr>\n <th>2</th>\n <td>Germany</td>\n <td>30.0</td>\n <td>54000.0</td>\n <td>No</td>\n </tr>\n <tr>\n <th>3</th>\n <td>Spain</td>\n <td>38.0</td>\n <td>61000.0</td>\n <td>No</td>\n </tr>\n <tr>\n <th>4</th>\n <td>Germany</td>\n <td>40.0</td>\n <td>NaN</td>\n <td>Yes</td>\n </tr>\n <tr>\n <th>5</th>\n <td>France</td>\n <td>35.0</td>\n <td>58000.0</td>\n <td>Yes</td>\n </tr>\n <tr>\n <th>6</th>\n <td>Spain</td>\n <td>NaN</td>\n <td>52000.0</td>\n <td>No</td>\n </tr>\n <tr>\n <th>7</th>\n <td>France</td>\n <td>48.0</td>\n <td>79000.0</td>\n <td>Yes</td>\n </tr>\n <tr>\n <th>8</th>\n <td>Germany</td>\n <td>50.0</td>\n <td>83000.0</td>\n <td>No</td>\n </tr>\n <tr>\n <th>9</th>\n <td>France</td>\n <td>37.0</td>\n <td>67000.0</td>\n <td>Yes</td>\n </tr>\n </tbody>\n</table>\n</div>"
|
||||
},
|
||||
"metadata": {},
|
||||
"execution_count": 6
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@ -55,10 +88,11 @@
|
||||
"#取最后一列\n",
|
||||
"Y = dataset.iloc[ : , 3].values\n",
|
||||
"print(\"Step 2: Importing dataset\")\n",
|
||||
"print(\"X\")\n",
|
||||
"print(\"X:\")\n",
|
||||
"print(X)\n",
|
||||
"print(\"Y\")\n",
|
||||
"print(Y)"
|
||||
"print(\"Y:\")\n",
|
||||
"print(Y)\n",
|
||||
"dataset.head(100)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -98,6 +132,7 @@
|
||||
],
|
||||
"source": [
|
||||
"# If you use the newest version of sklearn, use the lines of code commented out\n",
|
||||
<<<<<<< Updated upstream
|
||||
"from sklearn.impute import SimpleImputer\n",
|
||||
"imputer = SimpleImputer(missing_values=np.nan, strategy=\"mean\")\n",
|
||||
"#from sklearn.preprocessing import Imputer\n",
|
||||
@ -105,6 +140,15 @@
|
||||
"#imputer = Imputer(missing_values = \"NaN\", strategy = \"mean\", axis = 0)\n",
|
||||
"print(X[ 0:4 , 1:3])\n",
|
||||
"imputer = imputer.fit(X[ 0:4 , 1:3])\n",
|
||||
=======
|
||||
"# from sklearn.impute import SimpleImputer\n",
|
||||
"# imputer = SimpleImputer(missing_values=\"NaN\", strategy=\"mean\")\n",
|
||||
"from sklearn.preprocessing import Imputer\n",
|
||||
"# axis=0表示按列进行\n",
|
||||
"imputer = Imputer(missing_values = \"NaN\", strategy = \"mean\", axis = 0)\n",
|
||||
"print(imputer)\n",
|
||||
"imputer = imputer.fit(X[ : , 1:3])\n",
|
||||
>>>>>>> Stashed changes
|
||||
"X[ : , 1:3] = imputer.transform(X[ : , 1:3])\n",
|
||||
"print(\"---------------------\")\n",
|
||||
"print(\"Step 3: Handling the missing data\")\n",
|
||||
|
||||
Reference in New Issue
Block a user