For better comprehension
All the Operations I did to code for my own understanding.
This commit is contained in:
@ -19,7 +19,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -37,27 +37,14 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": [
|
||||
"Step 2: Importing dataset\n",
|
||||
"X\n",
|
||||
"[['France' 44.0 72000.0]\n",
|
||||
" ['Spain' 27.0 48000.0]\n",
|
||||
" ['Germany' 30.0 54000.0]\n",
|
||||
" ['Spain' 38.0 61000.0]\n",
|
||||
" ['Germany' 40.0 nan]\n",
|
||||
" ['France' 35.0 58000.0]\n",
|
||||
" ['Spain' nan 52000.0]\n",
|
||||
" ['France' 48.0 79000.0]\n",
|
||||
" ['Germany' 50.0 83000.0]\n",
|
||||
" ['France' 37.0 67000.0]]\n",
|
||||
"Y\n",
|
||||
"['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']\n"
|
||||
"Step 2: Importing dataset\nX\n[['France' 44.0 72000.0]\n ['Spain' 27.0 48000.0]\n ['Germany' 30.0 54000.0]\n ['Spain' 38.0 61000.0]\n ['Germany' 40.0 nan]\n ['France' 35.0 58000.0]\n ['Spain' nan 52000.0]\n ['France' 48.0 79000.0]\n ['Germany' 50.0 83000.0]\n ['France' 37.0 67000.0]]\nY\n['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -84,38 +71,40 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": [
|
||||
"---------------------\n",
|
||||
"Step 3: Handling the missing data\n",
|
||||
"step2\n",
|
||||
"X\n",
|
||||
"[['France' 44.0 72000.0]\n",
|
||||
" ['Spain' 27.0 48000.0]\n",
|
||||
" ['Germany' 30.0 54000.0]\n",
|
||||
" ['Spain' 38.0 61000.0]\n",
|
||||
" ['Germany' 40.0 63777.77777777778]\n",
|
||||
" ['France' 35.0 58000.0]\n",
|
||||
" ['Spain' 38.77777777777778 52000.0]\n",
|
||||
" ['France' 48.0 79000.0]\n",
|
||||
" ['Germany' 50.0 83000.0]\n",
|
||||
" ['France' 37.0 67000.0]]\n"
|
||||
"[[44.0 72000.0]\n [27.0 48000.0]\n [30.0 54000.0]\n [38.0 61000.0]]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"output_type": "error",
|
||||
"ename": "ValueError",
|
||||
"evalue": "'X' and 'missing_values' types are expected to be both numerical. Got X.dtype=float64 and type(missing_values)=<class 'str'>.",
|
||||
"traceback": [
|
||||
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
|
||||
"\u001b[1;32m<ipython-input-17-ee12d1366c46>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 6\u001b[0m \u001b[1;31m#imputer = Imputer(missing_values = \"NaN\", strategy = \"mean\", axis = 0)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 7\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m[\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;36m4\u001b[0m \u001b[1;33m,\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;36m3\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 8\u001b[1;33m \u001b[0mimputer\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mimputer\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m[\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;36m4\u001b[0m \u001b[1;33m,\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;36m3\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 9\u001b[0m \u001b[0mX\u001b[0m\u001b[1;33m[\u001b[0m \u001b[1;33m:\u001b[0m \u001b[1;33m,\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;36m3\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mimputer\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtransform\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m[\u001b[0m \u001b[1;33m:\u001b[0m \u001b[1;33m,\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;36m3\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 10\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"---------------------\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[1;32m~\\anaconda3\\lib\\site-packages\\sklearn\\impute\\_base.py\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 275\u001b[0m \u001b[0mself\u001b[0m \u001b[1;33m:\u001b[0m \u001b[0mSimpleImputer\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 276\u001b[0m \"\"\"\n\u001b[1;32m--> 277\u001b[1;33m \u001b[0mX\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_validate_input\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0min_fit\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 278\u001b[0m \u001b[0msuper\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_fit_indicator\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 279\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[1;32m~\\anaconda3\\lib\\site-packages\\sklearn\\impute\\_base.py\u001b[0m in \u001b[0;36m_validate_input\u001b[1;34m(self, X, in_fit)\u001b[0m\n\u001b[0;32m 251\u001b[0m \u001b[1;32mraise\u001b[0m \u001b[0mve\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 252\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 253\u001b[1;33m \u001b[0m_check_inputs_dtype\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmissing_values\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 254\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mX\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mkind\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32min\u001b[0m \u001b[1;33m(\u001b[0m\u001b[1;34m\"i\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"u\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"f\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"O\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 255\u001b[0m raise ValueError(\"SimpleImputer does not support data with dtype \"\n",
|
||||
"\u001b[1;32m~\\anaconda3\\lib\\site-packages\\sklearn\\impute\\_base.py\u001b[0m in \u001b[0;36m_check_inputs_dtype\u001b[1;34m(X, missing_values)\u001b[0m\n\u001b[0;32m 23\u001b[0m if (X.dtype.kind in (\"f\", \"i\", \"u\") and\n\u001b[0;32m 24\u001b[0m not isinstance(missing_values, numbers.Real)):\n\u001b[1;32m---> 25\u001b[1;33m raise ValueError(\"'X' and 'missing_values' types are expected to be\"\n\u001b[0m\u001b[0;32m 26\u001b[0m \u001b[1;34m\" both numerical. Got X.dtype={} and \"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 27\u001b[0m \u001b[1;34m\" type(missing_values)={}.\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[1;31mValueError\u001b[0m: 'X' and 'missing_values' types are expected to be both numerical. Got X.dtype=float64 and type(missing_values)=<class 'str'>."
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# If you use the newest version of sklearn, use the lines of code commented out",
|
||||
"# from sklearn.impute import SimpleImputer",
|
||||
"# imputer = SimpleImputer(missing_values=\"NaN\", strategy=\"mean\")",
|
||||
"from sklearn.preprocessing import Imputer\n",
|
||||
"# If you use the newest version of sklearn, use the lines of code commented out\n",
|
||||
"from sklearn.impute import SimpleImputer\n",
|
||||
"imputer = SimpleImputer(missing_values=np.nan, strategy=\"mean\")\n",
|
||||
"#from sklearn.preprocessing import Imputer\n",
|
||||
"# axis=0表示按列进行\n",
|
||||
"imputer = Imputer(missing_values = \"NaN\", strategy = \"mean\", axis = 0)\n",
|
||||
"imputer = imputer.fit(X[ : , 1:3])\n",
|
||||
"#imputer = Imputer(missing_values = \"NaN\", strategy = \"mean\", axis = 0)\n",
|
||||
"print(X[ 0:4 , 1:3])\n",
|
||||
"imputer = imputer.fit(X[ 0:4 , 1:3])\n",
|
||||
"X[ : , 1:3] = imputer.transform(X[ : , 1:3])\n",
|
||||
"print(\"---------------------\")\n",
|
||||
"print(\"Step 3: Handling the missing data\")\n",
|
||||
@ -315,9 +304,13 @@
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
"name": "python3",
|
||||
"display_name": "Python 3.8.3 64-bit (conda)",
|
||||
"metadata": {
|
||||
"interpreter": {
|
||||
"hash": "1b78ff499ec469310b6a6795c4effbbfc85eb20a6ba0cf828a15721670711b2c"
|
||||
}
|
||||
}
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
@ -329,9 +322,9 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.5"
|
||||
"version": "3.8.3-final"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user