Update Day 1_Data_Preprocessing.ipynb
This commit is contained in:
@ -19,7 +19,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 1,
|
"execution_count": 42,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@ -37,7 +37,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 2,
|
"execution_count": 52,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
@ -71,27 +71,14 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 3,
|
"execution_count": 53,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
"text": [
|
"text": [
|
||||||
"---------------------\n",
|
"---------------------\nStep 3: Handling the missing data\nstep2\nX\n[['France' 44.0 72000.0]\n ['Spain' 27.0 48000.0]\n ['Germany' 30.0 54000.0]\n ['Spain' 38.0 61000.0]\n ['Germany' 40.0 63777.77777777778]\n ['France' 35.0 58000.0]\n ['Spain' 38.77777777777778 52000.0]\n ['France' 48.0 79000.0]\n ['Germany' 50.0 83000.0]\n ['France' 37.0 67000.0]]\n"
|
||||||
"Step 3: Handling the missing data\n",
|
|
||||||
"step2\n",
|
|
||||||
"X\n",
|
|
||||||
"[['France' 44.0 72000.0]\n",
|
|
||||||
" ['Spain' 27.0 48000.0]\n",
|
|
||||||
" ['Germany' 30.0 54000.0]\n",
|
|
||||||
" ['Spain' 38.0 61000.0]\n",
|
|
||||||
" ['Germany' 40.0 63777.77777777778]\n",
|
|
||||||
" ['France' 35.0 58000.0]\n",
|
|
||||||
" ['Spain' 38.77777777777778 52000.0]\n",
|
|
||||||
" ['France' 48.0 79000.0]\n",
|
|
||||||
" ['Germany' 50.0 83000.0]\n",
|
|
||||||
" ['France' 37.0 67000.0]]\n"
|
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -121,48 +108,28 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 4,
|
"execution_count": 54,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
"text": [
|
"text": [
|
||||||
"---------------------\n",
|
"---------------------\nStep 4: Encoding categorical data\nX\n[[1.0 0.0 0.0 44.0 72000.0]\n [0.0 0.0 1.0 27.0 48000.0]\n [0.0 1.0 0.0 30.0 54000.0]\n [0.0 0.0 1.0 38.0 61000.0]\n [0.0 1.0 0.0 40.0 63777.77777777778]\n [1.0 0.0 0.0 35.0 58000.0]\n [0.0 0.0 1.0 38.77777777777778 52000.0]\n [1.0 0.0 0.0 48.0 79000.0]\n [0.0 1.0 0.0 50.0 83000.0]\n [1.0 0.0 0.0 37.0 67000.0]]\nY\n[0 1 0 0 1 1 0 1 0 1]\n"
|
||||||
"Step 4: Encoding categorical data\n",
|
|
||||||
"X\n",
|
|
||||||
"[[1.00000000e+00 0.00000000e+00 0.00000000e+00 4.40000000e+01\n",
|
|
||||||
" 7.20000000e+04]\n",
|
|
||||||
" [0.00000000e+00 0.00000000e+00 1.00000000e+00 2.70000000e+01\n",
|
|
||||||
" 4.80000000e+04]\n",
|
|
||||||
" [0.00000000e+00 1.00000000e+00 0.00000000e+00 3.00000000e+01\n",
|
|
||||||
" 5.40000000e+04]\n",
|
|
||||||
" [0.00000000e+00 0.00000000e+00 1.00000000e+00 3.80000000e+01\n",
|
|
||||||
" 6.10000000e+04]\n",
|
|
||||||
" [0.00000000e+00 1.00000000e+00 0.00000000e+00 4.00000000e+01\n",
|
|
||||||
" 6.37777778e+04]\n",
|
|
||||||
" [1.00000000e+00 0.00000000e+00 0.00000000e+00 3.50000000e+01\n",
|
|
||||||
" 5.80000000e+04]\n",
|
|
||||||
" [0.00000000e+00 0.00000000e+00 1.00000000e+00 3.87777778e+01\n",
|
|
||||||
" 5.20000000e+04]\n",
|
|
||||||
" [1.00000000e+00 0.00000000e+00 0.00000000e+00 4.80000000e+01\n",
|
|
||||||
" 7.90000000e+04]\n",
|
|
||||||
" [0.00000000e+00 1.00000000e+00 0.00000000e+00 5.00000000e+01\n",
|
|
||||||
" 8.30000000e+04]\n",
|
|
||||||
" [1.00000000e+00 0.00000000e+00 0.00000000e+00 3.70000000e+01\n",
|
|
||||||
" 6.70000000e+04]]\n",
|
|
||||||
"Y\n",
|
|
||||||
"[0 1 0 0 1 1 0 1 0 1]\n"
|
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"from sklearn.preprocessing import LabelEncoder, OneHotEncoder\n",
|
"from sklearn.preprocessing import LabelEncoder, OneHotEncoder\n",
|
||||||
"labelencoder_X = LabelEncoder()\n",
|
"from sklearn.compose import ColumnTransformer \n",
|
||||||
"X[ : , 0] = labelencoder_X.fit_transform(X[ : , 0])\n",
|
"#labelencoder_X = LabelEncoder()\n",
|
||||||
|
"#X[ : , 0] = labelencoder_X.fit_transform(X[ : , 0])\n",
|
||||||
"#Creating a dummy variable\n",
|
"#Creating a dummy variable\n",
|
||||||
"onehotencoder = OneHotEncoder(categorical_features = [0])\n",
|
"#print(X)\n",
|
||||||
"X = onehotencoder.fit_transform(X).toarray()\n",
|
"ct = ColumnTransformer([(\"\", OneHotEncoder(), [0])], remainder = 'passthrough')\n",
|
||||||
|
"X = ct.fit_transform(X)\n",
|
||||||
|
"#onehotencoder = OneHotEncoder(categorical_features = [0])\n",
|
||||||
|
"#X = onehotencoder.fit_transform(X).toarray()\n",
|
||||||
"labelencoder_Y = LabelEncoder()\n",
|
"labelencoder_Y = LabelEncoder()\n",
|
||||||
"Y = labelencoder_Y.fit_transform(Y)\n",
|
"Y = labelencoder_Y.fit_transform(Y)\n",
|
||||||
"print(\"---------------------\")\n",
|
"print(\"---------------------\")\n",
|
||||||
@ -183,41 +150,14 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 5,
|
"execution_count": 55,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
"text": [
|
"text": [
|
||||||
"---------------------\n",
|
"---------------------\nStep 5: Splitting the datasets into training sets and Test sets\nX_train\n[[0.0 1.0 0.0 40.0 63777.77777777778]\n [1.0 0.0 0.0 37.0 67000.0]\n [0.0 0.0 1.0 27.0 48000.0]\n [0.0 0.0 1.0 38.77777777777778 52000.0]\n [1.0 0.0 0.0 48.0 79000.0]\n [0.0 0.0 1.0 38.0 61000.0]\n [1.0 0.0 0.0 44.0 72000.0]\n [1.0 0.0 0.0 35.0 58000.0]]\nX_test\n[[0.0 1.0 0.0 30.0 54000.0]\n [0.0 1.0 0.0 50.0 83000.0]]\nY_train\n[1 1 1 0 1 0 0 1]\nY_test\n[0 0]\n"
|
||||||
"Step 5: Splitting the datasets into training sets and Test sets\n",
|
|
||||||
"X_train\n",
|
|
||||||
"[[ 0.00000000e+00 1.00000000e+00 0.00000000e+00 4.00000000e+01\n",
|
|
||||||
" 6.37777778e+04]\n",
|
|
||||||
" [ 1.00000000e+00 0.00000000e+00 0.00000000e+00 3.70000000e+01\n",
|
|
||||||
" 6.70000000e+04]\n",
|
|
||||||
" [ 0.00000000e+00 0.00000000e+00 1.00000000e+00 2.70000000e+01\n",
|
|
||||||
" 4.80000000e+04]\n",
|
|
||||||
" [ 0.00000000e+00 0.00000000e+00 1.00000000e+00 3.87777778e+01\n",
|
|
||||||
" 5.20000000e+04]\n",
|
|
||||||
" [ 1.00000000e+00 0.00000000e+00 0.00000000e+00 4.80000000e+01\n",
|
|
||||||
" 7.90000000e+04]\n",
|
|
||||||
" [ 0.00000000e+00 0.00000000e+00 1.00000000e+00 3.80000000e+01\n",
|
|
||||||
" 6.10000000e+04]\n",
|
|
||||||
" [ 1.00000000e+00 0.00000000e+00 0.00000000e+00 4.40000000e+01\n",
|
|
||||||
" 7.20000000e+04]\n",
|
|
||||||
" [ 1.00000000e+00 0.00000000e+00 0.00000000e+00 3.50000000e+01\n",
|
|
||||||
" 5.80000000e+04]]\n",
|
|
||||||
"X_test\n",
|
|
||||||
"[[ 0.00000000e+00 1.00000000e+00 0.00000000e+00 3.00000000e+01\n",
|
|
||||||
" 5.40000000e+04]\n",
|
|
||||||
" [ 0.00000000e+00 1.00000000e+00 0.00000000e+00 5.00000000e+01\n",
|
|
||||||
" 8.30000000e+04]]\n",
|
|
||||||
"Y_train\n",
|
|
||||||
"[1 1 1 0 1 0 0 1]\n",
|
|
||||||
"Y_test\n",
|
|
||||||
"[0 0]\n"
|
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -246,27 +186,14 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 6,
|
"execution_count": 57,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
"text": [
|
"text": [
|
||||||
"---------------------\n",
|
"---------------------\nStep 6: Feature Scaling\nX_train\n[[-1. 2.64575131 -0.77459667 0.26306757 0.12381479]\n [ 1. -0.37796447 -0.77459667 -0.25350148 0.46175632]\n [-1. -0.37796447 1.29099445 -1.97539832 -1.53093341]\n [-1. -0.37796447 1.29099445 0.05261351 -1.11141978]\n [ 1. -0.37796447 -0.77459667 1.64058505 1.7202972 ]\n [-1. -0.37796447 1.29099445 -0.0813118 -0.16751412]\n [ 1. -0.37796447 -0.77459667 0.95182631 0.98614835]\n [ 1. -0.37796447 -0.77459667 -0.59788085 -0.48214934]]\nX_test\n[[-1. 2.64575131 -0.77459667 -1.45882927 -0.90166297]\n [-1. 2.64575131 -0.77459667 1.98496442 2.13981082]]\n"
|
||||||
"Step 6: Feature Scaling\n",
|
|
||||||
"X_train\n",
|
|
||||||
"[[-1. 2.64575131 -0.77459667 0.26306757 0.12381479]\n",
|
|
||||||
" [ 1. -0.37796447 -0.77459667 -0.25350148 0.46175632]\n",
|
|
||||||
" [-1. -0.37796447 1.29099445 -1.97539832 -1.53093341]\n",
|
|
||||||
" [-1. -0.37796447 1.29099445 0.05261351 -1.11141978]\n",
|
|
||||||
" [ 1. -0.37796447 -0.77459667 1.64058505 1.7202972 ]\n",
|
|
||||||
" [-1. -0.37796447 1.29099445 -0.0813118 -0.16751412]\n",
|
|
||||||
" [ 1. -0.37796447 -0.77459667 0.95182631 0.98614835]\n",
|
|
||||||
" [ 1. -0.37796447 -0.77459667 -0.59788085 -0.48214934]]\n",
|
|
||||||
"X_test\n",
|
|
||||||
"[[ 0. 0. 0. -1. -1.]\n",
|
|
||||||
" [ 0. 0. 0. 1. 1.]]\n"
|
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -274,7 +201,7 @@
|
|||||||
"from sklearn.preprocessing import StandardScaler\n",
|
"from sklearn.preprocessing import StandardScaler\n",
|
||||||
"sc_X = StandardScaler()\n",
|
"sc_X = StandardScaler()\n",
|
||||||
"X_train = sc_X.fit_transform(X_train)\n",
|
"X_train = sc_X.fit_transform(X_train)\n",
|
||||||
"X_test = sc_X.transform(X_test)\n",
|
"X_test = sc_X.transform(X_test) #we should not use fit_transfer cause the u and z is determined from x_train\n",
|
||||||
"print(\"---------------------\")\n",
|
"print(\"---------------------\")\n",
|
||||||
"print(\"Step 6: Feature Scaling\")\n",
|
"print(\"Step 6: Feature Scaling\")\n",
|
||||||
"print(\"X_train\")\n",
|
"print(\"X_train\")\n",
|
||||||
@ -289,15 +216,6 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"<b>完整的项目请前往Github项目<a href=\"https://github.com/MachineLearning100/100-Days-Of-ML-Code\">100-Days-Of-ML-Code</a>查看。有任何的建议或者意见欢迎在issue中提出~</b>"
|
"<b>完整的项目请前往Github项目<a href=\"https://github.com/MachineLearning100/100-Days-Of-ML-Code\">100-Days-Of-ML-Code</a>查看。有任何的建议或者意见欢迎在issue中提出~</b>"
|
||||||
]
|
]
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"collapsed": true
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": []
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
|||||||
Reference in New Issue
Block a user