Update Day 1_Data_Preprocessing.ipynb
This commit is contained in:
@ -19,7 +19,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": 42,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -37,7 +37,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 52,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@ -71,27 +71,14 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 53,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": [
|
||||
"---------------------\n",
|
||||
"Step 3: Handling the missing data\n",
|
||||
"step2\n",
|
||||
"X\n",
|
||||
"[['France' 44.0 72000.0]\n",
|
||||
" ['Spain' 27.0 48000.0]\n",
|
||||
" ['Germany' 30.0 54000.0]\n",
|
||||
" ['Spain' 38.0 61000.0]\n",
|
||||
" ['Germany' 40.0 63777.77777777778]\n",
|
||||
" ['France' 35.0 58000.0]\n",
|
||||
" ['Spain' 38.77777777777778 52000.0]\n",
|
||||
" ['France' 48.0 79000.0]\n",
|
||||
" ['Germany' 50.0 83000.0]\n",
|
||||
" ['France' 37.0 67000.0]]\n"
|
||||
"---------------------\nStep 3: Handling the missing data\nstep2\nX\n[['France' 44.0 72000.0]\n ['Spain' 27.0 48000.0]\n ['Germany' 30.0 54000.0]\n ['Spain' 38.0 61000.0]\n ['Germany' 40.0 63777.77777777778]\n ['France' 35.0 58000.0]\n ['Spain' 38.77777777777778 52000.0]\n ['France' 48.0 79000.0]\n ['Germany' 50.0 83000.0]\n ['France' 37.0 67000.0]]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -121,48 +108,28 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 54,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": [
|
||||
"---------------------\n",
|
||||
"Step 4: Encoding categorical data\n",
|
||||
"X\n",
|
||||
"[[1.00000000e+00 0.00000000e+00 0.00000000e+00 4.40000000e+01\n",
|
||||
" 7.20000000e+04]\n",
|
||||
" [0.00000000e+00 0.00000000e+00 1.00000000e+00 2.70000000e+01\n",
|
||||
" 4.80000000e+04]\n",
|
||||
" [0.00000000e+00 1.00000000e+00 0.00000000e+00 3.00000000e+01\n",
|
||||
" 5.40000000e+04]\n",
|
||||
" [0.00000000e+00 0.00000000e+00 1.00000000e+00 3.80000000e+01\n",
|
||||
" 6.10000000e+04]\n",
|
||||
" [0.00000000e+00 1.00000000e+00 0.00000000e+00 4.00000000e+01\n",
|
||||
" 6.37777778e+04]\n",
|
||||
" [1.00000000e+00 0.00000000e+00 0.00000000e+00 3.50000000e+01\n",
|
||||
" 5.80000000e+04]\n",
|
||||
" [0.00000000e+00 0.00000000e+00 1.00000000e+00 3.87777778e+01\n",
|
||||
" 5.20000000e+04]\n",
|
||||
" [1.00000000e+00 0.00000000e+00 0.00000000e+00 4.80000000e+01\n",
|
||||
" 7.90000000e+04]\n",
|
||||
" [0.00000000e+00 1.00000000e+00 0.00000000e+00 5.00000000e+01\n",
|
||||
" 8.30000000e+04]\n",
|
||||
" [1.00000000e+00 0.00000000e+00 0.00000000e+00 3.70000000e+01\n",
|
||||
" 6.70000000e+04]]\n",
|
||||
"Y\n",
|
||||
"[0 1 0 0 1 1 0 1 0 1]\n"
|
||||
"---------------------\nStep 4: Encoding categorical data\nX\n[[1.0 0.0 0.0 44.0 72000.0]\n [0.0 0.0 1.0 27.0 48000.0]\n [0.0 1.0 0.0 30.0 54000.0]\n [0.0 0.0 1.0 38.0 61000.0]\n [0.0 1.0 0.0 40.0 63777.77777777778]\n [1.0 0.0 0.0 35.0 58000.0]\n [0.0 0.0 1.0 38.77777777777778 52000.0]\n [1.0 0.0 0.0 48.0 79000.0]\n [0.0 1.0 0.0 50.0 83000.0]\n [1.0 0.0 0.0 37.0 67000.0]]\nY\n[0 1 0 0 1 1 0 1 0 1]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from sklearn.preprocessing import LabelEncoder, OneHotEncoder\n",
|
||||
"labelencoder_X = LabelEncoder()\n",
|
||||
"X[ : , 0] = labelencoder_X.fit_transform(X[ : , 0])\n",
|
||||
"from sklearn.compose import ColumnTransformer \n",
|
||||
"#labelencoder_X = LabelEncoder()\n",
|
||||
"#X[ : , 0] = labelencoder_X.fit_transform(X[ : , 0])\n",
|
||||
"#Creating a dummy variable\n",
|
||||
"onehotencoder = OneHotEncoder(categorical_features = [0])\n",
|
||||
"X = onehotencoder.fit_transform(X).toarray()\n",
|
||||
"#print(X)\n",
|
||||
"ct = ColumnTransformer([(\"\", OneHotEncoder(), [0])], remainder = 'passthrough')\n",
|
||||
"X = ct.fit_transform(X)\n",
|
||||
"#onehotencoder = OneHotEncoder(categorical_features = [0])\n",
|
||||
"#X = onehotencoder.fit_transform(X).toarray()\n",
|
||||
"labelencoder_Y = LabelEncoder()\n",
|
||||
"Y = labelencoder_Y.fit_transform(Y)\n",
|
||||
"print(\"---------------------\")\n",
|
||||
@ -183,41 +150,14 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 55,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": [
|
||||
"---------------------\n",
|
||||
"Step 5: Splitting the datasets into training sets and Test sets\n",
|
||||
"X_train\n",
|
||||
"[[ 0.00000000e+00 1.00000000e+00 0.00000000e+00 4.00000000e+01\n",
|
||||
" 6.37777778e+04]\n",
|
||||
" [ 1.00000000e+00 0.00000000e+00 0.00000000e+00 3.70000000e+01\n",
|
||||
" 6.70000000e+04]\n",
|
||||
" [ 0.00000000e+00 0.00000000e+00 1.00000000e+00 2.70000000e+01\n",
|
||||
" 4.80000000e+04]\n",
|
||||
" [ 0.00000000e+00 0.00000000e+00 1.00000000e+00 3.87777778e+01\n",
|
||||
" 5.20000000e+04]\n",
|
||||
" [ 1.00000000e+00 0.00000000e+00 0.00000000e+00 4.80000000e+01\n",
|
||||
" 7.90000000e+04]\n",
|
||||
" [ 0.00000000e+00 0.00000000e+00 1.00000000e+00 3.80000000e+01\n",
|
||||
" 6.10000000e+04]\n",
|
||||
" [ 1.00000000e+00 0.00000000e+00 0.00000000e+00 4.40000000e+01\n",
|
||||
" 7.20000000e+04]\n",
|
||||
" [ 1.00000000e+00 0.00000000e+00 0.00000000e+00 3.50000000e+01\n",
|
||||
" 5.80000000e+04]]\n",
|
||||
"X_test\n",
|
||||
"[[ 0.00000000e+00 1.00000000e+00 0.00000000e+00 3.00000000e+01\n",
|
||||
" 5.40000000e+04]\n",
|
||||
" [ 0.00000000e+00 1.00000000e+00 0.00000000e+00 5.00000000e+01\n",
|
||||
" 8.30000000e+04]]\n",
|
||||
"Y_train\n",
|
||||
"[1 1 1 0 1 0 0 1]\n",
|
||||
"Y_test\n",
|
||||
"[0 0]\n"
|
||||
"---------------------\nStep 5: Splitting the datasets into training sets and Test sets\nX_train\n[[0.0 1.0 0.0 40.0 63777.77777777778]\n [1.0 0.0 0.0 37.0 67000.0]\n [0.0 0.0 1.0 27.0 48000.0]\n [0.0 0.0 1.0 38.77777777777778 52000.0]\n [1.0 0.0 0.0 48.0 79000.0]\n [0.0 0.0 1.0 38.0 61000.0]\n [1.0 0.0 0.0 44.0 72000.0]\n [1.0 0.0 0.0 35.0 58000.0]]\nX_test\n[[0.0 1.0 0.0 30.0 54000.0]\n [0.0 1.0 0.0 50.0 83000.0]]\nY_train\n[1 1 1 0 1 0 0 1]\nY_test\n[0 0]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -246,27 +186,14 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 57,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": [
|
||||
"---------------------\n",
|
||||
"Step 6: Feature Scaling\n",
|
||||
"X_train\n",
|
||||
"[[-1. 2.64575131 -0.77459667 0.26306757 0.12381479]\n",
|
||||
" [ 1. -0.37796447 -0.77459667 -0.25350148 0.46175632]\n",
|
||||
" [-1. -0.37796447 1.29099445 -1.97539832 -1.53093341]\n",
|
||||
" [-1. -0.37796447 1.29099445 0.05261351 -1.11141978]\n",
|
||||
" [ 1. -0.37796447 -0.77459667 1.64058505 1.7202972 ]\n",
|
||||
" [-1. -0.37796447 1.29099445 -0.0813118 -0.16751412]\n",
|
||||
" [ 1. -0.37796447 -0.77459667 0.95182631 0.98614835]\n",
|
||||
" [ 1. -0.37796447 -0.77459667 -0.59788085 -0.48214934]]\n",
|
||||
"X_test\n",
|
||||
"[[ 0. 0. 0. -1. -1.]\n",
|
||||
" [ 0. 0. 0. 1. 1.]]\n"
|
||||
"---------------------\nStep 6: Feature Scaling\nX_train\n[[-1. 2.64575131 -0.77459667 0.26306757 0.12381479]\n [ 1. -0.37796447 -0.77459667 -0.25350148 0.46175632]\n [-1. -0.37796447 1.29099445 -1.97539832 -1.53093341]\n [-1. -0.37796447 1.29099445 0.05261351 -1.11141978]\n [ 1. -0.37796447 -0.77459667 1.64058505 1.7202972 ]\n [-1. -0.37796447 1.29099445 -0.0813118 -0.16751412]\n [ 1. -0.37796447 -0.77459667 0.95182631 0.98614835]\n [ 1. -0.37796447 -0.77459667 -0.59788085 -0.48214934]]\nX_test\n[[-1. 2.64575131 -0.77459667 -1.45882927 -0.90166297]\n [-1. 2.64575131 -0.77459667 1.98496442 2.13981082]]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -274,7 +201,7 @@
|
||||
"from sklearn.preprocessing import StandardScaler\n",
|
||||
"sc_X = StandardScaler()\n",
|
||||
"X_train = sc_X.fit_transform(X_train)\n",
|
||||
"X_test = sc_X.transform(X_test)\n",
|
||||
"X_test = sc_X.transform(X_test) #we should not use fit_transfer cause the u and z is determined from x_train\n",
|
||||
"print(\"---------------------\")\n",
|
||||
"print(\"Step 6: Feature Scaling\")\n",
|
||||
"print(\"X_train\")\n",
|
||||
@ -289,15 +216,6 @@
|
||||
"source": [
|
||||
"<b>完整的项目请前往Github项目<a href=\"https://github.com/MachineLearning100/100-Days-Of-ML-Code\">100-Days-Of-ML-Code</a>查看。有任何的建议或者意见欢迎在issue中提出~</b>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
||||
Reference in New Issue
Block a user