diff --git a/Code/Day 1_Data_Preprocessing.ipynb b/Code/Day 1_Data_Preprocessing.ipynb index 7d250ac..eb2a746 100644 --- a/Code/Day 1_Data_Preprocessing.ipynb +++ b/Code/Day 1_Data_Preprocessing.ipynb @@ -19,7 +19,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 42, "metadata": {}, "outputs": [], "source": [ @@ -37,7 +37,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 52, "metadata": {}, "outputs": [ { @@ -71,27 +71,14 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 53, "metadata": {}, "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ - "---------------------\n", - "Step 3: Handling the missing data\n", - "step2\n", - "X\n", - "[['France' 44.0 72000.0]\n", - " ['Spain' 27.0 48000.0]\n", - " ['Germany' 30.0 54000.0]\n", - " ['Spain' 38.0 61000.0]\n", - " ['Germany' 40.0 63777.77777777778]\n", - " ['France' 35.0 58000.0]\n", - " ['Spain' 38.77777777777778 52000.0]\n", - " ['France' 48.0 79000.0]\n", - " ['Germany' 50.0 83000.0]\n", - " ['France' 37.0 67000.0]]\n" + "---------------------\nStep 3: Handling the missing data\nstep2\nX\n[['France' 44.0 72000.0]\n ['Spain' 27.0 48000.0]\n ['Germany' 30.0 54000.0]\n ['Spain' 38.0 61000.0]\n ['Germany' 40.0 63777.77777777778]\n ['France' 35.0 58000.0]\n ['Spain' 38.77777777777778 52000.0]\n ['France' 48.0 79000.0]\n ['Germany' 50.0 83000.0]\n ['France' 37.0 67000.0]]\n" ] } ], @@ -121,48 +108,28 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 54, "metadata": {}, "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ - "---------------------\n", - "Step 4: Encoding categorical data\n", - "X\n", - "[[1.00000000e+00 0.00000000e+00 0.00000000e+00 4.40000000e+01\n", - " 7.20000000e+04]\n", - " [0.00000000e+00 0.00000000e+00 1.00000000e+00 2.70000000e+01\n", - " 4.80000000e+04]\n", - " [0.00000000e+00 1.00000000e+00 0.00000000e+00 3.00000000e+01\n", - " 5.40000000e+04]\n", - " [0.00000000e+00 0.00000000e+00 1.00000000e+00 3.80000000e+01\n", - " 6.10000000e+04]\n", - " [0.00000000e+00 1.00000000e+00 0.00000000e+00 4.00000000e+01\n", - " 6.37777778e+04]\n", - " [1.00000000e+00 0.00000000e+00 0.00000000e+00 3.50000000e+01\n", - " 5.80000000e+04]\n", - " [0.00000000e+00 0.00000000e+00 1.00000000e+00 3.87777778e+01\n", - " 5.20000000e+04]\n", - " [1.00000000e+00 0.00000000e+00 0.00000000e+00 4.80000000e+01\n", - " 7.90000000e+04]\n", - " [0.00000000e+00 1.00000000e+00 0.00000000e+00 5.00000000e+01\n", - " 8.30000000e+04]\n", - " [1.00000000e+00 0.00000000e+00 0.00000000e+00 3.70000000e+01\n", - " 6.70000000e+04]]\n", - "Y\n", - "[0 1 0 0 1 1 0 1 0 1]\n" + "---------------------\nStep 4: Encoding categorical data\nX\n[[1.0 0.0 0.0 44.0 72000.0]\n [0.0 0.0 1.0 27.0 48000.0]\n [0.0 1.0 0.0 30.0 54000.0]\n [0.0 0.0 1.0 38.0 61000.0]\n [0.0 1.0 0.0 40.0 63777.77777777778]\n [1.0 0.0 0.0 35.0 58000.0]\n [0.0 0.0 1.0 38.77777777777778 52000.0]\n [1.0 0.0 0.0 48.0 79000.0]\n [0.0 1.0 0.0 50.0 83000.0]\n [1.0 0.0 0.0 37.0 67000.0]]\nY\n[0 1 0 0 1 1 0 1 0 1]\n" ] } ], "source": [ "from sklearn.preprocessing import LabelEncoder, OneHotEncoder\n", - "labelencoder_X = LabelEncoder()\n", - "X[ : , 0] = labelencoder_X.fit_transform(X[ : , 0])\n", + "from sklearn.compose import ColumnTransformer \n", + "#labelencoder_X = LabelEncoder()\n", + "#X[ : , 0] = labelencoder_X.fit_transform(X[ : , 0])\n", "#Creating a dummy variable\n", - "onehotencoder = OneHotEncoder(categorical_features = [0])\n", - "X = onehotencoder.fit_transform(X).toarray()\n", + "#print(X)\n", + "ct = ColumnTransformer([(\"\", OneHotEncoder(), [0])], remainder = 'passthrough')\n", + "X = ct.fit_transform(X)\n", + "#onehotencoder = OneHotEncoder(categorical_features = [0])\n", + "#X = onehotencoder.fit_transform(X).toarray()\n", "labelencoder_Y = LabelEncoder()\n", "Y = labelencoder_Y.fit_transform(Y)\n", "print(\"---------------------\")\n", @@ -183,41 +150,14 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 55, "metadata": {}, "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ - "---------------------\n", - "Step 5: Splitting the datasets into training sets and Test sets\n", - "X_train\n", - "[[ 0.00000000e+00 1.00000000e+00 0.00000000e+00 4.00000000e+01\n", - " 6.37777778e+04]\n", - " [ 1.00000000e+00 0.00000000e+00 0.00000000e+00 3.70000000e+01\n", - " 6.70000000e+04]\n", - " [ 0.00000000e+00 0.00000000e+00 1.00000000e+00 2.70000000e+01\n", - " 4.80000000e+04]\n", - " [ 0.00000000e+00 0.00000000e+00 1.00000000e+00 3.87777778e+01\n", - " 5.20000000e+04]\n", - " [ 1.00000000e+00 0.00000000e+00 0.00000000e+00 4.80000000e+01\n", - " 7.90000000e+04]\n", - " [ 0.00000000e+00 0.00000000e+00 1.00000000e+00 3.80000000e+01\n", - " 6.10000000e+04]\n", - " [ 1.00000000e+00 0.00000000e+00 0.00000000e+00 4.40000000e+01\n", - " 7.20000000e+04]\n", - " [ 1.00000000e+00 0.00000000e+00 0.00000000e+00 3.50000000e+01\n", - " 5.80000000e+04]]\n", - "X_test\n", - "[[ 0.00000000e+00 1.00000000e+00 0.00000000e+00 3.00000000e+01\n", - " 5.40000000e+04]\n", - " [ 0.00000000e+00 1.00000000e+00 0.00000000e+00 5.00000000e+01\n", - " 8.30000000e+04]]\n", - "Y_train\n", - "[1 1 1 0 1 0 0 1]\n", - "Y_test\n", - "[0 0]\n" + "---------------------\nStep 5: Splitting the datasets into training sets and Test sets\nX_train\n[[0.0 1.0 0.0 40.0 63777.77777777778]\n [1.0 0.0 0.0 37.0 67000.0]\n [0.0 0.0 1.0 27.0 48000.0]\n [0.0 0.0 1.0 38.77777777777778 52000.0]\n [1.0 0.0 0.0 48.0 79000.0]\n [0.0 0.0 1.0 38.0 61000.0]\n [1.0 0.0 0.0 44.0 72000.0]\n [1.0 0.0 0.0 35.0 58000.0]]\nX_test\n[[0.0 1.0 0.0 30.0 54000.0]\n [0.0 1.0 0.0 50.0 83000.0]]\nY_train\n[1 1 1 0 1 0 0 1]\nY_test\n[0 0]\n" ] } ], @@ -246,27 +186,14 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 57, "metadata": {}, "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ - "---------------------\n", - "Step 6: Feature Scaling\n", - "X_train\n", - "[[-1. 2.64575131 -0.77459667 0.26306757 0.12381479]\n", - " [ 1. -0.37796447 -0.77459667 -0.25350148 0.46175632]\n", - " [-1. -0.37796447 1.29099445 -1.97539832 -1.53093341]\n", - " [-1. -0.37796447 1.29099445 0.05261351 -1.11141978]\n", - " [ 1. -0.37796447 -0.77459667 1.64058505 1.7202972 ]\n", - " [-1. -0.37796447 1.29099445 -0.0813118 -0.16751412]\n", - " [ 1. -0.37796447 -0.77459667 0.95182631 0.98614835]\n", - " [ 1. -0.37796447 -0.77459667 -0.59788085 -0.48214934]]\n", - "X_test\n", - "[[ 0. 0. 0. -1. -1.]\n", - " [ 0. 0. 0. 1. 1.]]\n" + "---------------------\nStep 6: Feature Scaling\nX_train\n[[-1. 2.64575131 -0.77459667 0.26306757 0.12381479]\n [ 1. -0.37796447 -0.77459667 -0.25350148 0.46175632]\n [-1. -0.37796447 1.29099445 -1.97539832 -1.53093341]\n [-1. -0.37796447 1.29099445 0.05261351 -1.11141978]\n [ 1. -0.37796447 -0.77459667 1.64058505 1.7202972 ]\n [-1. -0.37796447 1.29099445 -0.0813118 -0.16751412]\n [ 1. -0.37796447 -0.77459667 0.95182631 0.98614835]\n [ 1. -0.37796447 -0.77459667 -0.59788085 -0.48214934]]\nX_test\n[[-1. 2.64575131 -0.77459667 -1.45882927 -0.90166297]\n [-1. 2.64575131 -0.77459667 1.98496442 2.13981082]]\n" ] } ], @@ -274,7 +201,7 @@ "from sklearn.preprocessing import StandardScaler\n", "sc_X = StandardScaler()\n", "X_train = sc_X.fit_transform(X_train)\n", - "X_test = sc_X.transform(X_test)\n", + "X_test = sc_X.transform(X_test) #we should not use fit_transfer cause the u and z is determined from x_train\n", "print(\"---------------------\")\n", "print(\"Step 6: Feature Scaling\")\n", "print(\"X_train\")\n", @@ -289,15 +216,6 @@ "source": [ "完整的项目请前往Github项目100-Days-Of-ML-Code查看。有任何的建议或者意见欢迎在issue中提出~" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] } ], "metadata": {