diff --git a/Code/Day 1_Data_Preprocessing.ipynb b/Code/Day 1_Data_Preprocessing.ipynb index eb2a746..6d7ae53 100644 --- a/Code/Day 1_Data_Preprocessing.ipynb +++ b/Code/Day 1_Data_Preprocessing.ipynb @@ -27,6 +27,63 @@ "import pandas as pd" ] }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[[ 7. 2. 3. ]\n [ 4. 3.5 6. ]\n [10. 3.5 9. ]]\nSklearn verion is 0.23.1\n" + ] + } + ], + "source": [ + "import sklearn\n", + "from sklearn.impute import SimpleImputer\n", + "#This block is an example used to learn SimpleImputer\n", + "imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')\n", + "imp_mean.fit([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]])\n", + "X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]]\n", + "print(imp_mean.transform(X))\n", + "print(\"Sklearn verion is {}\".format(sklearn.__version__))" + ] + }, + { + "source": [ + "from sklearn.preprocessing import OneHotEncoder\n", + "enc = OneHotEncoder(handle_unknown='ignore')\n", + "X = [['Male', 1], ['Female', 3], ['Female', 2]]\n", + ">>> enc.fit(X)\n", + "OneHotEncoder(handle_unknown='ignore')\n", + ">>> enc.categories_\n", + "[array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]\n", + ">>> enc.transform([['Female', 1], ['Male', 4]]).toarray()\n", + "array([[1., 0., 1., 0., 0.],\n", + " [0., 1., 0., 0., 0.]])\n", + ">>> enc.inverse_transform([[0, 1, 1, 0, 0], [0, 0, 0, 1, 0]])\n", + "array([['Male', 1],\n", + " [None, 2]], dtype=object)\n", + ">>> enc.get_feature_names(['gender', 'group'])\n", + "array(['gender_Female', 'gender_Male', 'group_1', 'group_2', 'group_3'],\n", + " dtype=object)" + ], + "cell_type": "code", + "metadata": {}, + "execution_count": 4, + "outputs": [ + { + "output_type": "error", + "ename": "SyntaxError", + "evalue": "invalid syntax (, line 4)", + "traceback": [ + "\u001b[1;36m File \u001b[1;32m\"\"\u001b[1;36m, line \u001b[1;32m4\u001b[0m\n\u001b[1;33m >>> enc.fit(X)\u001b[0m\n\u001b[1;37m ^\u001b[0m\n\u001b[1;31mSyntaxError\u001b[0m\u001b[1;31m:\u001b[0m invalid syntax\n" + ] + } + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -44,7 +101,7 @@ "output_type": "stream", "name": "stdout", "text": [ - "Step 2: Importing dataset\nX\n[['France' 44.0 72000.0]\n ['Spain' 27.0 48000.0]\n ['Germany' 30.0 54000.0]\n ['Spain' 38.0 61000.0]\n ['Germany' 40.0 nan]\n ['France' 35.0 58000.0]\n ['Spain' nan 52000.0]\n ['France' 48.0 79000.0]\n ['Germany' 50.0 83000.0]\n ['France' 37.0 67000.0]]\nY\n['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']\n" + "Step 2: Importing dataset\nX\n[['France' 44.0 72000.0]\n ['Spain' 27.0 48000.0]\n ['Germany' 30.0 54000.0]\n ['Spain' 38.0 61000.0]\n ['Germany' 40.0 nan]\n ['France' 35.0 58000.0]\n ['Spain' nan 52000.0]\n ['France' 48.0 79000.0]\n ['Germany' 50.0 83000.0]\n ['France' 37.0 67000.0]]\nY\n['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']\n[[44.0 72000.0]\n [27.0 48000.0]\n [30.0 54000.0]\n [38.0 61000.0]\n [40.0 nan]\n [35.0 58000.0]\n [nan 52000.0]\n [48.0 79000.0]\n [50.0 83000.0]\n [37.0 67000.0]]\n" ] } ], @@ -58,7 +115,8 @@ "print(\"X\")\n", "print(X)\n", "print(\"Y\")\n", - "print(Y)" + "print(Y)\n", + "print(X[ : , 1:3])" ] }, { @@ -72,6 +130,7 @@ { "cell_type": "code", "execution_count": 53, + "metadata": {}, "outputs": [ { @@ -89,8 +148,12 @@ "#from sklearn.preprocessing import Imputer\n", "# axis=0表示按列进行\n", "#imputer = Imputer(missing_values = \"NaN\", strategy = \"mean\", axis = 0)\n", - "imputer = imputer.fit(X[ : , 1:3])\n", - "X[ : , 1:3] = imputer.transform(X[ : , 1:3])\n", + "#print(imputer)\n", + "#\n", + "# print(X[ : , 1:3])\n", + "imputer = imputer.fit(X[ : , 1:3]) #put the data we want to process in to this imputer\n", + "X[ : , 1:3] = imputer.transform(X[ : , 1:3]) #replace the np.nan with mean\n", + "#print(X[ : , 1:3])\n", "print(\"---------------------\")\n", "print(\"Step 3: Handling the missing data\")\n", "print(\"step2\")\n", @@ -116,11 +179,13 @@ "name": "stdout", "text": [ "---------------------\nStep 4: Encoding categorical data\nX\n[[1.0 0.0 0.0 44.0 72000.0]\n [0.0 0.0 1.0 27.0 48000.0]\n [0.0 1.0 0.0 30.0 54000.0]\n [0.0 0.0 1.0 38.0 61000.0]\n [0.0 1.0 0.0 40.0 63777.77777777778]\n [1.0 0.0 0.0 35.0 58000.0]\n [0.0 0.0 1.0 38.77777777777778 52000.0]\n [1.0 0.0 0.0 48.0 79000.0]\n [0.0 1.0 0.0 50.0 83000.0]\n [1.0 0.0 0.0 37.0 67000.0]]\nY\n[0 1 0 0 1 1 0 1 0 1]\n" + ] } ], "source": [ "from sklearn.preprocessing import LabelEncoder, OneHotEncoder\n", + "from sklearn.compose import ColumnTransformer \n", "#labelencoder_X = LabelEncoder()\n", "#X[ : , 0] = labelencoder_X.fit_transform(X[ : , 0])\n", @@ -194,6 +259,7 @@ "name": "stdout", "text": [ "---------------------\nStep 6: Feature Scaling\nX_train\n[[-1. 2.64575131 -0.77459667 0.26306757 0.12381479]\n [ 1. -0.37796447 -0.77459667 -0.25350148 0.46175632]\n [-1. -0.37796447 1.29099445 -1.97539832 -1.53093341]\n [-1. -0.37796447 1.29099445 0.05261351 -1.11141978]\n [ 1. -0.37796447 -0.77459667 1.64058505 1.7202972 ]\n [-1. -0.37796447 1.29099445 -0.0813118 -0.16751412]\n [ 1. -0.37796447 -0.77459667 0.95182631 0.98614835]\n [ 1. -0.37796447 -0.77459667 -0.59788085 -0.48214934]]\nX_test\n[[-1. 2.64575131 -0.77459667 -1.45882927 -0.90166297]\n [-1. 2.64575131 -0.77459667 1.98496442 2.13981082]]\n" + ] } ], @@ -207,7 +273,7 @@ "print(\"X_train\")\n", "print(X_train)\n", "print(\"X_test\")\n", - "print(X_test)" + "print(X_test)\n" ] }, { @@ -220,9 +286,13 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" + "name": "python3", + "display_name": "Python 3.8.3 64-bit (conda)", + "metadata": { + "interpreter": { + "hash": "1b78ff499ec469310b6a6795c4effbbfc85eb20a6ba0cf828a15721670711b2c" + } + } }, "language_info": { "codemirror_mode": {