Update Day 1_Data_Preprocessing.ipynb

This commit is contained in:
yx-xyc
2021-01-13 17:47:06 +08:00
parent acd61766d7
commit 85b143e2d6

View File

@ -19,7 +19,55 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 36,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"[[ 7. 2. 3. ]\n [ 4. 3.5 6. ]\n [10. 3.5 9. ]]\nSklearn verion is 0.23.1\n"
]
}
],
"source": [
"import sklearn\n",
"from sklearn.impute import SimpleImputer\n",
"#This block is an example used to learn SimpleImputer\n",
"imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')\n",
"imp_mean.fit([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]])\n",
"X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]]\n",
"print(imp_mean.transform(X))\n",
"print(\"Sklearn verion is {}\".format(sklearn.__version__))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.preprocessing import OneHotEncoder\n",
"enc = OneHotEncoder(handle_unknown='ignore')\n",
"X = [['Male', 1], ['Female', 3], ['Female', 2]]\n",
">>> enc.fit(X)\n",
"OneHotEncoder(handle_unknown='ignore')\n",
">>> enc.categories_\n",
"[array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]\n",
">>> enc.transform([['Female', 1], ['Male', 4]]).toarray()\n",
"array([[1., 0., 1., 0., 0.],\n",
" [0., 1., 0., 0., 0.]])\n",
">>> enc.inverse_transform([[0, 1, 1, 0, 0], [0, 0, 0, 1, 0]])\n",
"array([['Male', 1],\n",
" [None, 2]], dtype=object)\n",
">>> enc.get_feature_names(['gender', 'group'])\n",
"array(['gender_Female', 'gender_Male', 'group_1', 'group_2', 'group_3'],\n",
" dtype=object)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
@ -37,27 +85,14 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 57,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"name": "stdout",
"text": [
"Step 2: Importing dataset\n",
"X\n",
"[['France' 44.0 72000.0]\n",
" ['Spain' 27.0 48000.0]\n",
" ['Germany' 30.0 54000.0]\n",
" ['Spain' 38.0 61000.0]\n",
" ['Germany' 40.0 nan]\n",
" ['France' 35.0 58000.0]\n",
" ['Spain' nan 52000.0]\n",
" ['France' 48.0 79000.0]\n",
" ['Germany' 50.0 83000.0]\n",
" ['France' 37.0 67000.0]]\n",
"Y\n",
"['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']\n"
"Step 2: Importing dataset\nX\n[['France' 44.0 72000.0]\n ['Spain' 27.0 48000.0]\n ['Germany' 30.0 54000.0]\n ['Spain' 38.0 61000.0]\n ['Germany' 40.0 nan]\n ['France' 35.0 58000.0]\n ['Spain' nan 52000.0]\n ['France' 48.0 79000.0]\n ['Germany' 50.0 83000.0]\n ['France' 37.0 67000.0]]\nY\n['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']\n[[44.0 72000.0]\n [27.0 48000.0]\n [30.0 54000.0]\n [38.0 61000.0]\n [40.0 nan]\n [35.0 58000.0]\n [nan 52000.0]\n [48.0 79000.0]\n [50.0 83000.0]\n [37.0 67000.0]]\n"
]
}
],
@ -71,7 +106,8 @@
"print(\"X\")\n",
"print(X)\n",
"print(\"Y\")\n",
"print(Y)"
"print(Y)\n",
"print(X[ : , 1:3])"
]
},
{
@ -84,39 +120,30 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 58,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"name": "stdout",
"text": [
"---------------------\n",
"Step 3: Handling the missing data\n",
"step2\n",
"X\n",
"[['France' 44.0 72000.0]\n",
" ['Spain' 27.0 48000.0]\n",
" ['Germany' 30.0 54000.0]\n",
" ['Spain' 38.0 61000.0]\n",
" ['Germany' 40.0 63777.77777777778]\n",
" ['France' 35.0 58000.0]\n",
" ['Spain' 38.77777777777778 52000.0]\n",
" ['France' 48.0 79000.0]\n",
" ['Germany' 50.0 83000.0]\n",
" ['France' 37.0 67000.0]]\n"
"---------------------\nStep 3: Handling the missing data\nstep2\nX\n[['France' 44.0 72000.0]\n ['Spain' 27.0 48000.0]\n ['Germany' 30.0 54000.0]\n ['Spain' 38.0 61000.0]\n ['Germany' 40.0 63777.77777777778]\n ['France' 35.0 58000.0]\n ['Spain' 38.77777777777778 52000.0]\n ['France' 48.0 79000.0]\n ['Germany' 50.0 83000.0]\n ['France' 37.0 67000.0]]\n"
]
}
],
"source": [
"# If you use the newest version of sklearn, use the lines of code commented out",
"# from sklearn.impute import SimpleImputer",
"# imputer = SimpleImputer(missing_values=\"NaN\", strategy=\"mean\")",
"from sklearn.preprocessing import Imputer\n",
"# If you use the newest version of sklearn, use the lines of code commented out\n",
"from sklearn.impute import SimpleImputer\n",
"imputer = SimpleImputer(missing_values=np.nan, strategy=\"mean\")\n",
"#from sklearn.preprocessing import Imputer\n",
"# axis=0表示按列进行\n",
"imputer = Imputer(missing_values = \"NaN\", strategy = \"mean\", axis = 0)\n",
"imputer = imputer.fit(X[ : , 1:3])\n",
"X[ : , 1:3] = imputer.transform(X[ : , 1:3])\n",
"#imputer = Imputer(missing_values = \"NaN\", strategy = \"mean\", axis = 0)\n",
"#print(imputer)\n",
"#\n",
"# print(X[ : , 1:3])\n",
"imputer = imputer.fit(X[ : , 1:3]) #put the data we want to process in to this imputer\n",
"X[ : , 1:3] = imputer.transform(X[ : , 1:3]) #replace the np.nan with mean\n",
"#print(X[ : , 1:3])\n",
"print(\"---------------------\")\n",
"print(\"Step 3: Handling the missing data\")\n",
"print(\"step2\")\n",
@ -134,47 +161,26 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 59,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"name": "stdout",
"text": [
"---------------------\n",
"Step 4: Encoding categorical data\n",
"X\n",
"[[1.00000000e+00 0.00000000e+00 0.00000000e+00 4.40000000e+01\n",
" 7.20000000e+04]\n",
" [0.00000000e+00 0.00000000e+00 1.00000000e+00 2.70000000e+01\n",
" 4.80000000e+04]\n",
" [0.00000000e+00 1.00000000e+00 0.00000000e+00 3.00000000e+01\n",
" 5.40000000e+04]\n",
" [0.00000000e+00 0.00000000e+00 1.00000000e+00 3.80000000e+01\n",
" 6.10000000e+04]\n",
" [0.00000000e+00 1.00000000e+00 0.00000000e+00 4.00000000e+01\n",
" 6.37777778e+04]\n",
" [1.00000000e+00 0.00000000e+00 0.00000000e+00 3.50000000e+01\n",
" 5.80000000e+04]\n",
" [0.00000000e+00 0.00000000e+00 1.00000000e+00 3.87777778e+01\n",
" 5.20000000e+04]\n",
" [1.00000000e+00 0.00000000e+00 0.00000000e+00 4.80000000e+01\n",
" 7.90000000e+04]\n",
" [0.00000000e+00 1.00000000e+00 0.00000000e+00 5.00000000e+01\n",
" 8.30000000e+04]\n",
" [1.00000000e+00 0.00000000e+00 0.00000000e+00 3.70000000e+01\n",
" 6.70000000e+04]]\n",
"Y\n",
"[0 1 0 0 1 1 0 1 0 1]\n"
"['France' 'Spain' 'Germany' 'Spain' 'Germany' 'France' 'Spain' 'France'\n 'Germany' 'France']\n[0 2 1 2 1 0 2 0 1 0]\n[[0 44.0 72000.0]\n [2 27.0 48000.0]\n [1 30.0 54000.0]\n [2 38.0 61000.0]\n [1 40.0 63777.77777777778]\n [0 35.0 58000.0]\n [2 38.77777777777778 52000.0]\n [0 48.0 79000.0]\n [1 50.0 83000.0]\n [0 37.0 67000.0]]\n---------------------\nStep 4: Encoding categorical data\nX\n[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]\n [0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n [0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]\n [0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]\n [0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]\n [1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]\n [0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]\n [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]\n [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]\n [1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]]\nY\n[0 1 0 0 1 1 0 1 0 1]\n"
]
}
],
"source": [
"from sklearn.preprocessing import LabelEncoder, OneHotEncoder\n",
"labelencoder_X = LabelEncoder()\n",
"print(X[ : , 0])\n",
"X[ : , 0] = labelencoder_X.fit_transform(X[ : , 0])\n",
"print(X[ : , 0])\n",
"#Creating a dummy variable\n",
"onehotencoder = OneHotEncoder(categorical_features = [0])\n",
"onehotencoder = OneHotEncoder('auto')\n",
"print(X)\n",
"X = onehotencoder.fit_transform(X).toarray()\n",
"labelencoder_Y = LabelEncoder()\n",
"Y = labelencoder_Y.fit_transform(Y)\n",
@ -196,41 +202,14 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 60,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"name": "stdout",
"text": [
"---------------------\n",
"Step 5: Splitting the datasets into training sets and Test sets\n",
"X_train\n",
"[[ 0.00000000e+00 1.00000000e+00 0.00000000e+00 4.00000000e+01\n",
" 6.37777778e+04]\n",
" [ 1.00000000e+00 0.00000000e+00 0.00000000e+00 3.70000000e+01\n",
" 6.70000000e+04]\n",
" [ 0.00000000e+00 0.00000000e+00 1.00000000e+00 2.70000000e+01\n",
" 4.80000000e+04]\n",
" [ 0.00000000e+00 0.00000000e+00 1.00000000e+00 3.87777778e+01\n",
" 5.20000000e+04]\n",
" [ 1.00000000e+00 0.00000000e+00 0.00000000e+00 4.80000000e+01\n",
" 7.90000000e+04]\n",
" [ 0.00000000e+00 0.00000000e+00 1.00000000e+00 3.80000000e+01\n",
" 6.10000000e+04]\n",
" [ 1.00000000e+00 0.00000000e+00 0.00000000e+00 4.40000000e+01\n",
" 7.20000000e+04]\n",
" [ 1.00000000e+00 0.00000000e+00 0.00000000e+00 3.50000000e+01\n",
" 5.80000000e+04]]\n",
"X_test\n",
"[[ 0.00000000e+00 1.00000000e+00 0.00000000e+00 3.00000000e+01\n",
" 5.40000000e+04]\n",
" [ 0.00000000e+00 1.00000000e+00 0.00000000e+00 5.00000000e+01\n",
" 8.30000000e+04]]\n",
"Y_train\n",
"[1 1 1 0 1 0 0 1]\n",
"Y_test\n",
"[0 0]\n"
"---------------------\nStep 5: Splitting the datasets into training sets and Test sets\nX_train\n[[0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]\n [1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]\n [0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n [0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]\n [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]\n [0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]\n [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]\n [1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]]\nX_test\n[[0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]\n [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]\nY_train\n[1 1 1 0 1 0 0 1]\nY_test\n[0 0]\n"
]
}
],
@ -259,27 +238,14 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 44,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"name": "stdout",
"text": [
"---------------------\n",
"Step 6: Feature Scaling\n",
"X_train\n",
"[[-1. 2.64575131 -0.77459667 0.26306757 0.12381479]\n",
" [ 1. -0.37796447 -0.77459667 -0.25350148 0.46175632]\n",
" [-1. -0.37796447 1.29099445 -1.97539832 -1.53093341]\n",
" [-1. -0.37796447 1.29099445 0.05261351 -1.11141978]\n",
" [ 1. -0.37796447 -0.77459667 1.64058505 1.7202972 ]\n",
" [-1. -0.37796447 1.29099445 -0.0813118 -0.16751412]\n",
" [ 1. -0.37796447 -0.77459667 0.95182631 0.98614835]\n",
" [ 1. -0.37796447 -0.77459667 -0.59788085 -0.48214934]]\n",
"X_test\n",
"[[ 0. 0. 0. -1. -1.]\n",
" [ 0. 0. 0. 1. 1.]]\n"
"---------------------\nStep 6: Feature Scaling\nX_train\n[[-1. 2.64575131 -0.77459667 -0.37796447 0. -0.37796447\n -0.37796447 -0.37796447 -0.37796447 2.64575131 -0.37796447 -0.37796447\n 0. -0.37796447 -0.37796447 0. -0.37796447 -0.37796447\n 2.64575131 -0.37796447 -0.37796447 -0.37796447 0. ]\n [ 1. -0.37796447 -0.77459667 -0.37796447 0. -0.37796447\n 2.64575131 -0.37796447 -0.37796447 -0.37796447 -0.37796447 -0.37796447\n 0. -0.37796447 -0.37796447 0. -0.37796447 -0.37796447\n -0.37796447 2.64575131 -0.37796447 -0.37796447 0. ]\n [-1. -0.37796447 1.29099445 2.64575131 0. -0.37796447\n -0.37796447 -0.37796447 -0.37796447 -0.37796447 -0.37796447 -0.37796447\n 0. 2.64575131 -0.37796447 0. -0.37796447 -0.37796447\n -0.37796447 -0.37796447 -0.37796447 -0.37796447 0. ]\n [-1. -0.37796447 1.29099445 -0.37796447 0. -0.37796447\n -0.37796447 -0.37796447 2.64575131 -0.37796447 -0.37796447 -0.37796447\n 0. -0.37796447 2.64575131 0. -0.37796447 -0.37796447\n -0.37796447 -0.37796447 -0.37796447 -0.37796447 0. ]\n [ 1. -0.37796447 -0.77459667 -0.37796447 0. -0.37796447\n -0.37796447 -0.37796447 -0.37796447 -0.37796447 -0.37796447 2.64575131\n 0. -0.37796447 -0.37796447 0. -0.37796447 -0.37796447\n -0.37796447 -0.37796447 -0.37796447 2.64575131 0. ]\n [-1. -0.37796447 1.29099445 -0.37796447 0. -0.37796447\n -0.37796447 2.64575131 -0.37796447 -0.37796447 -0.37796447 -0.37796447\n 0. -0.37796447 -0.37796447 0. -0.37796447 2.64575131\n -0.37796447 -0.37796447 -0.37796447 -0.37796447 0. ]\n [ 1. -0.37796447 -0.77459667 -0.37796447 0. -0.37796447\n -0.37796447 -0.37796447 -0.37796447 -0.37796447 2.64575131 -0.37796447\n 0. -0.37796447 -0.37796447 0. -0.37796447 -0.37796447\n -0.37796447 -0.37796447 2.64575131 -0.37796447 0. ]\n [ 1. -0.37796447 -0.77459667 -0.37796447 0. 2.64575131\n -0.37796447 -0.37796447 -0.37796447 -0.37796447 -0.37796447 -0.37796447\n 0. -0.37796447 -0.37796447 0. 2.64575131 -0.37796447\n -0.37796447 -0.37796447 -0.37796447 -0.37796447 0. ]]\nX_test\n[[-1. 2.64575131 -0.77459667 -0.37796447 1. -0.37796447\n -0.37796447 -0.37796447 -0.37796447 -0.37796447 -0.37796447 -0.37796447\n 0. -0.37796447 -0.37796447 1. -0.37796447 -0.37796447\n -0.37796447 -0.37796447 -0.37796447 -0.37796447 0. ]\n [-1. 2.64575131 -0.77459667 -0.37796447 0. -0.37796447\n -0.37796447 -0.37796447 -0.37796447 -0.37796447 -0.37796447 -0.37796447\n 1. -0.37796447 -0.37796447 0. -0.37796447 -0.37796447\n -0.37796447 -0.37796447 -0.37796447 -0.37796447 1. ]]\n"
]
}
],
@ -315,9 +281,13 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
"name": "python3",
"display_name": "Python 3.8.3 64-bit (conda)",
"metadata": {
"interpreter": {
"hash": "1b78ff499ec469310b6a6795c4effbbfc85eb20a6ba0cf828a15721670711b2c"
}
}
},
"language_info": {
"codemirror_mode": {
@ -329,9 +299,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
"version": "3.8.3-final"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
}