From 69ceebd98a5621ffc8fd5604febdb10154c9f040 Mon Sep 17 00:00:00 2001 From: zhang yongquan Date: Mon, 6 Aug 2018 14:05:00 +0800 Subject: [PATCH] add Day 1.py --- Code/Day 1_Data Prepocessing.py | 68 +++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 Code/Day 1_Data Prepocessing.py diff --git a/Code/Day 1_Data Prepocessing.py b/Code/Day 1_Data Prepocessing.py new file mode 100644 index 0000000..435d940 --- /dev/null +++ b/Code/Day 1_Data Prepocessing.py @@ -0,0 +1,68 @@ +#Day 1: Data Prepocessing + +#Step 1: Importing the libraries +import numpy as np +import pandas as pd + +#Step 2: Importing dataset +dataset = pd.read_csv('Data.csv') +X = dataset.iloc[ : , :-1].values +Y = dataset.iloc[ : , 3].values +print("Step 2: Importing dataset") +print("X") +print(X) +print("Y") +print(Y) + +#Step 3: Handling the missing data +from sklearn.preprocessing import Imputer +imputer = Imputer(missing_values = "NaN", strategy = "mean", axis = 0) +imputer = imputer.fit(X[ : , 1:3]) +X[ : , 1:3] = imputer.transform(X[ : , 1:3]) +print("---------------------") +print("Step 3: Handling the missing data") +print("step2") +print("X") +print(X) + +#Step 4: Encoding categorical data +from sklearn.preprocessing import LabelEncoder, OneHotEncoder +labelencoder_X = LabelEncoder() +X[ : , 0] = labelencoder_X.fit_transform(X[ : , 0]) +#Creating a dummy variable +onehotencoder = OneHotEncoder(categorical_features = [0]) +X = onehotencoder.fit_transform(X).toarray() +labelencoder_Y = LabelEncoder() +Y = labelencoder_Y.fit_transform(Y) +print("---------------------") +print("Step 4: Encoding categorical data") +print("X") +print(X) +print("Y") +print(Y) + +#Step 5: Splitting the datasets into training sets and Test sets +from sklearn.cross_validation import train_test_split +X_train, X_test, Y_train, Y_test = train_test_split( X , Y , test_size = 0.2, random_state = 0) +print("---------------------") +print("Step 5: Splitting the datasets into training sets and Test sets") +print("X_train") +print(X_train) +print("X_test") +print(X_test) +print("Y_train") +print(Y_train) +print("Y_test") +print(Y_test) + +#Step 6: Feature Scaling +from sklearn.preprocessing import StandardScaler +sc_X = StandardScaler() +X_train = sc_X.fit_transform(X_train) +X_test = sc_X.fit_transform(X_test) +print("---------------------") +print("Step 6: Feature Scaling") +print("X_train") +print(X_train) +print("X_test") +print(X_test) \ No newline at end of file