diff --git a/Code/Day 1_Data_Preprocessing.py b/Code/Day 1_Data_Preprocessing.py index 13f7c3e..5c7684a 100644 --- a/Code/Day 1_Data_Preprocessing.py +++ b/Code/Day 1_Data_Preprocessing.py @@ -1,76 +1,76 @@ -#Day 1: Data Prepocessing - -#Step 1: Importing the libraries -import numpy as np -import pandas as pd - -#Step 2: Importing dataset -dataset = pd.read_csv('../datasets/Data.csv') -X = dataset.iloc[ : , :-1].values -Y = dataset.iloc[ : , 3].values -print("Step 2: Importing dataset") -print("X") -print(X) -print("Y") -print(Y) - -#Step 3: Handling the missing data -# If you use the newest version of sklearn, use the lines of code commented out -from sklearn.impute import SimpleImputer -imputer = SimpleImputer(missing_values=np.nan, strategy="mean") -#from sklearn.preprocessing import Imputer -# axis=0表示按列进行 -#imputer = Imputer(missing_values = "NaN", strategy = "mean", axis = 0) -imputer = imputer.fit(X[ : , 1:3]) -X[ : , 1:3] = imputer.transform(X[ : , 1:3]) -print("---------------------") -print("Step 3: Handling the missing data") -print("step2") -print("X") -print(X) - -#Step 4: Encoding categorical data -from sklearn.preprocessing import LabelEncoder, OneHotEncoder -from sklearn.compose import ColumnTransformer -#labelencoder_X = LabelEncoder() -#X[ : , 0] = labelencoder_X.fit_transform(X[ : , 0]) -#Creating a dummy variable -#print(X) -ct = ColumnTransformer([("", OneHotEncoder(), [0])], remainder = 'passthrough') -X = ct.fit_transform(X) -#onehotencoder = OneHotEncoder(categorical_features = [0]) -#X = onehotencoder.fit_transform(X).toarray() -labelencoder_Y = LabelEncoder() -Y = labelencoder_Y.fit_transform(Y) -print("---------------------") -print("Step 4: Encoding categorical data") -print("X") -print(X) -print("Y") -print(Y) - -#Step 5: Splitting the datasets into training sets and Test sets -from sklearn.model_selection import train_test_split -X_train, X_test, Y_train, Y_test = train_test_split( X , Y , test_size = 0.2, random_state = 0) -print("---------------------") -print("Step 5: Splitting the datasets into training sets and Test sets") -print("X_train") -print(X_train) -print("X_test") -print(X_test) -print("Y_train") -print(Y_train) -print("Y_test") -print(Y_test) - -#Step 6: Feature Scaling -from sklearn.preprocessing import StandardScaler -sc_X = StandardScaler() -X_train = sc_X.fit_transform(X_train) -X_test = sc_X.transform(X_test) -print("---------------------") -print("Step 6: Feature Scaling") -print("X_train") -print(X_train) -print("X_test") -print(X_test) +#Day 1: Data Prepocessing + +#Step 1: Importing the libraries +import numpy as np +import pandas as pd + +#Step 2: Importing dataset +dataset = pd.read_csv('../datasets/Data.csv') +X = dataset.iloc[ : , :-1].values +Y = dataset.iloc[ : , 3].values +print("Step 2: Importing dataset") +print("X") +print(X) +print("Y") +print(Y) + +#Step 3: Handling the missing data +# If you use the newest version of sklearn, use the lines of code commented out +from sklearn.impute import SimpleImputer +imputer = SimpleImputer(missing_values=np.nan, strategy="mean") +#from sklearn.preprocessing import Imputer +# axis=0表示按列进行 +#imputer = Imputer(missing_values = "NaN", strategy = "mean", axis = 0) +imputer = imputer.fit(X[ : , 1:3]) +X[ : , 1:3] = imputer.transform(X[ : , 1:3]) +print("---------------------") +print("Step 3: Handling the missing data") +print("step2") +print("X") +print(X) + +#Step 4: Encoding categorical data +from sklearn.preprocessing import LabelEncoder, OneHotEncoder +from sklearn.compose import ColumnTransformer +#labelencoder_X = LabelEncoder() +#X[ : , 0] = labelencoder_X.fit_transform(X[ : , 0]) +#Creating a dummy variable +#print(X) +ct = ColumnTransformer([("", OneHotEncoder(), [0])], remainder = 'passthrough') +X = ct.fit_transform(X) +#onehotencoder = OneHotEncoder(categorical_features = [0]) +#X = onehotencoder.fit_transform(X).toarray() +labelencoder_Y = LabelEncoder() +Y = labelencoder_Y.fit_transform(Y) +print("---------------------") +print("Step 4: Encoding categorical data") +print("X") +print(X) +print("Y") +print(Y) + +#Step 5: Splitting the datasets into training sets and Test sets +from sklearn.model_selection import train_test_split +X_train, X_test, Y_train, Y_test = train_test_split( X , Y , test_size = 0.2, random_state = 0) +print("---------------------") +print("Step 5: Splitting the datasets into training sets and Test sets") +print("X_train") +print(X_train) +print("X_test") +print(X_test) +print("Y_train") +print(Y_train) +print("Y_test") +print(Y_test) + +#Step 6: Feature Scaling +from sklearn.preprocessing import StandardScaler +sc_X = StandardScaler() +X_train = sc_X.fit_transform(X_train) +X_test = sc_X.transform(X_test) +print("---------------------") +print("Step 6: Feature Scaling") +print("X_train") +print(X_train) +print("X_test") +print(X_test)