77 lines
2.2 KiB
Python
77 lines
2.2 KiB
Python
#Day 1: Data Prepocessing
|
|
|
|
#Step 1: Importing the libraries
|
|
import numpy as np
|
|
import pandas as pd
|
|
|
|
#Step 2: Importing dataset
|
|
dataset = pd.read_csv('../datasets/Data.csv')
|
|
X = dataset.iloc[ : , :-1].values
|
|
Y = dataset.iloc[ : , 3].values
|
|
print("Step 2: Importing dataset")
|
|
print("X")
|
|
print(X)
|
|
print("Y")
|
|
print(Y)
|
|
|
|
#Step 3: Handling the missing data
|
|
# If you use the newest version of sklearn, use the lines of code commented out
|
|
from sklearn.impute import SimpleImputer
|
|
imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
|
|
#from sklearn.preprocessing import Imputer
|
|
# axis=0表示按列进行
|
|
#imputer = Imputer(missing_values = "NaN", strategy = "mean", axis = 0)
|
|
imputer = imputer.fit(X[ : , 1:3])
|
|
X[ : , 1:3] = imputer.transform(X[ : , 1:3])
|
|
print("---------------------")
|
|
print("Step 3: Handling the missing data")
|
|
print("step2")
|
|
print("X")
|
|
print(X)
|
|
|
|
#Step 4: Encoding categorical data
|
|
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
|
|
from sklearn.compose import ColumnTransformer
|
|
#labelencoder_X = LabelEncoder()
|
|
#X[ : , 0] = labelencoder_X.fit_transform(X[ : , 0])
|
|
#Creating a dummy variable
|
|
#print(X)
|
|
ct = ColumnTransformer([("", OneHotEncoder(), [0])], remainder = 'passthrough')
|
|
X = ct.fit_transform(X)
|
|
#onehotencoder = OneHotEncoder(categorical_features = [0])
|
|
#X = onehotencoder.fit_transform(X).toarray()
|
|
labelencoder_Y = LabelEncoder()
|
|
Y = labelencoder_Y.fit_transform(Y)
|
|
print("---------------------")
|
|
print("Step 4: Encoding categorical data")
|
|
print("X")
|
|
print(X)
|
|
print("Y")
|
|
print(Y)
|
|
|
|
#Step 5: Splitting the datasets into training sets and Test sets
|
|
from sklearn.model_selection import train_test_split
|
|
X_train, X_test, Y_train, Y_test = train_test_split( X , Y , test_size = 0.2, random_state = 0)
|
|
print("---------------------")
|
|
print("Step 5: Splitting the datasets into training sets and Test sets")
|
|
print("X_train")
|
|
print(X_train)
|
|
print("X_test")
|
|
print(X_test)
|
|
print("Y_train")
|
|
print(Y_train)
|
|
print("Y_test")
|
|
print(Y_test)
|
|
|
|
#Step 6: Feature Scaling
|
|
from sklearn.preprocessing import StandardScaler
|
|
sc_X = StandardScaler()
|
|
X_train = sc_X.fit_transform(X_train)
|
|
X_test = sc_X.transform(X_test)
|
|
print("---------------------")
|
|
print("Step 6: Feature Scaling")
|
|
print("X_train")
|
|
print(X_train)
|
|
print("X_test")
|
|
print(X_test)
|