From 2dcdd4bd72b0a26148f91186713252f428ccfc1e Mon Sep 17 00:00:00 2001 From: jacksu <371387455@qq.com> Date: Thu, 6 Dec 2018 14:16:51 +0800 Subject: [PATCH] add some description --- Code/Day 3_Multiple_Linear_Regression.ipynb | 207 ++++++++++++++++---- 1 file changed, 168 insertions(+), 39 deletions(-) diff --git a/Code/Day 3_Multiple_Linear_Regression.ipynb b/Code/Day 3_Multiple_Linear_Regression.ipynb index 979fe09..68f5ec4 100644 --- a/Code/Day 3_Multiple_Linear_Regression.ipynb +++ b/Code/Day 3_Multiple_Linear_Regression.ipynb @@ -24,9 +24,7 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", @@ -42,15 +40,40 @@ }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": true - }, - "outputs": [], + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[165349.2 136897.8 471784.1 'New York']\n", + " [162597.7 151377.59 443898.53 'California']\n", + " [153441.51 101145.55 407934.54 'Florida']\n", + " [144372.41 118671.85 383199.62 'New York']\n", + " [142107.34 91391.77 366168.42 'Florida']\n", + " [131876.9 99814.71 362861.36 'New York']\n", + " [134615.46 147198.87 127716.82 'California']\n", + " [130298.13 145530.06 323876.68 'Florida']\n", + " [120542.52 148718.95 311613.29 'New York']\n", + " [123334.88 108679.17 304981.62 'California']]\n", + "[192261.83 191792.06 191050.39 182901.99 166187.94 156991.12 156122.51\n", + " 155752.6 152211.77 149759.96 146121.95 144259.4 141585.52 134307.35\n", + " 132602.65 129917.04 126992.93 125370.37 124266.9 122776.86 118474.03\n", + " 111313.02 110352.25 108733.99 108552.04 107404.34 105733.54 105008.31\n", + " 103282.38 101004.64 99937.59 97483.56 97427.84 96778.92 96712.8\n", + " 96479.51 90708.19 89949.14 81229.06 81005.76 78239.91 77798.83\n", + " 71498.49 69758.98 65200.33 64926.08 49490.75 42559.73 35673.41\n", + " 14681.4 ]\n" + ] + } + ], "source": [ "dataset = pd.read_csv('../datasets/50_Startups.csv')\n", "X = dataset.iloc[ : , :-1].values\n", - "Y = dataset.iloc[ : , 4 ].values" + "Y = dataset.iloc[ : , 4 ].values\n", + "print(X[:10])\n", + "print(Y)" ] }, { @@ -62,35 +85,82 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": true - }, - "outputs": [], + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "labelencoder:\n", + "[[165349.2 136897.8 471784.1 2]\n", + " [162597.7 151377.59 443898.53 0]\n", + " [153441.51 101145.55 407934.54 1]\n", + " [144372.41 118671.85 383199.62 2]\n", + " [142107.34 91391.77 366168.42 1]\n", + " [131876.9 99814.71 362861.36 2]\n", + " [134615.46 147198.87 127716.82 0]\n", + " [130298.13 145530.06 323876.68 1]\n", + " [120542.52 148718.95 311613.29 2]\n", + " [123334.88 108679.17 304981.62 0]]\n", + "onehot:\n", + "[[0.0000000e+00 0.0000000e+00 1.0000000e+00 1.6534920e+05 1.3689780e+05\n", + " 4.7178410e+05]\n", + " [1.0000000e+00 0.0000000e+00 0.0000000e+00 1.6259770e+05 1.5137759e+05\n", + " 4.4389853e+05]\n", + " [0.0000000e+00 1.0000000e+00 0.0000000e+00 1.5344151e+05 1.0114555e+05\n", + " 4.0793454e+05]\n", + " [0.0000000e+00 0.0000000e+00 1.0000000e+00 1.4437241e+05 1.1867185e+05\n", + " 3.8319962e+05]\n", + " [0.0000000e+00 1.0000000e+00 0.0000000e+00 1.4210734e+05 9.1391770e+04\n", + " 3.6616842e+05]\n", + " [0.0000000e+00 0.0000000e+00 1.0000000e+00 1.3187690e+05 9.9814710e+04\n", + " 3.6286136e+05]\n", + " [1.0000000e+00 0.0000000e+00 0.0000000e+00 1.3461546e+05 1.4719887e+05\n", + " 1.2771682e+05]\n", + " [0.0000000e+00 1.0000000e+00 0.0000000e+00 1.3029813e+05 1.4553006e+05\n", + " 3.2387668e+05]\n", + " [0.0000000e+00 0.0000000e+00 1.0000000e+00 1.2054252e+05 1.4871895e+05\n", + " 3.1161329e+05]\n", + " [1.0000000e+00 0.0000000e+00 0.0000000e+00 1.2333488e+05 1.0867917e+05\n", + " 3.0498162e+05]]\n" + ] + } + ], "source": [ "from sklearn.preprocessing import LabelEncoder, OneHotEncoder\n", "labelencoder = LabelEncoder()\n", "X[: , 3] = labelencoder.fit_transform(X[ : , 3])\n", + "print(\"labelencoder:\")\n", + "print(X[:10])\n", "onehotencoder = OneHotEncoder(categorical_features = [3])\n", - "X = onehotencoder.fit_transform(X).toarray()" + "X = onehotencoder.fit_transform(X).toarray()\n", + "print(\"onehot:\")\n", + "print(X[:10])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "**躲避虚拟变量陷阱**" + "**躲避虚拟变量陷阱**\n", + "\n", + "在回归预测中我们需要所有的数据都是numeric的,但是会有一些非numeric的数据,比如国家,省,部门,性别。这时候我们需要设置虚拟变量(Dummy variable)。做法是将此变量中的每一个值,衍生成为新的变量,是设为1,否设为0.举个例子,“性别”这个变量,我们可以虚拟出“男”和”女”两虚拟变量,男性的话“男”值为1,”女”值为,;女性的话“男”值为0,”女”值为1。\n", + "\n", + "但是要注意,这时候虚拟变量陷阱就出现了。就拿性别来说,其实一个虚拟变量就够了,比如 1 的时候是“男”, 0 的时候是”非男”,即为女。如果设置两个虚拟变量“男”和“女”,语义上来说没有问题,可以理解,但是在回归预测中会多出一个变量,多出的这个变量将会对回归预测结果产生影响。一般来说,如果虚拟变量要比实际变量的种类少一个。 \n", + "\n", + "在多重线性回归中,变量不是越多越好,而是选择适合的变量。这样才会对结果准确预测。如果category类的特征都放进去,拟合的时候,所有权重的计算,都可以有两种方法实现,一种是提高某个category的w,一种是降低其他category的w,这两种效果是等效的,也就是发生了共线性,虚拟变量系数相加和为1,出现完全共线陷阱。\n", + "\n", + "**但是下面测试尽然和想法不一致。。。**" ] }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": true - }, + "execution_count": 32, + "metadata": {}, "outputs": [], "source": [ - "X = X[: , 1:]" + "X1 = X[: , 1:]" ] }, { @@ -102,11 +172,58 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 39, "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[0.0000000e+00 1.0000000e+00 0.0000000e+00 6.6051520e+04 1.8264556e+05\n", + " 1.1814820e+05]\n", + " [1.0000000e+00 0.0000000e+00 0.0000000e+00 1.0067196e+05 9.1790610e+04\n", + " 2.4974455e+05]\n", + " [0.0000000e+00 1.0000000e+00 0.0000000e+00 1.0191308e+05 1.1059411e+05\n", + " 2.2916095e+05]\n", + " [0.0000000e+00 1.0000000e+00 0.0000000e+00 2.7892920e+04 8.4710770e+04\n", + " 1.6447071e+05]\n", + " [0.0000000e+00 1.0000000e+00 0.0000000e+00 1.5344151e+05 1.0114555e+05\n", + " 4.0793454e+05]\n", + " [0.0000000e+00 0.0000000e+00 1.0000000e+00 7.2107600e+04 1.2786455e+05\n", + " 3.5318381e+05]\n", + " [0.0000000e+00 0.0000000e+00 1.0000000e+00 2.0229590e+04 6.5947930e+04\n", + " 1.8526510e+05]\n", + " [0.0000000e+00 0.0000000e+00 1.0000000e+00 6.1136380e+04 1.5270192e+05\n", + " 8.8218230e+04]\n", + " [0.0000000e+00 1.0000000e+00 0.0000000e+00 7.3994560e+04 1.2278275e+05\n", + " 3.0331926e+05]\n", + " [0.0000000e+00 1.0000000e+00 0.0000000e+00 1.4210734e+05 9.1391770e+04\n", + " 3.6616842e+05]]\n", + "[103282.38 144259.4 146121.95 77798.83 191050.39 105008.31 81229.06\n", + " 97483.56 110352.25 166187.94]\n", + "[[1.0000000e+00 0.0000000e+00 6.6051520e+04 1.8264556e+05 1.1814820e+05]\n", + " [0.0000000e+00 0.0000000e+00 1.0067196e+05 9.1790610e+04 2.4974455e+05]\n", + " [1.0000000e+00 0.0000000e+00 1.0191308e+05 1.1059411e+05 2.2916095e+05]\n", + " [1.0000000e+00 0.0000000e+00 2.7892920e+04 8.4710770e+04 1.6447071e+05]\n", + " [1.0000000e+00 0.0000000e+00 1.5344151e+05 1.0114555e+05 4.0793454e+05]\n", + " [0.0000000e+00 1.0000000e+00 7.2107600e+04 1.2786455e+05 3.5318381e+05]\n", + " [0.0000000e+00 1.0000000e+00 2.0229590e+04 6.5947930e+04 1.8526510e+05]\n", + " [0.0000000e+00 1.0000000e+00 6.1136380e+04 1.5270192e+05 8.8218230e+04]\n", + " [1.0000000e+00 0.0000000e+00 7.3994560e+04 1.2278275e+05 3.0331926e+05]\n", + " [1.0000000e+00 0.0000000e+00 1.4210734e+05 9.1391770e+04 3.6616842e+05]]\n", + "[103282.38 144259.4 146121.95 77798.83 191050.39 105008.31 81229.06\n", + " 97483.56 110352.25 166187.94]\n" + ] + } + ], "source": [ "from sklearn.model_selection import train_test_split\n", - "X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)" + "X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)\n", + "X1_train, X1_test, Y1_train, Y1_test = train_test_split(X1, Y, test_size = 0.2, random_state = 0)\n", + "print(X_test)\n", + "print(Y_test)\n", + "print(X1_test)\n", + "print(Y1_test)" ] }, { @@ -118,7 +235,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 40, "metadata": {}, "outputs": [ { @@ -127,7 +244,7 @@ "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)" ] }, - "execution_count": 6, + "execution_count": 40, "metadata": {}, "output_type": "execute_result" } @@ -135,52 +252,64 @@ "source": [ "from sklearn.linear_model import LinearRegression\n", "regressor = LinearRegression()\n", - "regressor.fit(X_train, Y_train)" + "regressor.fit(X_train, Y_train)\n", + "regressor1 = LinearRegression()\n", + "regressor1.fit(X1_train, Y1_train)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## 第3步:在测试集上预测结果" + "## 第3步:在测试集上预测结果¶" ] }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "collapsed": true - }, + "execution_count": 41, + "metadata": {}, "outputs": [], "source": [ - "y_pred = regressor.predict(X_test)" + "y_pred = regressor.predict(X_test)\n", + "y1_pred = regressor1.predict(X1_test)" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 42, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[ 103015.20159796 132582.27760815 132447.73845175 71976.09851258\n", - " 178537.48221056 116161.24230166 67851.69209676 98791.73374687\n", - " 113969.43533013 167921.06569551]\n" + "[103015.20159796 132582.27760815 132447.73845173 71976.09851258\n", + " 178537.48221051 116161.24230163 67851.69209676 98791.73374689\n", + " 113969.43533011 167921.06569547]\n", + "[103015.20159795 132582.27760817 132447.73845176 71976.09851257\n", + " 178537.48221058 116161.24230165 67851.69209675 98791.73374686\n", + " 113969.43533013 167921.06569553]\n" ] } ], "source": [ - "print(y_pred)" + "print(y_pred)\n", + "print(y1_pred)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "完整的项目请前往Github项目100-Days-Of-ML-Code查看。有任何的建议或者意见欢迎在issue中提出~" + "**完整的项目请前往Github项目100-Days-Of-ML-Code查看。有任何的建议或者意见欢迎在issue中提出~**" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -199,7 +328,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.2" + "version": "3.6.5" } }, "nbformat": 4,