diff --git a/Schrick-Noah_Learning-Practice-5.ipynb b/Schrick-Noah_Learning-Practice-5.ipynb index 0e57b77..a124100 100644 --- a/Schrick-Noah_Learning-Practice-5.ipynb +++ b/Schrick-Noah_Learning-Practice-5.ipynb @@ -638,22 +638,18 @@ "metadata": {}, "source": [ "# Linear? \n", - "There does not appear to be a linear relationship between spending and frequency or spending and last update days ago.\n", - "An argument could be made for Frequency and Spending as spending gets larger, but both scatter plots do not seem to indicate a linear relationship." + "There does not appear to be a linear relationship between spending and last update days ago.\n", + "An argument could be made for Frequency and Spending as spending gets larger, but both scatter plots do not seem to indicate a linear relationship. The linear fit for frequency and spending would have a low R squared value." ] }, { "cell_type": "code", - "execution_count": 107, + "execution_count": 109, "metadata": {}, "outputs": [], "source": [ "# c. i\n", - "predictors = ['sequence_number', 'US', 'source_a', 'source_c', 'source_b', 'source_d', 'source_e',\n", - " 'source_m', 'source_o', 'source_h', 'source_r', 'source_s', 'source_t',\n", - " 'source_u', 'source_p', 'source_x', 'source_w', 'Freq', 'last_update_days_ago',\n", - " '1st_update_days_ago', 'Web order', 'Gender=male', 'Address_is_res', 'Purchase',\n", - " 'Spending']\n", + "predictors = ['US','Freq', 'last_update_days_ago', 'Web order', 'Gender=male', 'Address_is_res']\n", "\n", "outcome = 'Spending'\n", "X = pd.get_dummies(tayko_df[predictors], drop_first=True)\n", @@ -663,11 +659,47 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 111, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "intercept 10.17629741458822\n", + " Predictor coefficient\n", + "0 US -4.620293\n", + "1 Freq 91.274450\n", + "2 last_update_days_ago -0.010374\n", + "3 Web order 18.628731\n", + "4 Gender=male -9.111366\n", + "5 Address_is_res -75.815354\n" + ] + }, + { + "ename": "ValueError", + "evalue": "operands could not be broadcast together with shapes (800,) (1200,) ", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[111], line 10\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[39mprint\u001b[39m(pd\u001b[39m.\u001b[39mDataFrame({\u001b[39m'\u001b[39m\u001b[39mPredictor\u001b[39m\u001b[39m'\u001b[39m: X\u001b[39m.\u001b[39mcolumns, \u001b[39m'\u001b[39m\u001b[39mcoefficient\u001b[39m\u001b[39m'\u001b[39m: tayko_lm\u001b[39m.\u001b[39mcoef_}))\n\u001b[1;32m 9\u001b[0m \u001b[39m# print performance measures\u001b[39;00m\n\u001b[0;32m---> 10\u001b[0m regressionSummary(valid_y, tayko_lm\u001b[39m.\u001b[39;49mpredict(train_X))\n", + "File \u001b[0;32m~/.local/lib/python3.10/site-packages/dmba/metric.py:71\u001b[0m, in \u001b[0;36mregressionSummary\u001b[0;34m(y_true, y_pred)\u001b[0m\n\u001b[1;32m 69\u001b[0m y_true \u001b[39m=\u001b[39m _toArray(y_true)\n\u001b[1;32m 70\u001b[0m y_pred \u001b[39m=\u001b[39m _toArray(y_pred)\n\u001b[0;32m---> 71\u001b[0m y_res \u001b[39m=\u001b[39m y_true \u001b[39m-\u001b[39;49m y_pred\n\u001b[1;32m 72\u001b[0m metrics \u001b[39m=\u001b[39m [\n\u001b[1;32m 73\u001b[0m (\u001b[39m'\u001b[39m\u001b[39mMean Error (ME)\u001b[39m\u001b[39m'\u001b[39m, \u001b[39msum\u001b[39m(y_res) \u001b[39m/\u001b[39m \u001b[39mlen\u001b[39m(y_res)),\n\u001b[1;32m 74\u001b[0m (\u001b[39m'\u001b[39m\u001b[39mRoot Mean Squared Error (RMSE)\u001b[39m\u001b[39m'\u001b[39m, math\u001b[39m.\u001b[39msqrt(mean_squared_error(y_true, y_pred))),\n\u001b[1;32m 75\u001b[0m (\u001b[39m'\u001b[39m\u001b[39mMean Absolute Error (MAE)\u001b[39m\u001b[39m'\u001b[39m, \u001b[39msum\u001b[39m(\u001b[39mabs\u001b[39m(y_res)) \u001b[39m/\u001b[39m \u001b[39mlen\u001b[39m(y_res)),\n\u001b[1;32m 76\u001b[0m ]\n\u001b[1;32m 77\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mall\u001b[39m(yt \u001b[39m!=\u001b[39m \u001b[39m0\u001b[39m \u001b[39mfor\u001b[39;00m yt \u001b[39min\u001b[39;00m y_true):\n", + "\u001b[0;31mValueError\u001b[0m: operands could not be broadcast together with shapes (800,) (1200,) " + ] + } + ], "source": [ - "# c. ii" + "# c. ii\n", + "tayko_lm = LinearRegression()\n", + "tayko_lm.fit(train_X, train_y)\n", + "\n", + "# print coefficients\n", + "print('intercept ', tayko_lm.intercept_)\n", + "print(pd.DataFrame({'Predictor': X.columns, 'coefficient': tayko_lm.coef_}))\n", + "\n", + "# print performance measures\n", + "regressionSummary(valid_y, tayko_lm.predict(valid_X))" ] } ],