Fitting LM

This commit is contained in:
Noah L. Schrick 2023-02-27 17:10:53 -06:00
parent 7fc3a898dc
commit 4e7e4fc474

View File

@ -638,22 +638,18 @@
"metadata": {}, "metadata": {},
"source": [ "source": [
"# Linear? \n", "# Linear? \n",
"There does not appear to be a linear relationship between spending and frequency or spending and last update days ago.\n", "There does not appear to be a linear relationship between spending and last update days ago.\n",
"An argument could be made for Frequency and Spending as spending gets larger, but both scatter plots do not seem to indicate a linear relationship." "An argument could be made for Frequency and Spending as spending gets larger, but both scatter plots do not seem to indicate a linear relationship. The linear fit for frequency and spending would have a low R squared value."
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 107, "execution_count": 109,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# c. i\n", "# c. i\n",
"predictors = ['sequence_number', 'US', 'source_a', 'source_c', 'source_b', 'source_d', 'source_e',\n", "predictors = ['US','Freq', 'last_update_days_ago', 'Web order', 'Gender=male', 'Address_is_res']\n",
" 'source_m', 'source_o', 'source_h', 'source_r', 'source_s', 'source_t',\n",
" 'source_u', 'source_p', 'source_x', 'source_w', 'Freq', 'last_update_days_ago',\n",
" '1st_update_days_ago', 'Web order', 'Gender=male', 'Address_is_res', 'Purchase',\n",
" 'Spending']\n",
"\n", "\n",
"outcome = 'Spending'\n", "outcome = 'Spending'\n",
"X = pd.get_dummies(tayko_df[predictors], drop_first=True)\n", "X = pd.get_dummies(tayko_df[predictors], drop_first=True)\n",
@ -663,11 +659,47 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 111,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"intercept 10.17629741458822\n",
" Predictor coefficient\n",
"0 US -4.620293\n",
"1 Freq 91.274450\n",
"2 last_update_days_ago -0.010374\n",
"3 Web order 18.628731\n",
"4 Gender=male -9.111366\n",
"5 Address_is_res -75.815354\n"
]
},
{
"ename": "ValueError",
"evalue": "operands could not be broadcast together with shapes (800,) (1200,) ",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[111], line 10\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[39mprint\u001b[39m(pd\u001b[39m.\u001b[39mDataFrame({\u001b[39m'\u001b[39m\u001b[39mPredictor\u001b[39m\u001b[39m'\u001b[39m: X\u001b[39m.\u001b[39mcolumns, \u001b[39m'\u001b[39m\u001b[39mcoefficient\u001b[39m\u001b[39m'\u001b[39m: tayko_lm\u001b[39m.\u001b[39mcoef_}))\n\u001b[1;32m 9\u001b[0m \u001b[39m# print performance measures\u001b[39;00m\n\u001b[0;32m---> 10\u001b[0m regressionSummary(valid_y, tayko_lm\u001b[39m.\u001b[39;49mpredict(train_X))\n",
"File \u001b[0;32m~/.local/lib/python3.10/site-packages/dmba/metric.py:71\u001b[0m, in \u001b[0;36mregressionSummary\u001b[0;34m(y_true, y_pred)\u001b[0m\n\u001b[1;32m 69\u001b[0m y_true \u001b[39m=\u001b[39m _toArray(y_true)\n\u001b[1;32m 70\u001b[0m y_pred \u001b[39m=\u001b[39m _toArray(y_pred)\n\u001b[0;32m---> 71\u001b[0m y_res \u001b[39m=\u001b[39m y_true \u001b[39m-\u001b[39;49m y_pred\n\u001b[1;32m 72\u001b[0m metrics \u001b[39m=\u001b[39m [\n\u001b[1;32m 73\u001b[0m (\u001b[39m'\u001b[39m\u001b[39mMean Error (ME)\u001b[39m\u001b[39m'\u001b[39m, \u001b[39msum\u001b[39m(y_res) \u001b[39m/\u001b[39m \u001b[39mlen\u001b[39m(y_res)),\n\u001b[1;32m 74\u001b[0m (\u001b[39m'\u001b[39m\u001b[39mRoot Mean Squared Error (RMSE)\u001b[39m\u001b[39m'\u001b[39m, math\u001b[39m.\u001b[39msqrt(mean_squared_error(y_true, y_pred))),\n\u001b[1;32m 75\u001b[0m (\u001b[39m'\u001b[39m\u001b[39mMean Absolute Error (MAE)\u001b[39m\u001b[39m'\u001b[39m, \u001b[39msum\u001b[39m(\u001b[39mabs\u001b[39m(y_res)) \u001b[39m/\u001b[39m \u001b[39mlen\u001b[39m(y_res)),\n\u001b[1;32m 76\u001b[0m ]\n\u001b[1;32m 77\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mall\u001b[39m(yt \u001b[39m!=\u001b[39m \u001b[39m0\u001b[39m \u001b[39mfor\u001b[39;00m yt \u001b[39min\u001b[39;00m y_true):\n",
"\u001b[0;31mValueError\u001b[0m: operands could not be broadcast together with shapes (800,) (1200,) "
]
}
],
"source": [ "source": [
"# c. ii" "# c. ii\n",
"tayko_lm = LinearRegression()\n",
"tayko_lm.fit(train_X, train_y)\n",
"\n",
"# print coefficients\n",
"print('intercept ', tayko_lm.intercept_)\n",
"print(pd.DataFrame({'Predictor': X.columns, 'coefficient': tayko_lm.coef_}))\n",
"\n",
"# print performance measures\n",
"regressionSummary(valid_y, tayko_lm.predict(valid_X))"
] ]
} }
], ],