Predictor exploration, subset selection, and determining models

This commit is contained in:
Noah L. Schrick 2023-02-27 15:33:04 -06:00
parent b5e5908f6e
commit 0f2d8787ca

View File

@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 24,
"execution_count": 81,
"metadata": {},
"outputs": [],
"source": [
@ -19,7 +19,10 @@
"from sklearn.linear_model import LinearRegression, Lasso, Ridge, LassoCV, BayesianRidge\n",
"import statsmodels.formula.api as sm\n",
"import matplotlib.pylab as plt\n",
"import seaborn as sns\n"
"import seaborn as sns\n",
"from dmba import regressionSummary, exhaustive_search\n",
"from dmba import backward_elimination, forward_selection, stepwise_selection\n",
"from dmba import adjusted_r2_score, AIC_score, BIC_score\n"
]
},
{
@ -74,7 +77,7 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 31,
"metadata": {},
"outputs": [
{
@ -85,7 +88,15 @@
" Predictor coefficient\n",
"0 CRIM -0.240062\n",
"1 CHAS 3.266817\n",
"2 RM 8.325175\n"
"2 RM 8.325175\n",
"\n",
"Regression statistics\n",
"\n",
" Mean Error (ME) : -0.0000\n",
" Root Mean Squared Error (RMSE) : 5.9666\n",
" Mean Absolute Error (MAE) : 3.9668\n",
" Mean Percentage Error (MPE) : -7.2747\n",
"Mean Absolute Percentage Error (MAPE) : 22.5927\n"
]
}
],
@ -108,6 +119,9 @@
"print('intercept ', housing_lm.intercept_)\n",
"print(pd.DataFrame({'Predictor': X.columns, 'coefficient': housing_lm.coef_}))\n",
"\n",
"# print performance measures\n",
"regressionSummary(train_y, housing_lm.predict(train_X))\n",
"\n",
"# Equation:\n",
"# MEDV = -29.19 -0.24*CRIM + 3.27*CHAS + 8.33*RM\n"
]
@ -190,22 +204,304 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 26,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"<AxesSubplot: >"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# d.\n",
"# ii."
"# ii.\n",
"sns.heatmap(corr, annot=True, fmt=\".1f\", cmap=\"RdBu\", center=0, ax=ax)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Highly Correlated Pairs\n",
"ZN and DIS\n",
"RAD and TAX\n",
"PTRATIO and RAD\n",
"PTRATIO and TAX"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 42,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Backward\n",
"Variables: CRIM, CHAS, RM\n",
"Start: score=1952.30\n",
"Step: score=1952.30, remove None\n",
"['CRIM', 'CHAS', 'RM']\n",
"\n",
"Regression statistics\n",
"\n",
" Mean Error (ME) : -0.0000\n",
" Root Mean Squared Error (RMSE) : 5.9666\n",
" Mean Absolute Error (MAE) : 3.9668\n",
" Mean Percentage Error (MPE) : -7.2747\n",
"Mean Absolute Percentage Error (MAPE) : 22.5927\n",
"\n"
]
}
],
"source": [
"# d.\n",
"# iii."
"# iii.\n",
"def train_model(variables):\n",
" if len(variables) == 0:\n",
" return None\n",
" model = LinearRegression()\n",
" model.fit(train_X[variables], train_y)\n",
" return model\n",
"\n",
"def score_model(model, variables):\n",
" if len(variables) == 0:\n",
" return AIC_score(train_y, [train_y.mean()] * len(train_y), model, df=1)\n",
" return AIC_score(train_y, model.predict(train_X[variables]), model)\n",
"\n",
"print(\"Backward\")\n",
"best_back_model, best_back_variables = backward_elimination(train_X.columns, train_model, score_model, verbose=True)\n",
"print(best_back_variables)\n",
"regressionSummary(train_y, best_back_model.predict(train_X))\n",
"print()"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Forward\n",
"Variables: CRIM, CHAS, RM\n",
"Start: score=2191.75, constant\n",
"Step: score=1989.28, add RM\n",
"Step: score=1956.79, add CRIM\n",
"Step: score=1952.30, add CHAS\n",
"Step: score=1952.30, add None\n",
"['RM', 'CRIM', 'CHAS']\n",
"\n",
"Regression statistics\n",
"\n",
" Mean Error (ME) : -0.0000\n",
" Root Mean Squared Error (RMSE) : 5.9666\n",
" Mean Absolute Error (MAE) : 3.9668\n",
" Mean Percentage Error (MPE) : -7.2747\n",
"Mean Absolute Percentage Error (MAPE) : 22.5927\n"
]
}
],
"source": [
"print(\"Forward\")\n",
"best_forw_model, best_forw_variables = forward_selection(train_X.columns, train_model, score_model, verbose=True)\n",
"print(best_forw_variables)\n",
"forw_train_X = train_X.loc[:,['RM','CRIM','CHAS']]\n",
"regressionSummary(train_y, best_forw_model.predict(forw_train_X))\n"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Stepwise\n",
"Variables: CRIM, CHAS, RM\n",
"Start: score=2191.75, constant\n",
"Step: score=1989.28, add RM\n",
"Step: score=1956.79, add CRIM\n",
"Step: score=1952.30, add CHAS\n",
"Step: score=1952.30, add None\n",
"['RM', 'CRIM', 'CHAS']\n",
"\n",
"Regression statistics\n",
"\n",
" Mean Error (ME) : -0.0000\n",
" Root Mean Squared Error (RMSE) : 5.9666\n",
" Mean Absolute Error (MAE) : 3.9668\n",
" Mean Percentage Error (MPE) : -7.2747\n",
"Mean Absolute Percentage Error (MAPE) : 22.5927\n",
"\n",
"Regression statistics\n",
"\n",
" Mean Error (ME) : -0.0000\n",
" Root Mean Squared Error (RMSE) : 5.9666\n",
" Mean Absolute Error (MAE) : 3.9668\n",
" Mean Percentage Error (MPE) : -7.2747\n",
"Mean Absolute Percentage Error (MAPE) : 22.5927\n"
]
}
],
"source": [
"# d iii. continued\n",
"print(\"Stepwise\")\n",
"best_step_model, best_step_variables = forward_selection(train_X.columns, train_model, score_model, verbose=True)\n",
"print(best_step_variables)\n",
"step_train_X = train_X.loc[:,['RM','CRIM','CHAS']]\n",
"regressionSummary(train_y, best_step_model.predict(step_train_X))\n",
"test=regressionSummary(train_y, best_step_model.predict(step_train_X))\n"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"LASSO\n",
"\n",
"Regression statistics\n",
"\n",
" Mean Error (ME) : 0.2627\n",
" Root Mean Squared Error (RMSE) : 6.7153\n",
" Mean Absolute Error (MAE) : 4.7355\n",
" Mean Percentage Error (MPE) : -8.5983\n",
"Mean Absolute Percentage Error (MAPE) : 23.9824\n",
"\n",
"\n",
"LASSO CV\n",
"\n",
"Regression statistics\n",
"\n",
" Mean Error (ME) : 0.1124\n",
" Root Mean Squared Error (RMSE) : 6.4186\n",
" Mean Absolute Error (MAE) : 4.4592\n",
" Mean Percentage Error (MPE) : -7.7091\n",
"Mean Absolute Percentage Error (MAPE) : 23.1854\n",
"Lasso-CV chosen regularization: 0.033515828458353755\n",
"[-0.24201538 2.81692528 8.25934245]\n",
"\n",
"\n",
"RIDGE\n",
"\n",
"Regression statistics\n",
"\n",
" Mean Error (ME) : 0.1201\n",
" Root Mean Squared Error (RMSE) : 6.4138\n",
" Mean Absolute Error (MAE) : 4.4590\n",
" Mean Percentage Error (MPE) : -7.6484\n",
"Mean Absolute Percentage Error (MAPE) : 23.1724\n",
"\n",
"\n",
"BAYESIAN RIDGE\n",
"\n",
"Regression statistics\n",
"\n",
" Mean Error (ME) : 0.1211\n",
" Root Mean Squared Error (RMSE) : 6.4144\n",
" Mean Absolute Error (MAE) : 4.4603\n",
" Mean Percentage Error (MPE) : -7.6595\n",
"Mean Absolute Percentage Error (MAPE) : 23.1747\n",
"Bayesian ridge chosen regularization: 1.3591395967339095\n",
"\n",
"\n"
]
}
],
"source": [
"# d iii model\n",
"print(\"LASSO\")\n",
"lasso = Lasso(alpha=1)\n",
"lasso.fit(train_X, train_y)\n",
"regressionSummary(valid_y, lasso.predict(valid_X))\n",
"print(\"\\n\")\n",
"\n",
"print(\"LASSO CV\")\n",
"lasso_cv = LassoCV(cv=5)\n",
"lasso_cv.fit(train_X, train_y)\n",
"regressionSummary(valid_y, lasso_cv.predict(valid_X))\n",
"print('Lasso-CV chosen regularization: ', lasso_cv.alpha_)\n",
"print(lasso_cv.coef_)\n",
"print(\"\\n\")\n"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"RIDGE\n",
"\n",
"Regression statistics\n",
"\n",
" Mean Error (ME) : 0.1201\n",
" Root Mean Squared Error (RMSE) : 6.4138\n",
" Mean Absolute Error (MAE) : 4.4590\n",
" Mean Percentage Error (MPE) : -7.6484\n",
"Mean Absolute Percentage Error (MAPE) : 23.1724\n",
"\n",
"\n",
"BAYESIAN RIDGE\n",
"\n",
"Regression statistics\n",
"\n",
" Mean Error (ME) : 0.1211\n",
" Root Mean Squared Error (RMSE) : 6.4144\n",
" Mean Absolute Error (MAE) : 4.4603\n",
" Mean Percentage Error (MPE) : -7.6595\n",
"Mean Absolute Percentage Error (MAPE) : 23.1747\n",
"Bayesian ridge chosen regularization: 1.3591395967339095\n",
"\n",
"\n"
]
}
],
"source": [
"print(\"RIDGE\")\n",
"ridge = Ridge(alpha=1)\n",
"ridge.fit(train_X, train_y)\n",
"regressionSummary(valid_y, ridge.predict(valid_X))\n",
"print(\"\\n\")\n",
"\n",
"print(\"BAYESIAN RIDGE\")\n",
"bayesianRidge = BayesianRidge()\n",
"bayesianRidge.fit(train_X, train_y)\n",
"regressionSummary(valid_y, bayesianRidge.predict(valid_X))\n",
"print('Bayesian ridge chosen regularization: ', bayesianRidge.lambda_ / bayesianRidge.alpha_)\n",
"print(\"\\n\")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Best model\n",
"Bayesian Ridge: Lowest MAPE\n",
"Ridge: Lowest RMSE, lowest MAE\n",
"\n",
"Ridge or Bayesian Ridge should be used. Further parameter tuning can assist in selection which of the two models to use."
]
},
{