Predictor exploration, subset selection, and determining models

This commit is contained in:
Noah L. Schrick 2023-02-27 15:33:04 -06:00
parent b5e5908f6e
commit 0f2d8787ca

View File

@ -2,7 +2,7 @@
"cells": [ "cells": [
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 24, "execution_count": 81,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -19,7 +19,10 @@
"from sklearn.linear_model import LinearRegression, Lasso, Ridge, LassoCV, BayesianRidge\n", "from sklearn.linear_model import LinearRegression, Lasso, Ridge, LassoCV, BayesianRidge\n",
"import statsmodels.formula.api as sm\n", "import statsmodels.formula.api as sm\n",
"import matplotlib.pylab as plt\n", "import matplotlib.pylab as plt\n",
"import seaborn as sns\n" "import seaborn as sns\n",
"from dmba import regressionSummary, exhaustive_search\n",
"from dmba import backward_elimination, forward_selection, stepwise_selection\n",
"from dmba import adjusted_r2_score, AIC_score, BIC_score\n"
] ]
}, },
{ {
@ -74,7 +77,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 19, "execution_count": 31,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -85,7 +88,15 @@
" Predictor coefficient\n", " Predictor coefficient\n",
"0 CRIM -0.240062\n", "0 CRIM -0.240062\n",
"1 CHAS 3.266817\n", "1 CHAS 3.266817\n",
"2 RM 8.325175\n" "2 RM 8.325175\n",
"\n",
"Regression statistics\n",
"\n",
" Mean Error (ME) : -0.0000\n",
" Root Mean Squared Error (RMSE) : 5.9666\n",
" Mean Absolute Error (MAE) : 3.9668\n",
" Mean Percentage Error (MPE) : -7.2747\n",
"Mean Absolute Percentage Error (MAPE) : 22.5927\n"
] ]
} }
], ],
@ -108,6 +119,9 @@
"print('intercept ', housing_lm.intercept_)\n", "print('intercept ', housing_lm.intercept_)\n",
"print(pd.DataFrame({'Predictor': X.columns, 'coefficient': housing_lm.coef_}))\n", "print(pd.DataFrame({'Predictor': X.columns, 'coefficient': housing_lm.coef_}))\n",
"\n", "\n",
"# print performance measures\n",
"regressionSummary(train_y, housing_lm.predict(train_X))\n",
"\n",
"# Equation:\n", "# Equation:\n",
"# MEDV = -29.19 -0.24*CRIM + 3.27*CHAS + 8.33*RM\n" "# MEDV = -29.19 -0.24*CRIM + 3.27*CHAS + 8.33*RM\n"
] ]
@ -190,22 +204,304 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 26,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"data": {
"text/plain": [
"<AxesSubplot: >"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [ "source": [
"# d.\n", "# d.\n",
"# ii." "# ii.\n",
"sns.heatmap(corr, annot=True, fmt=\".1f\", cmap=\"RdBu\", center=0, ax=ax)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Highly Correlated Pairs\n",
"ZN and DIS\n",
"RAD and TAX\n",
"PTRATIO and RAD\n",
"PTRATIO and TAX"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 42,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Backward\n",
"Variables: CRIM, CHAS, RM\n",
"Start: score=1952.30\n",
"Step: score=1952.30, remove None\n",
"['CRIM', 'CHAS', 'RM']\n",
"\n",
"Regression statistics\n",
"\n",
" Mean Error (ME) : -0.0000\n",
" Root Mean Squared Error (RMSE) : 5.9666\n",
" Mean Absolute Error (MAE) : 3.9668\n",
" Mean Percentage Error (MPE) : -7.2747\n",
"Mean Absolute Percentage Error (MAPE) : 22.5927\n",
"\n"
]
}
],
"source": [ "source": [
"# d.\n", "# d.\n",
"# iii." "# iii.\n",
"def train_model(variables):\n",
" if len(variables) == 0:\n",
" return None\n",
" model = LinearRegression()\n",
" model.fit(train_X[variables], train_y)\n",
" return model\n",
"\n",
"def score_model(model, variables):\n",
" if len(variables) == 0:\n",
" return AIC_score(train_y, [train_y.mean()] * len(train_y), model, df=1)\n",
" return AIC_score(train_y, model.predict(train_X[variables]), model)\n",
"\n",
"print(\"Backward\")\n",
"best_back_model, best_back_variables = backward_elimination(train_X.columns, train_model, score_model, verbose=True)\n",
"print(best_back_variables)\n",
"regressionSummary(train_y, best_back_model.predict(train_X))\n",
"print()"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Forward\n",
"Variables: CRIM, CHAS, RM\n",
"Start: score=2191.75, constant\n",
"Step: score=1989.28, add RM\n",
"Step: score=1956.79, add CRIM\n",
"Step: score=1952.30, add CHAS\n",
"Step: score=1952.30, add None\n",
"['RM', 'CRIM', 'CHAS']\n",
"\n",
"Regression statistics\n",
"\n",
" Mean Error (ME) : -0.0000\n",
" Root Mean Squared Error (RMSE) : 5.9666\n",
" Mean Absolute Error (MAE) : 3.9668\n",
" Mean Percentage Error (MPE) : -7.2747\n",
"Mean Absolute Percentage Error (MAPE) : 22.5927\n"
]
}
],
"source": [
"print(\"Forward\")\n",
"best_forw_model, best_forw_variables = forward_selection(train_X.columns, train_model, score_model, verbose=True)\n",
"print(best_forw_variables)\n",
"forw_train_X = train_X.loc[:,['RM','CRIM','CHAS']]\n",
"regressionSummary(train_y, best_forw_model.predict(forw_train_X))\n"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Stepwise\n",
"Variables: CRIM, CHAS, RM\n",
"Start: score=2191.75, constant\n",
"Step: score=1989.28, add RM\n",
"Step: score=1956.79, add CRIM\n",
"Step: score=1952.30, add CHAS\n",
"Step: score=1952.30, add None\n",
"['RM', 'CRIM', 'CHAS']\n",
"\n",
"Regression statistics\n",
"\n",
" Mean Error (ME) : -0.0000\n",
" Root Mean Squared Error (RMSE) : 5.9666\n",
" Mean Absolute Error (MAE) : 3.9668\n",
" Mean Percentage Error (MPE) : -7.2747\n",
"Mean Absolute Percentage Error (MAPE) : 22.5927\n",
"\n",
"Regression statistics\n",
"\n",
" Mean Error (ME) : -0.0000\n",
" Root Mean Squared Error (RMSE) : 5.9666\n",
" Mean Absolute Error (MAE) : 3.9668\n",
" Mean Percentage Error (MPE) : -7.2747\n",
"Mean Absolute Percentage Error (MAPE) : 22.5927\n"
]
}
],
"source": [
"# d iii. continued\n",
"print(\"Stepwise\")\n",
"best_step_model, best_step_variables = forward_selection(train_X.columns, train_model, score_model, verbose=True)\n",
"print(best_step_variables)\n",
"step_train_X = train_X.loc[:,['RM','CRIM','CHAS']]\n",
"regressionSummary(train_y, best_step_model.predict(step_train_X))\n",
"test=regressionSummary(train_y, best_step_model.predict(step_train_X))\n"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"LASSO\n",
"\n",
"Regression statistics\n",
"\n",
" Mean Error (ME) : 0.2627\n",
" Root Mean Squared Error (RMSE) : 6.7153\n",
" Mean Absolute Error (MAE) : 4.7355\n",
" Mean Percentage Error (MPE) : -8.5983\n",
"Mean Absolute Percentage Error (MAPE) : 23.9824\n",
"\n",
"\n",
"LASSO CV\n",
"\n",
"Regression statistics\n",
"\n",
" Mean Error (ME) : 0.1124\n",
" Root Mean Squared Error (RMSE) : 6.4186\n",
" Mean Absolute Error (MAE) : 4.4592\n",
" Mean Percentage Error (MPE) : -7.7091\n",
"Mean Absolute Percentage Error (MAPE) : 23.1854\n",
"Lasso-CV chosen regularization: 0.033515828458353755\n",
"[-0.24201538 2.81692528 8.25934245]\n",
"\n",
"\n",
"RIDGE\n",
"\n",
"Regression statistics\n",
"\n",
" Mean Error (ME) : 0.1201\n",
" Root Mean Squared Error (RMSE) : 6.4138\n",
" Mean Absolute Error (MAE) : 4.4590\n",
" Mean Percentage Error (MPE) : -7.6484\n",
"Mean Absolute Percentage Error (MAPE) : 23.1724\n",
"\n",
"\n",
"BAYESIAN RIDGE\n",
"\n",
"Regression statistics\n",
"\n",
" Mean Error (ME) : 0.1211\n",
" Root Mean Squared Error (RMSE) : 6.4144\n",
" Mean Absolute Error (MAE) : 4.4603\n",
" Mean Percentage Error (MPE) : -7.6595\n",
"Mean Absolute Percentage Error (MAPE) : 23.1747\n",
"Bayesian ridge chosen regularization: 1.3591395967339095\n",
"\n",
"\n"
]
}
],
"source": [
"# d iii model\n",
"print(\"LASSO\")\n",
"lasso = Lasso(alpha=1)\n",
"lasso.fit(train_X, train_y)\n",
"regressionSummary(valid_y, lasso.predict(valid_X))\n",
"print(\"\\n\")\n",
"\n",
"print(\"LASSO CV\")\n",
"lasso_cv = LassoCV(cv=5)\n",
"lasso_cv.fit(train_X, train_y)\n",
"regressionSummary(valid_y, lasso_cv.predict(valid_X))\n",
"print('Lasso-CV chosen regularization: ', lasso_cv.alpha_)\n",
"print(lasso_cv.coef_)\n",
"print(\"\\n\")\n"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"RIDGE\n",
"\n",
"Regression statistics\n",
"\n",
" Mean Error (ME) : 0.1201\n",
" Root Mean Squared Error (RMSE) : 6.4138\n",
" Mean Absolute Error (MAE) : 4.4590\n",
" Mean Percentage Error (MPE) : -7.6484\n",
"Mean Absolute Percentage Error (MAPE) : 23.1724\n",
"\n",
"\n",
"BAYESIAN RIDGE\n",
"\n",
"Regression statistics\n",
"\n",
" Mean Error (ME) : 0.1211\n",
" Root Mean Squared Error (RMSE) : 6.4144\n",
" Mean Absolute Error (MAE) : 4.4603\n",
" Mean Percentage Error (MPE) : -7.6595\n",
"Mean Absolute Percentage Error (MAPE) : 23.1747\n",
"Bayesian ridge chosen regularization: 1.3591395967339095\n",
"\n",
"\n"
]
}
],
"source": [
"print(\"RIDGE\")\n",
"ridge = Ridge(alpha=1)\n",
"ridge.fit(train_X, train_y)\n",
"regressionSummary(valid_y, ridge.predict(valid_X))\n",
"print(\"\\n\")\n",
"\n",
"print(\"BAYESIAN RIDGE\")\n",
"bayesianRidge = BayesianRidge()\n",
"bayesianRidge.fit(train_X, train_y)\n",
"regressionSummary(valid_y, bayesianRidge.predict(valid_X))\n",
"print('Bayesian ridge chosen regularization: ', bayesianRidge.lambda_ / bayesianRidge.alpha_)\n",
"print(\"\\n\")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Best model\n",
"Bayesian Ridge: Lowest MAPE\n",
"Ridge: Lowest RMSE, lowest MAE\n",
"\n",
"Ridge or Bayesian Ridge should be used. Further parameter tuning can assist in selection which of the two models to use."
] ]
}, },
{ {