Predictor exploration, subset selection, and determining models
This commit is contained in:
parent
b5e5908f6e
commit
0f2d8787ca
@ -2,7 +2,7 @@
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"execution_count": 81,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -19,7 +19,10 @@
|
||||
"from sklearn.linear_model import LinearRegression, Lasso, Ridge, LassoCV, BayesianRidge\n",
|
||||
"import statsmodels.formula.api as sm\n",
|
||||
"import matplotlib.pylab as plt\n",
|
||||
"import seaborn as sns\n"
|
||||
"import seaborn as sns\n",
|
||||
"from dmba import regressionSummary, exhaustive_search\n",
|
||||
"from dmba import backward_elimination, forward_selection, stepwise_selection\n",
|
||||
"from dmba import adjusted_r2_score, AIC_score, BIC_score\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -74,7 +77,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"execution_count": 31,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@ -85,7 +88,15 @@
|
||||
" Predictor coefficient\n",
|
||||
"0 CRIM -0.240062\n",
|
||||
"1 CHAS 3.266817\n",
|
||||
"2 RM 8.325175\n"
|
||||
"2 RM 8.325175\n",
|
||||
"\n",
|
||||
"Regression statistics\n",
|
||||
"\n",
|
||||
" Mean Error (ME) : -0.0000\n",
|
||||
" Root Mean Squared Error (RMSE) : 5.9666\n",
|
||||
" Mean Absolute Error (MAE) : 3.9668\n",
|
||||
" Mean Percentage Error (MPE) : -7.2747\n",
|
||||
"Mean Absolute Percentage Error (MAPE) : 22.5927\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -108,6 +119,9 @@
|
||||
"print('intercept ', housing_lm.intercept_)\n",
|
||||
"print(pd.DataFrame({'Predictor': X.columns, 'coefficient': housing_lm.coef_}))\n",
|
||||
"\n",
|
||||
"# print performance measures\n",
|
||||
"regressionSummary(train_y, housing_lm.predict(train_X))\n",
|
||||
"\n",
|
||||
"# Equation:\n",
|
||||
"# MEDV = -29.19 -0.24*CRIM + 3.27*CHAS + 8.33*RM\n"
|
||||
]
|
||||
@ -190,22 +204,304 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 26,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"<AxesSubplot: >"
|
||||
]
|
||||
},
|
||||
"execution_count": 26,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# d.\n",
|
||||
"# ii."
|
||||
"# ii.\n",
|
||||
"sns.heatmap(corr, annot=True, fmt=\".1f\", cmap=\"RdBu\", center=0, ax=ax)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Highly Correlated Pairs\n",
|
||||
"ZN and DIS\n",
|
||||
"RAD and TAX\n",
|
||||
"PTRATIO and RAD\n",
|
||||
"PTRATIO and TAX"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 42,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Backward\n",
|
||||
"Variables: CRIM, CHAS, RM\n",
|
||||
"Start: score=1952.30\n",
|
||||
"Step: score=1952.30, remove None\n",
|
||||
"['CRIM', 'CHAS', 'RM']\n",
|
||||
"\n",
|
||||
"Regression statistics\n",
|
||||
"\n",
|
||||
" Mean Error (ME) : -0.0000\n",
|
||||
" Root Mean Squared Error (RMSE) : 5.9666\n",
|
||||
" Mean Absolute Error (MAE) : 3.9668\n",
|
||||
" Mean Percentage Error (MPE) : -7.2747\n",
|
||||
"Mean Absolute Percentage Error (MAPE) : 22.5927\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# d.\n",
|
||||
"# iii."
|
||||
"# iii.\n",
|
||||
"def train_model(variables):\n",
|
||||
" if len(variables) == 0:\n",
|
||||
" return None\n",
|
||||
" model = LinearRegression()\n",
|
||||
" model.fit(train_X[variables], train_y)\n",
|
||||
" return model\n",
|
||||
"\n",
|
||||
"def score_model(model, variables):\n",
|
||||
" if len(variables) == 0:\n",
|
||||
" return AIC_score(train_y, [train_y.mean()] * len(train_y), model, df=1)\n",
|
||||
" return AIC_score(train_y, model.predict(train_X[variables]), model)\n",
|
||||
"\n",
|
||||
"print(\"Backward\")\n",
|
||||
"best_back_model, best_back_variables = backward_elimination(train_X.columns, train_model, score_model, verbose=True)\n",
|
||||
"print(best_back_variables)\n",
|
||||
"regressionSummary(train_y, best_back_model.predict(train_X))\n",
|
||||
"print()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 62,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Forward\n",
|
||||
"Variables: CRIM, CHAS, RM\n",
|
||||
"Start: score=2191.75, constant\n",
|
||||
"Step: score=1989.28, add RM\n",
|
||||
"Step: score=1956.79, add CRIM\n",
|
||||
"Step: score=1952.30, add CHAS\n",
|
||||
"Step: score=1952.30, add None\n",
|
||||
"['RM', 'CRIM', 'CHAS']\n",
|
||||
"\n",
|
||||
"Regression statistics\n",
|
||||
"\n",
|
||||
" Mean Error (ME) : -0.0000\n",
|
||||
" Root Mean Squared Error (RMSE) : 5.9666\n",
|
||||
" Mean Absolute Error (MAE) : 3.9668\n",
|
||||
" Mean Percentage Error (MPE) : -7.2747\n",
|
||||
"Mean Absolute Percentage Error (MAPE) : 22.5927\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(\"Forward\")\n",
|
||||
"best_forw_model, best_forw_variables = forward_selection(train_X.columns, train_model, score_model, verbose=True)\n",
|
||||
"print(best_forw_variables)\n",
|
||||
"forw_train_X = train_X.loc[:,['RM','CRIM','CHAS']]\n",
|
||||
"regressionSummary(train_y, best_forw_model.predict(forw_train_X))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 78,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Stepwise\n",
|
||||
"Variables: CRIM, CHAS, RM\n",
|
||||
"Start: score=2191.75, constant\n",
|
||||
"Step: score=1989.28, add RM\n",
|
||||
"Step: score=1956.79, add CRIM\n",
|
||||
"Step: score=1952.30, add CHAS\n",
|
||||
"Step: score=1952.30, add None\n",
|
||||
"['RM', 'CRIM', 'CHAS']\n",
|
||||
"\n",
|
||||
"Regression statistics\n",
|
||||
"\n",
|
||||
" Mean Error (ME) : -0.0000\n",
|
||||
" Root Mean Squared Error (RMSE) : 5.9666\n",
|
||||
" Mean Absolute Error (MAE) : 3.9668\n",
|
||||
" Mean Percentage Error (MPE) : -7.2747\n",
|
||||
"Mean Absolute Percentage Error (MAPE) : 22.5927\n",
|
||||
"\n",
|
||||
"Regression statistics\n",
|
||||
"\n",
|
||||
" Mean Error (ME) : -0.0000\n",
|
||||
" Root Mean Squared Error (RMSE) : 5.9666\n",
|
||||
" Mean Absolute Error (MAE) : 3.9668\n",
|
||||
" Mean Percentage Error (MPE) : -7.2747\n",
|
||||
"Mean Absolute Percentage Error (MAPE) : 22.5927\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# d iii. continued\n",
|
||||
"print(\"Stepwise\")\n",
|
||||
"best_step_model, best_step_variables = forward_selection(train_X.columns, train_model, score_model, verbose=True)\n",
|
||||
"print(best_step_variables)\n",
|
||||
"step_train_X = train_X.loc[:,['RM','CRIM','CHAS']]\n",
|
||||
"regressionSummary(train_y, best_step_model.predict(step_train_X))\n",
|
||||
"test=regressionSummary(train_y, best_step_model.predict(step_train_X))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 86,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"LASSO\n",
|
||||
"\n",
|
||||
"Regression statistics\n",
|
||||
"\n",
|
||||
" Mean Error (ME) : 0.2627\n",
|
||||
" Root Mean Squared Error (RMSE) : 6.7153\n",
|
||||
" Mean Absolute Error (MAE) : 4.7355\n",
|
||||
" Mean Percentage Error (MPE) : -8.5983\n",
|
||||
"Mean Absolute Percentage Error (MAPE) : 23.9824\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"LASSO CV\n",
|
||||
"\n",
|
||||
"Regression statistics\n",
|
||||
"\n",
|
||||
" Mean Error (ME) : 0.1124\n",
|
||||
" Root Mean Squared Error (RMSE) : 6.4186\n",
|
||||
" Mean Absolute Error (MAE) : 4.4592\n",
|
||||
" Mean Percentage Error (MPE) : -7.7091\n",
|
||||
"Mean Absolute Percentage Error (MAPE) : 23.1854\n",
|
||||
"Lasso-CV chosen regularization: 0.033515828458353755\n",
|
||||
"[-0.24201538 2.81692528 8.25934245]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"RIDGE\n",
|
||||
"\n",
|
||||
"Regression statistics\n",
|
||||
"\n",
|
||||
" Mean Error (ME) : 0.1201\n",
|
||||
" Root Mean Squared Error (RMSE) : 6.4138\n",
|
||||
" Mean Absolute Error (MAE) : 4.4590\n",
|
||||
" Mean Percentage Error (MPE) : -7.6484\n",
|
||||
"Mean Absolute Percentage Error (MAPE) : 23.1724\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"BAYESIAN RIDGE\n",
|
||||
"\n",
|
||||
"Regression statistics\n",
|
||||
"\n",
|
||||
" Mean Error (ME) : 0.1211\n",
|
||||
" Root Mean Squared Error (RMSE) : 6.4144\n",
|
||||
" Mean Absolute Error (MAE) : 4.4603\n",
|
||||
" Mean Percentage Error (MPE) : -7.6595\n",
|
||||
"Mean Absolute Percentage Error (MAPE) : 23.1747\n",
|
||||
"Bayesian ridge chosen regularization: 1.3591395967339095\n",
|
||||
"\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# d iii model\n",
|
||||
"print(\"LASSO\")\n",
|
||||
"lasso = Lasso(alpha=1)\n",
|
||||
"lasso.fit(train_X, train_y)\n",
|
||||
"regressionSummary(valid_y, lasso.predict(valid_X))\n",
|
||||
"print(\"\\n\")\n",
|
||||
"\n",
|
||||
"print(\"LASSO CV\")\n",
|
||||
"lasso_cv = LassoCV(cv=5)\n",
|
||||
"lasso_cv.fit(train_X, train_y)\n",
|
||||
"regressionSummary(valid_y, lasso_cv.predict(valid_X))\n",
|
||||
"print('Lasso-CV chosen regularization: ', lasso_cv.alpha_)\n",
|
||||
"print(lasso_cv.coef_)\n",
|
||||
"print(\"\\n\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 87,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"RIDGE\n",
|
||||
"\n",
|
||||
"Regression statistics\n",
|
||||
"\n",
|
||||
" Mean Error (ME) : 0.1201\n",
|
||||
" Root Mean Squared Error (RMSE) : 6.4138\n",
|
||||
" Mean Absolute Error (MAE) : 4.4590\n",
|
||||
" Mean Percentage Error (MPE) : -7.6484\n",
|
||||
"Mean Absolute Percentage Error (MAPE) : 23.1724\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"BAYESIAN RIDGE\n",
|
||||
"\n",
|
||||
"Regression statistics\n",
|
||||
"\n",
|
||||
" Mean Error (ME) : 0.1211\n",
|
||||
" Root Mean Squared Error (RMSE) : 6.4144\n",
|
||||
" Mean Absolute Error (MAE) : 4.4603\n",
|
||||
" Mean Percentage Error (MPE) : -7.6595\n",
|
||||
"Mean Absolute Percentage Error (MAPE) : 23.1747\n",
|
||||
"Bayesian ridge chosen regularization: 1.3591395967339095\n",
|
||||
"\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(\"RIDGE\")\n",
|
||||
"ridge = Ridge(alpha=1)\n",
|
||||
"ridge.fit(train_X, train_y)\n",
|
||||
"regressionSummary(valid_y, ridge.predict(valid_X))\n",
|
||||
"print(\"\\n\")\n",
|
||||
"\n",
|
||||
"print(\"BAYESIAN RIDGE\")\n",
|
||||
"bayesianRidge = BayesianRidge()\n",
|
||||
"bayesianRidge.fit(train_X, train_y)\n",
|
||||
"regressionSummary(valid_y, bayesianRidge.predict(valid_X))\n",
|
||||
"print('Bayesian ridge chosen regularization: ', bayesianRidge.lambda_ / bayesianRidge.alpha_)\n",
|
||||
"print(\"\\n\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Best model\n",
|
||||
"Bayesian Ridge: Lowest MAPE\n",
|
||||
"Ridge: Lowest RMSE, lowest MAE\n",
|
||||
"\n",
|
||||
"Ridge or Bayesian Ridge should be used. Further parameter tuning can assist in selection which of the two models to use."
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user