Correlation and Matrix Plots. Linear Regression

This commit is contained in:
Noah L. Schrick 2023-04-11 15:48:03 -05:00
commit cce3b9fecb

231
timing-analysis.ipynb Normal file
View File

@ -0,0 +1,231 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Imports\n",
"\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.linear_model import LinearRegression\n",
"from dmba import regressionSummary"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Import data\n",
"timing_df = pd.read_csv('TimingData.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Compute mean, median, min, max, and standard deviation for quantitative variables\n",
"pd.DataFrame({'mean': timing_df.mean(),\n",
"'median': timing_df.median(),\n",
"'min': timing_df.min(),\n",
"'max': timing_df.max(),\n",
"'sd': timing_df.std()\n",
"})"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Show avg timing data for each var step (EX: Avg time for 12 nodes, 11...1, Avg time for 6 exploits...49152)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Correlation and Matrix Plot\n",
"timing_corr = timing_df.corr().round(3)\n",
"print(timing_corr)\n",
"fig, ax = plt.subplots()\n",
"fig.set_size_inches(11, 7)\n",
"sns.heatmap(timing_corr, annot=True, fmt=\".1f\", cmap=\"RdBu\", center=0, ax=ax)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Linear Regression - Overall\n",
"predictors = [\"nodes\", \"exploits\", \"appl\", \"load\"]\n",
"overall_outcome = [\"runtime\"]\n",
"\n",
"# partition data\n",
"X = pd.get_dummies(timing_df[predictors], drop_first=True)\n",
"overall_y = timing_df[overall_outcome]\n",
"train_X, valid_X, train_y, valid_y = train_test_split(X, overall_y, test_size=0.4, random_state=1)\n",
"housing_lm = LinearRegression()\n",
"housing_lm.fit(train_X, train_y)\n",
"# print coefficients\n",
"print('intercept ', housing_lm.intercept_)\n",
"print(pd.DataFrame({'Predictor': X.columns, 'coefficient': housing_lm.coef_}))\n",
"# print performance measures\n",
"regressionSummary(train_y, housing_lm.predict(train_X))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Linear Regression - Task 0\n",
"t0_outcome = [\"task0\"]\n",
"\n",
"# partition data\n",
"X = pd.get_dummies(timing_df[predictors], drop_first=True)\n",
"overall_y = timing_df[t0_outcome]\n",
"train_X, valid_X, train_y, valid_y = train_test_split(X, overall_y, test_size=0.4, random_state=1)\n",
"housing_lm = LinearRegression()\n",
"housing_lm.fit(train_X, train_y)\n",
"# print coefficients\n",
"print('intercept ', housing_lm.intercept_)\n",
"print(pd.DataFrame({'Predictor': X.columns, 'coefficient': housing_lm.coef_}))\n",
"# print performance measures\n",
"regressionSummary(train_y, housing_lm.predict(train_X))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Linear Regression - Task 1\n",
"t1_outcome = [\"task1\"]\n",
"\n",
"# partition data\n",
"X = pd.get_dummies(timing_df[predictors], drop_first=True)\n",
"overall_y = timing_df[t1_outcome]\n",
"train_X, valid_X, train_y, valid_y = train_test_split(X, overall_y, test_size=0.4, random_state=1)\n",
"housing_lm = LinearRegression()\n",
"housing_lm.fit(train_X, train_y)\n",
"# print coefficients\n",
"print('intercept ', housing_lm.intercept_)\n",
"print(pd.DataFrame({'Predictor': X.columns, 'coefficient': housing_lm.coef_}))\n",
"# print performance measures\n",
"regressionSummary(train_y, housing_lm.predict(train_X))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Linear Regression - Task 2\n",
"t2_outcome = [\"task2\"]\n",
"\n",
"# partition data\n",
"X = pd.get_dummies(timing_df[predictors], drop_first=True)\n",
"overall_y = timing_df[t2_outcome]\n",
"train_X, valid_X, train_y, valid_y = train_test_split(X, overall_y, test_size=0.4, random_state=1)\n",
"housing_lm = LinearRegression()\n",
"housing_lm.fit(train_X, train_y)\n",
"# print coefficients\n",
"print('intercept ', housing_lm.intercept_)\n",
"print(pd.DataFrame({'Predictor': X.columns, 'coefficient': housing_lm.coef_}))\n",
"# print performance measures\n",
"regressionSummary(train_y, housing_lm.predict(train_X))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Linear Regression - Task 3\n",
"t3_outcome = [\"task3\"]\n",
"\n",
"# partition data\n",
"X = pd.get_dummies(timing_df[predictors], drop_first=True)\n",
"overall_y = timing_df[t3_outcome]\n",
"train_X, valid_X, train_y, valid_y = train_test_split(X, overall_y, test_size=0.4, random_state=1)\n",
"housing_lm = LinearRegression()\n",
"housing_lm.fit(train_X, train_y)\n",
"# print coefficients\n",
"print('intercept ', housing_lm.intercept_)\n",
"print(pd.DataFrame({'Predictor': X.columns, 'coefficient': housing_lm.coef_}))\n",
"# print performance measures\n",
"regressionSummary(train_y, housing_lm.predict(train_X))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Linear Regression - Task 4\n",
"t4_outcome = [\"task4\"]\n",
"\n",
"# partition data\n",
"X = pd.get_dummies(timing_df[predictors], drop_first=True)\n",
"overall_y = timing_df[t4_outcome]\n",
"train_X, valid_X, train_y, valid_y = train_test_split(X, overall_y, test_size=0.4, random_state=1)\n",
"housing_lm = LinearRegression()\n",
"housing_lm.fit(train_X, train_y)\n",
"# print coefficients\n",
"print('intercept ', housing_lm.intercept_)\n",
"print(pd.DataFrame({'Predictor': X.columns, 'coefficient': housing_lm.coef_}))\n",
"# print performance measures\n",
"regressionSummary(train_y, housing_lm.predict(train_X))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Linear Regression - Task 5\n",
"t5_outcome = [\"task5\"]\n",
"\n",
"# partition data\n",
"X = pd.get_dummies(timing_df[predictors], drop_first=True)\n",
"overall_y = timing_df[t5_outcome]\n",
"train_X, valid_X, train_y, valid_y = train_test_split(X, overall_y, test_size=0.4, random_state=1)\n",
"housing_lm = LinearRegression()\n",
"housing_lm.fit(train_X, train_y)\n",
"# print coefficients\n",
"print('intercept ', housing_lm.intercept_)\n",
"print(pd.DataFrame({'Predictor': X.columns, 'coefficient': housing_lm.coef_}))\n",
"# print performance measures\n",
"regressionSummary(train_y, housing_lm.predict(train_X))"
]
}
],
"metadata": {
"language_info": {
"name": "python"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}