Correlation and Matrix Plots. Linear Regression

2023-04-11 15:48:03 -05:00 · 2023-04-11 15:48:03 -05:00 · cce3b9fecb
commit cce3b9fecb
1 changed files with 231 additions and 0 deletions
--- a/timing-analysis.ipynb
+++ b/timing-analysis.ipynb
@ -0,0 +1,231 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Imports\n",
+    "\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.linear_model import LinearRegression\n",
+    "from dmba import regressionSummary"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Import data\n",
+    "timing_df = pd.read_csv('TimingData.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Compute mean, median, min, max, and standard deviation for quantitative variables\n",
+    "pd.DataFrame({'mean': timing_df.mean(),\n",
+    "'median': timing_df.median(),\n",
+    "'min': timing_df.min(),\n",
+    "'max': timing_df.max(),\n",
+    "'sd': timing_df.std()\n",
+    "})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Show avg timing data for each var step (EX: Avg time for 12 nodes, 11...1, Avg time for 6 exploits...49152)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Correlation and Matrix Plot\n",
+    "timing_corr = timing_df.corr().round(3)\n",
+    "print(timing_corr)\n",
+    "fig, ax = plt.subplots()\n",
+    "fig.set_size_inches(11, 7)\n",
+    "sns.heatmap(timing_corr, annot=True, fmt=\".1f\", cmap=\"RdBu\", center=0, ax=ax)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Linear Regression - Overall\n",
+    "predictors = [\"nodes\", \"exploits\", \"appl\", \"load\"]\n",
+    "overall_outcome = [\"runtime\"]\n",
+    "\n",
+    "# partition data\n",
+    "X = pd.get_dummies(timing_df[predictors], drop_first=True)\n",
+    "overall_y = timing_df[overall_outcome]\n",
+    "train_X, valid_X, train_y, valid_y = train_test_split(X, overall_y, test_size=0.4, random_state=1)\n",
+    "housing_lm = LinearRegression()\n",
+    "housing_lm.fit(train_X, train_y)\n",
+    "# print coefficients\n",
+    "print('intercept ', housing_lm.intercept_)\n",
+    "print(pd.DataFrame({'Predictor': X.columns, 'coefficient': housing_lm.coef_}))\n",
+    "# print performance measures\n",
+    "regressionSummary(train_y, housing_lm.predict(train_X))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Linear Regression - Task 0\n",
+    "t0_outcome = [\"task0\"]\n",
+    "\n",
+    "# partition data\n",
+    "X = pd.get_dummies(timing_df[predictors], drop_first=True)\n",
+    "overall_y = timing_df[t0_outcome]\n",
+    "train_X, valid_X, train_y, valid_y = train_test_split(X, overall_y, test_size=0.4, random_state=1)\n",
+    "housing_lm = LinearRegression()\n",
+    "housing_lm.fit(train_X, train_y)\n",
+    "# print coefficients\n",
+    "print('intercept ', housing_lm.intercept_)\n",
+    "print(pd.DataFrame({'Predictor': X.columns, 'coefficient': housing_lm.coef_}))\n",
+    "# print performance measures\n",
+    "regressionSummary(train_y, housing_lm.predict(train_X))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Linear Regression - Task 1\n",
+    "t1_outcome = [\"task1\"]\n",
+    "\n",
+    "# partition data\n",
+    "X = pd.get_dummies(timing_df[predictors], drop_first=True)\n",
+    "overall_y = timing_df[t1_outcome]\n",
+    "train_X, valid_X, train_y, valid_y = train_test_split(X, overall_y, test_size=0.4, random_state=1)\n",
+    "housing_lm = LinearRegression()\n",
+    "housing_lm.fit(train_X, train_y)\n",
+    "# print coefficients\n",
+    "print('intercept ', housing_lm.intercept_)\n",
+    "print(pd.DataFrame({'Predictor': X.columns, 'coefficient': housing_lm.coef_}))\n",
+    "# print performance measures\n",
+    "regressionSummary(train_y, housing_lm.predict(train_X))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Linear Regression - Task 2\n",
+    "t2_outcome = [\"task2\"]\n",
+    "\n",
+    "# partition data\n",
+    "X = pd.get_dummies(timing_df[predictors], drop_first=True)\n",
+    "overall_y = timing_df[t2_outcome]\n",
+    "train_X, valid_X, train_y, valid_y = train_test_split(X, overall_y, test_size=0.4, random_state=1)\n",
+    "housing_lm = LinearRegression()\n",
+    "housing_lm.fit(train_X, train_y)\n",
+    "# print coefficients\n",
+    "print('intercept ', housing_lm.intercept_)\n",
+    "print(pd.DataFrame({'Predictor': X.columns, 'coefficient': housing_lm.coef_}))\n",
+    "# print performance measures\n",
+    "regressionSummary(train_y, housing_lm.predict(train_X))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Linear Regression - Task 3\n",
+    "t3_outcome = [\"task3\"]\n",
+    "\n",
+    "# partition data\n",
+    "X = pd.get_dummies(timing_df[predictors], drop_first=True)\n",
+    "overall_y = timing_df[t3_outcome]\n",
+    "train_X, valid_X, train_y, valid_y = train_test_split(X, overall_y, test_size=0.4, random_state=1)\n",
+    "housing_lm = LinearRegression()\n",
+    "housing_lm.fit(train_X, train_y)\n",
+    "# print coefficients\n",
+    "print('intercept ', housing_lm.intercept_)\n",
+    "print(pd.DataFrame({'Predictor': X.columns, 'coefficient': housing_lm.coef_}))\n",
+    "# print performance measures\n",
+    "regressionSummary(train_y, housing_lm.predict(train_X))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Linear Regression - Task 4\n",
+    "t4_outcome = [\"task4\"]\n",
+    "\n",
+    "# partition data\n",
+    "X = pd.get_dummies(timing_df[predictors], drop_first=True)\n",
+    "overall_y = timing_df[t4_outcome]\n",
+    "train_X, valid_X, train_y, valid_y = train_test_split(X, overall_y, test_size=0.4, random_state=1)\n",
+    "housing_lm = LinearRegression()\n",
+    "housing_lm.fit(train_X, train_y)\n",
+    "# print coefficients\n",
+    "print('intercept ', housing_lm.intercept_)\n",
+    "print(pd.DataFrame({'Predictor': X.columns, 'coefficient': housing_lm.coef_}))\n",
+    "# print performance measures\n",
+    "regressionSummary(train_y, housing_lm.predict(train_X))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Linear Regression - Task 5\n",
+    "t5_outcome = [\"task5\"]\n",
+    "\n",
+    "# partition data\n",
+    "X = pd.get_dummies(timing_df[predictors], drop_first=True)\n",
+    "overall_y = timing_df[t5_outcome]\n",
+    "train_X, valid_X, train_y, valid_y = train_test_split(X, overall_y, test_size=0.4, random_state=1)\n",
+    "housing_lm = LinearRegression()\n",
+    "housing_lm.fit(train_X, train_y)\n",
+    "# print coefficients\n",
+    "print('intercept ', housing_lm.intercept_)\n",
+    "print(pd.DataFrame({'Predictor': X.columns, 'coefficient': housing_lm.coef_}))\n",
+    "# print performance measures\n",
+    "regressionSummary(train_y, housing_lm.predict(train_X))"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}