From cce3b9fecbebae74491aae95199bda20b08663b7 Mon Sep 17 00:00:00 2001 From: noah Date: Tue, 11 Apr 2023 15:48:03 -0500 Subject: [PATCH] Correlation and Matrix Plots. Linear Regression --- timing-analysis.ipynb | 231 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 231 insertions(+) create mode 100644 timing-analysis.ipynb diff --git a/timing-analysis.ipynb b/timing-analysis.ipynb new file mode 100644 index 0000000..4c987ba --- /dev/null +++ b/timing-analysis.ipynb @@ -0,0 +1,231 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Imports\n", + "\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.linear_model import LinearRegression\n", + "from dmba import regressionSummary" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import data\n", + "timing_df = pd.read_csv('TimingData.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Compute mean, median, min, max, and standard deviation for quantitative variables\n", + "pd.DataFrame({'mean': timing_df.mean(),\n", + "'median': timing_df.median(),\n", + "'min': timing_df.min(),\n", + "'max': timing_df.max(),\n", + "'sd': timing_df.std()\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Show avg timing data for each var step (EX: Avg time for 12 nodes, 11...1, Avg time for 6 exploits...49152)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Correlation and Matrix Plot\n", + "timing_corr = timing_df.corr().round(3)\n", + "print(timing_corr)\n", + "fig, ax = plt.subplots()\n", + "fig.set_size_inches(11, 7)\n", + "sns.heatmap(timing_corr, annot=True, fmt=\".1f\", cmap=\"RdBu\", center=0, ax=ax)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Linear Regression - Overall\n", + "predictors = [\"nodes\", \"exploits\", \"appl\", \"load\"]\n", + "overall_outcome = [\"runtime\"]\n", + "\n", + "# partition data\n", + "X = pd.get_dummies(timing_df[predictors], drop_first=True)\n", + "overall_y = timing_df[overall_outcome]\n", + "train_X, valid_X, train_y, valid_y = train_test_split(X, overall_y, test_size=0.4, random_state=1)\n", + "housing_lm = LinearRegression()\n", + "housing_lm.fit(train_X, train_y)\n", + "# print coefficients\n", + "print('intercept ', housing_lm.intercept_)\n", + "print(pd.DataFrame({'Predictor': X.columns, 'coefficient': housing_lm.coef_}))\n", + "# print performance measures\n", + "regressionSummary(train_y, housing_lm.predict(train_X))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Linear Regression - Task 0\n", + "t0_outcome = [\"task0\"]\n", + "\n", + "# partition data\n", + "X = pd.get_dummies(timing_df[predictors], drop_first=True)\n", + "overall_y = timing_df[t0_outcome]\n", + "train_X, valid_X, train_y, valid_y = train_test_split(X, overall_y, test_size=0.4, random_state=1)\n", + "housing_lm = LinearRegression()\n", + "housing_lm.fit(train_X, train_y)\n", + "# print coefficients\n", + "print('intercept ', housing_lm.intercept_)\n", + "print(pd.DataFrame({'Predictor': X.columns, 'coefficient': housing_lm.coef_}))\n", + "# print performance measures\n", + "regressionSummary(train_y, housing_lm.predict(train_X))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Linear Regression - Task 1\n", + "t1_outcome = [\"task1\"]\n", + "\n", + "# partition data\n", + "X = pd.get_dummies(timing_df[predictors], drop_first=True)\n", + "overall_y = timing_df[t1_outcome]\n", + "train_X, valid_X, train_y, valid_y = train_test_split(X, overall_y, test_size=0.4, random_state=1)\n", + "housing_lm = LinearRegression()\n", + "housing_lm.fit(train_X, train_y)\n", + "# print coefficients\n", + "print('intercept ', housing_lm.intercept_)\n", + "print(pd.DataFrame({'Predictor': X.columns, 'coefficient': housing_lm.coef_}))\n", + "# print performance measures\n", + "regressionSummary(train_y, housing_lm.predict(train_X))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Linear Regression - Task 2\n", + "t2_outcome = [\"task2\"]\n", + "\n", + "# partition data\n", + "X = pd.get_dummies(timing_df[predictors], drop_first=True)\n", + "overall_y = timing_df[t2_outcome]\n", + "train_X, valid_X, train_y, valid_y = train_test_split(X, overall_y, test_size=0.4, random_state=1)\n", + "housing_lm = LinearRegression()\n", + "housing_lm.fit(train_X, train_y)\n", + "# print coefficients\n", + "print('intercept ', housing_lm.intercept_)\n", + "print(pd.DataFrame({'Predictor': X.columns, 'coefficient': housing_lm.coef_}))\n", + "# print performance measures\n", + "regressionSummary(train_y, housing_lm.predict(train_X))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Linear Regression - Task 3\n", + "t3_outcome = [\"task3\"]\n", + "\n", + "# partition data\n", + "X = pd.get_dummies(timing_df[predictors], drop_first=True)\n", + "overall_y = timing_df[t3_outcome]\n", + "train_X, valid_X, train_y, valid_y = train_test_split(X, overall_y, test_size=0.4, random_state=1)\n", + "housing_lm = LinearRegression()\n", + "housing_lm.fit(train_X, train_y)\n", + "# print coefficients\n", + "print('intercept ', housing_lm.intercept_)\n", + "print(pd.DataFrame({'Predictor': X.columns, 'coefficient': housing_lm.coef_}))\n", + "# print performance measures\n", + "regressionSummary(train_y, housing_lm.predict(train_X))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Linear Regression - Task 4\n", + "t4_outcome = [\"task4\"]\n", + "\n", + "# partition data\n", + "X = pd.get_dummies(timing_df[predictors], drop_first=True)\n", + "overall_y = timing_df[t4_outcome]\n", + "train_X, valid_X, train_y, valid_y = train_test_split(X, overall_y, test_size=0.4, random_state=1)\n", + "housing_lm = LinearRegression()\n", + "housing_lm.fit(train_X, train_y)\n", + "# print coefficients\n", + "print('intercept ', housing_lm.intercept_)\n", + "print(pd.DataFrame({'Predictor': X.columns, 'coefficient': housing_lm.coef_}))\n", + "# print performance measures\n", + "regressionSummary(train_y, housing_lm.predict(train_X))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Linear Regression - Task 5\n", + "t5_outcome = [\"task5\"]\n", + "\n", + "# partition data\n", + "X = pd.get_dummies(timing_df[predictors], drop_first=True)\n", + "overall_y = timing_df[t5_outcome]\n", + "train_X, valid_X, train_y, valid_y = train_test_split(X, overall_y, test_size=0.4, random_state=1)\n", + "housing_lm = LinearRegression()\n", + "housing_lm.fit(train_X, train_y)\n", + "# print coefficients\n", + "print('intercept ', housing_lm.intercept_)\n", + "print(pd.DataFrame({'Predictor': X.columns, 'coefficient': housing_lm.coef_}))\n", + "# print performance measures\n", + "regressionSummary(train_y, housing_lm.predict(train_X))" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +}