{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Imports\n", "\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.linear_model import LinearRegression\n", "from dmba import regressionSummary" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Import data\n", "timing_df = pd.read_csv('TimingData.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Compute mean, median, min, max, and standard deviation for quantitative variables\n", "pd.DataFrame({'mean': timing_df.mean(),\n", "'median': timing_df.median(),\n", "'min': timing_df.min(),\n", "'max': timing_df.max(),\n", "'sd': timing_df.std()\n", "})" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Show avg timing data for each var step (EX: Avg time for 12 nodes, 11...1, Avg time for 6 exploits...49152)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Correlation and Matrix Plot\n", "timing_corr = timing_df.corr().round(3)\n", "print(timing_corr)\n", "fig, ax = plt.subplots()\n", "fig.set_size_inches(11, 7)\n", "sns.heatmap(timing_corr, annot=True, fmt=\".1f\", cmap=\"RdBu\", center=0, ax=ax)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Linear Regression - Overall\n", "predictors = [\"nodes\", \"exploits\", \"appl\", \"load\"]\n", "overall_outcome = [\"runtime\"]\n", "\n", "# partition data\n", "X = pd.get_dummies(timing_df[predictors], drop_first=True)\n", "overall_y = timing_df[overall_outcome]\n", "train_X, valid_X, train_y, valid_y = train_test_split(X, overall_y, test_size=0.4, random_state=1)\n", "housing_lm = LinearRegression()\n", "housing_lm.fit(train_X, train_y)\n", "# print coefficients\n", "print('intercept ', housing_lm.intercept_)\n", "print(pd.DataFrame({'Predictor': X.columns, 'coefficient': housing_lm.coef_}))\n", "# print performance measures\n", "regressionSummary(train_y, housing_lm.predict(train_X))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Linear Regression - Task 0\n", "t0_outcome = [\"task0\"]\n", "\n", "# partition data\n", "X = pd.get_dummies(timing_df[predictors], drop_first=True)\n", "overall_y = timing_df[t0_outcome]\n", "train_X, valid_X, train_y, valid_y = train_test_split(X, overall_y, test_size=0.4, random_state=1)\n", "housing_lm = LinearRegression()\n", "housing_lm.fit(train_X, train_y)\n", "# print coefficients\n", "print('intercept ', housing_lm.intercept_)\n", "print(pd.DataFrame({'Predictor': X.columns, 'coefficient': housing_lm.coef_}))\n", "# print performance measures\n", "regressionSummary(train_y, housing_lm.predict(train_X))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Linear Regression - Task 1\n", "t1_outcome = [\"task1\"]\n", "\n", "# partition data\n", "X = pd.get_dummies(timing_df[predictors], drop_first=True)\n", "overall_y = timing_df[t1_outcome]\n", "train_X, valid_X, train_y, valid_y = train_test_split(X, overall_y, test_size=0.4, random_state=1)\n", "housing_lm = LinearRegression()\n", "housing_lm.fit(train_X, train_y)\n", "# print coefficients\n", "print('intercept ', housing_lm.intercept_)\n", "print(pd.DataFrame({'Predictor': X.columns, 'coefficient': housing_lm.coef_}))\n", "# print performance measures\n", "regressionSummary(train_y, housing_lm.predict(train_X))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Linear Regression - Task 2\n", "t2_outcome = [\"task2\"]\n", "\n", "# partition data\n", "X = pd.get_dummies(timing_df[predictors], drop_first=True)\n", "overall_y = timing_df[t2_outcome]\n", "train_X, valid_X, train_y, valid_y = train_test_split(X, overall_y, test_size=0.4, random_state=1)\n", "housing_lm = LinearRegression()\n", "housing_lm.fit(train_X, train_y)\n", "# print coefficients\n", "print('intercept ', housing_lm.intercept_)\n", "print(pd.DataFrame({'Predictor': X.columns, 'coefficient': housing_lm.coef_}))\n", "# print performance measures\n", "regressionSummary(train_y, housing_lm.predict(train_X))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Linear Regression - Task 3\n", "t3_outcome = [\"task3\"]\n", "\n", "# partition data\n", "X = pd.get_dummies(timing_df[predictors], drop_first=True)\n", "overall_y = timing_df[t3_outcome]\n", "train_X, valid_X, train_y, valid_y = train_test_split(X, overall_y, test_size=0.4, random_state=1)\n", "housing_lm = LinearRegression()\n", "housing_lm.fit(train_X, train_y)\n", "# print coefficients\n", "print('intercept ', housing_lm.intercept_)\n", "print(pd.DataFrame({'Predictor': X.columns, 'coefficient': housing_lm.coef_}))\n", "# print performance measures\n", "regressionSummary(train_y, housing_lm.predict(train_X))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Linear Regression - Task 4\n", "t4_outcome = [\"task4\"]\n", "\n", "# partition data\n", "X = pd.get_dummies(timing_df[predictors], drop_first=True)\n", "overall_y = timing_df[t4_outcome]\n", "train_X, valid_X, train_y, valid_y = train_test_split(X, overall_y, test_size=0.4, random_state=1)\n", "housing_lm = LinearRegression()\n", "housing_lm.fit(train_X, train_y)\n", "# print coefficients\n", "print('intercept ', housing_lm.intercept_)\n", "print(pd.DataFrame({'Predictor': X.columns, 'coefficient': housing_lm.coef_}))\n", "# print performance measures\n", "regressionSummary(train_y, housing_lm.predict(train_X))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Linear Regression - Task 5\n", "t5_outcome = [\"task5\"]\n", "\n", "# partition data\n", "X = pd.get_dummies(timing_df[predictors], drop_first=True)\n", "overall_y = timing_df[t5_outcome]\n", "train_X, valid_X, train_y, valid_y = train_test_split(X, overall_y, test_size=0.4, random_state=1)\n", "housing_lm = LinearRegression()\n", "housing_lm.fit(train_X, train_y)\n", "# print coefficients\n", "print('intercept ', housing_lm.intercept_)\n", "print(pd.DataFrame({'Predictor': X.columns, 'coefficient': housing_lm.coef_}))\n", "# print performance measures\n", "regressionSummary(train_y, housing_lm.predict(train_X))" ] } ], "metadata": { "language_info": { "name": "python" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }