{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Learning Practice 9 for the University of Tulsa's QM-7063 Data Mining Course\n", "# Support Vector Machines\n", "# Professor: Dr. Abdulrashid, Spring 2023\n", "# Noah L. Schrick - 1492657" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "# Imports\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "from sklearn import preprocessing\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.svm import SVC\n", "from sklearn.model_selection import GridSearchCV\n", "from sklearn.metrics import classification_report,confusion_matrix\n", "\n", "%matplotlib inline" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# a. \n", "Numerisize the dataset" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "# a\n", "accidents_df = pd.read_csv('accidentsFull.csv')\n", "accidents_df['Injury'] = (accidents_df['MAX_SEV_IR'] > 0).astype(int)\n", "accidents_df = accidents_df.apply(pd.to_numeric) # convert all columns of DataFrame\n" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# b. \n", "Transform the data by either normalizing or standardizing it." ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "# b. and c.\n", "scaler = preprocessing.StandardScaler()\n", "\n", "accident_cols = accidents_df.columns.values.tolist()\n", "accident_cols.remove('Injury')\n", "\n", "# split into training and validation\n", "trainData, validData = train_test_split(accidents_df, test_size=0.40, random_state=20)\n", "\n", "scaler.fit(trainData[accident_cols]) # Note the use of an array of column names\n", "\n", "# Transform the full dataset\n", "accidentNorm = pd.concat([pd.DataFrame(scaler.transform(accidents_df[accident_cols]), \n", " columns=accident_cols),\n", " accidents_df[['Injury']]], axis=1)\n", "\n", "trainNorm = accidentNorm.iloc[trainData.index]\n", "validNorm = accidentNorm.iloc[validData.index]" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# c. \n", "Use train, test, and split function to split the data into training and testing sets." ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "# c. \n", "train_X = trainNorm[accident_cols]\n", "train_y = trainNorm['Injury']\n", "valid_X = validNorm[accident_cols]\n", "valid_y = validNorm['Injury']" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# d.\n", "Select your preferred kernel type and determine the kernel values by using either grid-search or v-fold cross validation." ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Fitting 5 folds for each of 25 candidates, totalling 125 fits\n", "[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.772 total time= 33.9s\n", "[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.762 total time= 46.2s\n", "[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.769 total time= 42.3s\n", "[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.762 total time= 36.2s\n", "[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.761 total time= 34.8s\n", "[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.999 total time= 5.6s\n", "[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.996 total time= 5.8s\n", "[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.998 total time= 5.6s\n", "[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.995 total time= 5.6s\n", "[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.997 total time= 5.7s\n", "[CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=1.000 total time= 1.1s\n", "[CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=1.000 total time= 1.1s\n", "[CV 3/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=1.000 total time= 1.2s\n", "[CV 4/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.999 total time= 1.4s\n", "[CV 5/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=1.000 total time= 1.0s\n", "[CV 1/5] END ....C=0.1, gamma=0.001, kernel=rbf;, score=1.000 total time= 4.4s\n", "[CV 2/5] END ....C=0.1, gamma=0.001, kernel=rbf;, score=1.000 total time= 4.3s\n", "[CV 3/5] END ....C=0.1, gamma=0.001, kernel=rbf;, score=1.000 total time= 4.2s\n", "[CV 4/5] END ....C=0.1, gamma=0.001, kernel=rbf;, score=1.000 total time= 4.3s\n", "[CV 5/5] END ....C=0.1, gamma=0.001, kernel=rbf;, score=1.000 total time= 4.4s\n", "[CV 1/5] END ...C=0.1, gamma=0.0001, kernel=rbf;, score=1.000 total time= 26.5s\n", "[CV 2/5] END ...C=0.1, gamma=0.0001, kernel=rbf;, score=1.000 total time= 25.9s\n", "[CV 3/5] END ...C=0.1, gamma=0.0001, kernel=rbf;, score=1.000 total time= 27.3s\n", "[CV 4/5] END ...C=0.1, gamma=0.0001, kernel=rbf;, score=1.000 total time= 25.9s\n", "[CV 5/5] END ...C=0.1, gamma=0.0001, kernel=rbf;, score=1.000 total time= 26.6s\n", "[CV 1/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.923 total time= 1.1min\n", "[CV 2/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.913 total time= 1.3min\n", "[CV 3/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.917 total time= 1.1min\n", "[CV 4/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.916 total time= 1.1min\n", "[CV 5/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.912 total time= 1.1min\n", "[CV 1/5] END ........C=1, gamma=0.1, kernel=rbf;, score=1.000 total time= 3.4s\n", "[CV 2/5] END ........C=1, gamma=0.1, kernel=rbf;, score=0.999 total time= 3.3s\n", "[CV 3/5] END ........C=1, gamma=0.1, kernel=rbf;, score=0.999 total time= 3.4s\n", "[CV 4/5] END ........C=1, gamma=0.1, kernel=rbf;, score=0.999 total time= 3.3s\n", "[CV 5/5] END ........C=1, gamma=0.1, kernel=rbf;, score=1.000 total time= 3.4s\n", "[CV 1/5] END .......C=1, gamma=0.01, kernel=rbf;, score=1.000 total time= 0.3s\n", "[CV 2/5] END .......C=1, gamma=0.01, kernel=rbf;, score=1.000 total time= 0.4s\n", "[CV 3/5] END .......C=1, gamma=0.01, kernel=rbf;, score=1.000 total time= 0.4s\n", "[CV 4/5] END .......C=1, gamma=0.01, kernel=rbf;, score=0.999 total time= 0.4s\n", "[CV 5/5] END .......C=1, gamma=0.01, kernel=rbf;, score=1.000 total time= 0.4s\n", "[CV 1/5] END ......C=1, gamma=0.001, kernel=rbf;, score=1.000 total time= 0.6s\n", "[CV 2/5] END ......C=1, gamma=0.001, kernel=rbf;, score=1.000 total time= 0.6s\n", "[CV 3/5] END ......C=1, gamma=0.001, kernel=rbf;, score=1.000 total time= 0.6s\n", "[CV 4/5] END ......C=1, gamma=0.001, kernel=rbf;, score=1.000 total time= 0.6s\n", "[CV 5/5] END ......C=1, gamma=0.001, kernel=rbf;, score=1.000 total time= 0.7s\n", "[CV 1/5] END .....C=1, gamma=0.0001, kernel=rbf;, score=1.000 total time= 4.6s\n", "[CV 2/5] END .....C=1, gamma=0.0001, kernel=rbf;, score=1.000 total time= 4.9s\n", "[CV 3/5] END .....C=1, gamma=0.0001, kernel=rbf;, score=1.000 total time= 4.4s\n", "[CV 4/5] END .....C=1, gamma=0.0001, kernel=rbf;, score=1.000 total time= 5.0s\n", "[CV 5/5] END .....C=1, gamma=0.0001, kernel=rbf;, score=1.000 total time= 4.9s\n", "[CV 1/5] END .........C=10, gamma=1, kernel=rbf;, score=0.928 total time= 1.1min\n", "[CV 2/5] END .........C=10, gamma=1, kernel=rbf;, score=0.918 total time= 1.2min\n", "[CV 3/5] END .........C=10, gamma=1, kernel=rbf;, score=0.923 total time= 1.3min\n", "[CV 4/5] END .........C=10, gamma=1, kernel=rbf;, score=0.920 total time= 1.0min\n", "[CV 5/5] END .........C=10, gamma=1, kernel=rbf;, score=0.918 total time= 1.1min\n", "[CV 1/5] END .......C=10, gamma=0.1, kernel=rbf;, score=1.000 total time= 3.2s\n", "[CV 2/5] END .......C=10, gamma=0.1, kernel=rbf;, score=0.999 total time= 3.1s\n", "[CV 3/5] END .......C=10, gamma=0.1, kernel=rbf;, score=0.999 total time= 3.0s\n", "[CV 4/5] END .......C=10, gamma=0.1, kernel=rbf;, score=1.000 total time= 3.1s\n", "[CV 5/5] END .......C=10, gamma=0.1, kernel=rbf;, score=1.000 total time= 4.1s\n", "[CV 1/5] END ......C=10, gamma=0.01, kernel=rbf;, score=1.000 total time= 0.5s\n", "[CV 2/5] END ......C=10, gamma=0.01, kernel=rbf;, score=1.000 total time= 0.5s\n", "[CV 3/5] END ......C=10, gamma=0.01, kernel=rbf;, score=1.000 total time= 0.4s\n", "[CV 4/5] END ......C=10, gamma=0.01, kernel=rbf;, score=1.000 total time= 0.5s\n", "[CV 5/5] END ......C=10, gamma=0.01, kernel=rbf;, score=1.000 total time= 0.4s\n", "[CV 1/5] END .....C=10, gamma=0.001, kernel=rbf;, score=1.000 total time= 0.3s\n", "[CV 2/5] END .....C=10, gamma=0.001, kernel=rbf;, score=1.000 total time= 0.3s\n", "[CV 3/5] END .....C=10, gamma=0.001, kernel=rbf;, score=1.000 total time= 0.3s\n", "[CV 4/5] END .....C=10, gamma=0.001, kernel=rbf;, score=1.000 total time= 0.3s\n", "[CV 5/5] END .....C=10, gamma=0.001, kernel=rbf;, score=1.000 total time= 0.3s\n", "[CV 1/5] END ....C=10, gamma=0.0001, kernel=rbf;, score=1.000 total time= 1.0s\n", "[CV 2/5] END ....C=10, gamma=0.0001, kernel=rbf;, score=1.000 total time= 1.0s\n", "[CV 3/5] END ....C=10, gamma=0.0001, kernel=rbf;, score=1.000 total time= 1.0s\n", "[CV 4/5] END ....C=10, gamma=0.0001, kernel=rbf;, score=1.000 total time= 0.8s\n", "[CV 5/5] END ....C=10, gamma=0.0001, kernel=rbf;, score=1.000 total time= 0.9s\n", "[CV 1/5] END ........C=100, gamma=1, kernel=rbf;, score=0.928 total time= 1.1min\n", "[CV 2/5] END ........C=100, gamma=1, kernel=rbf;, score=0.918 total time= 1.1min\n", "[CV 3/5] END ........C=100, gamma=1, kernel=rbf;, score=0.923 total time= 1.1min\n", "[CV 4/5] END ........C=100, gamma=1, kernel=rbf;, score=0.920 total time= 59.7s\n", "[CV 5/5] END ........C=100, gamma=1, kernel=rbf;, score=0.918 total time= 1.0min\n", "[CV 1/5] END ......C=100, gamma=0.1, kernel=rbf;, score=1.000 total time= 3.2s\n", "[CV 2/5] END ......C=100, gamma=0.1, kernel=rbf;, score=0.999 total time= 3.0s\n", "[CV 3/5] END ......C=100, gamma=0.1, kernel=rbf;, score=0.999 total time= 3.1s\n", "[CV 4/5] END ......C=100, gamma=0.1, kernel=rbf;, score=1.000 total time= 3.1s\n", "[CV 5/5] END ......C=100, gamma=0.1, kernel=rbf;, score=1.000 total time= 3.2s\n", "[CV 1/5] END .....C=100, gamma=0.01, kernel=rbf;, score=1.000 total time= 0.3s\n", "[CV 2/5] END .....C=100, gamma=0.01, kernel=rbf;, score=1.000 total time= 0.4s\n", "[CV 3/5] END .....C=100, gamma=0.01, kernel=rbf;, score=1.000 total time= 0.3s\n", "[CV 4/5] END .....C=100, gamma=0.01, kernel=rbf;, score=1.000 total time= 0.3s\n", "[CV 5/5] END .....C=100, gamma=0.01, kernel=rbf;, score=1.000 total time= 0.3s\n", "[CV 1/5] END ....C=100, gamma=0.001, kernel=rbf;, score=1.000 total time= 0.2s\n", "[CV 2/5] END ....C=100, gamma=0.001, kernel=rbf;, score=1.000 total time= 0.2s\n", "[CV 3/5] END ....C=100, gamma=0.001, kernel=rbf;, score=1.000 total time= 0.2s\n", "[CV 4/5] END ....C=100, gamma=0.001, kernel=rbf;, score=1.000 total time= 0.2s\n", "[CV 5/5] END ....C=100, gamma=0.001, kernel=rbf;, score=1.000 total time= 0.2s\n", "[CV 1/5] END ...C=100, gamma=0.0001, kernel=rbf;, score=1.000 total time= 0.3s\n", "[CV 2/5] END ...C=100, gamma=0.0001, kernel=rbf;, score=1.000 total time= 0.4s\n", "[CV 3/5] END ...C=100, gamma=0.0001, kernel=rbf;, score=1.000 total time= 0.4s\n", "[CV 4/5] END ...C=100, gamma=0.0001, kernel=rbf;, score=1.000 total time= 0.4s\n", "[CV 5/5] END ...C=100, gamma=0.0001, kernel=rbf;, score=1.000 total time= 0.5s\n", "[CV 1/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.928 total time= 57.9s\n", "[CV 2/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.918 total time= 1.1min\n", "[CV 3/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.923 total time= 1.1min\n", "[CV 4/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.920 total time= 58.9s\n", "[CV 5/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.918 total time= 59.8s\n", "[CV 1/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=1.000 total time= 2.8s\n", "[CV 2/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.999 total time= 2.7s\n", "[CV 3/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.999 total time= 2.6s\n", "[CV 4/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=1.000 total time= 2.7s\n", "[CV 5/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=1.000 total time= 2.6s\n", "[CV 1/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=1.000 total time= 0.3s\n", "[CV 2/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=1.000 total time= 0.3s\n", "[CV 3/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=1.000 total time= 0.3s\n", "[CV 4/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=1.000 total time= 0.4s\n", "[CV 5/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=1.000 total time= 0.4s\n", "[CV 1/5] END ...C=1000, gamma=0.001, kernel=rbf;, score=1.000 total time= 0.2s\n", "[CV 2/5] END ...C=1000, gamma=0.001, kernel=rbf;, score=1.000 total time= 0.2s\n", "[CV 3/5] END ...C=1000, gamma=0.001, kernel=rbf;, score=1.000 total time= 0.3s\n", "[CV 4/5] END ...C=1000, gamma=0.001, kernel=rbf;, score=1.000 total time= 0.2s\n", "[CV 5/5] END ...C=1000, gamma=0.001, kernel=rbf;, score=1.000 total time= 0.3s\n", "[CV 1/5] END ..C=1000, gamma=0.0001, kernel=rbf;, score=1.000 total time= 0.4s\n", "[CV 2/5] END ..C=1000, gamma=0.0001, kernel=rbf;, score=1.000 total time= 0.4s\n", "[CV 3/5] END ..C=1000, gamma=0.0001, kernel=rbf;, score=1.000 total time= 0.3s\n", "[CV 4/5] END ..C=1000, gamma=0.0001, kernel=rbf;, score=1.000 total time= 0.4s\n", "[CV 5/5] END ..C=1000, gamma=0.0001, kernel=rbf;, score=1.000 total time= 0.6s\n" ] }, { "data": { "text/html": [ "
GridSearchCV(estimator=SVC(),\n",
       "             param_grid={'C': [0.1, 1, 10, 100, 1000],\n",
       "                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],\n",
       "                         'kernel': ['rbf']},\n",
       "             verbose=3)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "GridSearchCV(estimator=SVC(),\n", " param_grid={'C': [0.1, 1, 10, 100, 1000],\n", " 'gamma': [1, 0.1, 0.01, 0.001, 0.0001],\n", " 'kernel': ['rbf']},\n", " verbose=3)" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# d.\n", "param_grid = {'C': [0.1,1, 10, 100, 1000], 'gamma': [1,0.1,0.01,0.001,0.0001], 'kernel': ['rbf']} \n", "grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=3)\n", "grid.fit(train_X,train_y)\n" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "text/html": [ "
SVC(C=0.1, gamma=0.001)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "SVC(C=0.1, gamma=0.001)" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Best options:\n", "grid.best_params_\n", "print()\n", "grid.best_estimator_" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# e.\n", "Run a SVM classifier using identified kernel values found in (d)." ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "# e.\n", "grid_predictions = grid.predict(valid_X)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# f.\n", "Obtain the confusion matrix." ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[8270 0]\n", " [ 0 8604]]\n" ] } ], "source": [ "# f. \n", "print(confusion_matrix(valid_y,grid_predictions))" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# g.\n", "What is the overall error for the validation set?" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " 0 1.00 1.00 1.00 8270\n", " 1 1.00 1.00 1.00 8604\n", "\n", " accuracy 1.00 16874\n", " macro avg 1.00 1.00 1.00 16874\n", "weighted avg 1.00 1.00 1.00 16874\n", "\n" ] } ], "source": [ "# g. \n", "print(classification_report(valid_y,grid_predictions))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.10" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }