diff --git a/Schrick-Noah_Learning-Practice-9.ipynb b/Schrick-Noah_Learning-Practice-9.ipynb index 03a1508..d617e76 100644 --- a/Schrick-Noah_Learning-Practice-9.ipynb +++ b/Schrick-Noah_Learning-Practice-9.ipynb @@ -14,7 +14,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ @@ -25,7 +25,9 @@ "import seaborn as sns\n", "from sklearn import preprocessing\n", "from sklearn.model_selection import train_test_split\n", - "\n", + "from sklearn.svm import SVC\n", + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.metrics import classification_report,confusion_matrix\n", "\n", "%matplotlib inline" ] @@ -41,12 +43,13 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "# a\n", "accidents_df = pd.read_csv('accidentsFull.csv')\n", + "accidents_df['Injury'] = (accidents_df['MAX_SEV_IR'] > 0).astype(int)\n", "accidents_df = accidents_df.apply(pd.to_numeric) # convert all columns of DataFrame\n" ] }, @@ -61,21 +64,28 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ - "# b.\n", - "## Normalize\n", + "# b. and c.\n", "scaler = preprocessing.StandardScaler()\n", "\n", "accident_cols = accidents_df.columns.values.tolist()\n", + "accident_cols.remove('Injury')\n", "\n", - "scaler.fit(accidents_df[accident_cols]) # Note the use of an array of column names\n", + "# split into training and validation\n", + "trainData, validData = train_test_split(accidents_df, test_size=0.40, random_state=20)\n", + "\n", + "scaler.fit(trainData[accident_cols]) # Note the use of an array of column names\n", "\n", "# Transform the full dataset\n", - "accidentsNorm = pd.DataFrame(scaler.transform(accidents_df[accident_cols]), \n", - " columns=accident_cols)" + "accidentNorm = pd.concat([pd.DataFrame(scaler.transform(accidents_df[accident_cols]), \n", + " columns=accident_cols),\n", + " accidents_df[['Injury']]], axis=1)\n", + "\n", + "trainNorm = accidentNorm.iloc[trainData.index]\n", + "validNorm = accidentNorm.iloc[validData.index]" ] }, { @@ -89,12 +99,15 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ - "# c.\n", - "trainData, validData = train_test_split(accidentsNorm, test_size=0.4, random_state=26)" + "# c. \n", + "train_X = trainNorm[accident_cols]\n", + "train_y = trainNorm['Injury']\n", + "valid_X = validNorm[accident_cols]\n", + "valid_y = validNorm['Injury']" ] }, { @@ -108,11 +121,205 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fitting 5 folds for each of 25 candidates, totalling 125 fits\n", + "[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.772 total time= 33.9s\n", + "[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.762 total time= 46.2s\n", + "[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.769 total time= 42.3s\n", + "[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.762 total time= 36.2s\n", + "[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.761 total time= 34.8s\n", + "[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.999 total time= 5.6s\n", + "[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.996 total time= 5.8s\n", + "[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.998 total time= 5.6s\n", + "[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.995 total time= 5.6s\n", + "[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.997 total time= 5.7s\n", + "[CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=1.000 total time= 1.1s\n", + "[CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=1.000 total time= 1.1s\n", + "[CV 3/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=1.000 total time= 1.2s\n", + "[CV 4/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.999 total time= 1.4s\n", + "[CV 5/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=1.000 total time= 1.0s\n", + "[CV 1/5] END ....C=0.1, gamma=0.001, kernel=rbf;, score=1.000 total time= 4.4s\n", + "[CV 2/5] END ....C=0.1, gamma=0.001, kernel=rbf;, score=1.000 total time= 4.3s\n", + "[CV 3/5] END ....C=0.1, gamma=0.001, kernel=rbf;, score=1.000 total time= 4.2s\n", + "[CV 4/5] END ....C=0.1, gamma=0.001, kernel=rbf;, score=1.000 total time= 4.3s\n", + "[CV 5/5] END ....C=0.1, gamma=0.001, kernel=rbf;, score=1.000 total time= 4.4s\n", + "[CV 1/5] END ...C=0.1, gamma=0.0001, kernel=rbf;, score=1.000 total time= 26.5s\n", + "[CV 2/5] END ...C=0.1, gamma=0.0001, kernel=rbf;, score=1.000 total time= 25.9s\n", + "[CV 3/5] END ...C=0.1, gamma=0.0001, kernel=rbf;, score=1.000 total time= 27.3s\n", + "[CV 4/5] END ...C=0.1, gamma=0.0001, kernel=rbf;, score=1.000 total time= 25.9s\n", + "[CV 5/5] END ...C=0.1, gamma=0.0001, kernel=rbf;, score=1.000 total time= 26.6s\n", + "[CV 1/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.923 total time= 1.1min\n", + "[CV 2/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.913 total time= 1.3min\n", + "[CV 3/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.917 total time= 1.1min\n", + "[CV 4/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.916 total time= 1.1min\n", + "[CV 5/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.912 total time= 1.1min\n", + "[CV 1/5] END ........C=1, gamma=0.1, kernel=rbf;, score=1.000 total time= 3.4s\n", + "[CV 2/5] END ........C=1, gamma=0.1, kernel=rbf;, score=0.999 total time= 3.3s\n", + "[CV 3/5] END ........C=1, gamma=0.1, kernel=rbf;, score=0.999 total time= 3.4s\n", + "[CV 4/5] END ........C=1, gamma=0.1, kernel=rbf;, score=0.999 total time= 3.3s\n", + "[CV 5/5] END ........C=1, gamma=0.1, kernel=rbf;, score=1.000 total time= 3.4s\n", + "[CV 1/5] END .......C=1, gamma=0.01, kernel=rbf;, score=1.000 total time= 0.3s\n", + "[CV 2/5] END .......C=1, gamma=0.01, kernel=rbf;, score=1.000 total time= 0.4s\n", + "[CV 3/5] END .......C=1, gamma=0.01, kernel=rbf;, score=1.000 total time= 0.4s\n", + "[CV 4/5] END .......C=1, gamma=0.01, kernel=rbf;, score=0.999 total time= 0.4s\n", + "[CV 5/5] END .......C=1, gamma=0.01, kernel=rbf;, score=1.000 total time= 0.4s\n", + "[CV 1/5] END ......C=1, gamma=0.001, kernel=rbf;, score=1.000 total time= 0.6s\n", + "[CV 2/5] END ......C=1, gamma=0.001, kernel=rbf;, score=1.000 total time= 0.6s\n", + "[CV 3/5] END ......C=1, gamma=0.001, kernel=rbf;, score=1.000 total time= 0.6s\n", + "[CV 4/5] END ......C=1, gamma=0.001, kernel=rbf;, score=1.000 total time= 0.6s\n", + "[CV 5/5] END ......C=1, gamma=0.001, kernel=rbf;, score=1.000 total time= 0.7s\n", + "[CV 1/5] END .....C=1, gamma=0.0001, kernel=rbf;, score=1.000 total time= 4.6s\n", + "[CV 2/5] END .....C=1, gamma=0.0001, kernel=rbf;, score=1.000 total time= 4.9s\n", + "[CV 3/5] END .....C=1, gamma=0.0001, kernel=rbf;, score=1.000 total time= 4.4s\n", + "[CV 4/5] END .....C=1, gamma=0.0001, kernel=rbf;, score=1.000 total time= 5.0s\n", + "[CV 5/5] END .....C=1, gamma=0.0001, kernel=rbf;, score=1.000 total time= 4.9s\n", + "[CV 1/5] END .........C=10, gamma=1, kernel=rbf;, score=0.928 total time= 1.1min\n", + "[CV 2/5] END .........C=10, gamma=1, kernel=rbf;, score=0.918 total time= 1.2min\n", + "[CV 3/5] END .........C=10, gamma=1, kernel=rbf;, score=0.923 total time= 1.3min\n", + "[CV 4/5] END .........C=10, gamma=1, kernel=rbf;, score=0.920 total time= 1.0min\n", + "[CV 5/5] END .........C=10, gamma=1, kernel=rbf;, score=0.918 total time= 1.1min\n", + "[CV 1/5] END .......C=10, gamma=0.1, kernel=rbf;, score=1.000 total time= 3.2s\n", + "[CV 2/5] END .......C=10, gamma=0.1, kernel=rbf;, score=0.999 total time= 3.1s\n", + "[CV 3/5] END .......C=10, gamma=0.1, kernel=rbf;, score=0.999 total time= 3.0s\n", + "[CV 4/5] END .......C=10, gamma=0.1, kernel=rbf;, score=1.000 total time= 3.1s\n", + "[CV 5/5] END .......C=10, gamma=0.1, kernel=rbf;, score=1.000 total time= 4.1s\n", + "[CV 1/5] END ......C=10, gamma=0.01, kernel=rbf;, score=1.000 total time= 0.5s\n", + "[CV 2/5] END ......C=10, gamma=0.01, kernel=rbf;, score=1.000 total time= 0.5s\n", + "[CV 3/5] END ......C=10, gamma=0.01, kernel=rbf;, score=1.000 total time= 0.4s\n", + "[CV 4/5] END ......C=10, gamma=0.01, kernel=rbf;, score=1.000 total time= 0.5s\n", + "[CV 5/5] END ......C=10, gamma=0.01, kernel=rbf;, score=1.000 total time= 0.4s\n", + "[CV 1/5] END .....C=10, gamma=0.001, kernel=rbf;, score=1.000 total time= 0.3s\n", + "[CV 2/5] END .....C=10, gamma=0.001, kernel=rbf;, score=1.000 total time= 0.3s\n", + "[CV 3/5] END .....C=10, gamma=0.001, kernel=rbf;, score=1.000 total time= 0.3s\n", + "[CV 4/5] END .....C=10, gamma=0.001, kernel=rbf;, score=1.000 total time= 0.3s\n", + "[CV 5/5] END .....C=10, gamma=0.001, kernel=rbf;, score=1.000 total time= 0.3s\n", + "[CV 1/5] END ....C=10, gamma=0.0001, kernel=rbf;, score=1.000 total time= 1.0s\n", + "[CV 2/5] END ....C=10, gamma=0.0001, kernel=rbf;, score=1.000 total time= 1.0s\n", + "[CV 3/5] END ....C=10, gamma=0.0001, kernel=rbf;, score=1.000 total time= 1.0s\n", + "[CV 4/5] END ....C=10, gamma=0.0001, kernel=rbf;, score=1.000 total time= 0.8s\n", + "[CV 5/5] END ....C=10, gamma=0.0001, kernel=rbf;, score=1.000 total time= 0.9s\n", + "[CV 1/5] END ........C=100, gamma=1, kernel=rbf;, score=0.928 total time= 1.1min\n", + "[CV 2/5] END ........C=100, gamma=1, kernel=rbf;, score=0.918 total time= 1.1min\n", + "[CV 3/5] END ........C=100, gamma=1, kernel=rbf;, score=0.923 total time= 1.1min\n", + "[CV 4/5] END ........C=100, gamma=1, kernel=rbf;, score=0.920 total time= 59.7s\n", + "[CV 5/5] END ........C=100, gamma=1, kernel=rbf;, score=0.918 total time= 1.0min\n", + "[CV 1/5] END ......C=100, gamma=0.1, kernel=rbf;, score=1.000 total time= 3.2s\n", + "[CV 2/5] END ......C=100, gamma=0.1, kernel=rbf;, score=0.999 total time= 3.0s\n", + "[CV 3/5] END ......C=100, gamma=0.1, kernel=rbf;, score=0.999 total time= 3.1s\n", + "[CV 4/5] END ......C=100, gamma=0.1, kernel=rbf;, score=1.000 total time= 3.1s\n", + "[CV 5/5] END ......C=100, gamma=0.1, kernel=rbf;, score=1.000 total time= 3.2s\n", + "[CV 1/5] END .....C=100, gamma=0.01, kernel=rbf;, score=1.000 total time= 0.3s\n", + "[CV 2/5] END .....C=100, gamma=0.01, kernel=rbf;, score=1.000 total time= 0.4s\n", + "[CV 3/5] END .....C=100, gamma=0.01, kernel=rbf;, score=1.000 total time= 0.3s\n", + "[CV 4/5] END .....C=100, gamma=0.01, kernel=rbf;, score=1.000 total time= 0.3s\n", + "[CV 5/5] END .....C=100, gamma=0.01, kernel=rbf;, score=1.000 total time= 0.3s\n", + "[CV 1/5] END ....C=100, gamma=0.001, kernel=rbf;, score=1.000 total time= 0.2s\n", + "[CV 2/5] END ....C=100, gamma=0.001, kernel=rbf;, score=1.000 total time= 0.2s\n", + "[CV 3/5] END ....C=100, gamma=0.001, kernel=rbf;, score=1.000 total time= 0.2s\n", + "[CV 4/5] END ....C=100, gamma=0.001, kernel=rbf;, score=1.000 total time= 0.2s\n", + "[CV 5/5] END ....C=100, gamma=0.001, kernel=rbf;, score=1.000 total time= 0.2s\n", + "[CV 1/5] END ...C=100, gamma=0.0001, kernel=rbf;, score=1.000 total time= 0.3s\n", + "[CV 2/5] END ...C=100, gamma=0.0001, kernel=rbf;, score=1.000 total time= 0.4s\n", + "[CV 3/5] END ...C=100, gamma=0.0001, kernel=rbf;, score=1.000 total time= 0.4s\n", + "[CV 4/5] END ...C=100, gamma=0.0001, kernel=rbf;, score=1.000 total time= 0.4s\n", + "[CV 5/5] END ...C=100, gamma=0.0001, kernel=rbf;, score=1.000 total time= 0.5s\n", + "[CV 1/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.928 total time= 57.9s\n", + "[CV 2/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.918 total time= 1.1min\n", + "[CV 3/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.923 total time= 1.1min\n", + "[CV 4/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.920 total time= 58.9s\n", + "[CV 5/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.918 total time= 59.8s\n", + "[CV 1/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=1.000 total time= 2.8s\n", + "[CV 2/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.999 total time= 2.7s\n", + "[CV 3/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.999 total time= 2.6s\n", + "[CV 4/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=1.000 total time= 2.7s\n", + "[CV 5/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=1.000 total time= 2.6s\n", + "[CV 1/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=1.000 total time= 0.3s\n", + "[CV 2/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=1.000 total time= 0.3s\n", + "[CV 3/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=1.000 total time= 0.3s\n", + "[CV 4/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=1.000 total time= 0.4s\n", + "[CV 5/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=1.000 total time= 0.4s\n", + "[CV 1/5] END ...C=1000, gamma=0.001, kernel=rbf;, score=1.000 total time= 0.2s\n", + "[CV 2/5] END ...C=1000, gamma=0.001, kernel=rbf;, score=1.000 total time= 0.2s\n", + "[CV 3/5] END ...C=1000, gamma=0.001, kernel=rbf;, score=1.000 total time= 0.3s\n", + "[CV 4/5] END ...C=1000, gamma=0.001, kernel=rbf;, score=1.000 total time= 0.2s\n", + "[CV 5/5] END ...C=1000, gamma=0.001, kernel=rbf;, score=1.000 total time= 0.3s\n", + "[CV 1/5] END ..C=1000, gamma=0.0001, kernel=rbf;, score=1.000 total time= 0.4s\n", + "[CV 2/5] END ..C=1000, gamma=0.0001, kernel=rbf;, score=1.000 total time= 0.4s\n", + "[CV 3/5] END ..C=1000, gamma=0.0001, kernel=rbf;, score=1.000 total time= 0.3s\n", + "[CV 4/5] END ..C=1000, gamma=0.0001, kernel=rbf;, score=1.000 total time= 0.4s\n", + "[CV 5/5] END ..C=1000, gamma=0.0001, kernel=rbf;, score=1.000 total time= 0.6s\n" + ] + }, + { + "data": { + "text/html": [ + "
GridSearchCV(estimator=SVC(),\n",
+       "             param_grid={'C': [0.1, 1, 10, 100, 1000],\n",
+       "                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],\n",
+       "                         'kernel': ['rbf']},\n",
+       "             verbose=3)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "GridSearchCV(estimator=SVC(),\n", + " param_grid={'C': [0.1, 1, 10, 100, 1000],\n", + " 'gamma': [1, 0.1, 0.01, 0.001, 0.0001],\n", + " 'kernel': ['rbf']},\n", + " verbose=3)" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# d." + "# d.\n", + "param_grid = {'C': [0.1,1, 10, 100, 1000], 'gamma': [1,0.1,0.01,0.001,0.0001], 'kernel': ['rbf']} \n", + "grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=3)\n", + "grid.fit(train_X,train_y)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/html": [ + "
SVC(C=0.1, gamma=0.001)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "SVC(C=0.1, gamma=0.001)" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Best options:\n", + "grid.best_params_\n", + "print()\n", + "grid.best_estimator_" ] }, { @@ -126,11 +333,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 42, "metadata": {}, "outputs": [], "source": [ - "# e." + "# e.\n", + "grid_predictions = grid.predict(valid_X)" ] }, { @@ -144,11 +352,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 43, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[8270 0]\n", + " [ 0 8604]]\n" + ] + } + ], "source": [ - "# f. " + "# f. \n", + "print(confusion_matrix(valid_y,grid_predictions))" ] }, { @@ -162,11 +380,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 44, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 1.00 1.00 1.00 8270\n", + " 1 1.00 1.00 1.00 8604\n", + "\n", + " accuracy 1.00 16874\n", + " macro avg 1.00 1.00 1.00 16874\n", + "weighted avg 1.00 1.00 1.00 16874\n", + "\n" + ] + } + ], "source": [ - "# g. " + "# g. \n", + "print(classification_report(valid_y,grid_predictions))" ] } ], diff --git a/Schrick-Noah_Learning-Practice-9.odt b/Schrick-Noah_Learning-Practice-9.odt new file mode 100644 index 0000000..aca6a8c Binary files /dev/null and b/Schrick-Noah_Learning-Practice-9.odt differ diff --git a/Schrick-Noah_Learning-Practice-9.pdf b/Schrick-Noah_Learning-Practice-9.pdf new file mode 100644 index 0000000..f4665ad Binary files /dev/null and b/Schrick-Noah_Learning-Practice-9.pdf differ