diff --git a/.~lock.accidentsFull.csv# b/.~lock.accidentsFull.csv# deleted file mode 100644 index 4f6a001..0000000 --- a/.~lock.accidentsFull.csv# +++ /dev/null @@ -1 +0,0 @@ -,noah,NovaArchSys,29.03.2023 17:55,file:///home/noah/.config/libreoffice/4; \ No newline at end of file diff --git a/Schrick-Noah_Learning-Practice-8.ipynb b/Schrick-Noah_Learning-Practice-8.ipynb index 1f658ba..75b770e 100644 --- a/Schrick-Noah_Learning-Practice-8.ipynb +++ b/Schrick-Noah_Learning-Practice-8.ipynb @@ -578,11 +578,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 202, "metadata": {}, "outputs": [], "source": [ - "# c." + "# c.\n", + "trainData, validData = train_test_split(accidents_df, test_size=0.4, random_state=26)" ] }, { @@ -595,12 +596,23 @@ ] }, { - "cell_type": "code", - "execution_count": null, + "attachments": {}, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "# i" + "# i. \n", + "HOUR_I_R \n", + "ALIGN_I \n", + "WRK_ZONE \n", + "WKDY_I_R \n", + "INT_HWY \n", + "LGTCON_I_R \n", + "PROFIL_I_R \n", + "SPD_LIM \n", + "SUR_CON \n", + "TRAF_CON_R \n", + "TRAF_WAY \n", + "WEATHER_R" ] }, { @@ -609,16 +621,53 @@ "metadata": {}, "source": [ "# ii. \n", - "Run a naive Bayes classifier on the complete training set with the relevant predictors (and INJURY as the response). Note that all predictors are categorical. Show the confusion matrix.\n" + "Run a naive Bayes classifier on the complete training set with the relevant predictors (and INJURY as the response). Note that all predictors are categorical. Show the confusion matrix." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 209, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training\n", + "Confusion Matrix (Accuracy 0.5291)\n", + "\n", + " Prediction\n", + "Actual 0 1\n", + " 0 4197 8195\n", + " 1 3724 9193\n" + ] + } + ], "source": [ - "# ii." + "# ii.\n", + "predictors = ['HOUR_I_R', 'ALIGN_I', 'WRK_ZONE', 'WKDY_I_R', 'INT_HWY',\n", + " 'LGTCON_I_R', 'PROFIL_I_R', 'SPD_LIM', 'SUR_COND', \n", + " 'TRAF_CON_R', 'TRAF_WAY', 'WEATHER_R']\n", + "\n", + "X = pd.get_dummies(accidents_df[predictors])\n", + "y = accidents_df['Injury']\n", + "\n", + "X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.40, random_state=1)\n", + "\n", + "# run naive Bayes\n", + "delays_nb = MultinomialNB(alpha=0.01)\n", + "delays_nb.fit(X_train, y_train)\n", + "\n", + "# predict probabilities\n", + "predProb_train = delays_nb.predict_proba(X_train)\n", + "predProb_valid = delays_nb.predict_proba(X_valid)\n", + "\n", + "# predict class membership\n", + "y_valid_pred = delays_nb.predict(X_valid)\n", + "y_train_pred = delays_nb.predict(X_train)\n", + "\n", + "print(\"Training\")\n", + "classificationSummary(y_train, y_train_pred) " ] }, { @@ -627,16 +676,32 @@ "metadata": {}, "source": [ "# iii. \n", - "What is the overall error for the validation set?\n" + "What is the overall error for the validation set?" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 210, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Validation\n", + "Confusion Matrix (Accuracy 0.5288)\n", + "\n", + " Prediction\n", + "Actual 0 1\n", + " 0 2838 5491\n", + " 1 2460 6085\n" + ] + } + ], "source": [ - "# iii." + "# iii.\n", + "print(\"Validation\")\n", + "classificationSummary(y_valid, y_valid_pred) " ] }, { @@ -650,11 +715,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 217, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.057\n" + ] + } + ], "source": [ - "# iv." + "# iv.\n", + "pctg_inc = round(100* abs(0.5288 - 0.5291)/(0.5291), 3)\n", + "print(pctg_inc)\n" ] }, { @@ -667,12 +742,12 @@ ] }, { - "cell_type": "code", - "execution_count": null, + "attachments": {}, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "# v." + "The probability is rounded to 0 due to the extremely low likelihood of sustaining an injury at such low speeds. \n", + "The pivot tables display values ranging from E-6 to E-9, which is assumed as 0." ] } ],