From 2710b772edf98de251e4a75bd20377dd5f68e094 Mon Sep 17 00:00:00 2001
From: noah <noah@theschricks.com>
Date: Thu, 30 Mar 2023 15:49:01 -0500
Subject: [PATCH] Starting Naive Bayes

---
 .~lock.accidentsFull.csv#              |   1 +
 Schrick-Noah_Learning-Practice-8.ipynb | 242 ++++++++++++++++++++++---
 2 files changed, 219 insertions(+), 24 deletions(-)
 create mode 100644 .~lock.accidentsFull.csv#

diff --git a/.~lock.accidentsFull.csv# b/.~lock.accidentsFull.csv#
new file mode 100644
index 0000000..4f6a001
--- /dev/null
+++ b/.~lock.accidentsFull.csv#
@@ -0,0 +1 @@
+,noah,NovaArchSys,29.03.2023 17:55,file:///home/noah/.config/libreoffice/4;
\ No newline at end of file
diff --git a/Schrick-Noah_Learning-Practice-8.ipynb b/Schrick-Noah_Learning-Practice-8.ipynb
index 6f81ae4..1f658ba 100644
--- a/Schrick-Noah_Learning-Practice-8.ipynb
+++ b/Schrick-Noah_Learning-Practice-8.ipynb
@@ -14,7 +14,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 110,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -28,7 +28,7 @@
     "from sklearn.model_selection import train_test_split\n",
     "from sklearn.naive_bayes import MultinomialNB\n",
     "from sklearn.metrics import mean_squared_error\n",
-    "from math import sqrt\n",
+    "from math import isnan\n",
     "\n",
     "import matplotlib.pylab as plt\n",
     "from dmba import classificationSummary, gainsChart\n",
@@ -261,22 +261,56 @@
     "Using the information in this dataset, if an accident has just been reported and no further information is available, what should the prediction be? (INJURY = Yes or No?) Why?\n"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 83,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.5087831590925255"
+      ]
+     },
+     "execution_count": 83,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# a.\n",
+    "accidents_df = pd.read_csv('accidentsFull.csv')\n",
+    "accidents_df['Injury'] = (accidents_df['MAX_SEV_IR'] > 0).astype(int)\n",
+    "\n",
+    "accidents_df.loc[:, 'Injury'].mean()"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# a.\n",
+    "Viewing the available data, the average value is 0.5088, where a value of 1 means all reports involved an injury, and a value of 0 means all reports did not involve an injury. With a value of 0.5088, the prediction should be 'YES' for injury, though an accident with an injury is only slightly more likely than an accident without. "
+   ]
+  },
   {
    "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "# b. \n",
-    "Select the first 12 records in the dataset and look only at the response (INJURY) and the two predictors WEATHER_R and TRAF_CON_R.\n"
+    "Select the first 12 records in the dataset and look only at the response (INJURY) and the two predictors WEATHER_R and TRAF_CON_R."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 88,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# b."
+    "# b.\n",
+    "b_records = accidents_df[0:12]"
    ]
   },
   {
@@ -285,16 +319,39 @@
    "metadata": {},
    "source": [
     "# i. \n",
-    "Create a pivot table that examines INJURY as a function of the two predictors for these 12 records. Use all three variables in the pivot table as rows/columns.\n"
+    "Create a pivot table that examines INJURY as a function of the two predictors for these 12 records. Use all three variables in the pivot table as rows/columns."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 179,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WEATHER_R         1         2\n",
+      "Injury                       \n",
+      "0          0.333333  0.666667\n",
+      "1          0.666667  0.333333\n",
+      "TRAF_CON_R         0         1         2\n",
+      "Injury                                  \n",
+      "0           0.666667  0.222222  0.111111\n",
+      "1           1.000000  0.000000  0.000000\n"
+     ]
+    }
+   ],
    "source": [
-    "# i."
+    "# i.\n",
+    "weather_freq = b_records[['Injury', 'WEATHER_R']].pivot_table(index='Injury', columns='WEATHER_R', aggfunc=len, fill_value=0)\n",
+    "weather_propTable = weather_freq.apply(lambda x: x/sum(x), axis=1)\n",
+    "\n",
+    "traf_freq = b_records[['Injury', 'TRAF_CON_R']].pivot_table(index='Injury', columns='TRAF_CON_R', aggfunc=len, fill_value=0)\n",
+    "traf_propTable = traf_freq.apply(lambda x: x/sum(x), axis=1)\n",
+    "\n",
+    "print(weather_propTable)\n",
+    "print(traf_propTable)"
    ]
   },
   {
@@ -303,16 +360,46 @@
    "metadata": {},
    "source": [
     "# ii. \n",
-    "Compute the exact Bayes conditional probabilities of an injury (INJURY = Yes) given the six possible combinations of the predictors.\n"
+    "Compute the exact Bayes conditional probabilities of an injury (INJURY = Yes) given the six possible combinations of the predictors."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 189,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "P(INJURY = Yes | WEATHER_R = 1, TRAF_CON_R = 0): 0.75\n",
+      "P(INJURY = Yes | WEATHER_R = 2, TRAF_CON_R = 0): 0.4285714285714286\n",
+      "P(INJURY = Yes | WEATHER_R = 1, TRAF_CON_R = 1): 0.0\n",
+      "P(INJURY = Yes | WEATHER_R = 2, TRAF_CON_R = 1): 0.0\n",
+      "P(INJURY = Yes | WEATHER_R = 1, TRAF_CON_R = 2): 0.0\n",
+      "P(INJURY = Yes | WEATHER_R = 2, TRAF_CON_R = 2): 0.0\n"
+     ]
+    }
+   ],
    "source": [
-    "# ii."
+    "# ii.\n",
+    "def computeInjuryprob(wpred, tpred):\n",
+    "    p_w_inj = weather_propTable.iloc[1][wpred]\n",
+    "    p_t_inj = traf_propTable.iloc[1][tpred]\n",
+    "    p_inj = p_w_inj * p_t_inj\n",
+    "\n",
+    "    np_w_inj = weather_propTable.iloc[0][wpred]\n",
+    "    np_t_inj = traf_propTable.iloc[0][tpred]\n",
+    "    np_inj = np_w_inj * np_t_inj\n",
+    "\n",
+    "    return(p_inj/(p_inj+np_inj))\n",
+    "\n",
+    "print(\"P(INJURY = Yes | WEATHER_R = 1, TRAF_CON_R = 0):\", computeInjuryprob(1,0))\n",
+    "print(\"P(INJURY = Yes | WEATHER_R = 2, TRAF_CON_R = 0):\", computeInjuryprob(2,0))\n",
+    "print(\"P(INJURY = Yes | WEATHER_R = 1, TRAF_CON_R = 1):\", computeInjuryprob(1,1))\n",
+    "print(\"P(INJURY = Yes | WEATHER_R = 2, TRAF_CON_R = 1):\", computeInjuryprob(2,1))\n",
+    "print(\"P(INJURY = Yes | WEATHER_R = 1, TRAF_CON_R = 2):\", computeInjuryprob(1,2))\n",
+    "print(\"P(INJURY = Yes | WEATHER_R = 2, TRAF_CON_R = 2):\", computeInjuryprob(2,2))"
    ]
   },
   {
@@ -321,16 +408,79 @@
    "metadata": {},
    "source": [
     "# iii. \n",
-    "Classify the 12 accidents using these probabilities and a cutoff of 0.5.\n"
+    "Classify the 12 accidents using these probabilities and a cutoff of 0.5."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 200,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "   actual  predicted         0         1\n",
+      "4       0          0  0.509515  0.490485\n",
+      "    actual  predicted         0         1\n",
+      "10       0          1  0.447309  0.552691\n",
+      "1        0          1  0.447309  0.552691\n",
+      "   actual  predicted         0         1\n",
+      "2       0          0  0.986395  0.013605\n",
+      "   actual  predicted         0         1\n",
+      "3       0          0  0.989368  0.010632\n",
+      "   actual  predicted         0         1\n",
+      "4       0          0  0.509515  0.490485\n",
+      "    actual  predicted         0         1\n",
+      "10       0          1  0.447309  0.552691\n",
+      "1        0          1  0.447309  0.552691\n",
+      "    actual  predicted         0         1\n",
+      "10       0          1  0.447309  0.552691\n",
+      "1        0          1  0.447309  0.552691\n",
+      "   actual  predicted         0         1\n",
+      "4       0          0  0.509515  0.490485\n",
+      "    actual  predicted         0         1\n",
+      "10       0          1  0.447309  0.552691\n",
+      "1        0          1  0.447309  0.552691\n",
+      "    actual  predicted         0         1\n",
+      "10       0          1  0.447309  0.552691\n",
+      "1        0          1  0.447309  0.552691\n",
+      "    actual  predicted         0         1\n",
+      "10       0          1  0.447309  0.552691\n",
+      "1        0          1  0.447309  0.552691\n",
+      "Empty DataFrame\n",
+      "Columns: [actual, predicted, 0, 1]\n",
+      "Index: []\n"
+     ]
+    }
+   ],
    "source": [
-    "# iii."
+    "# iii.\n",
+    "preictors = ['WEATHER_R', 'TRAF_CON_R']\n",
+    "X = pd.get_dummies(b_records[predictors])\n",
+    "y = b_records['Injury']\n",
+    "\n",
+    "# split into training and validation\n",
+    "X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.40, random_state=1)\n",
+    "\n",
+    "# run naive Bayes\n",
+    "delays_nb = MultinomialNB(alpha=0.01)\n",
+    "delays_nb.fit(X_train, y_train)\n",
+    "\n",
+    "# predict probabilities\n",
+    "predProb_train = delays_nb.predict_proba(X_train)\n",
+    "predProb_valid = delays_nb.predict_proba(X_valid)\n",
+    "\n",
+    "# predict class membership\n",
+    "y_valid_pred = delays_nb.predict(X_valid)\n",
+    "y_train_pred = delays_nb.predict(X_train)\n",
+    "# Subset a specific set\n",
+    "df = pd.concat([pd.DataFrame({'actual': y_valid, 'predicted': y_valid_pred}),\n",
+    "                pd.DataFrame(predProb_valid, index=y_valid.index)], axis=1)\n",
+    "\n",
+    "for index, row in b_records.iterrows():\n",
+    "    mask = ((X_valid.WEATHER_R == row['WEATHER_R']) & (X_valid.TRAF_CON_R == row['TRAF_CON_R']))\n",
+    "    print(df[mask])\n"
    ]
   },
   {
@@ -339,16 +489,35 @@
    "metadata": {},
    "source": [
     "# iv. \n",
-    "Compute manually the naive Bayes conditional probability of an injury given WEATHER_R = 1 and TRAF_CON_R = 1.\n"
+    "Compute manually the naive Bayes conditional probability of an injury given WEATHER_R = 1 and TRAF_CON_R = 1."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 197,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WEATHER_R  1  2\n",
+      "Injury         \n",
+      "0          3  6\n",
+      "1          2  1\n",
+      "TRAF_CON_R  0  1  2\n",
+      "Injury             \n",
+      "0           6  2  1\n",
+      "1           3  0  0\n",
+      "0.0\n"
+     ]
+    }
+   ],
    "source": [
-    "# iv."
+    "# iv.\n",
+    "print(weather_freq)\n",
+    "print(traf_freq)\n",
+    "print(((3 / 12) * ((2 / 3) * (0 / 3))) / (((3 / 12) * ((2 / 3) * (0 / 3))) + ((9 / 12) * ((3 / 9) * (2 / 9)))))\n"
    ]
   },
   {
@@ -366,11 +535,36 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 201,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Confusion Matrix (Accuracy 0.2857)\n",
+      "\n",
+      "       Prediction\n",
+      "Actual 0 1\n",
+      "     0 1 3\n",
+      "     1 2 1\n",
+      "\n",
+      "Confusion Matrix (Accuracy 0.6000)\n",
+      "\n",
+      "       Prediction\n",
+      "Actual 0 1\n",
+      "     0 3 2\n",
+      "     1 0 0\n"
+     ]
+    }
+   ],
    "source": [
-    "# v."
+    "# v.\n",
+    "classificationSummary(y_train, y_train_pred) \n",
+    "\n",
+    "print()\n",
+    "\n",
+    "classificationSummary(y_valid, y_valid_pred) "
    ]
   },
   {