From 2710b772edf98de251e4a75bd20377dd5f68e094 Mon Sep 17 00:00:00 2001 From: noah Date: Thu, 30 Mar 2023 15:49:01 -0500 Subject: [PATCH] Starting Naive Bayes --- .~lock.accidentsFull.csv# | 1 + Schrick-Noah_Learning-Practice-8.ipynb | 242 ++++++++++++++++++++++--- 2 files changed, 219 insertions(+), 24 deletions(-) create mode 100644 .~lock.accidentsFull.csv# diff --git a/.~lock.accidentsFull.csv# b/.~lock.accidentsFull.csv# new file mode 100644 index 0000000..4f6a001 --- /dev/null +++ b/.~lock.accidentsFull.csv# @@ -0,0 +1 @@ +,noah,NovaArchSys,29.03.2023 17:55,file:///home/noah/.config/libreoffice/4; \ No newline at end of file diff --git a/Schrick-Noah_Learning-Practice-8.ipynb b/Schrick-Noah_Learning-Practice-8.ipynb index 6f81ae4..1f658ba 100644 --- a/Schrick-Noah_Learning-Practice-8.ipynb +++ b/Schrick-Noah_Learning-Practice-8.ipynb @@ -14,7 +14,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 110, "metadata": {}, "outputs": [], "source": [ @@ -28,7 +28,7 @@ "from sklearn.model_selection import train_test_split\n", "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.metrics import mean_squared_error\n", - "from math import sqrt\n", + "from math import isnan\n", "\n", "import matplotlib.pylab as plt\n", "from dmba import classificationSummary, gainsChart\n", @@ -261,22 +261,56 @@ "Using the information in this dataset, if an accident has just been reported and no further information is available, what should the prediction be? (INJURY = Yes or No?) Why?\n" ] }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.5087831590925255" + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# a.\n", + "accidents_df = pd.read_csv('accidentsFull.csv')\n", + "accidents_df['Injury'] = (accidents_df['MAX_SEV_IR'] > 0).astype(int)\n", + "\n", + "accidents_df.loc[:, 'Injury'].mean()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# a.\n", + "Viewing the available data, the average value is 0.5088, where a value of 1 means all reports involved an injury, and a value of 0 means all reports did not involve an injury. With a value of 0.5088, the prediction should be 'YES' for injury, though an accident with an injury is only slightly more likely than an accident without. " + ] + }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# b. \n", - "Select the first 12 records in the dataset and look only at the response (INJURY) and the two predictors WEATHER_R and TRAF_CON_R.\n" + "Select the first 12 records in the dataset and look only at the response (INJURY) and the two predictors WEATHER_R and TRAF_CON_R." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 88, "metadata": {}, "outputs": [], "source": [ - "# b." + "# b.\n", + "b_records = accidents_df[0:12]" ] }, { @@ -285,16 +319,39 @@ "metadata": {}, "source": [ "# i. \n", - "Create a pivot table that examines INJURY as a function of the two predictors for these 12 records. Use all three variables in the pivot table as rows/columns.\n" + "Create a pivot table that examines INJURY as a function of the two predictors for these 12 records. Use all three variables in the pivot table as rows/columns." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 179, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WEATHER_R 1 2\n", + "Injury \n", + "0 0.333333 0.666667\n", + "1 0.666667 0.333333\n", + "TRAF_CON_R 0 1 2\n", + "Injury \n", + "0 0.666667 0.222222 0.111111\n", + "1 1.000000 0.000000 0.000000\n" + ] + } + ], "source": [ - "# i." + "# i.\n", + "weather_freq = b_records[['Injury', 'WEATHER_R']].pivot_table(index='Injury', columns='WEATHER_R', aggfunc=len, fill_value=0)\n", + "weather_propTable = weather_freq.apply(lambda x: x/sum(x), axis=1)\n", + "\n", + "traf_freq = b_records[['Injury', 'TRAF_CON_R']].pivot_table(index='Injury', columns='TRAF_CON_R', aggfunc=len, fill_value=0)\n", + "traf_propTable = traf_freq.apply(lambda x: x/sum(x), axis=1)\n", + "\n", + "print(weather_propTable)\n", + "print(traf_propTable)" ] }, { @@ -303,16 +360,46 @@ "metadata": {}, "source": [ "# ii. \n", - "Compute the exact Bayes conditional probabilities of an injury (INJURY = Yes) given the six possible combinations of the predictors.\n" + "Compute the exact Bayes conditional probabilities of an injury (INJURY = Yes) given the six possible combinations of the predictors." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 189, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "P(INJURY = Yes | WEATHER_R = 1, TRAF_CON_R = 0): 0.75\n", + "P(INJURY = Yes | WEATHER_R = 2, TRAF_CON_R = 0): 0.4285714285714286\n", + "P(INJURY = Yes | WEATHER_R = 1, TRAF_CON_R = 1): 0.0\n", + "P(INJURY = Yes | WEATHER_R = 2, TRAF_CON_R = 1): 0.0\n", + "P(INJURY = Yes | WEATHER_R = 1, TRAF_CON_R = 2): 0.0\n", + "P(INJURY = Yes | WEATHER_R = 2, TRAF_CON_R = 2): 0.0\n" + ] + } + ], "source": [ - "# ii." + "# ii.\n", + "def computeInjuryprob(wpred, tpred):\n", + " p_w_inj = weather_propTable.iloc[1][wpred]\n", + " p_t_inj = traf_propTable.iloc[1][tpred]\n", + " p_inj = p_w_inj * p_t_inj\n", + "\n", + " np_w_inj = weather_propTable.iloc[0][wpred]\n", + " np_t_inj = traf_propTable.iloc[0][tpred]\n", + " np_inj = np_w_inj * np_t_inj\n", + "\n", + " return(p_inj/(p_inj+np_inj))\n", + "\n", + "print(\"P(INJURY = Yes | WEATHER_R = 1, TRAF_CON_R = 0):\", computeInjuryprob(1,0))\n", + "print(\"P(INJURY = Yes | WEATHER_R = 2, TRAF_CON_R = 0):\", computeInjuryprob(2,0))\n", + "print(\"P(INJURY = Yes | WEATHER_R = 1, TRAF_CON_R = 1):\", computeInjuryprob(1,1))\n", + "print(\"P(INJURY = Yes | WEATHER_R = 2, TRAF_CON_R = 1):\", computeInjuryprob(2,1))\n", + "print(\"P(INJURY = Yes | WEATHER_R = 1, TRAF_CON_R = 2):\", computeInjuryprob(1,2))\n", + "print(\"P(INJURY = Yes | WEATHER_R = 2, TRAF_CON_R = 2):\", computeInjuryprob(2,2))" ] }, { @@ -321,16 +408,79 @@ "metadata": {}, "source": [ "# iii. \n", - "Classify the 12 accidents using these probabilities and a cutoff of 0.5.\n" + "Classify the 12 accidents using these probabilities and a cutoff of 0.5." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 200, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " actual predicted 0 1\n", + "4 0 0 0.509515 0.490485\n", + " actual predicted 0 1\n", + "10 0 1 0.447309 0.552691\n", + "1 0 1 0.447309 0.552691\n", + " actual predicted 0 1\n", + "2 0 0 0.986395 0.013605\n", + " actual predicted 0 1\n", + "3 0 0 0.989368 0.010632\n", + " actual predicted 0 1\n", + "4 0 0 0.509515 0.490485\n", + " actual predicted 0 1\n", + "10 0 1 0.447309 0.552691\n", + "1 0 1 0.447309 0.552691\n", + " actual predicted 0 1\n", + "10 0 1 0.447309 0.552691\n", + "1 0 1 0.447309 0.552691\n", + " actual predicted 0 1\n", + "4 0 0 0.509515 0.490485\n", + " actual predicted 0 1\n", + "10 0 1 0.447309 0.552691\n", + "1 0 1 0.447309 0.552691\n", + " actual predicted 0 1\n", + "10 0 1 0.447309 0.552691\n", + "1 0 1 0.447309 0.552691\n", + " actual predicted 0 1\n", + "10 0 1 0.447309 0.552691\n", + "1 0 1 0.447309 0.552691\n", + "Empty DataFrame\n", + "Columns: [actual, predicted, 0, 1]\n", + "Index: []\n" + ] + } + ], "source": [ - "# iii." + "# iii.\n", + "preictors = ['WEATHER_R', 'TRAF_CON_R']\n", + "X = pd.get_dummies(b_records[predictors])\n", + "y = b_records['Injury']\n", + "\n", + "# split into training and validation\n", + "X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.40, random_state=1)\n", + "\n", + "# run naive Bayes\n", + "delays_nb = MultinomialNB(alpha=0.01)\n", + "delays_nb.fit(X_train, y_train)\n", + "\n", + "# predict probabilities\n", + "predProb_train = delays_nb.predict_proba(X_train)\n", + "predProb_valid = delays_nb.predict_proba(X_valid)\n", + "\n", + "# predict class membership\n", + "y_valid_pred = delays_nb.predict(X_valid)\n", + "y_train_pred = delays_nb.predict(X_train)\n", + "# Subset a specific set\n", + "df = pd.concat([pd.DataFrame({'actual': y_valid, 'predicted': y_valid_pred}),\n", + " pd.DataFrame(predProb_valid, index=y_valid.index)], axis=1)\n", + "\n", + "for index, row in b_records.iterrows():\n", + " mask = ((X_valid.WEATHER_R == row['WEATHER_R']) & (X_valid.TRAF_CON_R == row['TRAF_CON_R']))\n", + " print(df[mask])\n" ] }, { @@ -339,16 +489,35 @@ "metadata": {}, "source": [ "# iv. \n", - "Compute manually the naive Bayes conditional probability of an injury given WEATHER_R = 1 and TRAF_CON_R = 1.\n" + "Compute manually the naive Bayes conditional probability of an injury given WEATHER_R = 1 and TRAF_CON_R = 1." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 197, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WEATHER_R 1 2\n", + "Injury \n", + "0 3 6\n", + "1 2 1\n", + "TRAF_CON_R 0 1 2\n", + "Injury \n", + "0 6 2 1\n", + "1 3 0 0\n", + "0.0\n" + ] + } + ], "source": [ - "# iv." + "# iv.\n", + "print(weather_freq)\n", + "print(traf_freq)\n", + "print(((3 / 12) * ((2 / 3) * (0 / 3))) / (((3 / 12) * ((2 / 3) * (0 / 3))) + ((9 / 12) * ((3 / 9) * (2 / 9)))))\n" ] }, { @@ -366,11 +535,36 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 201, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Confusion Matrix (Accuracy 0.2857)\n", + "\n", + " Prediction\n", + "Actual 0 1\n", + " 0 1 3\n", + " 1 2 1\n", + "\n", + "Confusion Matrix (Accuracy 0.6000)\n", + "\n", + " Prediction\n", + "Actual 0 1\n", + " 0 3 2\n", + " 1 0 0\n" + ] + } + ], "source": [ - "# v." + "# v.\n", + "classificationSummary(y_train, y_train_pred) \n", + "\n", + "print()\n", + "\n", + "classificationSummary(y_valid, y_valid_pred) " ] }, {