Starting Naive Bayes

This commit is contained in:
Noah L. Schrick 2023-03-30 15:49:01 -05:00
parent 2510582087
commit 2710b772ed
2 changed files with 219 additions and 24 deletions

View File

@ -0,0 +1 @@
,noah,NovaArchSys,29.03.2023 17:55,file:///home/noah/.config/libreoffice/4;

View File

@ -14,7 +14,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 16, "execution_count": 110,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -28,7 +28,7 @@
"from sklearn.model_selection import train_test_split\n", "from sklearn.model_selection import train_test_split\n",
"from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.naive_bayes import MultinomialNB\n",
"from sklearn.metrics import mean_squared_error\n", "from sklearn.metrics import mean_squared_error\n",
"from math import sqrt\n", "from math import isnan\n",
"\n", "\n",
"import matplotlib.pylab as plt\n", "import matplotlib.pylab as plt\n",
"from dmba import classificationSummary, gainsChart\n", "from dmba import classificationSummary, gainsChart\n",
@ -261,22 +261,56 @@
"Using the information in this dataset, if an accident has just been reported and no further information is available, what should the prediction be? (INJURY = Yes or No?) Why?\n" "Using the information in this dataset, if an accident has just been reported and no further information is available, what should the prediction be? (INJURY = Yes or No?) Why?\n"
] ]
}, },
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.5087831590925255"
]
},
"execution_count": 83,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# a.\n",
"accidents_df = pd.read_csv('accidentsFull.csv')\n",
"accidents_df['Injury'] = (accidents_df['MAX_SEV_IR'] > 0).astype(int)\n",
"\n",
"accidents_df.loc[:, 'Injury'].mean()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# a.\n",
"Viewing the available data, the average value is 0.5088, where a value of 1 means all reports involved an injury, and a value of 0 means all reports did not involve an injury. With a value of 0.5088, the prediction should be 'YES' for injury, though an accident with an injury is only slightly more likely than an accident without. "
]
},
{ {
"attachments": {}, "attachments": {},
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"# b. \n", "# b. \n",
"Select the first 12 records in the dataset and look only at the response (INJURY) and the two predictors WEATHER_R and TRAF_CON_R.\n" "Select the first 12 records in the dataset and look only at the response (INJURY) and the two predictors WEATHER_R and TRAF_CON_R."
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 88,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# b." "# b.\n",
"b_records = accidents_df[0:12]"
] ]
}, },
{ {
@ -285,16 +319,39 @@
"metadata": {}, "metadata": {},
"source": [ "source": [
"# i. \n", "# i. \n",
"Create a pivot table that examines INJURY as a function of the two predictors for these 12 records. Use all three variables in the pivot table as rows/columns.\n" "Create a pivot table that examines INJURY as a function of the two predictors for these 12 records. Use all three variables in the pivot table as rows/columns."
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 179,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"WEATHER_R 1 2\n",
"Injury \n",
"0 0.333333 0.666667\n",
"1 0.666667 0.333333\n",
"TRAF_CON_R 0 1 2\n",
"Injury \n",
"0 0.666667 0.222222 0.111111\n",
"1 1.000000 0.000000 0.000000\n"
]
}
],
"source": [ "source": [
"# i." "# i.\n",
"weather_freq = b_records[['Injury', 'WEATHER_R']].pivot_table(index='Injury', columns='WEATHER_R', aggfunc=len, fill_value=0)\n",
"weather_propTable = weather_freq.apply(lambda x: x/sum(x), axis=1)\n",
"\n",
"traf_freq = b_records[['Injury', 'TRAF_CON_R']].pivot_table(index='Injury', columns='TRAF_CON_R', aggfunc=len, fill_value=0)\n",
"traf_propTable = traf_freq.apply(lambda x: x/sum(x), axis=1)\n",
"\n",
"print(weather_propTable)\n",
"print(traf_propTable)"
] ]
}, },
{ {
@ -303,16 +360,46 @@
"metadata": {}, "metadata": {},
"source": [ "source": [
"# ii. \n", "# ii. \n",
"Compute the exact Bayes conditional probabilities of an injury (INJURY = Yes) given the six possible combinations of the predictors.\n" "Compute the exact Bayes conditional probabilities of an injury (INJURY = Yes) given the six possible combinations of the predictors."
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 189,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"P(INJURY = Yes | WEATHER_R = 1, TRAF_CON_R = 0): 0.75\n",
"P(INJURY = Yes | WEATHER_R = 2, TRAF_CON_R = 0): 0.4285714285714286\n",
"P(INJURY = Yes | WEATHER_R = 1, TRAF_CON_R = 1): 0.0\n",
"P(INJURY = Yes | WEATHER_R = 2, TRAF_CON_R = 1): 0.0\n",
"P(INJURY = Yes | WEATHER_R = 1, TRAF_CON_R = 2): 0.0\n",
"P(INJURY = Yes | WEATHER_R = 2, TRAF_CON_R = 2): 0.0\n"
]
}
],
"source": [ "source": [
"# ii." "# ii.\n",
"def computeInjuryprob(wpred, tpred):\n",
" p_w_inj = weather_propTable.iloc[1][wpred]\n",
" p_t_inj = traf_propTable.iloc[1][tpred]\n",
" p_inj = p_w_inj * p_t_inj\n",
"\n",
" np_w_inj = weather_propTable.iloc[0][wpred]\n",
" np_t_inj = traf_propTable.iloc[0][tpred]\n",
" np_inj = np_w_inj * np_t_inj\n",
"\n",
" return(p_inj/(p_inj+np_inj))\n",
"\n",
"print(\"P(INJURY = Yes | WEATHER_R = 1, TRAF_CON_R = 0):\", computeInjuryprob(1,0))\n",
"print(\"P(INJURY = Yes | WEATHER_R = 2, TRAF_CON_R = 0):\", computeInjuryprob(2,0))\n",
"print(\"P(INJURY = Yes | WEATHER_R = 1, TRAF_CON_R = 1):\", computeInjuryprob(1,1))\n",
"print(\"P(INJURY = Yes | WEATHER_R = 2, TRAF_CON_R = 1):\", computeInjuryprob(2,1))\n",
"print(\"P(INJURY = Yes | WEATHER_R = 1, TRAF_CON_R = 2):\", computeInjuryprob(1,2))\n",
"print(\"P(INJURY = Yes | WEATHER_R = 2, TRAF_CON_R = 2):\", computeInjuryprob(2,2))"
] ]
}, },
{ {
@ -321,16 +408,79 @@
"metadata": {}, "metadata": {},
"source": [ "source": [
"# iii. \n", "# iii. \n",
"Classify the 12 accidents using these probabilities and a cutoff of 0.5.\n" "Classify the 12 accidents using these probabilities and a cutoff of 0.5."
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 200,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" actual predicted 0 1\n",
"4 0 0 0.509515 0.490485\n",
" actual predicted 0 1\n",
"10 0 1 0.447309 0.552691\n",
"1 0 1 0.447309 0.552691\n",
" actual predicted 0 1\n",
"2 0 0 0.986395 0.013605\n",
" actual predicted 0 1\n",
"3 0 0 0.989368 0.010632\n",
" actual predicted 0 1\n",
"4 0 0 0.509515 0.490485\n",
" actual predicted 0 1\n",
"10 0 1 0.447309 0.552691\n",
"1 0 1 0.447309 0.552691\n",
" actual predicted 0 1\n",
"10 0 1 0.447309 0.552691\n",
"1 0 1 0.447309 0.552691\n",
" actual predicted 0 1\n",
"4 0 0 0.509515 0.490485\n",
" actual predicted 0 1\n",
"10 0 1 0.447309 0.552691\n",
"1 0 1 0.447309 0.552691\n",
" actual predicted 0 1\n",
"10 0 1 0.447309 0.552691\n",
"1 0 1 0.447309 0.552691\n",
" actual predicted 0 1\n",
"10 0 1 0.447309 0.552691\n",
"1 0 1 0.447309 0.552691\n",
"Empty DataFrame\n",
"Columns: [actual, predicted, 0, 1]\n",
"Index: []\n"
]
}
],
"source": [ "source": [
"# iii." "# iii.\n",
"preictors = ['WEATHER_R', 'TRAF_CON_R']\n",
"X = pd.get_dummies(b_records[predictors])\n",
"y = b_records['Injury']\n",
"\n",
"# split into training and validation\n",
"X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.40, random_state=1)\n",
"\n",
"# run naive Bayes\n",
"delays_nb = MultinomialNB(alpha=0.01)\n",
"delays_nb.fit(X_train, y_train)\n",
"\n",
"# predict probabilities\n",
"predProb_train = delays_nb.predict_proba(X_train)\n",
"predProb_valid = delays_nb.predict_proba(X_valid)\n",
"\n",
"# predict class membership\n",
"y_valid_pred = delays_nb.predict(X_valid)\n",
"y_train_pred = delays_nb.predict(X_train)\n",
"# Subset a specific set\n",
"df = pd.concat([pd.DataFrame({'actual': y_valid, 'predicted': y_valid_pred}),\n",
" pd.DataFrame(predProb_valid, index=y_valid.index)], axis=1)\n",
"\n",
"for index, row in b_records.iterrows():\n",
" mask = ((X_valid.WEATHER_R == row['WEATHER_R']) & (X_valid.TRAF_CON_R == row['TRAF_CON_R']))\n",
" print(df[mask])\n"
] ]
}, },
{ {
@ -339,16 +489,35 @@
"metadata": {}, "metadata": {},
"source": [ "source": [
"# iv. \n", "# iv. \n",
"Compute manually the naive Bayes conditional probability of an injury given WEATHER_R = 1 and TRAF_CON_R = 1.\n" "Compute manually the naive Bayes conditional probability of an injury given WEATHER_R = 1 and TRAF_CON_R = 1."
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 197,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"WEATHER_R 1 2\n",
"Injury \n",
"0 3 6\n",
"1 2 1\n",
"TRAF_CON_R 0 1 2\n",
"Injury \n",
"0 6 2 1\n",
"1 3 0 0\n",
"0.0\n"
]
}
],
"source": [ "source": [
"# iv." "# iv.\n",
"print(weather_freq)\n",
"print(traf_freq)\n",
"print(((3 / 12) * ((2 / 3) * (0 / 3))) / (((3 / 12) * ((2 / 3) * (0 / 3))) + ((9 / 12) * ((3 / 9) * (2 / 9)))))\n"
] ]
}, },
{ {
@ -366,11 +535,36 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 201,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Confusion Matrix (Accuracy 0.2857)\n",
"\n",
" Prediction\n",
"Actual 0 1\n",
" 0 1 3\n",
" 1 2 1\n",
"\n",
"Confusion Matrix (Accuracy 0.6000)\n",
"\n",
" Prediction\n",
"Actual 0 1\n",
" 0 3 2\n",
" 1 0 0\n"
]
}
],
"source": [ "source": [
"# v." "# v.\n",
"classificationSummary(y_train, y_train_pred) \n",
"\n",
"print()\n",
"\n",
"classificationSummary(y_valid, y_valid_pred) "
] ]
}, },
{ {