Starting Naive Bayes
This commit is contained in:
parent
2510582087
commit
2710b772ed
1
.~lock.accidentsFull.csv#
Normal file
1
.~lock.accidentsFull.csv#
Normal file
@ -0,0 +1 @@
|
||||
,noah,NovaArchSys,29.03.2023 17:55,file:///home/noah/.config/libreoffice/4;
|
||||
@ -14,7 +14,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"execution_count": 110,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -28,7 +28,7 @@
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"from sklearn.naive_bayes import MultinomialNB\n",
|
||||
"from sklearn.metrics import mean_squared_error\n",
|
||||
"from math import sqrt\n",
|
||||
"from math import isnan\n",
|
||||
"\n",
|
||||
"import matplotlib.pylab as plt\n",
|
||||
"from dmba import classificationSummary, gainsChart\n",
|
||||
@ -261,22 +261,56 @@
|
||||
"Using the information in this dataset, if an accident has just been reported and no further information is available, what should the prediction be? (INJURY = Yes or No?) Why?\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 83,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"0.5087831590925255"
|
||||
]
|
||||
},
|
||||
"execution_count": 83,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# a.\n",
|
||||
"accidents_df = pd.read_csv('accidentsFull.csv')\n",
|
||||
"accidents_df['Injury'] = (accidents_df['MAX_SEV_IR'] > 0).astype(int)\n",
|
||||
"\n",
|
||||
"accidents_df.loc[:, 'Injury'].mean()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# a.\n",
|
||||
"Viewing the available data, the average value is 0.5088, where a value of 1 means all reports involved an injury, and a value of 0 means all reports did not involve an injury. With a value of 0.5088, the prediction should be 'YES' for injury, though an accident with an injury is only slightly more likely than an accident without. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# b. \n",
|
||||
"Select the first 12 records in the dataset and look only at the response (INJURY) and the two predictors WEATHER_R and TRAF_CON_R.\n"
|
||||
"Select the first 12 records in the dataset and look only at the response (INJURY) and the two predictors WEATHER_R and TRAF_CON_R."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 88,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# b."
|
||||
"# b.\n",
|
||||
"b_records = accidents_df[0:12]"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -285,16 +319,39 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# i. \n",
|
||||
"Create a pivot table that examines INJURY as a function of the two predictors for these 12 records. Use all three variables in the pivot table as rows/columns.\n"
|
||||
"Create a pivot table that examines INJURY as a function of the two predictors for these 12 records. Use all three variables in the pivot table as rows/columns."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 179,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"WEATHER_R 1 2\n",
|
||||
"Injury \n",
|
||||
"0 0.333333 0.666667\n",
|
||||
"1 0.666667 0.333333\n",
|
||||
"TRAF_CON_R 0 1 2\n",
|
||||
"Injury \n",
|
||||
"0 0.666667 0.222222 0.111111\n",
|
||||
"1 1.000000 0.000000 0.000000\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# i."
|
||||
"# i.\n",
|
||||
"weather_freq = b_records[['Injury', 'WEATHER_R']].pivot_table(index='Injury', columns='WEATHER_R', aggfunc=len, fill_value=0)\n",
|
||||
"weather_propTable = weather_freq.apply(lambda x: x/sum(x), axis=1)\n",
|
||||
"\n",
|
||||
"traf_freq = b_records[['Injury', 'TRAF_CON_R']].pivot_table(index='Injury', columns='TRAF_CON_R', aggfunc=len, fill_value=0)\n",
|
||||
"traf_propTable = traf_freq.apply(lambda x: x/sum(x), axis=1)\n",
|
||||
"\n",
|
||||
"print(weather_propTable)\n",
|
||||
"print(traf_propTable)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -303,16 +360,46 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# ii. \n",
|
||||
"Compute the exact Bayes conditional probabilities of an injury (INJURY = Yes) given the six possible combinations of the predictors.\n"
|
||||
"Compute the exact Bayes conditional probabilities of an injury (INJURY = Yes) given the six possible combinations of the predictors."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 189,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"P(INJURY = Yes | WEATHER_R = 1, TRAF_CON_R = 0): 0.75\n",
|
||||
"P(INJURY = Yes | WEATHER_R = 2, TRAF_CON_R = 0): 0.4285714285714286\n",
|
||||
"P(INJURY = Yes | WEATHER_R = 1, TRAF_CON_R = 1): 0.0\n",
|
||||
"P(INJURY = Yes | WEATHER_R = 2, TRAF_CON_R = 1): 0.0\n",
|
||||
"P(INJURY = Yes | WEATHER_R = 1, TRAF_CON_R = 2): 0.0\n",
|
||||
"P(INJURY = Yes | WEATHER_R = 2, TRAF_CON_R = 2): 0.0\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# ii."
|
||||
"# ii.\n",
|
||||
"def computeInjuryprob(wpred, tpred):\n",
|
||||
" p_w_inj = weather_propTable.iloc[1][wpred]\n",
|
||||
" p_t_inj = traf_propTable.iloc[1][tpred]\n",
|
||||
" p_inj = p_w_inj * p_t_inj\n",
|
||||
"\n",
|
||||
" np_w_inj = weather_propTable.iloc[0][wpred]\n",
|
||||
" np_t_inj = traf_propTable.iloc[0][tpred]\n",
|
||||
" np_inj = np_w_inj * np_t_inj\n",
|
||||
"\n",
|
||||
" return(p_inj/(p_inj+np_inj))\n",
|
||||
"\n",
|
||||
"print(\"P(INJURY = Yes | WEATHER_R = 1, TRAF_CON_R = 0):\", computeInjuryprob(1,0))\n",
|
||||
"print(\"P(INJURY = Yes | WEATHER_R = 2, TRAF_CON_R = 0):\", computeInjuryprob(2,0))\n",
|
||||
"print(\"P(INJURY = Yes | WEATHER_R = 1, TRAF_CON_R = 1):\", computeInjuryprob(1,1))\n",
|
||||
"print(\"P(INJURY = Yes | WEATHER_R = 2, TRAF_CON_R = 1):\", computeInjuryprob(2,1))\n",
|
||||
"print(\"P(INJURY = Yes | WEATHER_R = 1, TRAF_CON_R = 2):\", computeInjuryprob(1,2))\n",
|
||||
"print(\"P(INJURY = Yes | WEATHER_R = 2, TRAF_CON_R = 2):\", computeInjuryprob(2,2))"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -321,16 +408,79 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# iii. \n",
|
||||
"Classify the 12 accidents using these probabilities and a cutoff of 0.5.\n"
|
||||
"Classify the 12 accidents using these probabilities and a cutoff of 0.5."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 200,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" actual predicted 0 1\n",
|
||||
"4 0 0 0.509515 0.490485\n",
|
||||
" actual predicted 0 1\n",
|
||||
"10 0 1 0.447309 0.552691\n",
|
||||
"1 0 1 0.447309 0.552691\n",
|
||||
" actual predicted 0 1\n",
|
||||
"2 0 0 0.986395 0.013605\n",
|
||||
" actual predicted 0 1\n",
|
||||
"3 0 0 0.989368 0.010632\n",
|
||||
" actual predicted 0 1\n",
|
||||
"4 0 0 0.509515 0.490485\n",
|
||||
" actual predicted 0 1\n",
|
||||
"10 0 1 0.447309 0.552691\n",
|
||||
"1 0 1 0.447309 0.552691\n",
|
||||
" actual predicted 0 1\n",
|
||||
"10 0 1 0.447309 0.552691\n",
|
||||
"1 0 1 0.447309 0.552691\n",
|
||||
" actual predicted 0 1\n",
|
||||
"4 0 0 0.509515 0.490485\n",
|
||||
" actual predicted 0 1\n",
|
||||
"10 0 1 0.447309 0.552691\n",
|
||||
"1 0 1 0.447309 0.552691\n",
|
||||
" actual predicted 0 1\n",
|
||||
"10 0 1 0.447309 0.552691\n",
|
||||
"1 0 1 0.447309 0.552691\n",
|
||||
" actual predicted 0 1\n",
|
||||
"10 0 1 0.447309 0.552691\n",
|
||||
"1 0 1 0.447309 0.552691\n",
|
||||
"Empty DataFrame\n",
|
||||
"Columns: [actual, predicted, 0, 1]\n",
|
||||
"Index: []\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# iii."
|
||||
"# iii.\n",
|
||||
"preictors = ['WEATHER_R', 'TRAF_CON_R']\n",
|
||||
"X = pd.get_dummies(b_records[predictors])\n",
|
||||
"y = b_records['Injury']\n",
|
||||
"\n",
|
||||
"# split into training and validation\n",
|
||||
"X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.40, random_state=1)\n",
|
||||
"\n",
|
||||
"# run naive Bayes\n",
|
||||
"delays_nb = MultinomialNB(alpha=0.01)\n",
|
||||
"delays_nb.fit(X_train, y_train)\n",
|
||||
"\n",
|
||||
"# predict probabilities\n",
|
||||
"predProb_train = delays_nb.predict_proba(X_train)\n",
|
||||
"predProb_valid = delays_nb.predict_proba(X_valid)\n",
|
||||
"\n",
|
||||
"# predict class membership\n",
|
||||
"y_valid_pred = delays_nb.predict(X_valid)\n",
|
||||
"y_train_pred = delays_nb.predict(X_train)\n",
|
||||
"# Subset a specific set\n",
|
||||
"df = pd.concat([pd.DataFrame({'actual': y_valid, 'predicted': y_valid_pred}),\n",
|
||||
" pd.DataFrame(predProb_valid, index=y_valid.index)], axis=1)\n",
|
||||
"\n",
|
||||
"for index, row in b_records.iterrows():\n",
|
||||
" mask = ((X_valid.WEATHER_R == row['WEATHER_R']) & (X_valid.TRAF_CON_R == row['TRAF_CON_R']))\n",
|
||||
" print(df[mask])\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -339,16 +489,35 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# iv. \n",
|
||||
"Compute manually the naive Bayes conditional probability of an injury given WEATHER_R = 1 and TRAF_CON_R = 1.\n"
|
||||
"Compute manually the naive Bayes conditional probability of an injury given WEATHER_R = 1 and TRAF_CON_R = 1."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 197,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"WEATHER_R 1 2\n",
|
||||
"Injury \n",
|
||||
"0 3 6\n",
|
||||
"1 2 1\n",
|
||||
"TRAF_CON_R 0 1 2\n",
|
||||
"Injury \n",
|
||||
"0 6 2 1\n",
|
||||
"1 3 0 0\n",
|
||||
"0.0\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# iv."
|
||||
"# iv.\n",
|
||||
"print(weather_freq)\n",
|
||||
"print(traf_freq)\n",
|
||||
"print(((3 / 12) * ((2 / 3) * (0 / 3))) / (((3 / 12) * ((2 / 3) * (0 / 3))) + ((9 / 12) * ((3 / 9) * (2 / 9)))))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -366,11 +535,36 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 201,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Confusion Matrix (Accuracy 0.2857)\n",
|
||||
"\n",
|
||||
" Prediction\n",
|
||||
"Actual 0 1\n",
|
||||
" 0 1 3\n",
|
||||
" 1 2 1\n",
|
||||
"\n",
|
||||
"Confusion Matrix (Accuracy 0.6000)\n",
|
||||
"\n",
|
||||
" Prediction\n",
|
||||
"Actual 0 1\n",
|
||||
" 0 3 2\n",
|
||||
" 1 0 0\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# v."
|
||||
"# v.\n",
|
||||
"classificationSummary(y_train, y_train_pred) \n",
|
||||
"\n",
|
||||
"print()\n",
|
||||
"\n",
|
||||
"classificationSummary(y_valid, y_valid_pred) "
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user