Starting Naive Bayes
This commit is contained in:
parent
2510582087
commit
2710b772ed
1
.~lock.accidentsFull.csv#
Normal file
1
.~lock.accidentsFull.csv#
Normal file
@ -0,0 +1 @@
|
|||||||
|
,noah,NovaArchSys,29.03.2023 17:55,file:///home/noah/.config/libreoffice/4;
|
||||||
@ -14,7 +14,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 16,
|
"execution_count": 110,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@ -28,7 +28,7 @@
|
|||||||
"from sklearn.model_selection import train_test_split\n",
|
"from sklearn.model_selection import train_test_split\n",
|
||||||
"from sklearn.naive_bayes import MultinomialNB\n",
|
"from sklearn.naive_bayes import MultinomialNB\n",
|
||||||
"from sklearn.metrics import mean_squared_error\n",
|
"from sklearn.metrics import mean_squared_error\n",
|
||||||
"from math import sqrt\n",
|
"from math import isnan\n",
|
||||||
"\n",
|
"\n",
|
||||||
"import matplotlib.pylab as plt\n",
|
"import matplotlib.pylab as plt\n",
|
||||||
"from dmba import classificationSummary, gainsChart\n",
|
"from dmba import classificationSummary, gainsChart\n",
|
||||||
@ -261,22 +261,56 @@
|
|||||||
"Using the information in this dataset, if an accident has just been reported and no further information is available, what should the prediction be? (INJURY = Yes or No?) Why?\n"
|
"Using the information in this dataset, if an accident has just been reported and no further information is available, what should the prediction be? (INJURY = Yes or No?) Why?\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 83,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"0.5087831590925255"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 83,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# a.\n",
|
||||||
|
"accidents_df = pd.read_csv('accidentsFull.csv')\n",
|
||||||
|
"accidents_df['Injury'] = (accidents_df['MAX_SEV_IR'] > 0).astype(int)\n",
|
||||||
|
"\n",
|
||||||
|
"accidents_df.loc[:, 'Injury'].mean()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"attachments": {},
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# a.\n",
|
||||||
|
"Viewing the available data, the average value is 0.5088, where a value of 1 means all reports involved an injury, and a value of 0 means all reports did not involve an injury. With a value of 0.5088, the prediction should be 'YES' for injury, though an accident with an injury is only slightly more likely than an accident without. "
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"attachments": {},
|
"attachments": {},
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"# b. \n",
|
"# b. \n",
|
||||||
"Select the first 12 records in the dataset and look only at the response (INJURY) and the two predictors WEATHER_R and TRAF_CON_R.\n"
|
"Select the first 12 records in the dataset and look only at the response (INJURY) and the two predictors WEATHER_R and TRAF_CON_R."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 88,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# b."
|
"# b.\n",
|
||||||
|
"b_records = accidents_df[0:12]"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -285,16 +319,39 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"# i. \n",
|
"# i. \n",
|
||||||
"Create a pivot table that examines INJURY as a function of the two predictors for these 12 records. Use all three variables in the pivot table as rows/columns.\n"
|
"Create a pivot table that examines INJURY as a function of the two predictors for these 12 records. Use all three variables in the pivot table as rows/columns."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 179,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"WEATHER_R 1 2\n",
|
||||||
|
"Injury \n",
|
||||||
|
"0 0.333333 0.666667\n",
|
||||||
|
"1 0.666667 0.333333\n",
|
||||||
|
"TRAF_CON_R 0 1 2\n",
|
||||||
|
"Injury \n",
|
||||||
|
"0 0.666667 0.222222 0.111111\n",
|
||||||
|
"1 1.000000 0.000000 0.000000\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"# i."
|
"# i.\n",
|
||||||
|
"weather_freq = b_records[['Injury', 'WEATHER_R']].pivot_table(index='Injury', columns='WEATHER_R', aggfunc=len, fill_value=0)\n",
|
||||||
|
"weather_propTable = weather_freq.apply(lambda x: x/sum(x), axis=1)\n",
|
||||||
|
"\n",
|
||||||
|
"traf_freq = b_records[['Injury', 'TRAF_CON_R']].pivot_table(index='Injury', columns='TRAF_CON_R', aggfunc=len, fill_value=0)\n",
|
||||||
|
"traf_propTable = traf_freq.apply(lambda x: x/sum(x), axis=1)\n",
|
||||||
|
"\n",
|
||||||
|
"print(weather_propTable)\n",
|
||||||
|
"print(traf_propTable)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -303,16 +360,46 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"# ii. \n",
|
"# ii. \n",
|
||||||
"Compute the exact Bayes conditional probabilities of an injury (INJURY = Yes) given the six possible combinations of the predictors.\n"
|
"Compute the exact Bayes conditional probabilities of an injury (INJURY = Yes) given the six possible combinations of the predictors."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 189,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"P(INJURY = Yes | WEATHER_R = 1, TRAF_CON_R = 0): 0.75\n",
|
||||||
|
"P(INJURY = Yes | WEATHER_R = 2, TRAF_CON_R = 0): 0.4285714285714286\n",
|
||||||
|
"P(INJURY = Yes | WEATHER_R = 1, TRAF_CON_R = 1): 0.0\n",
|
||||||
|
"P(INJURY = Yes | WEATHER_R = 2, TRAF_CON_R = 1): 0.0\n",
|
||||||
|
"P(INJURY = Yes | WEATHER_R = 1, TRAF_CON_R = 2): 0.0\n",
|
||||||
|
"P(INJURY = Yes | WEATHER_R = 2, TRAF_CON_R = 2): 0.0\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"# ii."
|
"# ii.\n",
|
||||||
|
"def computeInjuryprob(wpred, tpred):\n",
|
||||||
|
" p_w_inj = weather_propTable.iloc[1][wpred]\n",
|
||||||
|
" p_t_inj = traf_propTable.iloc[1][tpred]\n",
|
||||||
|
" p_inj = p_w_inj * p_t_inj\n",
|
||||||
|
"\n",
|
||||||
|
" np_w_inj = weather_propTable.iloc[0][wpred]\n",
|
||||||
|
" np_t_inj = traf_propTable.iloc[0][tpred]\n",
|
||||||
|
" np_inj = np_w_inj * np_t_inj\n",
|
||||||
|
"\n",
|
||||||
|
" return(p_inj/(p_inj+np_inj))\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"P(INJURY = Yes | WEATHER_R = 1, TRAF_CON_R = 0):\", computeInjuryprob(1,0))\n",
|
||||||
|
"print(\"P(INJURY = Yes | WEATHER_R = 2, TRAF_CON_R = 0):\", computeInjuryprob(2,0))\n",
|
||||||
|
"print(\"P(INJURY = Yes | WEATHER_R = 1, TRAF_CON_R = 1):\", computeInjuryprob(1,1))\n",
|
||||||
|
"print(\"P(INJURY = Yes | WEATHER_R = 2, TRAF_CON_R = 1):\", computeInjuryprob(2,1))\n",
|
||||||
|
"print(\"P(INJURY = Yes | WEATHER_R = 1, TRAF_CON_R = 2):\", computeInjuryprob(1,2))\n",
|
||||||
|
"print(\"P(INJURY = Yes | WEATHER_R = 2, TRAF_CON_R = 2):\", computeInjuryprob(2,2))"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -321,16 +408,79 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"# iii. \n",
|
"# iii. \n",
|
||||||
"Classify the 12 accidents using these probabilities and a cutoff of 0.5.\n"
|
"Classify the 12 accidents using these probabilities and a cutoff of 0.5."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 200,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
" actual predicted 0 1\n",
|
||||||
|
"4 0 0 0.509515 0.490485\n",
|
||||||
|
" actual predicted 0 1\n",
|
||||||
|
"10 0 1 0.447309 0.552691\n",
|
||||||
|
"1 0 1 0.447309 0.552691\n",
|
||||||
|
" actual predicted 0 1\n",
|
||||||
|
"2 0 0 0.986395 0.013605\n",
|
||||||
|
" actual predicted 0 1\n",
|
||||||
|
"3 0 0 0.989368 0.010632\n",
|
||||||
|
" actual predicted 0 1\n",
|
||||||
|
"4 0 0 0.509515 0.490485\n",
|
||||||
|
" actual predicted 0 1\n",
|
||||||
|
"10 0 1 0.447309 0.552691\n",
|
||||||
|
"1 0 1 0.447309 0.552691\n",
|
||||||
|
" actual predicted 0 1\n",
|
||||||
|
"10 0 1 0.447309 0.552691\n",
|
||||||
|
"1 0 1 0.447309 0.552691\n",
|
||||||
|
" actual predicted 0 1\n",
|
||||||
|
"4 0 0 0.509515 0.490485\n",
|
||||||
|
" actual predicted 0 1\n",
|
||||||
|
"10 0 1 0.447309 0.552691\n",
|
||||||
|
"1 0 1 0.447309 0.552691\n",
|
||||||
|
" actual predicted 0 1\n",
|
||||||
|
"10 0 1 0.447309 0.552691\n",
|
||||||
|
"1 0 1 0.447309 0.552691\n",
|
||||||
|
" actual predicted 0 1\n",
|
||||||
|
"10 0 1 0.447309 0.552691\n",
|
||||||
|
"1 0 1 0.447309 0.552691\n",
|
||||||
|
"Empty DataFrame\n",
|
||||||
|
"Columns: [actual, predicted, 0, 1]\n",
|
||||||
|
"Index: []\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"# iii."
|
"# iii.\n",
|
||||||
|
"preictors = ['WEATHER_R', 'TRAF_CON_R']\n",
|
||||||
|
"X = pd.get_dummies(b_records[predictors])\n",
|
||||||
|
"y = b_records['Injury']\n",
|
||||||
|
"\n",
|
||||||
|
"# split into training and validation\n",
|
||||||
|
"X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.40, random_state=1)\n",
|
||||||
|
"\n",
|
||||||
|
"# run naive Bayes\n",
|
||||||
|
"delays_nb = MultinomialNB(alpha=0.01)\n",
|
||||||
|
"delays_nb.fit(X_train, y_train)\n",
|
||||||
|
"\n",
|
||||||
|
"# predict probabilities\n",
|
||||||
|
"predProb_train = delays_nb.predict_proba(X_train)\n",
|
||||||
|
"predProb_valid = delays_nb.predict_proba(X_valid)\n",
|
||||||
|
"\n",
|
||||||
|
"# predict class membership\n",
|
||||||
|
"y_valid_pred = delays_nb.predict(X_valid)\n",
|
||||||
|
"y_train_pred = delays_nb.predict(X_train)\n",
|
||||||
|
"# Subset a specific set\n",
|
||||||
|
"df = pd.concat([pd.DataFrame({'actual': y_valid, 'predicted': y_valid_pred}),\n",
|
||||||
|
" pd.DataFrame(predProb_valid, index=y_valid.index)], axis=1)\n",
|
||||||
|
"\n",
|
||||||
|
"for index, row in b_records.iterrows():\n",
|
||||||
|
" mask = ((X_valid.WEATHER_R == row['WEATHER_R']) & (X_valid.TRAF_CON_R == row['TRAF_CON_R']))\n",
|
||||||
|
" print(df[mask])\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -339,16 +489,35 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"# iv. \n",
|
"# iv. \n",
|
||||||
"Compute manually the naive Bayes conditional probability of an injury given WEATHER_R = 1 and TRAF_CON_R = 1.\n"
|
"Compute manually the naive Bayes conditional probability of an injury given WEATHER_R = 1 and TRAF_CON_R = 1."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 197,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"WEATHER_R 1 2\n",
|
||||||
|
"Injury \n",
|
||||||
|
"0 3 6\n",
|
||||||
|
"1 2 1\n",
|
||||||
|
"TRAF_CON_R 0 1 2\n",
|
||||||
|
"Injury \n",
|
||||||
|
"0 6 2 1\n",
|
||||||
|
"1 3 0 0\n",
|
||||||
|
"0.0\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"# iv."
|
"# iv.\n",
|
||||||
|
"print(weather_freq)\n",
|
||||||
|
"print(traf_freq)\n",
|
||||||
|
"print(((3 / 12) * ((2 / 3) * (0 / 3))) / (((3 / 12) * ((2 / 3) * (0 / 3))) + ((9 / 12) * ((3 / 9) * (2 / 9)))))\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -366,11 +535,36 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 201,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Confusion Matrix (Accuracy 0.2857)\n",
|
||||||
|
"\n",
|
||||||
|
" Prediction\n",
|
||||||
|
"Actual 0 1\n",
|
||||||
|
" 0 1 3\n",
|
||||||
|
" 1 2 1\n",
|
||||||
|
"\n",
|
||||||
|
"Confusion Matrix (Accuracy 0.6000)\n",
|
||||||
|
"\n",
|
||||||
|
" Prediction\n",
|
||||||
|
"Actual 0 1\n",
|
||||||
|
" 0 3 2\n",
|
||||||
|
" 1 0 0\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"# v."
|
"# v.\n",
|
||||||
|
"classificationSummary(y_train, y_train_pred) \n",
|
||||||
|
"\n",
|
||||||
|
"print()\n",
|
||||||
|
"\n",
|
||||||
|
"classificationSummary(y_valid, y_valid_pred) "
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user