QM-7063-Learning-Practice-3/Lecture-Work.ipynb
2023-02-07 18:05:20 -06:00

1463 lines
64 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Chapter 14: Association Rules and Collaborative Filtering\n",
"\n",
"> (c) 2019 Galit Shmueli, Peter C. Bruce, Peter Gedeck \n",
">\n",
"> Code included in\n",
">\n",
"> _Data Mining for Business Analytics: Concepts, Techniques, and Applications in Python_ (First Edition) \n",
"> Galit Shmueli, Peter C. Bruce, Peter Gedeck, and Nitin R. Patel. 2019.\n",
"\n",
"## Import required packages"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting package metadata (current_repodata.json): ...working... done\n",
"Solving environment: ...working... failed with initial frozen solve. Retrying with flexible solve.\n",
"Solving environment: ...working... failed with repodata from current_repodata.json, will retry with next repodata source.\n",
"Collecting package metadata (repodata.json): ...working... done\n",
"Solving environment: ...working... done\n",
"\n",
"## Package Plan ##\n",
"\n",
" environment location: C:\\Users\\isa1761\\Anaconda3\n",
"\n",
" added / updated specs:\n",
" - scikit-surprise\n",
"\n",
"\n",
"The following packages will be downloaded:\n",
"\n",
" package | build\n",
" ---------------------------|-----------------\n",
" conda-4.12.0 | py39hcbf5309_0 1.0 MB conda-forge\n",
" scikit-surprise-1.1.1 | py39h5d4886f_2 538 KB conda-forge\n",
" ------------------------------------------------------------\n",
" Total: 1.5 MB\n",
"\n",
"The following NEW packages will be INSTALLED:\n",
"\n",
" scikit-surprise conda-forge/win-64::scikit-surprise-1.1.1-py39h5d4886f_2\n",
"\n",
"The following packages will be UPDATED:\n",
"\n",
" conda 4.11.0-py39hcbf5309_0 --> 4.12.0-py39hcbf5309_0\n",
"\n",
"\n",
"\n",
"Downloading and Extracting Packages\n",
"\n",
"scikit-surprise-1.1. | 538 KB | | 0% \n",
"scikit-surprise-1.1. | 538 KB | 2 | 3% \n",
"scikit-surprise-1.1. | 538 KB | ########## | 100% \n",
"scikit-surprise-1.1. | 538 KB | ########## | 100% \n",
"\n",
"conda-4.12.0 | 1.0 MB | | 0% \n",
"conda-4.12.0 | 1.0 MB | 1 | 2% \n",
"conda-4.12.0 | 1.0 MB | ########## | 100% \n",
"conda-4.12.0 | 1.0 MB | ########## | 100% \n",
"Preparing transaction: ...working... done\n",
"Verifying transaction: ...working... done\n",
"Executing transaction: ...working... done\n",
"\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"conda install -c conda-forge scikit-surprise"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"%matplotlib inline\n",
"\n",
"from pathlib import Path\n",
"\n",
"import heapq\n",
"from collections import defaultdict\n",
"\n",
"import pandas as pd\n",
"import matplotlib.pylab as plt\n",
"from mlxtend.frequent_patterns import apriori\n",
"from mlxtend.frequent_patterns import association_rules\n",
"\n",
"from surprise import Dataset, Reader, KNNBasic\n",
"from surprise.model_selection import train_test_split\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Table 14.4"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Red</th>\n",
" <th>White</th>\n",
" <th>Blue</th>\n",
" <th>Orange</th>\n",
" <th>Green</th>\n",
" <th>Yellow</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Transaction</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Red White Blue Orange Green Yellow\n",
"Transaction \n",
"1 1 1 0 0 1 0\n",
"2 0 1 0 1 0 0\n",
"3 0 1 1 0 0 0\n",
"4 1 1 0 1 0 0\n",
"5 1 0 1 0 0 0\n",
"6 0 1 1 0 0 0\n",
"7 1 0 1 0 0 0\n",
"8 1 1 1 0 1 0\n",
"9 1 1 1 0 0 0\n",
"10 0 0 0 0 0 1"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Load and preprocess data set \n",
"fp_df = pd.read_csv('Faceplate.csv')\n",
"fp_df.set_index('Transaction', inplace=True)\n",
"fp_df"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" antecedents consequents support confidence lift leverage\n",
"12 (White, Red) (Green) 0.2 0.5 2.500000 0.12\n",
"15 (Green) (White, Red) 0.2 1.0 2.500000 0.12\n",
"4 (Green) (Red) 0.2 1.0 1.666667 0.08\n",
"13 (White, Green) (Red) 0.2 1.0 1.666667 0.08\n",
"7 (Orange) (White) 0.2 1.0 1.428571 0.06\n",
"8 (Green) (White) 0.2 1.0 1.428571 0.06\n"
]
}
],
"source": [
"# create frequent itemsets\n",
"itemsets = apriori(fp_df, min_support=0.2, use_colnames=True)\n",
"\n",
"# and convert into rules\n",
"rules = association_rules(itemsets, metric='confidence', min_threshold=0.5)\n",
"rules.sort_values(by=['lift'], ascending=False).head(6)\n",
"\n",
"print(rules.sort_values(by=['lift'], ascending=False)\n",
" .drop(columns=['antecedent support', 'consequent support', 'conviction'])\n",
" .head(6))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>antecedents</th>\n",
" <th>consequents</th>\n",
" <th>antecedent support</th>\n",
" <th>consequent support</th>\n",
" <th>support</th>\n",
" <th>confidence</th>\n",
" <th>lift</th>\n",
" <th>leverage</th>\n",
" <th>conviction</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>(White, Red)</td>\n",
" <td>(Green)</td>\n",
" <td>0.4</td>\n",
" <td>0.2</td>\n",
" <td>0.2</td>\n",
" <td>0.5</td>\n",
" <td>2.500000</td>\n",
" <td>0.12</td>\n",
" <td>1.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>(Green)</td>\n",
" <td>(Red)</td>\n",
" <td>0.2</td>\n",
" <td>0.6</td>\n",
" <td>0.2</td>\n",
" <td>1.0</td>\n",
" <td>1.666667</td>\n",
" <td>0.08</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>(White, Green)</td>\n",
" <td>(Red)</td>\n",
" <td>0.2</td>\n",
" <td>0.6</td>\n",
" <td>0.2</td>\n",
" <td>1.0</td>\n",
" <td>1.666667</td>\n",
" <td>0.08</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>(Orange)</td>\n",
" <td>(White)</td>\n",
" <td>0.2</td>\n",
" <td>0.7</td>\n",
" <td>0.2</td>\n",
" <td>1.0</td>\n",
" <td>1.428571</td>\n",
" <td>0.06</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>(Green)</td>\n",
" <td>(White)</td>\n",
" <td>0.2</td>\n",
" <td>0.7</td>\n",
" <td>0.2</td>\n",
" <td>1.0</td>\n",
" <td>1.428571</td>\n",
" <td>0.06</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>(Red, Green)</td>\n",
" <td>(White)</td>\n",
" <td>0.2</td>\n",
" <td>0.7</td>\n",
" <td>0.2</td>\n",
" <td>1.0</td>\n",
" <td>1.428571</td>\n",
" <td>0.06</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" antecedents consequents antecedent support consequent support \\\n",
"12 (White, Red) (Green) 0.4 0.2 \n",
"4 (Green) (Red) 0.2 0.6 \n",
"13 (White, Green) (Red) 0.2 0.6 \n",
"7 (Orange) (White) 0.2 0.7 \n",
"8 (Green) (White) 0.2 0.7 \n",
"14 (Red, Green) (White) 0.2 0.7 \n",
"\n",
" support confidence lift leverage conviction \n",
"12 0.2 0.5 2.500000 0.12 1.6 \n",
"4 0.2 1.0 1.666667 0.08 inf \n",
"13 0.2 1.0 1.666667 0.08 inf \n",
"7 0.2 1.0 1.428571 0.06 inf \n",
"8 0.2 1.0 1.428571 0.06 inf \n",
"14 0.2 1.0 1.428571 0.06 inf "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# filter to get rules with single consequents only\n",
"rules[[len(c) == 1 for c in rules.consequents]].sort_values(by=['lift'], ascending=False).head(6)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The apriori method accepts sparse data frames as well. If we convert the original data frame to sparse format, we can see that the memory requirements go down to 40%. The `fill_value` argument informs the `to_sparse` method here which fields to ignore in each transaction."
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Density 0.4\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>antecedents</th>\n",
" <th>consequents</th>\n",
" <th>antecedent support</th>\n",
" <th>consequent support</th>\n",
" <th>support</th>\n",
" <th>confidence</th>\n",
" <th>lift</th>\n",
" <th>leverage</th>\n",
" <th>conviction</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>(White, Red)</td>\n",
" <td>(Green)</td>\n",
" <td>0.4</td>\n",
" <td>0.2</td>\n",
" <td>0.2</td>\n",
" <td>0.5</td>\n",
" <td>2.500000</td>\n",
" <td>0.12</td>\n",
" <td>1.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>(Green)</td>\n",
" <td>(White, Red)</td>\n",
" <td>0.2</td>\n",
" <td>0.4</td>\n",
" <td>0.2</td>\n",
" <td>1.0</td>\n",
" <td>2.500000</td>\n",
" <td>0.12</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>(Green)</td>\n",
" <td>(Red)</td>\n",
" <td>0.2</td>\n",
" <td>0.6</td>\n",
" <td>0.2</td>\n",
" <td>1.0</td>\n",
" <td>1.666667</td>\n",
" <td>0.08</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>(White, Green)</td>\n",
" <td>(Red)</td>\n",
" <td>0.2</td>\n",
" <td>0.6</td>\n",
" <td>0.2</td>\n",
" <td>1.0</td>\n",
" <td>1.666667</td>\n",
" <td>0.08</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>(Orange)</td>\n",
" <td>(White)</td>\n",
" <td>0.2</td>\n",
" <td>0.7</td>\n",
" <td>0.2</td>\n",
" <td>1.0</td>\n",
" <td>1.428571</td>\n",
" <td>0.06</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>(Green)</td>\n",
" <td>(White)</td>\n",
" <td>0.2</td>\n",
" <td>0.7</td>\n",
" <td>0.2</td>\n",
" <td>1.0</td>\n",
" <td>1.428571</td>\n",
" <td>0.06</td>\n",
" <td>inf</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" antecedents consequents antecedent support consequent support \\\n",
"12 (White, Red) (Green) 0.4 0.2 \n",
"15 (Green) (White, Red) 0.2 0.4 \n",
"4 (Green) (Red) 0.2 0.6 \n",
"13 (White, Green) (Red) 0.2 0.6 \n",
"7 (Orange) (White) 0.2 0.7 \n",
"8 (Green) (White) 0.2 0.7 \n",
"\n",
" support confidence lift leverage conviction \n",
"12 0.2 0.5 2.500000 0.12 1.6 \n",
"15 0.2 1.0 2.500000 0.12 inf \n",
"4 0.2 1.0 1.666667 0.08 inf \n",
"13 0.2 1.0 1.666667 0.08 inf \n",
"7 0.2 1.0 1.428571 0.06 inf \n",
"8 0.2 1.0 1.428571 0.06 inf "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Convert data set into a sparse data frame\n",
"sparse_df = fp_df.to_sparse(fill_value=0)\n",
"print('Density {}'.format(sparse_df.density))\n",
"\n",
"# create frequent itemsets\n",
"itemsets = apriori(sparse_df, min_support=0.2, use_colnames=True)\n",
"\n",
"# and convert into rules\n",
"rules = association_rules(itemsets, metric='confidence', min_threshold=0.5)\n",
"rules.sort_values(by=['lift'], ascending=False).head(6)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data required for Table 14.5 and 14.6"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[{8}, {8, 3, 4}, {8}, {9, 3}, {9}, {8, 1}, {9, 6}, {9, 3, 5, 7}, {8}, set(), {1, 9, 7}, {1, 4, 5, 8, 9}, {9, 5, 7}, {8, 6, 7}, {9, 3, 7}, {1, 4, 9}, {8, 6, 7}, {8}, set(), {9}, {8, 2, 5, 6}, {9, 4, 6}, {9, 4}, {8, 9}, {8, 6}, {8, 1, 6}, {8, 5}, {8, 9, 4}, {9}, {8}, {8, 1, 5}, {9, 3, 6}, {9, 7}, {8, 9, 7}, {8, 3, 4, 6}, {8, 1, 4}, {8, 4, 7}, {8, 9}, {9, 4, 5, 7}, {8, 9, 2}, {9, 2, 5}, {1, 2, 9, 7}, {8, 5}, {8, 1, 7}, {8}, {9, 2, 7}, {9, 4, 6}, {9}, {9}, {8, 6, 7}]\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" <th>4</th>\n",
" <th>5</th>\n",
" <th>6</th>\n",
" <th>7</th>\n",
" <th>8</th>\n",
" <th>9</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 1 2 3 4 5 6 7 8 9\n",
"0 0 0 0 0 0 0 0 1 0\n",
"1 0 0 1 1 0 0 0 1 0\n",
"2 0 0 0 0 0 0 0 1 0\n",
"3 0 0 1 0 0 0 0 0 1\n",
"4 0 0 0 0 0 0 0 0 1"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Prepare the dataset for table 14.6 based on table 14.5\n",
"from itertools import chain\n",
"randomTransactions = [{8}, {3,4,8}, {8}, {3,9}, {9}, {1,8}, {6,9}, {3,5,7,9}, {8}, set(), \n",
" {1,7,9}, {1,4,5,8,9}, {5,7,9}, {6,7,8}, {3,7,9}, {1,4,9}, {6,7,8}, {8}, set(), {9},\n",
" {2,5,6,8}, {4,6,9}, {4,9}, {8,9}, {6,8}, {1,6,8}, {5,8}, {4,8,9}, {9}, {8},\n",
" {1,5,8}, {3,6,9}, {7,9}, {7,8,9}, {3,4,6,8}, {1,4,8}, {4,7,8}, {8,9}, {4,5,7,9}, {2,8,9},\n",
" {2,5,9}, {1,2,7,9}, {5,8}, {1,7,8}, {8}, {2,7,9}, {4,6,9}, {9}, {9}, {6,7,8}]\n",
"print(randomTransactions)\n",
"uniqueItems = sorted(set(chain.from_iterable(randomTransactions)))\n",
"randomData = pd.DataFrame(0, index=range(len(randomTransactions)), columns=uniqueItems)\n",
"for row, transaction in enumerate(randomTransactions):\n",
" for item in transaction:\n",
" randomData.loc[row][item] = 1\n",
"randomData.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Table 14.6"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" antecedents consequents support confidence lift leverage\n",
"3 (8, 3) (4) 0.04 1.0 4.545455 0.0312\n",
"1 (1, 5) (8) 0.04 1.0 1.851852 0.0184\n",
"2 (2, 7) (9) 0.04 1.0 1.851852 0.0184\n",
"4 (3, 4) (8) 0.04 1.0 1.851852 0.0184\n",
"5 (3, 7) (9) 0.04 1.0 1.851852 0.0184\n",
"6 (4, 5) (9) 0.04 1.0 1.851852 0.0184\n"
]
}
],
"source": [
"# create frequent itemsets\n",
"itemsets = apriori(randomData, min_support=2/len(randomData), use_colnames=True)\n",
"# and convert into rules\n",
"rules = association_rules(itemsets, metric='confidence', min_threshold=0.7)\n",
"print(rules.sort_values(by=['lift'], ascending=False)\n",
" .drop(columns=['antecedent support', 'consequent support', 'conviction'])\n",
" .head(6))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Table 14.8"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ChildBks</th>\n",
" <th>YouthBks</th>\n",
" <th>CookBks</th>\n",
" <th>DoItYBks</th>\n",
" <th>RefBks</th>\n",
" <th>ArtBks</th>\n",
" <th>GeogBks</th>\n",
" <th>ItalCook</th>\n",
" <th>ItalAtlas</th>\n",
" <th>ItalArt</th>\n",
" <th>Florence</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ChildBks YouthBks CookBks DoItYBks RefBks ArtBks GeogBks ItalCook \\\n",
"0 0 1 1 0 0 0 0 0 \n",
"1 0 0 0 0 0 0 0 0 \n",
"2 1 1 1 0 1 0 1 1 \n",
"3 0 0 0 0 0 0 0 0 \n",
"4 0 0 0 0 0 0 0 0 \n",
"\n",
" ItalAtlas ItalArt Florence \n",
"0 0 0 0 \n",
"1 0 0 0 \n",
"2 0 0 0 \n",
"3 0 0 0 \n",
"4 0 0 0 "
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# load dataset\n",
"all_books_df = pd.read_csv('CharlesBookClub.csv')\n",
"\n",
"# create the binary incidence matrix\n",
"ignore = ['Seq#', 'ID#', 'Gender', 'M', 'R', 'F', 'FirstPurch', 'Related Purchase',\n",
" 'Mcode', 'Rcode', 'Fcode', 'Yes_Florence', 'No_Florence']\n",
"count_books = all_books_df.drop(columns=ignore)\n",
"count_books[count_books > 0] = 1\n",
"\n",
"count_books.head()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYsAAAEgCAYAAABFO1+mAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAIABJREFUeJzt3XucXWV97/HPlwCCCgoSaQ2EBA21aK3oCCjeAYV6BLWo4KWoHKMWvJRzrCBWamw9CtUe8QCCgiJHiiCIwaMiVsUqVTLhHhQJASSgFQQB5WbC9/zxrIHFMJm9k8yz9iT7+3699it73fbvWZNkfns9V9kmIiJiMhsMugARETH9JVlERERPSRYREdFTkkVERPSUZBERET0lWURERE9JFhER0VOSRURE9JRkERERPW046AJMla222spz5swZdDEiItYpixcvvtX2zF7nrTfJYs6cOYyOjg66GBER6xRJN/RzXqqhIiKipySLiIjoKckiIiJ6SrKIiIiekiwiIqKnJIuIiOgpySIiInpKsoiIiJ7Wm0F58UjSml+bpdkjoi1PFhER0VOSRURE9JRkERERPSVZRERET1WThaS9JF0taamkwyY5bz9JljTS2nd4c93Vkl5es5wRETG5ar2hJM0AjgX2BJYDiyQttH3VuPM2A94D/LS1b0dgf+BpwJOA70rawfbKWuWNiIhVq/lksTOw1PYy2/cDpwP7TnDeR4GjgHtb+/YFTrd9n+3rgKXN50VExADUTBazgBtb28ubfQ+StBOwre1vrO61zfXzJY1KGr3lllumptQREfEINZPFREPCHhzqJWkD4F+B/7G61z64wz7R9ojtkZkze64KGBERa6jmCO7lwLat7W2Am1vbmwFPB36gMtT4T4CFkvbp49qIiOhQzSeLRcA8SXMlbUxpsF44dtD2Hba3sj3H9hzgJ8A+tkeb8/aX9ChJc4F5wEVrUxhpzV8REcOu2pOF7RWSDgHOA2YAJ9teImkBMGp74STXLpF0BnAVsAI4OD2hIiIGR15PZowbGRnx6OjoKo8P46R6w3jPEbF6JC22PdLrvIzgjoiInpIsIiKipySLiIjoKckiIiJ6SrKIiIiekiwiIqKnJIuIiOgpySIiInpKsoiIiJ6SLCIioqcki4iI6CnJIiIiekqyiIiInpIsIiKipySLiIjoKckiIiJ6qposJO0l6WpJSyUdNsHxd0q6QtKlkn4kacdm/xxJ9zT7L5X02ZrljIiIyVVbVlXSDOBYYE9gObBI0kLbV7VOO832Z5vz9wE+BezVHLvW9jNrlS8iIvpX88liZ2Cp7WW27wdOB/Ztn2D7ztbmY4As5hkRMQ3VTBazgBtb28ubfQ8j6WBJ1wJHAe9pHZor6RJJF0h6wUQBJM2XNCpp9JZbbpnKskdEREvNZKEJ9j3iycH2sbafDHwA+FCz+1fAbNs7AYcCp0nafIJrT7Q9Yntk5syZU1j0iIhoq5kslgPbtra3AW6e5PzTgVcB2L7P9m+b94uBa4EdKpUzIiJ6qJksFgHzJM2VtDGwP7CwfYKkea3NVwDXNPtnNg3kSNoemAcsq1jWiIiYRLXeULZXSDoEOA+YAZxse4mkBcCo7YXAIZL2AP4I3A4c2Fz+QmCBpBXASuCdtm+rVdaIiJic7N4dkCRtATwJuAe43vYDtQu2ukZGRjw6OrrK45qoBaVPffyIpqVhvOeIWD2SFtse6XXeKp8sJD0OOBg4ANgYuAXYBNha0k+A42x/f4rKGxER09hk1VBfBb4EvMD279oHJD0beLOk7W2fVLOAERExeKtMFrb3nOTYYmBxlRJFRMS007M3lIo3Sfpwsz1b0s71ixYREdNFP11njwOeS2m7ALiLMudTREQMiX66zu5i+1mSLgGwfXszbiIiIoZEP08Wf2wGyBnKgDlg2nWdjYiIevpJFscAXwOeKOmfgR8BH6taqoiImFZ6VkPZ/rKkxcDulMkBX2X7Z9VLFhER00bPZCHp08BXbKdROyJiSPVTDXUx8KFmadSjJfUcFh4REeuXnsnC9im2/4qy8t0vgE9IuqZ6ySIiYtpYnSnKnwI8FZgD/LxKaSIiYlrqZwT32JPEAmAJ8Gzbr6xesoiImDb6GZR3HfBc27fWLkxERExPk01R/lTbPwcuAmZLmt0+bvvi2oWLiIjpYbIni0OB+cAnJzhm4KW9PlzSXsCnKSvlfd72x8cdfydlzYyVwO+B+bavao4dDhzUHHuP7fN63k1ERFQx2RTl85u3e9u+t31M0ia9PriZIuRYYE9gObBI0sKxZNA4zfZnm/P3AT4F7CVpR8qa3U+jrND3XUk72F7Z/61FRMRU6ac31IV97htvZ2Cp7WW27wdOB/Ztn2D7ztbmY2jmn2rOO932fbavA5Y2nxcREQMwWZvFnwCzgE0l7USZ6gNgc+DRfXz2LODG1vZyYJcJ4hxMqfLamIeqtmYBPxl37awJrp1PqSpj9uzZ4w9HRMQUmazN4uXAW4BtKNVDY+4CPtjHZ2uCfX7EjjKNyLGS3gB8CDhwNa49ETgRYGRk5BHHIyJiakzWZnEKcIqkv7Z91hp89nJg29b2NsDNk5x/OnD8Gl4bEREV9TPr7FmSXkFpbN6ktX9Bj0sXAfMkzQVuojRYv6F9gqR5tsemDnkFMPZ+IXCapE9RGrjnUbrwRkTEAPQz6+xnKW0ULwE+D+xHH7+4ba+QdAhwHqXr7Mm2l0haAIzaXggcImkP4I/A7ZQqKJrzzgCuAlYAB6cnVETE4MievKpf0uW2n9H687HA2bZf1k0R+zMyMuLR0dFVHtdErSB96vEjmraG8Z4jYvVIWmy752zi/XSdvaf5825JT6I8Bcxdm8JFRMS6pZ+5ob4h6fHA0ZS1LUypjoqIiCHRTwP3R5u3Z0n6BrCJ7TvqFisiIqaTyQblvWaSY9g+u06RIiJiupnsyWKyNSsMJFlERAyJyQblvbXLgkRExPTVz0p5W0s6SdK3mu0dJR1Uv2gRETFd9NN19ouUgXVParZ/AbyvVoEiImL66SdZbGX7DOABKCOzKQsSRUTEkOgnWfxB0hNoZn2VtCuQrrMREUOkn0F5h1Im9nuypB8DMynzQ0XENJBpXaILkyYLSRtQZpp9EfBnlHUmrrb9xw7KFhER08SkycL2A5I+afu5wJKOyhQREdNMP20W35H019LaPOxGRMS6rN82i8cAKyTdS6mKsu3Nq5YsIiKmjX4mEtysi4JERMT0tcpqKElzJrtQxTY9ztlL0tWSlko6bILjh0q6StLlkv5d0natYyslXdq8Fva+lZgupLV7RcT0M9mTxdFNb6ivA4uBWyg9o55CWWJ1d+BIYPlEF0uaARwL7Nmcs0jSQttXtU67BBixfbekdwFHAa9vjt1j+5lrfGcRETFlJptI8LWSdgTeCLwN+FPgbuBnwDeBf7Z97ySfvTOw1PYyAEmnA/tS1tUei/H91vk/Ad60hvcREREV9eo6exVwxBp+9izgxtb2cmCXSc4/CPhWa3sTSaPACuDjts8Zf4Gk+cB8gNmzZ69hMSMiopd+ekOtqYlqnyccLyrpTcAIZfDfmNm2b5a0PfA9SVfYvvZhH2afCJwIMDIykrGoERGV9DPOYk0tB7ZtbW8D3Dz+JEl7UJ5e9rF939h+2zc3fy4DfgDsVLGsERExiZrJYhEwT9JcSRsD+1PmmHqQpJ2AEyiJ4jet/VtIelTzfitgN1ptHRER0a1+Fj86S9Irmp5RfWumMj+EshbGz4AzbC+RtEDSPs1pRwOPBc4c10X2z4FRSZcB36e0WSRZREQMiNxj2smmmuitwK7AmcAXbf+8g7KtlpGREY+Ojq7y+DDOzDmoe17bsRLr6s97UIbx33ZMHUmLbY/0Oq/n04Lt79p+I/As4HrgfEkXSnqrpI3WvqjrtwxQi4j1QV9VS83iR28B/jtlIN2nKcnj/Goli4iIaaNn11lJZwNPBU4FXmn7V82hrzTjICIiYj3XzziL/2P7exMd6KeeKyIi1n39VEP9uaTHj2003Vr/tmKZIiJimuknWbzd9u/GNmzfDry9XpEiImK66SdZbNBeJa+ZTXbjekWKiIjppp82i/OAMyR9ljK30zuBb1ctVcQ6JmMdYn3XT7L4APAO4F2UyQG/A3y+ZqEiImJ66WdZ1QeA45tXREQMoX7GWewG/COwXXO+ANvevm7RIiJiuuinGuok4O8oS6uurFuciIiYjvpJFnfY/lbv0yIiYn3VT7L4vqSjgbOB9uJEF1crVURETCv9JIuxdbPbU3sYeOnUFyciIqajfqYof8kEr74ShaS9JF0taamkwyY4fqikqyRdLunfJW3XOnagpGua14Grd1sRETGV+lkpb2tJJ0n6VrO9o6SD+rhuBnAssDewI3CApB3HnXYJMGL7GcBXgaOaa7cEjqQ81ewMHClpi/5vK4ZV1g6JqKOf6T6+SBnF/aRm+xfA+/q4bmdgqe1ltu8HTgf2bZ9g+/u27242fwJs07x/OXC+7duauajOB/bqI2ZERFTQT7LYyvYZwAPw4Nra/XShnQXc2Npe3uxblYOAsV5Xq3ttRERU1E8D9x+alfIMIGlX4I4+rpvowX7CWXAkvYnSgP6i1blW0nxgPsDs2bP7KFJERKyJfp4sDgUWAk+W9GPgS8C7+7huObBta3sb4ObxJ0naAzgC2Mf2fatzre0TbY/YHpk5c2YfRYqIiDXRz9xQF0t6EfBnlG/8V9v+Yx+fvQiYJ2kucBOwP/CG9gmSdgJOAPay/ZvWofOAj7UatV8GHN5HzIiIqKCfuaH+ZtyuZ0nC9pcmu872CkmHUH7xzwBOtr1E0gJg1PZC4GjgscCZzZIZv7S9j+3bJH2UknAAFti+bfVuLSIiporcYzJ9SZ9pbW4C7A5cbHu/mgVbXSMjIx4dHV3l8UGtN7C2XTIHFTv3vG7EHXTsWPdJWmx7pNd5/VRDPax9QtLjgFPXomwREbGO6aeBe7y7gXlTXZCIiJi++mmzOJeHuq1uQBmNfUbNQkVExPTSzziLf2m9XwHcYHt5pfJERMQ01E+bxQVdFCQiIqavfqqh7mLikddjy6tuPuWlioiIaaWfaqh/BX5N6QEl4I3AZraPqlmwiIiYPvrpDfVy28fZvsv2nbaPB/66dsEiImL66CdZrJT0RkkzJG0g6Y30N+tsRESsJ/pJFm8AXgf8V/N6LePmeIqIiPVbP72hrmfcokURETFc+llWdYdmfewrm+1nSPpQ/aJFRMR00U811Oco04P/EcD25ZTpxiMiYkj0kywebfuicftW1ChMRERMT/0ki1slPZmHllXdD/hV1VJFRMS00s+gvIOBE4GnSroJuI4yMC8iIobEpE8WkjYARmzvAcwEnmr7+bZv6OfDJe0l6WpJSyUdNsHxF0q6WNKK5omlfWylpEub18LVuKeIiJhikz5Z2H6gWRr1DNt/WJ0PljQDOBbYE1gOLJK00PZVrdN+CbwF+J8TfMQ9tp+5OjEjIqKOftoszpf0PyVtK2nLsVcf1+0MLLW9zPb9wOmMG69h+/qmd9UDq1/0iIjoSj9tFm9r/jy4tc/A9j2umwXc2NpeDuzSf9HYRNIopefVx22fsxrXRkTEFFplspD0WttnArvbXrYGnz3RMvKrszz8bNs3S9oe+J6kK2xfO66M84H5ALNnz16DIkZERD8mq4Y6vPnzq2v42cuBbVvb2wA393ux7ZubP5cBPwB2muCcE22P2B6ZOXPmGhYzIiJ6mawa6reSvg/Mnag3ku19enz2ImCepLnATZRR331NQChpC+Bu2/dJ2grYDcj6GRERAzJZsngF8CzKokefXN0Ptr2i6Ul1HjADONn2EkkLgFHbCyU9B/gasAXwSkkfsf004M+BEyQ9QHn6+fi4XlQREdEh2ZM3I0iaafuWjsqzxkZGRjw6OrrK45qoBaVPPX5Ek1qbuIOMnXteN+IOOnas+yQttj3S67yeXWfXhUQRERF19TPOIiIihlySRURE9NRzUF7Tm+ndwJz2+X30hoqIiPVEPyO4zwFOAs4l03JERAylfpLFvbaPqV6SiIiYtvpJFp+WdCTwHeC+sZ22L65WqoiImFb6SRZ/AbwZeCkPVUO52Y6IiCHQT7J4NbB9M814REQMoX66zl4GPL52QSIiYvrq58lia+Dnkhbx8DaLdJ2NiBgS/SSLI6uXIiIiprWeycL2BZK2A+bZ/q6kR1NmkY2IiCHRs81C0tspCyCd0OyaRRmoFxERQ6KfBu6DKYsP3Qlg+xrgiTULFRER00s/yeK+drdZSRuyemtpR0TEOq6fZHGBpA8Cm0raEziTMk9UT5L2knS1pKWSDpvg+AslXSxphaT9xh07UNI1zevAfuJFRHekNX/FuqefZHEYcAtwBfAO4Ju2j+h1kaQZwLHA3sCOwAGSdhx32i+BtwCnjbt2S0ovrF2AnYEjm3W5IyJiAPpJFu+2/Tnbr7W9n+3PSXpvH9ftDCy1vaypxjod2Ld9gu3rbV/OI2ezfTlwvu3bbN8OnA/s1UfMiIiooJ9kMVEV0Fv6uG4WcGNre3mzrx99XStpvqRRSaO33JLVXyMialnlOAtJBwBvAOZKWtg6tBnw2z4+e6KayX4bxvu61vaJwIkAIyMjaXSPiKhkskF5FwK/ArYCPtnafxdweR+fvRzYtrW9DXBzn+VaDrx43LU/6PPaiIiYYqtMFrZvAG4AnruGn70ImNcsy3oTsD/lSaUf5wEfazVqvww4fA3LERERa2myaqi7mLjaSIBtbz7ZB9teIekQyi/+GcDJtpdIWgCM2l4o6TnA14AtgFdK+ojtp9m+TdJHKQkHYIHt21b/9iIiYirIXj+q+kdGRjw6OrrK42vTt3ttfkRr26d8ULFzz+tG3EHGHuQ9x9SRtNj2SK/z+ukNFRERQy7JIiIieupnPYuIiBigQVbtjsmTRURE9JRkERERPSVZRERET0kWERHRU5JFRET0lGQRERE9JVlERERPSRYREdFTkkVERPSUZBERET0lWURERE9JFhER0VOSRURE9FQ1WUjaS9LVkpZKOmyC44+S9JXm+E8lzWn2z5F0j6RLm9dna5YzIiImV22KckkzgGOBPYHlwCJJC21f1TrtIOB220+RtD/wCeD1zbFrbT+zVvkiIqJ/NZ8sdgaW2l5m+37gdGDfcefsC5zSvP8qsLu0tjO3R0TEVKuZLGYBN7a2lzf7JjzH9grgDuAJzbG5ki6RdIGkF0wUQNJ8SaOSRm+55ZapLX1ERDyoZrKY6Alh/HpNqzrnV8Bs2zsBhwKnSdr8ESfaJ9oesT0yc+bMtS5wRERMrGayWA5s29reBrh5VedI2hB4HHCb7fts/xbA9mLgWmCHimWNiIhJ1EwWi4B5kuZK2hjYH1g47pyFwIHN+/2A79m2pJlNAzmStgfmAcsqljUiIiZRrTeU7RWSDgHOA2YAJ9teImkBMGp7IXAScKqkpcBtlIQC8EJggaQVwErgnbZvq1XWiIh+rE33G4+vhF/HyOv6HTRGRkY8Ojq6yuOD+kte275dg4qde1434g4y9jD+4lwXf9a9YktabHuk12dkBHdERPSUZBERET0lWURERE/VGrgjImoZxvaSQcuTRURE9JRkERERPSVZRERET0kWERHRU5JFRET0lGQRERE9JVlERERPSRYREdFTkkVERPSUZBERET0lWURERE9JFhER0VPVZCFpL0lXS1oq6bAJjj9K0lea4z+VNKd17PBm/9WSXl6znBERMblqyaJZQ/tYYG9gR+AASTuOO+0g4HbbTwH+FfhEc+2OlCVWnwbsBRw3tiZ3RER0r+aTxc7AUtvLbN8PnA7sO+6cfYFTmvdfBXaXpGb/6bbvs30dsLT5vIiIGICa61nMAm5sbS8HdlnVObZXSLoDeEKz/yfjrp01PoCk+cD8ZvP3kq5ei/JuBdw60YG1Xf92TeMOMvZ6es/5WU+T2Lnn7uL2EXu7fgLUTBYTFW/8siOrOqefa7F9InDi6hftkSSN9rNo+VQbVNxBxh62uIOMnXsejthdxK1ZDbUc2La1vQ1w86rOkbQh8Djgtj6vjYiIjtRMFouAeZLmStqY0mC9cNw5C4EDm/f7Ad+z7Wb//k1vqbnAPOCiimWNiIhJVKuGatogDgHOA2YAJ9teImkBMGp7IXAScKqkpZQniv2ba5dIOgO4ClgBHGx7Za2yNqakOmsdijvI2MMWd5Cxc8/DEbt6XDmrl0dERA8ZwR0RET0lWURERE9JFhER0VOSRURE9DS0yULSayVt1rz/kKSzJT2rg7j/S9LmkjaUdJ6k/5L0htpxY3AkbSHpGR3G203SY5r3b5L0KUl9jdKdgtiPmmDfll3EbsXr+uf92n72VYq9naQ9mvebjv1Oq2FokwXwD7bvkvR84OWUOaqO7yDu3rbvBP4b8BvKZIkf6CAuko5qEtVGkv5d0q2S3tRR7Pc2sSXpJEkXS3pZB3EHcs+SftDE3RK4DPiCpE/Vjts4Hrhb0l8Cfw/cAHypo9hnS9pobEPSnwLn1w464J/34X3um1KS3k6ZU++EZtc2wDm14g1zshgbt/EK4HjbXwc27iDu2NiWvwL+zfatTDCVSSUvayWq5cAOwPs7iv22JvbLgJnAW4GPdxB3UPf8uCbua4Av2H42sEcHcQFWNINb9wU+bfvTQLVvnOOcA5wpaUaz5MB5dPCLkwH8vCXtLekzwCxJx7ReX6SMD6vtYGA34E4A29cAT6wVrObcUNPdTZJOoPyD+kTz+NxF8vyWpCspyepgSVsB93UQF2DsG99YorpNlWc3axkL9FeU/8yXqZvgg7rnDZtv1a8DjugiYMtdkg4H3gS8sJnef6Me10wJ259rZmw4B5gDvMP2hR2EHsTP+2ZgFNgHWNzafxfwdx3Ev8/2/WP/npspk6p98RzmZPE6yloZ/2L7d80/tOrfOG2/X9LRwG3NKPd7gVfXjts4V9LPgXuAv5U0E7i3o9iLJX0HmAsc3tStPtBB3EHd8wLKt+of2V4kaXvgmg7iArweeANwkO1fS5oNHF0zoKRD25uUud0uBXaVtKvt2lVCnf+8my88V1KeXk/pecHUu0DSB4FNJe0J/C1wbq1gQzuCW9JBtk8at+/jth+xot8Ux/2c7be3th8DnGN7z5pxW/G2AO60vbKJ/Vjb/9VB3A2AZwLLmuT8BGCW7cs7iN35PUva0vZt4/bNbdZnqUrS3ra/NW7fO21/tmLMIyc7bvsjtWIPmqRvA/s06/Z0GXcDygJyL6Mk6POAz7vSL/VhfrLYT9K9tr8MIOk44BE9OSq4RdJnbL9b0uOBbwBf7CAukk62/bb2LuA0YPcOwv+j7Q+3tn8HHAO8sWbQAd7zuc0v7TubcuwInAE8vXJcgH+QdJ/t7zWxPwC8GKiWLMYng+bJ0bZ/XyvmuHibUH5xPg3YpFWut63yoqlzA/BjSQuBP7Ri136a2pQy597n4MHVSTcF7q4RbJgbuF8DvEXSAZK+BNxv+6DaQW1/ELhf0rGUbwLH2P587biNmyQdDw9+2/4O8H87ij27qUcf6175NbqplhnUPX+MkjAeK+nZwJmUNoQu7AN8TNILJP0zZZXJfboILOnpki4BrgSWSFos6WkdhD4V+BNKz8YLKD2D7uogLpS2i29Qfp9u1rwe20Hcf6ckhzGbAt+tFWzoqqH08D7fm1Ea4n4MfBhgfNXBFMZt/2cV8BHgp8D/a+KOn769CkmfoKwb8mzg47bP6iiugC8DVwAvAb5l+187ij2oe34VpevqZsBrmt4qnZD0RMovjsWUnmid/EeXdCFwhO3vN9svBj5m+3mV415ieydJl9t+RtN99zzbL60ZdxVl2QR4pe0zK8e51PYze+2bsnhDmCyu46HV+Mavymfb21eKe+okh237b2rEbWK/pr0J/ANlfZBvN8HPrhi7PdBxI0qf8B9TpqfH9sWV4g7knpuulO3/VC8FlgHXN3HfUyNuE/uucbE3pnThdAntzWvFbpXhMtt/2WtfhbgX2d5Z0g8pDb2/Bi6q9f95gvgzKG0HBzR//sj2fpVj/hh499j/oeYJ9v/Yfm6VeMOWLIaRpC9Mctg163Ulfb9H7Crf/AZ1z5IOnOz4gHrNdEbS14CLKdVCUKreRmy/qnLc/w6cBTwD+AKlGujDNRv1m7gvpPQ8ewXly8huwPa2q7QbjIv9HOB0HlpF9E+B19tevOqr1iLeMCYLlakP/mD7Vkm7As8HltquNvqxqaffD7jd9jebroYvBK4F/rlW9VcMl6av/UrblrQtsAvl3/alHcXfglLF+nzKE90PKZ0bbu8ifpckLQd+SRkxf04zI8R1tud2WIaNgD+j/Kx/bvuP1WINW7KQ9GHKUq6mZOU9gB9Q/lNdZvt9leKeTvkLfXTzuobSKPZ8YEfb1RogmzrU1wO3U/phv5+HEtVHm1Hk1Uh6ESVJXi7pda3Yx9muMiBxUPcsaR7wwSbup4DPAS9o4h5ke7RG3Cb224FPAL8HPkq554uBnSi9Zj5RK/YEZdkceKB2b6hx4zseoWaPJEmfBl5FaYc7Dfg6cEVXVV9NGZ5HGfz4YM9W21WmdhnGZHEVpb//oynfCv7E9t3NN7JLbVfp2ijpSttPb+LcZHvr1rHLbVeb+Exlido/Ao8BtqD0VDmXkqieafu/VYx9LKVqYBPgakr1wLeB5wEzbFfpOjuoe5b0I8o8TJtTRvG+r4n7AuCfbO9SI24Tewnl/jYDfgZs1zw9PxpYZLt6ryRJf0G5/7GOJLcCB9q+slK8ycZ32PaCGnFb8UXpsHEAZZaAzSldeL/ZQaI8FXgyZfDj2PRFrtYuZnuoXsDFrfeXrOpY5bgXdxW3+fwrmz83BH497thllWNf1fy5CfBbSoKA8pR1xfp2z5QvHGPvl67qWKXYl7TeX7aqY5XLcCHwktb2i4ELO4i7Wz/7KpdhI0oX5dOAWzuI9zOaL/xdvIZxUN7jm54yAjZv9ZoRpXtlLduozIKp1vuxuLMqxgW4H8BlepGbxx1bOcH5U+neJva9km6wvbLZtqRq9asM7p7bU5jcOcmxGjaVtBOlv//GzXs1r00mvXJ/IfSxAAALvUlEQVTqPMZNt1kA2z9oRs3X9hlg/BIDE+2rxqW9YCGwUNKmvc6fAldSxpb8qoNYQ5ksLgBe2bz/Yev92HYth6/iPZQ67pq2kXQMDyWqY5r9XSSqJzb1ymq9H4s9s2LcQd3zUyVd3sR5cvN+LG7tuuxfUdpJoHQdbdfX/7py7DHLJP0DD+8NVW2KE0nPpVRpzhzXfrE5MKNW3Cb2FUw+cV/tNTW2Aq6SdBGtyUhdqf1z6NoshtEgu3P2qFPGleYMGtQ9q8ciQ7ZvqBF3uhjXGwrKF7CPuFJvqKbzxIuBd/Lw6UzuAs51xYGQg/67bu59orgXVIk3bMlikL0nmvhPAQ7lkT0YulgIaI7t68fte47tRRVj7mr7J7U+f5K4p9p+s6T3uqzn0HX8T9j+QK99lWK/ZoLdd1DaiH5TKeYmwGa2bxm3f2vgDttVZ/qVtF37l3NXo6gHrUlY82x/t+nIMMN2lWlOhnFuqLG5W0aAd1GqJGZRvpns2EH8r1Iapv6JMqp47NWFsyQ9WAXTfDM5uXLM4yWdIKlme9BEnt38R3qbyjKbW7ZfHcSfaBbhvTuIC6U3zucpkzS+kdJ991DKZHdvrhTzGEqPr/H2AKpP62L7BpUFl/ZWmevtekrX6eok7SppkaTfS7pf0kpJ49urasQdv1LeLCqulNdZT4Hp9qJMKLdZa3sz4NsdxK3a86lH7OcAiyiNYn9F6XK3beWYG1C6j/4CeHOH9/oeSlK+jzLdxnWt17KKcd9F6Xd/N3B563Ud8H87uvdzga1b21sDZ1O6s15ZKeZVkxxbUvl+X0ipgrqRMor718CjO/y3Ngo8BbiE0k7yVspA29pxL6VM6dLuBVeth+EwNnCPmU3TY6ZxP6VqqIpmkBLA1yXNp8y62m6Uqv5NxGVRmPdQEuW9wJ4eV21QIeYDwP9WWfjoP1Wmgn9wbi5Xmq/I9jHAMZKOt/2uGjFW4TTgW8D/Atpro9zl7kbpz/HD1+v4DbCDyyqBtXqgTbb8YLUajHGjqN/vh0ZRV59uo832UkkzXHr7fUFlQsXaslJeR04FLmrmsjFltbqai9ov4eETF7arnkxJXlVIOpeH/yN6NKUO+yRJuOLo8Sb+QZRfnEcAx7r5CtQF2++S9HxKve4XVJax3cyVFiGyfYfKhH5/4cE1Zv+HpG9QpkWHMs3MD5surL+rFPM3kna2fVF7p8r8RTW/kJxFGUX9emClpK/T3Zr2Y+5WWUr2UklHUXqlddFd+AJlpbxuqMyIOlbP+kPbl3QQcyOPm79lon1THHPCXhNjXKn3RBP7Qkr98aG2u+q+2Y5/JKV96s9s7yDpScCZtnerHPfLwOG2f1kzzipii7Jey9j8TD8CzqqZpCXtTFnc6Ys8tB71CPA3wP62f1ox9sBGUTfxtwP+i1Il9HeU8VrH2r62ctxOV8obumTRq3GzdlWBpIttP6vXvorxt6a0XUCZwrlK75hWvD1tny9pN9s/HnfsEfsqxL+UMjfSxbZ3avZVnV6lifE9ys/5Ih5aPc22960ZtxV/a8qiR6aDv+dWzL/lodUAl1CmzK4eu1WGjSgdCfanrI29VQcxH9HjrnYvPJUp0U+x3dWCWkOZLNrrWcBDj6xjdei11rN4ImUK4dOB17Xib075NvDUGnHHleF1wNGUiRNFeap6v+2vdhB7IElSD61zcLHtZzVVMf/ZQbJoP82J8i3/AHczP9PA/p6nE0mb2r6ngzgT/du+ZOzLScW451G6B3ey9vfQtVm4w+mDx3kF8DbKco/HtfbfRXddZ48AnjP2TU/STMpqatV+iahMAb8bAxhh2zhD0gmUaV7eTvk7qL6Mre0LJD2TstbB6yi9oaqurdAyiL/nVY1mHvsSViU5TxJ3TM0JOg+g/P3OVVl/e8xmlHnQarueDtf+HrpkIemptn+uh6/g9iBXWrnN9hcovSReZ/uMGjH6sMG4KoHfUn+szaMoM81uSPlPNOZOSsNrVbb/pWn8u5My7/+HbZ9fK56kHShVIAdQfr5foTzBv6RWzAkM4u+52szF0zQulEkTf0WZduOTrf13UbpL13Zz8xpb+7uqYayGOtH2fE28gptdec1eSUcwwTch2x+rGbeJfTTlm9a/NbteD1zubkYVbzfA3kHtcsygNLh+udLnPwD8B2XtiqXNvmW1qjdXUYaJ/p6vsP33XZUhuiNpM8rvrrpTog9bshg0Se1fzJtQqqeW2H5rR/HbvWR+aPtrleON77b7MLW67TbjWg6mjGpdCJzfbL+fMlV4lYZmSa+mPFk8j7Jux+mUNqlOqz+7/ntuxd2VMtvrn1N6B82grEpZdf3vQcTVI9c8f/AQHax5LunplCEA7bVD/sb2kirxhjlZqMNVpiYpwyaUJRn36ihep71kBtVtt+lvfzvwn8DulAWQNgbe6w6WGG0a0l9FqY56KXAK8DXb36kdu4nf2ZxB4+KOUpLlmTzUdfYpto9YH+MOUtMt/Qg3U8JLejHwMdvPqxJvWJOFul5latXleBwwanteB7EG2kumGbi0Q7N5deWxJVfY/ovm/QzKt67ZXfzCnKAsWwKvBV5fu5qzifd2YD6wpe0nqyz1+lnbu3cQe9T2SLt7sqQLa/0CG3TcQZJ0me2/7LVvqgxdA3fLCGXt606zpaRLeOjRdQalO2319opG571kxjTfek6h9OAQsK2kA23XWkPkwURke6XKFBCdJ4om/m2Uyd5O6HXuFDmY8vT40yb+NU3X7S4MajTzoOIOUqdrhwxzsuh0lamWdg+gFZQlP+9b1clTbBC9ZMZ8kjJI6mp4sNfQvwHPrhTvL/XQzJ+iTIlwJx3VJw9Yp3MGjfNmyr+pQyijmbeljCZfX+MO0tsoa4ecTdM2RZnEsIqhSxatBtfN6HCVqdbnX9s0TLUXh7mqZsyWbzcDedq9ZL7ZUeyNxhIFgO1fNKNtq7DdxRiO6eoCdThn0DivakYu30v5RYak9wK11xQZVNyBcVlQqrNq86Frs2jqc7emdG9sexFwk+2TKsc/hPKfd2ze+X0p88gct+qr1jrmUyhTVv94XC+Z24Evu/IcNk0ZTqYk6bFH5jcCG3bVC2yY6OFzBgGcZ7v6QMQm9qBGMw8k7iAMrIfhECaLbwAftH35uP0jwJG2XznxlVMW/3LgeWN9oiU9Friw5vQTg77nJtajKHXpD3bnBI7rsApuvSdpX2Ab28c22xdR1jk38Pc1OzK0RjM/n4d/EdsMWGl7j/Up7iANqofh0FVDUeb6f8ToStujkuZ0EF+0Gl+b95OtBTAVBn3P2L6v6YF2qiuvoTHE/p7SfXTMxpQ2occCX6BuR4ZBjWYe9CjqQbjOA5jNeBiTxSaTHNu0g/inAj+RdFaz/WpKL6GaBnbPKq2sR1IaHtXsWgl8xvaCmrGH0Ma2b2xt/6jpiXVbM+6jmmZ0/g3Ac2vGmS5xB+wc4FkAks6y/dddBB3GNbgXNe0WD6OyQM/iCc6fEpK+KWmO7aOAd1CW3bwHeKftf6kVtzGQe268jzKR4HNsP8H2lsAuwG6S/q5y7GGzRXvD9iGtzZk1A0u6S9KdE7zuUsX1qAcVd8DaNRHdTSMzhG0WW1OWNL2fhy/SsjHwaldaoKcZEPdPlKeIo2oOSJsg9kDuuYl9CWX51lvH7Z8JfGd9bIAcFJUFl35g+3Pj9r8DeLHtAwZTsphK7cb8iRr2q8UdtmQxRtJLaC3SYvt7HcR8DPBhYC9KddQDY8dcaVrhcfEHcc9X2n766h6L1dcMvDuH0hV8bPbkZ1Nm/n2VH74ud6yjmmrcP9CMH6LUUkDlMUTD2GYBQDOfykQzz9b0R8pf8qMovTUemPz0qTWge55sYZZOFm0ZFs2Ay+dJeikwtsjS/+viS0F0Z1BjiIb2yaJrkvYCPkWZAXWB7bt7XLJeaH0LesQhYBPb1QbmRcTUSbLoiKT/oDRmV5k+OCKipiSLiIjoaRi7zkZExGpKsoiIiJ6SLCIioqcki4iI6CnJIiIievr/1jx69dQ3U/0AAAAASUVORK5CYII=\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"# determine item frequencies\n",
"itemFrequency = count_books.sum(axis=0) / len(count_books)\n",
"\n",
"# and plot as histogram\n",
"ax = itemFrequency.plot.bar(color='blue')\n",
"plt.ylabel('Item frequency (relative)')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of rules 81\n",
" antecedents consequents support confidence lift leverage\n",
"64 (RefBks, YouthBks) (CookBks, ChildBks) 0.05525 0.68000 2.80992 0.03559\n",
"73 (RefBks, DoItYBks) (CookBks, ChildBks) 0.06125 0.66216 2.73621 0.03886\n",
"60 (DoItYBks, YouthBks) (CookBks, ChildBks) 0.06700 0.64891 2.68145 0.04201\n",
"80 (RefBks, GeogBks) (CookBks, ChildBks) 0.05025 0.61468 2.54000 0.03047\n",
"69 (GeogBks, YouthBks) (CookBks, ChildBks) 0.06325 0.60526 2.50109 0.03796\n",
"77 (DoItYBks, GeogBks) (CookBks, ChildBks) 0.06050 0.59901 2.47525 0.03606\n",
"65 (CookBks, ChildBks, GeogBks) (YouthBks) 0.06325 0.57763 2.42445 0.03716\n",
"71 (CookBks, RefBks, ChildBks) (DoItYBks) 0.06125 0.59179 2.32301 0.03488\n",
"47 (DoItYBks, GeogBks) (YouthBks) 0.05450 0.53960 2.26486 0.03044\n",
"61 (CookBks, RefBks, ChildBks) (YouthBks) 0.05525 0.53382 2.24057 0.03059\n",
"56 (CookBks, DoItYBks, ChildBks) (YouthBks) 0.06700 0.52446 2.20131 0.03656\n",
"58 (CookBks, ChildBks, YouthBks) (DoItYBks) 0.06700 0.55833 2.19169 0.03643\n",
"34 (RefBks, ChildBks) (DoItYBks) 0.07100 0.55361 2.17314 0.03833\n",
"75 (CookBks, ChildBks, GeogBks) (DoItYBks) 0.06050 0.55251 2.16884 0.03260\n",
"19 (ChildBks, GeogBks) (YouthBks) 0.07550 0.51624 2.16680 0.04066\n",
"45 (CookBks, GeogBks) (YouthBks) 0.08025 0.51360 2.15572 0.04302\n",
"63 (RefBks, ChildBks, YouthBks) (CookBks) 0.05525 0.89113 2.14471 0.02949\n",
"17 (ChildBks, YouthBks) (DoItYBks) 0.08025 0.54407 2.13569 0.04267\n",
"50 (CookBks, RefBks) (DoItYBks) 0.07450 0.53309 2.09262 0.03890\n",
"28 (RefBks) (CookBks, ChildBks) 0.10350 0.50549 2.08882 0.05395\n",
"70 (CookBks, DoItYBks, RefBks) (ChildBks) 0.06125 0.82215 2.08667 0.03190\n",
"15 (YouthBks) (CookBks, ChildBks) 0.12000 0.50367 2.08129 0.06234\n",
"72 (RefBks, DoItYBks, ChildBks) (CookBks) 0.06125 0.86268 2.07624 0.03175\n",
"23 (CookBks, ChildBks) (DoItYBks) 0.12775 0.52789 2.07220 0.06610\n",
"25 (DoItYBks) (CookBks, ChildBks) 0.12775 0.50147 2.07220 0.06610\n"
]
}
],
"source": [
"# create frequent itemsets and rules\n",
"itemsets = apriori(count_books, min_support=200/4000, use_colnames=True)\n",
"rules = association_rules(itemsets, metric='confidence', min_threshold=0.5)\n",
"\n",
"print('Number of rules', len(rules))\n",
"\n",
"# Display 25 rules with highest lift\n",
"rules.sort_values(by=['lift'], ascending=False).head(25)\n",
"\n",
"pd.set_option('precision', 5)\n",
"pd.set_option('display.width', 100)\n",
"print(rules.sort_values(by=['lift'], ascending=False).drop(columns=['antecedent support', 'consequent support', 'conviction']).head(25))\n",
"pd.set_option('precision', 6)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>antecedents</th>\n",
" <th>consequents</th>\n",
" <th>antecedent support</th>\n",
" <th>consequent support</th>\n",
" <th>support</th>\n",
" <th>confidence</th>\n",
" <th>lift</th>\n",
" <th>leverage</th>\n",
" <th>conviction</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>47</th>\n",
" <td>(DoItYBks, GeogBks)</td>\n",
" <td>(YouthBks)</td>\n",
" <td>0.10100</td>\n",
" <td>0.23825</td>\n",
" <td>0.05450</td>\n",
" <td>0.539604</td>\n",
" <td>2.264864</td>\n",
" <td>0.030437</td>\n",
" <td>1.654554</td>\n",
" </tr>\n",
" <tr>\n",
" <th>34</th>\n",
" <td>(RefBks, ChildBks)</td>\n",
" <td>(DoItYBks)</td>\n",
" <td>0.12825</td>\n",
" <td>0.25475</td>\n",
" <td>0.07100</td>\n",
" <td>0.553606</td>\n",
" <td>2.173135</td>\n",
" <td>0.038328</td>\n",
" <td>1.669490</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>(ChildBks, GeogBks)</td>\n",
" <td>(YouthBks)</td>\n",
" <td>0.14625</td>\n",
" <td>0.23825</td>\n",
" <td>0.07550</td>\n",
" <td>0.516239</td>\n",
" <td>2.166797</td>\n",
" <td>0.040656</td>\n",
" <td>1.574642</td>\n",
" </tr>\n",
" <tr>\n",
" <th>45</th>\n",
" <td>(CookBks, GeogBks)</td>\n",
" <td>(YouthBks)</td>\n",
" <td>0.15625</td>\n",
" <td>0.23825</td>\n",
" <td>0.08025</td>\n",
" <td>0.513600</td>\n",
" <td>2.155719</td>\n",
" <td>0.043023</td>\n",
" <td>1.566098</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>(ChildBks, YouthBks)</td>\n",
" <td>(DoItYBks)</td>\n",
" <td>0.14750</td>\n",
" <td>0.25475</td>\n",
" <td>0.08025</td>\n",
" <td>0.544068</td>\n",
" <td>2.135693</td>\n",
" <td>0.042674</td>\n",
" <td>1.634563</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50</th>\n",
" <td>(CookBks, RefBks)</td>\n",
" <td>(DoItYBks)</td>\n",
" <td>0.13975</td>\n",
" <td>0.25475</td>\n",
" <td>0.07450</td>\n",
" <td>0.533095</td>\n",
" <td>2.092619</td>\n",
" <td>0.038899</td>\n",
" <td>1.596148</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>(CookBks, ChildBks)</td>\n",
" <td>(DoItYBks)</td>\n",
" <td>0.24200</td>\n",
" <td>0.25475</td>\n",
" <td>0.12775</td>\n",
" <td>0.527893</td>\n",
" <td>2.072198</td>\n",
" <td>0.066101</td>\n",
" <td>1.578560</td>\n",
" </tr>\n",
" <tr>\n",
" <th>49</th>\n",
" <td>(GeogBks, YouthBks)</td>\n",
" <td>(DoItYBks)</td>\n",
" <td>0.10450</td>\n",
" <td>0.25475</td>\n",
" <td>0.05450</td>\n",
" <td>0.521531</td>\n",
" <td>2.047227</td>\n",
" <td>0.027879</td>\n",
" <td>1.557573</td>\n",
" </tr>\n",
" <tr>\n",
" <th>41</th>\n",
" <td>(CookBks, YouthBks)</td>\n",
" <td>(DoItYBks)</td>\n",
" <td>0.16100</td>\n",
" <td>0.25475</td>\n",
" <td>0.08375</td>\n",
" <td>0.520186</td>\n",
" <td>2.041948</td>\n",
" <td>0.042735</td>\n",
" <td>1.553207</td>\n",
" </tr>\n",
" <tr>\n",
" <th>43</th>\n",
" <td>(RefBks, YouthBks)</td>\n",
" <td>(CookBks)</td>\n",
" <td>0.08125</td>\n",
" <td>0.41550</td>\n",
" <td>0.06825</td>\n",
" <td>0.840000</td>\n",
" <td>2.021661</td>\n",
" <td>0.034491</td>\n",
" <td>3.653125</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" antecedents consequents antecedent support consequent support support confidence \\\n",
"47 (DoItYBks, GeogBks) (YouthBks) 0.10100 0.23825 0.05450 0.539604 \n",
"34 (RefBks, ChildBks) (DoItYBks) 0.12825 0.25475 0.07100 0.553606 \n",
"19 (ChildBks, GeogBks) (YouthBks) 0.14625 0.23825 0.07550 0.516239 \n",
"45 (CookBks, GeogBks) (YouthBks) 0.15625 0.23825 0.08025 0.513600 \n",
"17 (ChildBks, YouthBks) (DoItYBks) 0.14750 0.25475 0.08025 0.544068 \n",
"50 (CookBks, RefBks) (DoItYBks) 0.13975 0.25475 0.07450 0.533095 \n",
"23 (CookBks, ChildBks) (DoItYBks) 0.24200 0.25475 0.12775 0.527893 \n",
"49 (GeogBks, YouthBks) (DoItYBks) 0.10450 0.25475 0.05450 0.521531 \n",
"41 (CookBks, YouthBks) (DoItYBks) 0.16100 0.25475 0.08375 0.520186 \n",
"43 (RefBks, YouthBks) (CookBks) 0.08125 0.41550 0.06825 0.840000 \n",
"\n",
" lift leverage conviction \n",
"47 2.264864 0.030437 1.654554 \n",
"34 2.173135 0.038328 1.669490 \n",
"19 2.166797 0.040656 1.574642 \n",
"45 2.155719 0.043023 1.566098 \n",
"17 2.135693 0.042674 1.634563 \n",
"50 2.092619 0.038899 1.596148 \n",
"23 2.072198 0.066101 1.578560 \n",
"49 2.047227 0.027879 1.557573 \n",
"41 2.041948 0.042735 1.553207 \n",
"43 2.021661 0.034491 3.653125 "
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Filter rules by number of antecedents (maximum 2) and consequents (maximum 1)\n",
"rules = rules[[len(c) <= 2 for c in rules.antecedents]]\n",
"rules = rules[[len(c) == 1 for c in rules.consequents]]\n",
"\n",
"rules.sort_values(by=['lift'], ascending=False).head(10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Section 14.2 Collaborative Filtering"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Computing the cosine similarity matrix...\n",
"Done computing similarity matrix.\n",
"user: 823519 item: 30 r_ui = 4.00 est = 3.54 {'was_impossible': True, 'reason': 'User and/or item is unkown.'}\n"
]
}
],
"source": [
"ratings = pd.DataFrame([\n",
" [30878, 1, 4], [30878, 5, 1], [30878, 18, 3], [30878, 28, 3], [30878, 30, 4], [30878, 44, 5], \n",
" [124105, 1, 4], \n",
" [822109, 1, 5], \n",
" [823519, 1, 3], [823519, 8, 1], [823519, 17, 4], [823519, 28, 4], [823519, 30, 5], \n",
" [885013, 1, 4], [885013, 5, 5], \n",
" [893988, 1, 3], [893988, 30, 4], [893988, 44, 4], \n",
" [1248029, 1, 3], [1248029, 28, 2], [1248029, 30, 4], [1248029, 48, 3], \n",
" [1503895, 1, 4], \n",
" [1842128, 1, 4], [1842128, 30, 3], \n",
" [2238063, 1, 3], \n",
"], columns=['customerID', 'movieID', 'rating'])\n",
"\n",
"reader = Reader(rating_scale=(1, 5))\n",
"data = Dataset.load_from_df(ratings[['customerID', 'movieID', 'rating']], reader)\n",
"trainset = data.build_full_trainset()\n",
"sim_options = {'name': 'cosine', 'user_based': False} # compute cosine similarities between items\n",
"algo = KNNBasic(sim_options=sim_options)\n",
"algo.fit(trainset)\n",
"pred = algo.predict(str(823519), str(30), r_ui=4, verbose=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Table 14.11"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Computing the cosine similarity matrix...\n",
"Done computing similarity matrix.\n",
"\n",
"Top-3 recommended items for each user\n",
"User 6\n",
" Item 6 (5.00) Item 77 (2.50) Item 60 (1.00)\n",
"User 222\n",
" Item 77 (3.50) Item 75 (2.78)\n",
"User 424\n",
" Item 14 (3.50) Item 45 (3.10) Item 54 (2.34)\n",
"User 87\n",
" Item 27 (3.00) Item 54 (3.00) Item 82 (3.00) Item 32 (1.00)\n",
"User 121\n",
" Item 98 (3.48) Item 32 (2.83)\n",
"\n",
"Computing the cosine similarity matrix...\n",
"Done computing similarity matrix.\n",
"\n",
"Top-3 recommended items for each user\n",
"User 6\n",
" Item 77 (3.00) Item 60 (3.00) Item 6 (3.00)\n",
"User 222\n",
" Item 77 (2.24) Item 75 (2.00)\n",
"User 424\n",
" Item 54 (3.47) Item 14 (3.44) Item 45 (3.00)\n",
"User 87\n",
" Item 27 (3.00) Item 32 (3.00) Item 82 (3.00) Item 54 (2.50)\n",
"User 121\n",
" Item 32 (3.06) Item 98 (2.31)\n"
]
}
],
"source": [
"import random\n",
"\n",
"random.seed(0)\n",
"nratings = 5000\n",
"randomData = pd.DataFrame({\n",
" 'itemID': [random.randint(0,99) for _ in range(nratings)],\n",
" 'userID': [random.randint(0,999) for _ in range(nratings)],\n",
" 'rating': [random.randint(1,5) for _ in range(nratings)],\n",
"})\n",
"\n",
"def get_top_n(predictions, n=10):\n",
" # First map the predictions to each user.\n",
" byUser = defaultdict(list)\n",
" for p in predictions:\n",
" byUser[p.uid].append(p)\n",
" \n",
" # For each user, reduce predictions to top-n\n",
" for uid, userPredictions in byUser.items():\n",
" byUser[uid] = heapq.nlargest(n, userPredictions, key=lambda p: p.est)\n",
" return byUser\n",
"\n",
"# Convert thes data set into the format required by the surprise package\n",
"# The columns must correspond to user id, item id and ratings (in that order)\n",
"reader = Reader(rating_scale=(1, 5))\n",
"data = Dataset.load_from_df(randomData[['userID', 'itemID', 'rating']], reader)\n",
"\n",
"# Split into training and test set\n",
"trainset, testset = train_test_split(data, test_size=.25, random_state=1)\n",
"\n",
"## User-based filtering\n",
"# compute cosine similarity between users \n",
"sim_options = {'name': 'cosine', 'user_based': True}\n",
"algo = KNNBasic(sim_options=sim_options)\n",
"algo.fit(trainset)\n",
"\n",
"# Than predict ratings for all pairs (u, i) that are NOT in the training set.\n",
"predictions = algo.test(testset)\n",
"\n",
"top_n = get_top_n(predictions, n=4)\n",
"\n",
"# Print the recommended items for each user\n",
"print()\n",
"print('Top-3 recommended items for each user')\n",
"for uid, user_ratings in list(top_n.items())[:5]:\n",
" print('User {}'.format(uid))\n",
" for prediction in user_ratings:\n",
" print(' Item {0.iid} ({0.est:.2f})'.format(prediction), end='')\n",
" print()\n",
"print()\n",
"\n",
" \n",
"## Item-based filtering\n",
"# compute cosine similarity between users \n",
"sim_options = {'name': 'cosine', 'user_based': False}\n",
"algo = KNNBasic(sim_options=sim_options)\n",
"algo.fit(trainset)\n",
"\n",
"# Than predict ratings for all pairs (u, i) that are NOT in the training set.\n",
"predictions = algo.test(testset)\n",
"top_n = get_top_n(predictions, n=4)\n",
"\n",
"# Print the recommended items for each user\n",
"print()\n",
"print('Top-3 recommended items for each user')\n",
"for uid, user_ratings in list(top_n.items())[:5]:\n",
" print('User {}'.format(uid))\n",
" for prediction in user_ratings:\n",
" print(' Item {0.iid} ({0.est:.2f})'.format(prediction), end='')\n",
" print()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Computing the cosine similarity matrix...\n",
"Done computing similarity matrix.\n"
]
},
{
"data": {
"text/plain": [
"Prediction(uid=383, iid=7, r_ui=None, est=2.3661840936304324, details={'actual_k': 4, 'was_impossible': False})"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"## Build a model using the full dataset\n",
"trainset = data.build_full_trainset()\n",
"sim_options = {'name': 'cosine', 'user_based': False}\n",
"algo = KNNBasic(sim_options=sim_options)\n",
"algo.fit(trainset)\n",
"\n",
"# Predict rating for user 383 and item 7\n",
"algo.predict(383, 7)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}