1463 lines
64 KiB
Plaintext
1463 lines
64 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Chapter 14: Association Rules and Collaborative Filtering\n",
|
|
"\n",
|
|
"> (c) 2019 Galit Shmueli, Peter C. Bruce, Peter Gedeck \n",
|
|
">\n",
|
|
"> Code included in\n",
|
|
">\n",
|
|
"> _Data Mining for Business Analytics: Concepts, Techniques, and Applications in Python_ (First Edition) \n",
|
|
"> Galit Shmueli, Peter C. Bruce, Peter Gedeck, and Nitin R. Patel. 2019.\n",
|
|
"\n",
|
|
"## Import required packages"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Collecting package metadata (current_repodata.json): ...working... done\n",
|
|
"Solving environment: ...working... failed with initial frozen solve. Retrying with flexible solve.\n",
|
|
"Solving environment: ...working... failed with repodata from current_repodata.json, will retry with next repodata source.\n",
|
|
"Collecting package metadata (repodata.json): ...working... done\n",
|
|
"Solving environment: ...working... done\n",
|
|
"\n",
|
|
"## Package Plan ##\n",
|
|
"\n",
|
|
" environment location: C:\\Users\\isa1761\\Anaconda3\n",
|
|
"\n",
|
|
" added / updated specs:\n",
|
|
" - scikit-surprise\n",
|
|
"\n",
|
|
"\n",
|
|
"The following packages will be downloaded:\n",
|
|
"\n",
|
|
" package | build\n",
|
|
" ---------------------------|-----------------\n",
|
|
" conda-4.12.0 | py39hcbf5309_0 1.0 MB conda-forge\n",
|
|
" scikit-surprise-1.1.1 | py39h5d4886f_2 538 KB conda-forge\n",
|
|
" ------------------------------------------------------------\n",
|
|
" Total: 1.5 MB\n",
|
|
"\n",
|
|
"The following NEW packages will be INSTALLED:\n",
|
|
"\n",
|
|
" scikit-surprise conda-forge/win-64::scikit-surprise-1.1.1-py39h5d4886f_2\n",
|
|
"\n",
|
|
"The following packages will be UPDATED:\n",
|
|
"\n",
|
|
" conda 4.11.0-py39hcbf5309_0 --> 4.12.0-py39hcbf5309_0\n",
|
|
"\n",
|
|
"\n",
|
|
"\n",
|
|
"Downloading and Extracting Packages\n",
|
|
"\n",
|
|
"scikit-surprise-1.1. | 538 KB | | 0% \n",
|
|
"scikit-surprise-1.1. | 538 KB | 2 | 3% \n",
|
|
"scikit-surprise-1.1. | 538 KB | ########## | 100% \n",
|
|
"scikit-surprise-1.1. | 538 KB | ########## | 100% \n",
|
|
"\n",
|
|
"conda-4.12.0 | 1.0 MB | | 0% \n",
|
|
"conda-4.12.0 | 1.0 MB | 1 | 2% \n",
|
|
"conda-4.12.0 | 1.0 MB | ########## | 100% \n",
|
|
"conda-4.12.0 | 1.0 MB | ########## | 100% \n",
|
|
"Preparing transaction: ...working... done\n",
|
|
"Verifying transaction: ...working... done\n",
|
|
"Executing transaction: ...working... done\n",
|
|
"\n",
|
|
"Note: you may need to restart the kernel to use updated packages.\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"conda install -c conda-forge scikit-surprise"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"%matplotlib inline\n",
|
|
"\n",
|
|
"from pathlib import Path\n",
|
|
"\n",
|
|
"import heapq\n",
|
|
"from collections import defaultdict\n",
|
|
"\n",
|
|
"import pandas as pd\n",
|
|
"import matplotlib.pylab as plt\n",
|
|
"from mlxtend.frequent_patterns import apriori\n",
|
|
"from mlxtend.frequent_patterns import association_rules\n",
|
|
"\n",
|
|
"from surprise import Dataset, Reader, KNNBasic\n",
|
|
"from surprise.model_selection import train_test_split\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Table 14.4"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {
|
|
"scrolled": true
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>Red</th>\n",
|
|
" <th>White</th>\n",
|
|
" <th>Blue</th>\n",
|
|
" <th>Orange</th>\n",
|
|
" <th>Green</th>\n",
|
|
" <th>Yellow</th>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>Transaction</th>\n",
|
|
" <th></th>\n",
|
|
" <th></th>\n",
|
|
" <th></th>\n",
|
|
" <th></th>\n",
|
|
" <th></th>\n",
|
|
" <th></th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>5</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>6</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>7</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>8</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>9</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>10</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" Red White Blue Orange Green Yellow\n",
|
|
"Transaction \n",
|
|
"1 1 1 0 0 1 0\n",
|
|
"2 0 1 0 1 0 0\n",
|
|
"3 0 1 1 0 0 0\n",
|
|
"4 1 1 0 1 0 0\n",
|
|
"5 1 0 1 0 0 0\n",
|
|
"6 0 1 1 0 0 0\n",
|
|
"7 1 0 1 0 0 0\n",
|
|
"8 1 1 1 0 1 0\n",
|
|
"9 1 1 1 0 0 0\n",
|
|
"10 0 0 0 0 0 1"
|
|
]
|
|
},
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"# Load and preprocess data set \n",
|
|
"fp_df = pd.read_csv('Faceplate.csv')\n",
|
|
"fp_df.set_index('Transaction', inplace=True)\n",
|
|
"fp_df"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
" antecedents consequents support confidence lift leverage\n",
|
|
"12 (White, Red) (Green) 0.2 0.5 2.500000 0.12\n",
|
|
"15 (Green) (White, Red) 0.2 1.0 2.500000 0.12\n",
|
|
"4 (Green) (Red) 0.2 1.0 1.666667 0.08\n",
|
|
"13 (White, Green) (Red) 0.2 1.0 1.666667 0.08\n",
|
|
"7 (Orange) (White) 0.2 1.0 1.428571 0.06\n",
|
|
"8 (Green) (White) 0.2 1.0 1.428571 0.06\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# create frequent itemsets\n",
|
|
"itemsets = apriori(fp_df, min_support=0.2, use_colnames=True)\n",
|
|
"\n",
|
|
"# and convert into rules\n",
|
|
"rules = association_rules(itemsets, metric='confidence', min_threshold=0.5)\n",
|
|
"rules.sort_values(by=['lift'], ascending=False).head(6)\n",
|
|
"\n",
|
|
"print(rules.sort_values(by=['lift'], ascending=False)\n",
|
|
" .drop(columns=['antecedent support', 'consequent support', 'conviction'])\n",
|
|
" .head(6))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>antecedents</th>\n",
|
|
" <th>consequents</th>\n",
|
|
" <th>antecedent support</th>\n",
|
|
" <th>consequent support</th>\n",
|
|
" <th>support</th>\n",
|
|
" <th>confidence</th>\n",
|
|
" <th>lift</th>\n",
|
|
" <th>leverage</th>\n",
|
|
" <th>conviction</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>12</th>\n",
|
|
" <td>(White, Red)</td>\n",
|
|
" <td>(Green)</td>\n",
|
|
" <td>0.4</td>\n",
|
|
" <td>0.2</td>\n",
|
|
" <td>0.2</td>\n",
|
|
" <td>0.5</td>\n",
|
|
" <td>2.500000</td>\n",
|
|
" <td>0.12</td>\n",
|
|
" <td>1.6</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>(Green)</td>\n",
|
|
" <td>(Red)</td>\n",
|
|
" <td>0.2</td>\n",
|
|
" <td>0.6</td>\n",
|
|
" <td>0.2</td>\n",
|
|
" <td>1.0</td>\n",
|
|
" <td>1.666667</td>\n",
|
|
" <td>0.08</td>\n",
|
|
" <td>inf</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>13</th>\n",
|
|
" <td>(White, Green)</td>\n",
|
|
" <td>(Red)</td>\n",
|
|
" <td>0.2</td>\n",
|
|
" <td>0.6</td>\n",
|
|
" <td>0.2</td>\n",
|
|
" <td>1.0</td>\n",
|
|
" <td>1.666667</td>\n",
|
|
" <td>0.08</td>\n",
|
|
" <td>inf</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>7</th>\n",
|
|
" <td>(Orange)</td>\n",
|
|
" <td>(White)</td>\n",
|
|
" <td>0.2</td>\n",
|
|
" <td>0.7</td>\n",
|
|
" <td>0.2</td>\n",
|
|
" <td>1.0</td>\n",
|
|
" <td>1.428571</td>\n",
|
|
" <td>0.06</td>\n",
|
|
" <td>inf</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>8</th>\n",
|
|
" <td>(Green)</td>\n",
|
|
" <td>(White)</td>\n",
|
|
" <td>0.2</td>\n",
|
|
" <td>0.7</td>\n",
|
|
" <td>0.2</td>\n",
|
|
" <td>1.0</td>\n",
|
|
" <td>1.428571</td>\n",
|
|
" <td>0.06</td>\n",
|
|
" <td>inf</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>14</th>\n",
|
|
" <td>(Red, Green)</td>\n",
|
|
" <td>(White)</td>\n",
|
|
" <td>0.2</td>\n",
|
|
" <td>0.7</td>\n",
|
|
" <td>0.2</td>\n",
|
|
" <td>1.0</td>\n",
|
|
" <td>1.428571</td>\n",
|
|
" <td>0.06</td>\n",
|
|
" <td>inf</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" antecedents consequents antecedent support consequent support \\\n",
|
|
"12 (White, Red) (Green) 0.4 0.2 \n",
|
|
"4 (Green) (Red) 0.2 0.6 \n",
|
|
"13 (White, Green) (Red) 0.2 0.6 \n",
|
|
"7 (Orange) (White) 0.2 0.7 \n",
|
|
"8 (Green) (White) 0.2 0.7 \n",
|
|
"14 (Red, Green) (White) 0.2 0.7 \n",
|
|
"\n",
|
|
" support confidence lift leverage conviction \n",
|
|
"12 0.2 0.5 2.500000 0.12 1.6 \n",
|
|
"4 0.2 1.0 1.666667 0.08 inf \n",
|
|
"13 0.2 1.0 1.666667 0.08 inf \n",
|
|
"7 0.2 1.0 1.428571 0.06 inf \n",
|
|
"8 0.2 1.0 1.428571 0.06 inf \n",
|
|
"14 0.2 1.0 1.428571 0.06 inf "
|
|
]
|
|
},
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"# filter to get rules with single consequents only\n",
|
|
"rules[[len(c) == 1 for c in rules.consequents]].sort_values(by=['lift'], ascending=False).head(6)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"The apriori method accepts sparse data frames as well. If we convert the original data frame to sparse format, we can see that the memory requirements go down to 40%. The `fill_value` argument informs the `to_sparse` method here which fields to ignore in each transaction."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Density 0.4\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>antecedents</th>\n",
|
|
" <th>consequents</th>\n",
|
|
" <th>antecedent support</th>\n",
|
|
" <th>consequent support</th>\n",
|
|
" <th>support</th>\n",
|
|
" <th>confidence</th>\n",
|
|
" <th>lift</th>\n",
|
|
" <th>leverage</th>\n",
|
|
" <th>conviction</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>12</th>\n",
|
|
" <td>(White, Red)</td>\n",
|
|
" <td>(Green)</td>\n",
|
|
" <td>0.4</td>\n",
|
|
" <td>0.2</td>\n",
|
|
" <td>0.2</td>\n",
|
|
" <td>0.5</td>\n",
|
|
" <td>2.500000</td>\n",
|
|
" <td>0.12</td>\n",
|
|
" <td>1.6</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>15</th>\n",
|
|
" <td>(Green)</td>\n",
|
|
" <td>(White, Red)</td>\n",
|
|
" <td>0.2</td>\n",
|
|
" <td>0.4</td>\n",
|
|
" <td>0.2</td>\n",
|
|
" <td>1.0</td>\n",
|
|
" <td>2.500000</td>\n",
|
|
" <td>0.12</td>\n",
|
|
" <td>inf</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>(Green)</td>\n",
|
|
" <td>(Red)</td>\n",
|
|
" <td>0.2</td>\n",
|
|
" <td>0.6</td>\n",
|
|
" <td>0.2</td>\n",
|
|
" <td>1.0</td>\n",
|
|
" <td>1.666667</td>\n",
|
|
" <td>0.08</td>\n",
|
|
" <td>inf</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>13</th>\n",
|
|
" <td>(White, Green)</td>\n",
|
|
" <td>(Red)</td>\n",
|
|
" <td>0.2</td>\n",
|
|
" <td>0.6</td>\n",
|
|
" <td>0.2</td>\n",
|
|
" <td>1.0</td>\n",
|
|
" <td>1.666667</td>\n",
|
|
" <td>0.08</td>\n",
|
|
" <td>inf</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>7</th>\n",
|
|
" <td>(Orange)</td>\n",
|
|
" <td>(White)</td>\n",
|
|
" <td>0.2</td>\n",
|
|
" <td>0.7</td>\n",
|
|
" <td>0.2</td>\n",
|
|
" <td>1.0</td>\n",
|
|
" <td>1.428571</td>\n",
|
|
" <td>0.06</td>\n",
|
|
" <td>inf</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>8</th>\n",
|
|
" <td>(Green)</td>\n",
|
|
" <td>(White)</td>\n",
|
|
" <td>0.2</td>\n",
|
|
" <td>0.7</td>\n",
|
|
" <td>0.2</td>\n",
|
|
" <td>1.0</td>\n",
|
|
" <td>1.428571</td>\n",
|
|
" <td>0.06</td>\n",
|
|
" <td>inf</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" antecedents consequents antecedent support consequent support \\\n",
|
|
"12 (White, Red) (Green) 0.4 0.2 \n",
|
|
"15 (Green) (White, Red) 0.2 0.4 \n",
|
|
"4 (Green) (Red) 0.2 0.6 \n",
|
|
"13 (White, Green) (Red) 0.2 0.6 \n",
|
|
"7 (Orange) (White) 0.2 0.7 \n",
|
|
"8 (Green) (White) 0.2 0.7 \n",
|
|
"\n",
|
|
" support confidence lift leverage conviction \n",
|
|
"12 0.2 0.5 2.500000 0.12 1.6 \n",
|
|
"15 0.2 1.0 2.500000 0.12 inf \n",
|
|
"4 0.2 1.0 1.666667 0.08 inf \n",
|
|
"13 0.2 1.0 1.666667 0.08 inf \n",
|
|
"7 0.2 1.0 1.428571 0.06 inf \n",
|
|
"8 0.2 1.0 1.428571 0.06 inf "
|
|
]
|
|
},
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"# Convert data set into a sparse data frame\n",
|
|
"sparse_df = fp_df.to_sparse(fill_value=0)\n",
|
|
"print('Density {}'.format(sparse_df.density))\n",
|
|
"\n",
|
|
"# create frequent itemsets\n",
|
|
"itemsets = apriori(sparse_df, min_support=0.2, use_colnames=True)\n",
|
|
"\n",
|
|
"# and convert into rules\n",
|
|
"rules = association_rules(itemsets, metric='confidence', min_threshold=0.5)\n",
|
|
"rules.sort_values(by=['lift'], ascending=False).head(6)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Data required for Table 14.5 and 14.6"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"[{8}, {8, 3, 4}, {8}, {9, 3}, {9}, {8, 1}, {9, 6}, {9, 3, 5, 7}, {8}, set(), {1, 9, 7}, {1, 4, 5, 8, 9}, {9, 5, 7}, {8, 6, 7}, {9, 3, 7}, {1, 4, 9}, {8, 6, 7}, {8}, set(), {9}, {8, 2, 5, 6}, {9, 4, 6}, {9, 4}, {8, 9}, {8, 6}, {8, 1, 6}, {8, 5}, {8, 9, 4}, {9}, {8}, {8, 1, 5}, {9, 3, 6}, {9, 7}, {8, 9, 7}, {8, 3, 4, 6}, {8, 1, 4}, {8, 4, 7}, {8, 9}, {9, 4, 5, 7}, {8, 9, 2}, {9, 2, 5}, {1, 2, 9, 7}, {8, 5}, {8, 1, 7}, {8}, {9, 2, 7}, {9, 4, 6}, {9}, {9}, {8, 6, 7}]\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>1</th>\n",
|
|
" <th>2</th>\n",
|
|
" <th>3</th>\n",
|
|
" <th>4</th>\n",
|
|
" <th>5</th>\n",
|
|
" <th>6</th>\n",
|
|
" <th>7</th>\n",
|
|
" <th>8</th>\n",
|
|
" <th>9</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" 1 2 3 4 5 6 7 8 9\n",
|
|
"0 0 0 0 0 0 0 0 1 0\n",
|
|
"1 0 0 1 1 0 0 0 1 0\n",
|
|
"2 0 0 0 0 0 0 0 1 0\n",
|
|
"3 0 0 1 0 0 0 0 0 1\n",
|
|
"4 0 0 0 0 0 0 0 0 1"
|
|
]
|
|
},
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"# Prepare the dataset for table 14.6 based on table 14.5\n",
|
|
"from itertools import chain\n",
|
|
"randomTransactions = [{8}, {3,4,8}, {8}, {3,9}, {9}, {1,8}, {6,9}, {3,5,7,9}, {8}, set(), \n",
|
|
" {1,7,9}, {1,4,5,8,9}, {5,7,9}, {6,7,8}, {3,7,9}, {1,4,9}, {6,7,8}, {8}, set(), {9},\n",
|
|
" {2,5,6,8}, {4,6,9}, {4,9}, {8,9}, {6,8}, {1,6,8}, {5,8}, {4,8,9}, {9}, {8},\n",
|
|
" {1,5,8}, {3,6,9}, {7,9}, {7,8,9}, {3,4,6,8}, {1,4,8}, {4,7,8}, {8,9}, {4,5,7,9}, {2,8,9},\n",
|
|
" {2,5,9}, {1,2,7,9}, {5,8}, {1,7,8}, {8}, {2,7,9}, {4,6,9}, {9}, {9}, {6,7,8}]\n",
|
|
"print(randomTransactions)\n",
|
|
"uniqueItems = sorted(set(chain.from_iterable(randomTransactions)))\n",
|
|
"randomData = pd.DataFrame(0, index=range(len(randomTransactions)), columns=uniqueItems)\n",
|
|
"for row, transaction in enumerate(randomTransactions):\n",
|
|
" for item in transaction:\n",
|
|
" randomData.loc[row][item] = 1\n",
|
|
"randomData.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Table 14.6"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
" antecedents consequents support confidence lift leverage\n",
|
|
"3 (8, 3) (4) 0.04 1.0 4.545455 0.0312\n",
|
|
"1 (1, 5) (8) 0.04 1.0 1.851852 0.0184\n",
|
|
"2 (2, 7) (9) 0.04 1.0 1.851852 0.0184\n",
|
|
"4 (3, 4) (8) 0.04 1.0 1.851852 0.0184\n",
|
|
"5 (3, 7) (9) 0.04 1.0 1.851852 0.0184\n",
|
|
"6 (4, 5) (9) 0.04 1.0 1.851852 0.0184\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# create frequent itemsets\n",
|
|
"itemsets = apriori(randomData, min_support=2/len(randomData), use_colnames=True)\n",
|
|
"# and convert into rules\n",
|
|
"rules = association_rules(itemsets, metric='confidence', min_threshold=0.7)\n",
|
|
"print(rules.sort_values(by=['lift'], ascending=False)\n",
|
|
" .drop(columns=['antecedent support', 'consequent support', 'conviction'])\n",
|
|
" .head(6))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Table 14.8"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>ChildBks</th>\n",
|
|
" <th>YouthBks</th>\n",
|
|
" <th>CookBks</th>\n",
|
|
" <th>DoItYBks</th>\n",
|
|
" <th>RefBks</th>\n",
|
|
" <th>ArtBks</th>\n",
|
|
" <th>GeogBks</th>\n",
|
|
" <th>ItalCook</th>\n",
|
|
" <th>ItalAtlas</th>\n",
|
|
" <th>ItalArt</th>\n",
|
|
" <th>Florence</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" ChildBks YouthBks CookBks DoItYBks RefBks ArtBks GeogBks ItalCook \\\n",
|
|
"0 0 1 1 0 0 0 0 0 \n",
|
|
"1 0 0 0 0 0 0 0 0 \n",
|
|
"2 1 1 1 0 1 0 1 1 \n",
|
|
"3 0 0 0 0 0 0 0 0 \n",
|
|
"4 0 0 0 0 0 0 0 0 \n",
|
|
"\n",
|
|
" ItalAtlas ItalArt Florence \n",
|
|
"0 0 0 0 \n",
|
|
"1 0 0 0 \n",
|
|
"2 0 0 0 \n",
|
|
"3 0 0 0 \n",
|
|
"4 0 0 0 "
|
|
]
|
|
},
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"# load dataset\n",
|
|
"all_books_df = pd.read_csv('CharlesBookClub.csv')\n",
|
|
"\n",
|
|
"# create the binary incidence matrix\n",
|
|
"ignore = ['Seq#', 'ID#', 'Gender', 'M', 'R', 'F', 'FirstPurch', 'Related Purchase',\n",
|
|
" 'Mcode', 'Rcode', 'Fcode', 'Yes_Florence', 'No_Florence']\n",
|
|
"count_books = all_books_df.drop(columns=ignore)\n",
|
|
"count_books[count_books > 0] = 1\n",
|
|
"\n",
|
|
"count_books.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"image/png": "\n",
|
|
"text/plain": [
|
|
"<Figure size 432x288 with 1 Axes>"
|
|
]
|
|
},
|
|
"metadata": {
|
|
"needs_background": "light"
|
|
},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"# determine item frequencies\n",
|
|
"itemFrequency = count_books.sum(axis=0) / len(count_books)\n",
|
|
"\n",
|
|
"# and plot as histogram\n",
|
|
"ax = itemFrequency.plot.bar(color='blue')\n",
|
|
"plt.ylabel('Item frequency (relative)')\n",
|
|
"plt.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Number of rules 81\n",
|
|
" antecedents consequents support confidence lift leverage\n",
|
|
"64 (RefBks, YouthBks) (CookBks, ChildBks) 0.05525 0.68000 2.80992 0.03559\n",
|
|
"73 (RefBks, DoItYBks) (CookBks, ChildBks) 0.06125 0.66216 2.73621 0.03886\n",
|
|
"60 (DoItYBks, YouthBks) (CookBks, ChildBks) 0.06700 0.64891 2.68145 0.04201\n",
|
|
"80 (RefBks, GeogBks) (CookBks, ChildBks) 0.05025 0.61468 2.54000 0.03047\n",
|
|
"69 (GeogBks, YouthBks) (CookBks, ChildBks) 0.06325 0.60526 2.50109 0.03796\n",
|
|
"77 (DoItYBks, GeogBks) (CookBks, ChildBks) 0.06050 0.59901 2.47525 0.03606\n",
|
|
"65 (CookBks, ChildBks, GeogBks) (YouthBks) 0.06325 0.57763 2.42445 0.03716\n",
|
|
"71 (CookBks, RefBks, ChildBks) (DoItYBks) 0.06125 0.59179 2.32301 0.03488\n",
|
|
"47 (DoItYBks, GeogBks) (YouthBks) 0.05450 0.53960 2.26486 0.03044\n",
|
|
"61 (CookBks, RefBks, ChildBks) (YouthBks) 0.05525 0.53382 2.24057 0.03059\n",
|
|
"56 (CookBks, DoItYBks, ChildBks) (YouthBks) 0.06700 0.52446 2.20131 0.03656\n",
|
|
"58 (CookBks, ChildBks, YouthBks) (DoItYBks) 0.06700 0.55833 2.19169 0.03643\n",
|
|
"34 (RefBks, ChildBks) (DoItYBks) 0.07100 0.55361 2.17314 0.03833\n",
|
|
"75 (CookBks, ChildBks, GeogBks) (DoItYBks) 0.06050 0.55251 2.16884 0.03260\n",
|
|
"19 (ChildBks, GeogBks) (YouthBks) 0.07550 0.51624 2.16680 0.04066\n",
|
|
"45 (CookBks, GeogBks) (YouthBks) 0.08025 0.51360 2.15572 0.04302\n",
|
|
"63 (RefBks, ChildBks, YouthBks) (CookBks) 0.05525 0.89113 2.14471 0.02949\n",
|
|
"17 (ChildBks, YouthBks) (DoItYBks) 0.08025 0.54407 2.13569 0.04267\n",
|
|
"50 (CookBks, RefBks) (DoItYBks) 0.07450 0.53309 2.09262 0.03890\n",
|
|
"28 (RefBks) (CookBks, ChildBks) 0.10350 0.50549 2.08882 0.05395\n",
|
|
"70 (CookBks, DoItYBks, RefBks) (ChildBks) 0.06125 0.82215 2.08667 0.03190\n",
|
|
"15 (YouthBks) (CookBks, ChildBks) 0.12000 0.50367 2.08129 0.06234\n",
|
|
"72 (RefBks, DoItYBks, ChildBks) (CookBks) 0.06125 0.86268 2.07624 0.03175\n",
|
|
"23 (CookBks, ChildBks) (DoItYBks) 0.12775 0.52789 2.07220 0.06610\n",
|
|
"25 (DoItYBks) (CookBks, ChildBks) 0.12775 0.50147 2.07220 0.06610\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# create frequent itemsets and rules\n",
|
|
"itemsets = apriori(count_books, min_support=200/4000, use_colnames=True)\n",
|
|
"rules = association_rules(itemsets, metric='confidence', min_threshold=0.5)\n",
|
|
"\n",
|
|
"print('Number of rules', len(rules))\n",
|
|
"\n",
|
|
"# Display 25 rules with highest lift\n",
|
|
"rules.sort_values(by=['lift'], ascending=False).head(25)\n",
|
|
"\n",
|
|
"pd.set_option('precision', 5)\n",
|
|
"pd.set_option('display.width', 100)\n",
|
|
"print(rules.sort_values(by=['lift'], ascending=False).drop(columns=['antecedent support', 'consequent support', 'conviction']).head(25))\n",
|
|
"pd.set_option('precision', 6)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>antecedents</th>\n",
|
|
" <th>consequents</th>\n",
|
|
" <th>antecedent support</th>\n",
|
|
" <th>consequent support</th>\n",
|
|
" <th>support</th>\n",
|
|
" <th>confidence</th>\n",
|
|
" <th>lift</th>\n",
|
|
" <th>leverage</th>\n",
|
|
" <th>conviction</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>47</th>\n",
|
|
" <td>(DoItYBks, GeogBks)</td>\n",
|
|
" <td>(YouthBks)</td>\n",
|
|
" <td>0.10100</td>\n",
|
|
" <td>0.23825</td>\n",
|
|
" <td>0.05450</td>\n",
|
|
" <td>0.539604</td>\n",
|
|
" <td>2.264864</td>\n",
|
|
" <td>0.030437</td>\n",
|
|
" <td>1.654554</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>34</th>\n",
|
|
" <td>(RefBks, ChildBks)</td>\n",
|
|
" <td>(DoItYBks)</td>\n",
|
|
" <td>0.12825</td>\n",
|
|
" <td>0.25475</td>\n",
|
|
" <td>0.07100</td>\n",
|
|
" <td>0.553606</td>\n",
|
|
" <td>2.173135</td>\n",
|
|
" <td>0.038328</td>\n",
|
|
" <td>1.669490</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>19</th>\n",
|
|
" <td>(ChildBks, GeogBks)</td>\n",
|
|
" <td>(YouthBks)</td>\n",
|
|
" <td>0.14625</td>\n",
|
|
" <td>0.23825</td>\n",
|
|
" <td>0.07550</td>\n",
|
|
" <td>0.516239</td>\n",
|
|
" <td>2.166797</td>\n",
|
|
" <td>0.040656</td>\n",
|
|
" <td>1.574642</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>45</th>\n",
|
|
" <td>(CookBks, GeogBks)</td>\n",
|
|
" <td>(YouthBks)</td>\n",
|
|
" <td>0.15625</td>\n",
|
|
" <td>0.23825</td>\n",
|
|
" <td>0.08025</td>\n",
|
|
" <td>0.513600</td>\n",
|
|
" <td>2.155719</td>\n",
|
|
" <td>0.043023</td>\n",
|
|
" <td>1.566098</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>17</th>\n",
|
|
" <td>(ChildBks, YouthBks)</td>\n",
|
|
" <td>(DoItYBks)</td>\n",
|
|
" <td>0.14750</td>\n",
|
|
" <td>0.25475</td>\n",
|
|
" <td>0.08025</td>\n",
|
|
" <td>0.544068</td>\n",
|
|
" <td>2.135693</td>\n",
|
|
" <td>0.042674</td>\n",
|
|
" <td>1.634563</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>50</th>\n",
|
|
" <td>(CookBks, RefBks)</td>\n",
|
|
" <td>(DoItYBks)</td>\n",
|
|
" <td>0.13975</td>\n",
|
|
" <td>0.25475</td>\n",
|
|
" <td>0.07450</td>\n",
|
|
" <td>0.533095</td>\n",
|
|
" <td>2.092619</td>\n",
|
|
" <td>0.038899</td>\n",
|
|
" <td>1.596148</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>23</th>\n",
|
|
" <td>(CookBks, ChildBks)</td>\n",
|
|
" <td>(DoItYBks)</td>\n",
|
|
" <td>0.24200</td>\n",
|
|
" <td>0.25475</td>\n",
|
|
" <td>0.12775</td>\n",
|
|
" <td>0.527893</td>\n",
|
|
" <td>2.072198</td>\n",
|
|
" <td>0.066101</td>\n",
|
|
" <td>1.578560</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>49</th>\n",
|
|
" <td>(GeogBks, YouthBks)</td>\n",
|
|
" <td>(DoItYBks)</td>\n",
|
|
" <td>0.10450</td>\n",
|
|
" <td>0.25475</td>\n",
|
|
" <td>0.05450</td>\n",
|
|
" <td>0.521531</td>\n",
|
|
" <td>2.047227</td>\n",
|
|
" <td>0.027879</td>\n",
|
|
" <td>1.557573</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>41</th>\n",
|
|
" <td>(CookBks, YouthBks)</td>\n",
|
|
" <td>(DoItYBks)</td>\n",
|
|
" <td>0.16100</td>\n",
|
|
" <td>0.25475</td>\n",
|
|
" <td>0.08375</td>\n",
|
|
" <td>0.520186</td>\n",
|
|
" <td>2.041948</td>\n",
|
|
" <td>0.042735</td>\n",
|
|
" <td>1.553207</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>43</th>\n",
|
|
" <td>(RefBks, YouthBks)</td>\n",
|
|
" <td>(CookBks)</td>\n",
|
|
" <td>0.08125</td>\n",
|
|
" <td>0.41550</td>\n",
|
|
" <td>0.06825</td>\n",
|
|
" <td>0.840000</td>\n",
|
|
" <td>2.021661</td>\n",
|
|
" <td>0.034491</td>\n",
|
|
" <td>3.653125</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" antecedents consequents antecedent support consequent support support confidence \\\n",
|
|
"47 (DoItYBks, GeogBks) (YouthBks) 0.10100 0.23825 0.05450 0.539604 \n",
|
|
"34 (RefBks, ChildBks) (DoItYBks) 0.12825 0.25475 0.07100 0.553606 \n",
|
|
"19 (ChildBks, GeogBks) (YouthBks) 0.14625 0.23825 0.07550 0.516239 \n",
|
|
"45 (CookBks, GeogBks) (YouthBks) 0.15625 0.23825 0.08025 0.513600 \n",
|
|
"17 (ChildBks, YouthBks) (DoItYBks) 0.14750 0.25475 0.08025 0.544068 \n",
|
|
"50 (CookBks, RefBks) (DoItYBks) 0.13975 0.25475 0.07450 0.533095 \n",
|
|
"23 (CookBks, ChildBks) (DoItYBks) 0.24200 0.25475 0.12775 0.527893 \n",
|
|
"49 (GeogBks, YouthBks) (DoItYBks) 0.10450 0.25475 0.05450 0.521531 \n",
|
|
"41 (CookBks, YouthBks) (DoItYBks) 0.16100 0.25475 0.08375 0.520186 \n",
|
|
"43 (RefBks, YouthBks) (CookBks) 0.08125 0.41550 0.06825 0.840000 \n",
|
|
"\n",
|
|
" lift leverage conviction \n",
|
|
"47 2.264864 0.030437 1.654554 \n",
|
|
"34 2.173135 0.038328 1.669490 \n",
|
|
"19 2.166797 0.040656 1.574642 \n",
|
|
"45 2.155719 0.043023 1.566098 \n",
|
|
"17 2.135693 0.042674 1.634563 \n",
|
|
"50 2.092619 0.038899 1.596148 \n",
|
|
"23 2.072198 0.066101 1.578560 \n",
|
|
"49 2.047227 0.027879 1.557573 \n",
|
|
"41 2.041948 0.042735 1.553207 \n",
|
|
"43 2.021661 0.034491 3.653125 "
|
|
]
|
|
},
|
|
"execution_count": 11,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"# Filter rules by number of antecedents (maximum 2) and consequents (maximum 1)\n",
|
|
"rules = rules[[len(c) <= 2 for c in rules.antecedents]]\n",
|
|
"rules = rules[[len(c) == 1 for c in rules.consequents]]\n",
|
|
"\n",
|
|
"rules.sort_values(by=['lift'], ascending=False).head(10)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Section 14.2 Collaborative Filtering"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Computing the cosine similarity matrix...\n",
|
|
"Done computing similarity matrix.\n",
|
|
"user: 823519 item: 30 r_ui = 4.00 est = 3.54 {'was_impossible': True, 'reason': 'User and/or item is unkown.'}\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"ratings = pd.DataFrame([\n",
|
|
" [30878, 1, 4], [30878, 5, 1], [30878, 18, 3], [30878, 28, 3], [30878, 30, 4], [30878, 44, 5], \n",
|
|
" [124105, 1, 4], \n",
|
|
" [822109, 1, 5], \n",
|
|
" [823519, 1, 3], [823519, 8, 1], [823519, 17, 4], [823519, 28, 4], [823519, 30, 5], \n",
|
|
" [885013, 1, 4], [885013, 5, 5], \n",
|
|
" [893988, 1, 3], [893988, 30, 4], [893988, 44, 4], \n",
|
|
" [1248029, 1, 3], [1248029, 28, 2], [1248029, 30, 4], [1248029, 48, 3], \n",
|
|
" [1503895, 1, 4], \n",
|
|
" [1842128, 1, 4], [1842128, 30, 3], \n",
|
|
" [2238063, 1, 3], \n",
|
|
"], columns=['customerID', 'movieID', 'rating'])\n",
|
|
"\n",
|
|
"reader = Reader(rating_scale=(1, 5))\n",
|
|
"data = Dataset.load_from_df(ratings[['customerID', 'movieID', 'rating']], reader)\n",
|
|
"trainset = data.build_full_trainset()\n",
|
|
"sim_options = {'name': 'cosine', 'user_based': False} # compute cosine similarities between items\n",
|
|
"algo = KNNBasic(sim_options=sim_options)\n",
|
|
"algo.fit(trainset)\n",
|
|
"pred = algo.predict(str(823519), str(30), r_ui=4, verbose=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Table 14.11"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Computing the cosine similarity matrix...\n",
|
|
"Done computing similarity matrix.\n",
|
|
"\n",
|
|
"Top-3 recommended items for each user\n",
|
|
"User 6\n",
|
|
" Item 6 (5.00) Item 77 (2.50) Item 60 (1.00)\n",
|
|
"User 222\n",
|
|
" Item 77 (3.50) Item 75 (2.78)\n",
|
|
"User 424\n",
|
|
" Item 14 (3.50) Item 45 (3.10) Item 54 (2.34)\n",
|
|
"User 87\n",
|
|
" Item 27 (3.00) Item 54 (3.00) Item 82 (3.00) Item 32 (1.00)\n",
|
|
"User 121\n",
|
|
" Item 98 (3.48) Item 32 (2.83)\n",
|
|
"\n",
|
|
"Computing the cosine similarity matrix...\n",
|
|
"Done computing similarity matrix.\n",
|
|
"\n",
|
|
"Top-3 recommended items for each user\n",
|
|
"User 6\n",
|
|
" Item 77 (3.00) Item 60 (3.00) Item 6 (3.00)\n",
|
|
"User 222\n",
|
|
" Item 77 (2.24) Item 75 (2.00)\n",
|
|
"User 424\n",
|
|
" Item 54 (3.47) Item 14 (3.44) Item 45 (3.00)\n",
|
|
"User 87\n",
|
|
" Item 27 (3.00) Item 32 (3.00) Item 82 (3.00) Item 54 (2.50)\n",
|
|
"User 121\n",
|
|
" Item 32 (3.06) Item 98 (2.31)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import random\n",
|
|
"\n",
|
|
"random.seed(0)\n",
|
|
"nratings = 5000\n",
|
|
"randomData = pd.DataFrame({\n",
|
|
" 'itemID': [random.randint(0,99) for _ in range(nratings)],\n",
|
|
" 'userID': [random.randint(0,999) for _ in range(nratings)],\n",
|
|
" 'rating': [random.randint(1,5) for _ in range(nratings)],\n",
|
|
"})\n",
|
|
"\n",
|
|
"def get_top_n(predictions, n=10):\n",
|
|
" # First map the predictions to each user.\n",
|
|
" byUser = defaultdict(list)\n",
|
|
" for p in predictions:\n",
|
|
" byUser[p.uid].append(p)\n",
|
|
" \n",
|
|
" # For each user, reduce predictions to top-n\n",
|
|
" for uid, userPredictions in byUser.items():\n",
|
|
" byUser[uid] = heapq.nlargest(n, userPredictions, key=lambda p: p.est)\n",
|
|
" return byUser\n",
|
|
"\n",
|
|
"# Convert thes data set into the format required by the surprise package\n",
|
|
"# The columns must correspond to user id, item id and ratings (in that order)\n",
|
|
"reader = Reader(rating_scale=(1, 5))\n",
|
|
"data = Dataset.load_from_df(randomData[['userID', 'itemID', 'rating']], reader)\n",
|
|
"\n",
|
|
"# Split into training and test set\n",
|
|
"trainset, testset = train_test_split(data, test_size=.25, random_state=1)\n",
|
|
"\n",
|
|
"## User-based filtering\n",
|
|
"# compute cosine similarity between users \n",
|
|
"sim_options = {'name': 'cosine', 'user_based': True}\n",
|
|
"algo = KNNBasic(sim_options=sim_options)\n",
|
|
"algo.fit(trainset)\n",
|
|
"\n",
|
|
"# Than predict ratings for all pairs (u, i) that are NOT in the training set.\n",
|
|
"predictions = algo.test(testset)\n",
|
|
"\n",
|
|
"top_n = get_top_n(predictions, n=4)\n",
|
|
"\n",
|
|
"# Print the recommended items for each user\n",
|
|
"print()\n",
|
|
"print('Top-3 recommended items for each user')\n",
|
|
"for uid, user_ratings in list(top_n.items())[:5]:\n",
|
|
" print('User {}'.format(uid))\n",
|
|
" for prediction in user_ratings:\n",
|
|
" print(' Item {0.iid} ({0.est:.2f})'.format(prediction), end='')\n",
|
|
" print()\n",
|
|
"print()\n",
|
|
"\n",
|
|
" \n",
|
|
"## Item-based filtering\n",
|
|
"# compute cosine similarity between users \n",
|
|
"sim_options = {'name': 'cosine', 'user_based': False}\n",
|
|
"algo = KNNBasic(sim_options=sim_options)\n",
|
|
"algo.fit(trainset)\n",
|
|
"\n",
|
|
"# Than predict ratings for all pairs (u, i) that are NOT in the training set.\n",
|
|
"predictions = algo.test(testset)\n",
|
|
"top_n = get_top_n(predictions, n=4)\n",
|
|
"\n",
|
|
"# Print the recommended items for each user\n",
|
|
"print()\n",
|
|
"print('Top-3 recommended items for each user')\n",
|
|
"for uid, user_ratings in list(top_n.items())[:5]:\n",
|
|
" print('User {}'.format(uid))\n",
|
|
" for prediction in user_ratings:\n",
|
|
" print(' Item {0.iid} ({0.est:.2f})'.format(prediction), end='')\n",
|
|
" print()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Computing the cosine similarity matrix...\n",
|
|
"Done computing similarity matrix.\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"Prediction(uid=383, iid=7, r_ui=None, est=2.3661840936304324, details={'actual_k': 4, 'was_impossible': False})"
|
|
]
|
|
},
|
|
"execution_count": 14,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"## Build a model using the full dataset\n",
|
|
"trainset = data.build_full_trainset()\n",
|
|
"sim_options = {'name': 'cosine', 'user_based': False}\n",
|
|
"algo = KNNBasic(sim_options=sim_options)\n",
|
|
"algo.fit(trainset)\n",
|
|
"\n",
|
|
"# Predict rating for user 383 and item 7\n",
|
|
"algo.predict(383, 7)"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.9.7"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|