1490 lines
81 KiB
Plaintext
1490 lines
81 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Chapter 14: Association Rules and Collaborative Filtering\n",
|
|
"\n",
|
|
"> (c) 2019 Galit Shmueli, Peter C. Bruce, Peter Gedeck \n",
|
|
">\n",
|
|
"> Code included in\n",
|
|
">\n",
|
|
"> _Data Mining for Business Analytics: Concepts, Techniques, and Applications in Python_ (First Edition) \n",
|
|
"> Galit Shmueli, Peter C. Bruce, Peter Gedeck, and Nitin R. Patel. 2019.\n",
|
|
"\n",
|
|
"## Import required packages"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {
|
|
"vscode": {
|
|
"languageId": "python"
|
|
}
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"%matplotlib inline\n",
|
|
"\n",
|
|
"from pathlib import Path\n",
|
|
"\n",
|
|
"import heapq\n",
|
|
"from collections import defaultdict\n",
|
|
"\n",
|
|
"import pandas as pd\n",
|
|
"import matplotlib.pylab as plt\n",
|
|
"from mlxtend.frequent_patterns import apriori\n",
|
|
"from mlxtend.frequent_patterns import association_rules\n",
|
|
"\n",
|
|
"from surprise import Dataset, Reader, KNNBasic\n",
|
|
"from surprise.model_selection import train_test_split\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Table 14.4"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {
|
|
"scrolled": true,
|
|
"vscode": {
|
|
"languageId": "python"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>Red</th>\n",
|
|
" <th>White</th>\n",
|
|
" <th>Blue</th>\n",
|
|
" <th>Orange</th>\n",
|
|
" <th>Green</th>\n",
|
|
" <th>Yellow</th>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>Transaction</th>\n",
|
|
" <th></th>\n",
|
|
" <th></th>\n",
|
|
" <th></th>\n",
|
|
" <th></th>\n",
|
|
" <th></th>\n",
|
|
" <th></th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>5</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>6</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>7</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>8</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>9</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>10</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" Red White Blue Orange Green Yellow\n",
|
|
"Transaction \n",
|
|
"1 1 1 0 0 1 0\n",
|
|
"2 0 1 0 1 0 0\n",
|
|
"3 0 1 1 0 0 0\n",
|
|
"4 1 1 0 1 0 0\n",
|
|
"5 1 0 1 0 0 0\n",
|
|
"6 0 1 1 0 0 0\n",
|
|
"7 1 0 1 0 0 0\n",
|
|
"8 1 1 1 0 1 0\n",
|
|
"9 1 1 1 0 0 0\n",
|
|
"10 0 0 0 0 0 1"
|
|
]
|
|
},
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"# Load and preprocess data set \n",
|
|
"fp_df = pd.read_csv('Faceplate.csv')\n",
|
|
"fp_df.set_index('Transaction', inplace=True)\n",
|
|
"fp_df"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {
|
|
"vscode": {
|
|
"languageId": "python"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
" antecedents consequents support confidence lift leverage\n",
|
|
"13 (White, Red) (Green) 0.2 0.5 2.500000 0.12\n",
|
|
"15 (Green) (White, Red) 0.2 1.0 2.500000 0.12\n",
|
|
"4 (Green) (Red) 0.2 1.0 1.666667 0.08\n",
|
|
"12 (White, Green) (Red) 0.2 1.0 1.666667 0.08\n",
|
|
"7 (Orange) (White) 0.2 1.0 1.428571 0.06\n",
|
|
"8 (Green) (White) 0.2 1.0 1.428571 0.06\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"/home/noah/.local/lib/python3.10/site-packages/mlxtend/frequent_patterns/fpcommon.py:111: DeprecationWarning: DataFrames with non-bool types result in worse computationalperformance and their support might be discontinued in the future.Please use a DataFrame with bool type\n",
|
|
" warnings.warn(\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# create frequent itemsets\n",
|
|
"itemsets = apriori(fp_df, min_support=0.2, use_colnames=True)\n",
|
|
"\n",
|
|
"# and convert into rules\n",
|
|
"rules = association_rules(itemsets, metric='confidence', min_threshold=0.5)\n",
|
|
"rules.sort_values(by=['lift'], ascending=False).head(6)\n",
|
|
"\n",
|
|
"print(rules.sort_values(by=['lift'], ascending=False)\n",
|
|
" .drop(columns=['antecedent support', 'consequent support', 'conviction'])\n",
|
|
" .head(6))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"metadata": {
|
|
"vscode": {
|
|
"languageId": "python"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>antecedents</th>\n",
|
|
" <th>consequents</th>\n",
|
|
" <th>antecedent support</th>\n",
|
|
" <th>consequent support</th>\n",
|
|
" <th>support</th>\n",
|
|
" <th>confidence</th>\n",
|
|
" <th>lift</th>\n",
|
|
" <th>leverage</th>\n",
|
|
" <th>conviction</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>13</th>\n",
|
|
" <td>(White, Red)</td>\n",
|
|
" <td>(Green)</td>\n",
|
|
" <td>0.4</td>\n",
|
|
" <td>0.2</td>\n",
|
|
" <td>0.2</td>\n",
|
|
" <td>0.5</td>\n",
|
|
" <td>2.500000</td>\n",
|
|
" <td>0.12</td>\n",
|
|
" <td>1.6</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>(Green)</td>\n",
|
|
" <td>(Red)</td>\n",
|
|
" <td>0.2</td>\n",
|
|
" <td>0.6</td>\n",
|
|
" <td>0.2</td>\n",
|
|
" <td>1.0</td>\n",
|
|
" <td>1.666667</td>\n",
|
|
" <td>0.08</td>\n",
|
|
" <td>inf</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>12</th>\n",
|
|
" <td>(White, Green)</td>\n",
|
|
" <td>(Red)</td>\n",
|
|
" <td>0.2</td>\n",
|
|
" <td>0.6</td>\n",
|
|
" <td>0.2</td>\n",
|
|
" <td>1.0</td>\n",
|
|
" <td>1.666667</td>\n",
|
|
" <td>0.08</td>\n",
|
|
" <td>inf</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>7</th>\n",
|
|
" <td>(Orange)</td>\n",
|
|
" <td>(White)</td>\n",
|
|
" <td>0.2</td>\n",
|
|
" <td>0.7</td>\n",
|
|
" <td>0.2</td>\n",
|
|
" <td>1.0</td>\n",
|
|
" <td>1.428571</td>\n",
|
|
" <td>0.06</td>\n",
|
|
" <td>inf</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>8</th>\n",
|
|
" <td>(Green)</td>\n",
|
|
" <td>(White)</td>\n",
|
|
" <td>0.2</td>\n",
|
|
" <td>0.7</td>\n",
|
|
" <td>0.2</td>\n",
|
|
" <td>1.0</td>\n",
|
|
" <td>1.428571</td>\n",
|
|
" <td>0.06</td>\n",
|
|
" <td>inf</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>14</th>\n",
|
|
" <td>(Green, Red)</td>\n",
|
|
" <td>(White)</td>\n",
|
|
" <td>0.2</td>\n",
|
|
" <td>0.7</td>\n",
|
|
" <td>0.2</td>\n",
|
|
" <td>1.0</td>\n",
|
|
" <td>1.428571</td>\n",
|
|
" <td>0.06</td>\n",
|
|
" <td>inf</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" antecedents consequents antecedent support consequent support \\\n",
|
|
"13 (White, Red) (Green) 0.4 0.2 \n",
|
|
"4 (Green) (Red) 0.2 0.6 \n",
|
|
"12 (White, Green) (Red) 0.2 0.6 \n",
|
|
"7 (Orange) (White) 0.2 0.7 \n",
|
|
"8 (Green) (White) 0.2 0.7 \n",
|
|
"14 (Green, Red) (White) 0.2 0.7 \n",
|
|
"\n",
|
|
" support confidence lift leverage conviction \n",
|
|
"13 0.2 0.5 2.500000 0.12 1.6 \n",
|
|
"4 0.2 1.0 1.666667 0.08 inf \n",
|
|
"12 0.2 1.0 1.666667 0.08 inf \n",
|
|
"7 0.2 1.0 1.428571 0.06 inf \n",
|
|
"8 0.2 1.0 1.428571 0.06 inf \n",
|
|
"14 0.2 1.0 1.428571 0.06 inf "
|
|
]
|
|
},
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"# filter to get rules with single consequents only\n",
|
|
"rules[[len(c) == 1 for c in rules.consequents]].sort_values(by=['lift'], ascending=False).head(6)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"The apriori method accepts sparse data frames as well. If we convert the original data frame to sparse format, we can see that the memory requirements go down to 40%. The `fill_value` argument informs the `to_sparse` method here which fields to ignore in each transaction."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"metadata": {
|
|
"vscode": {
|
|
"languageId": "python"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"/home/noah/.local/lib/python3.10/site-packages/mlxtend/frequent_patterns/fpcommon.py:111: DeprecationWarning: DataFrames with non-bool types result in worse computationalperformance and their support might be discontinued in the future.Please use a DataFrame with bool type\n",
|
|
" warnings.warn(\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>antecedents</th>\n",
|
|
" <th>consequents</th>\n",
|
|
" <th>antecedent support</th>\n",
|
|
" <th>consequent support</th>\n",
|
|
" <th>support</th>\n",
|
|
" <th>confidence</th>\n",
|
|
" <th>lift</th>\n",
|
|
" <th>leverage</th>\n",
|
|
" <th>conviction</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>13</th>\n",
|
|
" <td>(White, Red)</td>\n",
|
|
" <td>(Green)</td>\n",
|
|
" <td>0.4</td>\n",
|
|
" <td>0.2</td>\n",
|
|
" <td>0.2</td>\n",
|
|
" <td>0.5</td>\n",
|
|
" <td>2.500000</td>\n",
|
|
" <td>0.12</td>\n",
|
|
" <td>1.6</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>15</th>\n",
|
|
" <td>(Green)</td>\n",
|
|
" <td>(White, Red)</td>\n",
|
|
" <td>0.2</td>\n",
|
|
" <td>0.4</td>\n",
|
|
" <td>0.2</td>\n",
|
|
" <td>1.0</td>\n",
|
|
" <td>2.500000</td>\n",
|
|
" <td>0.12</td>\n",
|
|
" <td>inf</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>(Green)</td>\n",
|
|
" <td>(Red)</td>\n",
|
|
" <td>0.2</td>\n",
|
|
" <td>0.6</td>\n",
|
|
" <td>0.2</td>\n",
|
|
" <td>1.0</td>\n",
|
|
" <td>1.666667</td>\n",
|
|
" <td>0.08</td>\n",
|
|
" <td>inf</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>12</th>\n",
|
|
" <td>(White, Green)</td>\n",
|
|
" <td>(Red)</td>\n",
|
|
" <td>0.2</td>\n",
|
|
" <td>0.6</td>\n",
|
|
" <td>0.2</td>\n",
|
|
" <td>1.0</td>\n",
|
|
" <td>1.666667</td>\n",
|
|
" <td>0.08</td>\n",
|
|
" <td>inf</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>7</th>\n",
|
|
" <td>(Orange)</td>\n",
|
|
" <td>(White)</td>\n",
|
|
" <td>0.2</td>\n",
|
|
" <td>0.7</td>\n",
|
|
" <td>0.2</td>\n",
|
|
" <td>1.0</td>\n",
|
|
" <td>1.428571</td>\n",
|
|
" <td>0.06</td>\n",
|
|
" <td>inf</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>8</th>\n",
|
|
" <td>(Green)</td>\n",
|
|
" <td>(White)</td>\n",
|
|
" <td>0.2</td>\n",
|
|
" <td>0.7</td>\n",
|
|
" <td>0.2</td>\n",
|
|
" <td>1.0</td>\n",
|
|
" <td>1.428571</td>\n",
|
|
" <td>0.06</td>\n",
|
|
" <td>inf</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" antecedents consequents antecedent support consequent support \\\n",
|
|
"13 (White, Red) (Green) 0.4 0.2 \n",
|
|
"15 (Green) (White, Red) 0.2 0.4 \n",
|
|
"4 (Green) (Red) 0.2 0.6 \n",
|
|
"12 (White, Green) (Red) 0.2 0.6 \n",
|
|
"7 (Orange) (White) 0.2 0.7 \n",
|
|
"8 (Green) (White) 0.2 0.7 \n",
|
|
"\n",
|
|
" support confidence lift leverage conviction \n",
|
|
"13 0.2 0.5 2.500000 0.12 1.6 \n",
|
|
"15 0.2 1.0 2.500000 0.12 inf \n",
|
|
"4 0.2 1.0 1.666667 0.08 inf \n",
|
|
"12 0.2 1.0 1.666667 0.08 inf \n",
|
|
"7 0.2 1.0 1.428571 0.06 inf \n",
|
|
"8 0.2 1.0 1.428571 0.06 inf "
|
|
]
|
|
},
|
|
"execution_count": 12,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"# Convert data set into a sparse data frame\n",
|
|
"#sparse_df = fp_df.to_sparse(fill_value=0)\n",
|
|
"sparse_df = fp_df.astype(pd.SparseDtype(int, fill_value=0))\n",
|
|
"#print('Density {}'.format(sparse_df.density))\n",
|
|
"\n",
|
|
"# create frequent itemsets\n",
|
|
"itemsets = apriori(sparse_df, min_support=0.2, use_colnames=True)\n",
|
|
"\n",
|
|
"# and convert into rules\n",
|
|
"rules = association_rules(itemsets, metric='confidence', min_threshold=0.5)\n",
|
|
"rules.sort_values(by=['lift'], ascending=False).head(6)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Data required for Table 14.5 and 14.6"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"metadata": {
|
|
"vscode": {
|
|
"languageId": "python"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"[{8}, {8, 3, 4}, {8}, {9, 3}, {9}, {8, 1}, {9, 6}, {9, 3, 5, 7}, {8}, set(), {1, 9, 7}, {1, 4, 5, 8, 9}, {9, 5, 7}, {8, 6, 7}, {9, 3, 7}, {1, 4, 9}, {8, 6, 7}, {8}, set(), {9}, {8, 2, 5, 6}, {9, 4, 6}, {9, 4}, {8, 9}, {8, 6}, {8, 1, 6}, {8, 5}, {8, 9, 4}, {9}, {8}, {8, 1, 5}, {9, 3, 6}, {9, 7}, {8, 9, 7}, {8, 3, 4, 6}, {8, 1, 4}, {8, 4, 7}, {8, 9}, {9, 4, 5, 7}, {8, 9, 2}, {9, 2, 5}, {1, 2, 9, 7}, {8, 5}, {8, 1, 7}, {8}, {9, 2, 7}, {9, 4, 6}, {9}, {9}, {8, 6, 7}]\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>1</th>\n",
|
|
" <th>2</th>\n",
|
|
" <th>3</th>\n",
|
|
" <th>4</th>\n",
|
|
" <th>5</th>\n",
|
|
" <th>6</th>\n",
|
|
" <th>7</th>\n",
|
|
" <th>8</th>\n",
|
|
" <th>9</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" 1 2 3 4 5 6 7 8 9\n",
|
|
"0 0 0 0 0 0 0 0 1 0\n",
|
|
"1 0 0 1 1 0 0 0 1 0\n",
|
|
"2 0 0 0 0 0 0 0 1 0\n",
|
|
"3 0 0 1 0 0 0 0 0 1\n",
|
|
"4 0 0 0 0 0 0 0 0 1"
|
|
]
|
|
},
|
|
"execution_count": 13,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"# Prepare the dataset for table 14.6 based on table 14.5\n",
|
|
"from itertools import chain\n",
|
|
"randomTransactions = [{8}, {3,4,8}, {8}, {3,9}, {9}, {1,8}, {6,9}, {3,5,7,9}, {8}, set(), \n",
|
|
" {1,7,9}, {1,4,5,8,9}, {5,7,9}, {6,7,8}, {3,7,9}, {1,4,9}, {6,7,8}, {8}, set(), {9},\n",
|
|
" {2,5,6,8}, {4,6,9}, {4,9}, {8,9}, {6,8}, {1,6,8}, {5,8}, {4,8,9}, {9}, {8},\n",
|
|
" {1,5,8}, {3,6,9}, {7,9}, {7,8,9}, {3,4,6,8}, {1,4,8}, {4,7,8}, {8,9}, {4,5,7,9}, {2,8,9},\n",
|
|
" {2,5,9}, {1,2,7,9}, {5,8}, {1,7,8}, {8}, {2,7,9}, {4,6,9}, {9}, {9}, {6,7,8}]\n",
|
|
"print(randomTransactions)\n",
|
|
"uniqueItems = sorted(set(chain.from_iterable(randomTransactions)))\n",
|
|
"randomData = pd.DataFrame(0, index=range(len(randomTransactions)), columns=uniqueItems)\n",
|
|
"for row, transaction in enumerate(randomTransactions):\n",
|
|
" for item in transaction:\n",
|
|
" randomData.loc[row][item] = 1\n",
|
|
"randomData.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Table 14.6"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"metadata": {
|
|
"vscode": {
|
|
"languageId": "python"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
" antecedents consequents support confidence lift leverage\n",
|
|
"3 (8, 3) (4) 0.04 1.0 4.545455 0.0312\n",
|
|
"1 (1, 5) (8) 0.04 1.0 1.851852 0.0184\n",
|
|
"2 (2, 7) (9) 0.04 1.0 1.851852 0.0184\n",
|
|
"4 (3, 4) (8) 0.04 1.0 1.851852 0.0184\n",
|
|
"5 (3, 7) (9) 0.04 1.0 1.851852 0.0184\n",
|
|
"6 (4, 5) (9) 0.04 1.0 1.851852 0.0184\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"/home/noah/.local/lib/python3.10/site-packages/mlxtend/frequent_patterns/fpcommon.py:111: DeprecationWarning: DataFrames with non-bool types result in worse computationalperformance and their support might be discontinued in the future.Please use a DataFrame with bool type\n",
|
|
" warnings.warn(\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# create frequent itemsets\n",
|
|
"itemsets = apriori(randomData, min_support=2/len(randomData), use_colnames=True)\n",
|
|
"# and convert into rules\n",
|
|
"rules = association_rules(itemsets, metric='confidence', min_threshold=0.7)\n",
|
|
"print(rules.sort_values(by=['lift'], ascending=False)\n",
|
|
" .drop(columns=['antecedent support', 'consequent support', 'conviction'])\n",
|
|
" .head(6))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Table 14.8"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 16,
|
|
"metadata": {
|
|
"vscode": {
|
|
"languageId": "python"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>ChildBks</th>\n",
|
|
" <th>YouthBks</th>\n",
|
|
" <th>CookBks</th>\n",
|
|
" <th>DoItYBks</th>\n",
|
|
" <th>RefBks</th>\n",
|
|
" <th>ArtBks</th>\n",
|
|
" <th>GeogBks</th>\n",
|
|
" <th>ItalCook</th>\n",
|
|
" <th>ItalAtlas</th>\n",
|
|
" <th>ItalArt</th>\n",
|
|
" <th>Florence</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" ChildBks YouthBks CookBks DoItYBks RefBks ArtBks GeogBks ItalCook \\\n",
|
|
"0 0 1 1 0 0 0 0 0 \n",
|
|
"1 0 0 0 0 0 0 0 0 \n",
|
|
"2 1 1 1 0 1 0 1 1 \n",
|
|
"3 0 0 0 0 0 0 0 0 \n",
|
|
"4 0 0 0 0 0 0 0 0 \n",
|
|
"\n",
|
|
" ItalAtlas ItalArt Florence \n",
|
|
"0 0 0 0 \n",
|
|
"1 0 0 0 \n",
|
|
"2 0 0 0 \n",
|
|
"3 0 0 0 \n",
|
|
"4 0 0 0 "
|
|
]
|
|
},
|
|
"execution_count": 16,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"# load dataset\n",
|
|
"all_books_df = pd.read_csv('CharlesBookClub.csv')\n",
|
|
"\n",
|
|
"# create the binary incidence matrix\n",
|
|
"ignore = ['Seq#', 'ID#', 'Gender', 'M', 'R', 'F', 'FirstPurch', 'Related Purchase',\n",
|
|
" 'Mcode', 'Rcode', 'Fcode', 'Yes_Florence', 'No_Florence']\n",
|
|
"count_books = all_books_df.drop(columns=ignore)\n",
|
|
"count_books[count_books > 0] = 1\n",
|
|
"\n",
|
|
"count_books.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 17,
|
|
"metadata": {
|
|
"vscode": {
|
|
"languageId": "python"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"image/png": "",
|
|
"text/plain": [
|
|
"<Figure size 640x480 with 1 Axes>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"# determine item frequencies\n",
|
|
"itemFrequency = count_books.sum(axis=0) / len(count_books)\n",
|
|
"\n",
|
|
"# and plot as histogram\n",
|
|
"ax = itemFrequency.plot.bar(color='blue')\n",
|
|
"plt.ylabel('Item frequency (relative)')\n",
|
|
"plt.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 24,
|
|
"metadata": {
|
|
"vscode": {
|
|
"languageId": "python"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Number of rules 81\n",
|
|
" antecedents consequents support confidence \\\n",
|
|
"64 (RefBks, YouthBks) (ChildBks, CookBks) 0.05525 0.680000 \n",
|
|
"73 (DoItYBks, RefBks) (ChildBks, CookBks) 0.06125 0.662162 \n",
|
|
"60 (DoItYBks, YouthBks) (ChildBks, CookBks) 0.06700 0.648910 \n",
|
|
"80 (RefBks, GeogBks) (ChildBks, CookBks) 0.05025 0.614679 \n",
|
|
"69 (YouthBks, GeogBks) (ChildBks, CookBks) 0.06325 0.605263 \n",
|
|
"77 (DoItYBks, GeogBks) (ChildBks, CookBks) 0.06050 0.599010 \n",
|
|
"68 (ChildBks, CookBks, GeogBks) (YouthBks) 0.06325 0.577626 \n",
|
|
"72 (RefBks, ChildBks, CookBks) (DoItYBks) 0.06125 0.591787 \n",
|
|
"48 (DoItYBks, GeogBks) (YouthBks) 0.05450 0.539604 \n",
|
|
"63 (RefBks, ChildBks, CookBks) (YouthBks) 0.05525 0.533816 \n",
|
|
"58 (DoItYBks, ChildBks, CookBks) (YouthBks) 0.06700 0.524462 \n",
|
|
"59 (YouthBks, ChildBks, CookBks) (DoItYBks) 0.06700 0.558333 \n",
|
|
"34 (RefBks, ChildBks) (DoItYBks) 0.07100 0.553606 \n",
|
|
"76 (ChildBks, CookBks, GeogBks) (DoItYBks) 0.06050 0.552511 \n",
|
|
"21 (ChildBks, GeogBks) (YouthBks) 0.07550 0.516239 \n",
|
|
"46 (CookBks, GeogBks) (YouthBks) 0.08025 0.513600 \n",
|
|
"61 (RefBks, YouthBks, ChildBks) (CookBks) 0.05525 0.891129 \n",
|
|
"17 (YouthBks, ChildBks) (DoItYBks) 0.08025 0.544068 \n",
|
|
"51 (RefBks, CookBks) (DoItYBks) 0.07450 0.533095 \n",
|
|
"28 (RefBks) (ChildBks, CookBks) 0.10350 0.505495 \n",
|
|
"71 (DoItYBks, RefBks, CookBks) (ChildBks) 0.06125 0.822148 \n",
|
|
"15 (YouthBks) (ChildBks, CookBks) 0.12000 0.503673 \n",
|
|
"70 (DoItYBks, RefBks, ChildBks) (CookBks) 0.06125 0.862676 \n",
|
|
"24 (ChildBks, CookBks) (DoItYBks) 0.12775 0.527893 \n",
|
|
"25 (DoItYBks) (ChildBks, CookBks) 0.12775 0.501472 \n",
|
|
"\n",
|
|
" lift leverage \n",
|
|
"64 2.809917 0.035588 \n",
|
|
"73 2.736207 0.038865 \n",
|
|
"60 2.681448 0.042014 \n",
|
|
"80 2.539995 0.030467 \n",
|
|
"69 2.501087 0.037961 \n",
|
|
"77 2.475248 0.036058 \n",
|
|
"68 2.424452 0.037162 \n",
|
|
"72 2.323013 0.034883 \n",
|
|
"48 2.264864 0.030437 \n",
|
|
"63 2.240573 0.030591 \n",
|
|
"58 2.201309 0.036564 \n",
|
|
"59 2.191691 0.036430 \n",
|
|
"34 2.173135 0.038328 \n",
|
|
"76 2.168838 0.032605 \n",
|
|
"21 2.166797 0.040656 \n",
|
|
"46 2.155719 0.043023 \n",
|
|
"61 2.144715 0.029489 \n",
|
|
"17 2.135693 0.042674 \n",
|
|
"51 2.092619 0.038899 \n",
|
|
"28 2.088820 0.053950 \n",
|
|
"71 2.086669 0.031897 \n",
|
|
"15 2.081292 0.062344 \n",
|
|
"70 2.076236 0.031749 \n",
|
|
"24 2.072198 0.066101 \n",
|
|
"25 2.072198 0.066101 \n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"/home/noah/.local/lib/python3.10/site-packages/mlxtend/frequent_patterns/fpcommon.py:111: DeprecationWarning: DataFrames with non-bool types result in worse computationalperformance and their support might be discontinued in the future.Please use a DataFrame with bool type\n",
|
|
" warnings.warn(\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# create frequent itemsets and rules\n",
|
|
"pd.options.display.max_rows = 100\n",
|
|
"itemsets = apriori(count_books, min_support=200/4000, use_colnames=True)\n",
|
|
"rules = association_rules(itemsets, metric='confidence', min_threshold=0.5)\n",
|
|
"\n",
|
|
"print('Number of rules', len(rules))\n",
|
|
"\n",
|
|
"# Display 25 rules with highest lift\n",
|
|
"rules.sort_values(by=['lift'], ascending=False).head(25)\n",
|
|
"\n",
|
|
"print(rules.sort_values(by=['lift'], ascending=False).drop(columns=['antecedent support', 'consequent support', 'conviction']).head(25))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 25,
|
|
"metadata": {
|
|
"vscode": {
|
|
"languageId": "python"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>antecedents</th>\n",
|
|
" <th>consequents</th>\n",
|
|
" <th>antecedent support</th>\n",
|
|
" <th>consequent support</th>\n",
|
|
" <th>support</th>\n",
|
|
" <th>confidence</th>\n",
|
|
" <th>lift</th>\n",
|
|
" <th>leverage</th>\n",
|
|
" <th>conviction</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>48</th>\n",
|
|
" <td>(DoItYBks, GeogBks)</td>\n",
|
|
" <td>(YouthBks)</td>\n",
|
|
" <td>0.10100</td>\n",
|
|
" <td>0.23825</td>\n",
|
|
" <td>0.05450</td>\n",
|
|
" <td>0.539604</td>\n",
|
|
" <td>2.264864</td>\n",
|
|
" <td>0.030437</td>\n",
|
|
" <td>1.654554</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>34</th>\n",
|
|
" <td>(RefBks, ChildBks)</td>\n",
|
|
" <td>(DoItYBks)</td>\n",
|
|
" <td>0.12825</td>\n",
|
|
" <td>0.25475</td>\n",
|
|
" <td>0.07100</td>\n",
|
|
" <td>0.553606</td>\n",
|
|
" <td>2.173135</td>\n",
|
|
" <td>0.038328</td>\n",
|
|
" <td>1.669490</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>21</th>\n",
|
|
" <td>(ChildBks, GeogBks)</td>\n",
|
|
" <td>(YouthBks)</td>\n",
|
|
" <td>0.14625</td>\n",
|
|
" <td>0.23825</td>\n",
|
|
" <td>0.07550</td>\n",
|
|
" <td>0.516239</td>\n",
|
|
" <td>2.166797</td>\n",
|
|
" <td>0.040656</td>\n",
|
|
" <td>1.574642</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>46</th>\n",
|
|
" <td>(CookBks, GeogBks)</td>\n",
|
|
" <td>(YouthBks)</td>\n",
|
|
" <td>0.15625</td>\n",
|
|
" <td>0.23825</td>\n",
|
|
" <td>0.08025</td>\n",
|
|
" <td>0.513600</td>\n",
|
|
" <td>2.155719</td>\n",
|
|
" <td>0.043023</td>\n",
|
|
" <td>1.566098</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>17</th>\n",
|
|
" <td>(YouthBks, ChildBks)</td>\n",
|
|
" <td>(DoItYBks)</td>\n",
|
|
" <td>0.14750</td>\n",
|
|
" <td>0.25475</td>\n",
|
|
" <td>0.08025</td>\n",
|
|
" <td>0.544068</td>\n",
|
|
" <td>2.135693</td>\n",
|
|
" <td>0.042674</td>\n",
|
|
" <td>1.634563</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>51</th>\n",
|
|
" <td>(RefBks, CookBks)</td>\n",
|
|
" <td>(DoItYBks)</td>\n",
|
|
" <td>0.13975</td>\n",
|
|
" <td>0.25475</td>\n",
|
|
" <td>0.07450</td>\n",
|
|
" <td>0.533095</td>\n",
|
|
" <td>2.092619</td>\n",
|
|
" <td>0.038899</td>\n",
|
|
" <td>1.596148</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>24</th>\n",
|
|
" <td>(ChildBks, CookBks)</td>\n",
|
|
" <td>(DoItYBks)</td>\n",
|
|
" <td>0.24200</td>\n",
|
|
" <td>0.25475</td>\n",
|
|
" <td>0.12775</td>\n",
|
|
" <td>0.527893</td>\n",
|
|
" <td>2.072198</td>\n",
|
|
" <td>0.066101</td>\n",
|
|
" <td>1.578560</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>49</th>\n",
|
|
" <td>(YouthBks, GeogBks)</td>\n",
|
|
" <td>(DoItYBks)</td>\n",
|
|
" <td>0.10450</td>\n",
|
|
" <td>0.25475</td>\n",
|
|
" <td>0.05450</td>\n",
|
|
" <td>0.521531</td>\n",
|
|
" <td>2.047227</td>\n",
|
|
" <td>0.027879</td>\n",
|
|
" <td>1.557573</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>42</th>\n",
|
|
" <td>(YouthBks, CookBks)</td>\n",
|
|
" <td>(DoItYBks)</td>\n",
|
|
" <td>0.16100</td>\n",
|
|
" <td>0.25475</td>\n",
|
|
" <td>0.08375</td>\n",
|
|
" <td>0.520186</td>\n",
|
|
" <td>2.041948</td>\n",
|
|
" <td>0.042735</td>\n",
|
|
" <td>1.553207</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>43</th>\n",
|
|
" <td>(RefBks, YouthBks)</td>\n",
|
|
" <td>(CookBks)</td>\n",
|
|
" <td>0.08125</td>\n",
|
|
" <td>0.41550</td>\n",
|
|
" <td>0.06825</td>\n",
|
|
" <td>0.840000</td>\n",
|
|
" <td>2.021661</td>\n",
|
|
" <td>0.034491</td>\n",
|
|
" <td>3.653125</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" antecedents consequents antecedent support consequent support \\\n",
|
|
"48 (DoItYBks, GeogBks) (YouthBks) 0.10100 0.23825 \n",
|
|
"34 (RefBks, ChildBks) (DoItYBks) 0.12825 0.25475 \n",
|
|
"21 (ChildBks, GeogBks) (YouthBks) 0.14625 0.23825 \n",
|
|
"46 (CookBks, GeogBks) (YouthBks) 0.15625 0.23825 \n",
|
|
"17 (YouthBks, ChildBks) (DoItYBks) 0.14750 0.25475 \n",
|
|
"51 (RefBks, CookBks) (DoItYBks) 0.13975 0.25475 \n",
|
|
"24 (ChildBks, CookBks) (DoItYBks) 0.24200 0.25475 \n",
|
|
"49 (YouthBks, GeogBks) (DoItYBks) 0.10450 0.25475 \n",
|
|
"42 (YouthBks, CookBks) (DoItYBks) 0.16100 0.25475 \n",
|
|
"43 (RefBks, YouthBks) (CookBks) 0.08125 0.41550 \n",
|
|
"\n",
|
|
" support confidence lift leverage conviction \n",
|
|
"48 0.05450 0.539604 2.264864 0.030437 1.654554 \n",
|
|
"34 0.07100 0.553606 2.173135 0.038328 1.669490 \n",
|
|
"21 0.07550 0.516239 2.166797 0.040656 1.574642 \n",
|
|
"46 0.08025 0.513600 2.155719 0.043023 1.566098 \n",
|
|
"17 0.08025 0.544068 2.135693 0.042674 1.634563 \n",
|
|
"51 0.07450 0.533095 2.092619 0.038899 1.596148 \n",
|
|
"24 0.12775 0.527893 2.072198 0.066101 1.578560 \n",
|
|
"49 0.05450 0.521531 2.047227 0.027879 1.557573 \n",
|
|
"42 0.08375 0.520186 2.041948 0.042735 1.553207 \n",
|
|
"43 0.06825 0.840000 2.021661 0.034491 3.653125 "
|
|
]
|
|
},
|
|
"execution_count": 25,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"# Filter rules by number of antecedents (maximum 2) and consequents (maximum 1)\n",
|
|
"rules = rules[[len(c) <= 2 for c in rules.antecedents]]\n",
|
|
"rules = rules[[len(c) == 1 for c in rules.consequents]]\n",
|
|
"\n",
|
|
"rules.sort_values(by=['lift'], ascending=False).head(10)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Section 14.2 Collaborative Filtering"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 26,
|
|
"metadata": {
|
|
"vscode": {
|
|
"languageId": "python"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Computing the cosine similarity matrix...\n",
|
|
"Done computing similarity matrix.\n",
|
|
"user: 823519 item: 30 r_ui = 4.00 est = 3.54 {'was_impossible': True, 'reason': 'User and/or item is unknown.'}\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"ratings = pd.DataFrame([\n",
|
|
" [30878, 1, 4], [30878, 5, 1], [30878, 18, 3], [30878, 28, 3], [30878, 30, 4], [30878, 44, 5], \n",
|
|
" [124105, 1, 4], \n",
|
|
" [822109, 1, 5], \n",
|
|
" [823519, 1, 3], [823519, 8, 1], [823519, 17, 4], [823519, 28, 4], [823519, 30, 5], \n",
|
|
" [885013, 1, 4], [885013, 5, 5], \n",
|
|
" [893988, 1, 3], [893988, 30, 4], [893988, 44, 4], \n",
|
|
" [1248029, 1, 3], [1248029, 28, 2], [1248029, 30, 4], [1248029, 48, 3], \n",
|
|
" [1503895, 1, 4], \n",
|
|
" [1842128, 1, 4], [1842128, 30, 3], \n",
|
|
" [2238063, 1, 3], \n",
|
|
"], columns=['customerID', 'movieID', 'rating'])\n",
|
|
"\n",
|
|
"reader = Reader(rating_scale=(1, 5))\n",
|
|
"data = Dataset.load_from_df(ratings[['customerID', 'movieID', 'rating']], reader)\n",
|
|
"trainset = data.build_full_trainset()\n",
|
|
"sim_options = {'name': 'cosine', 'user_based': False} # compute cosine similarities between items\n",
|
|
"algo = KNNBasic(sim_options=sim_options)\n",
|
|
"algo.fit(trainset)\n",
|
|
"pred = algo.predict(str(823519), str(30), r_ui=4, verbose=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Table 14.11"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 27,
|
|
"metadata": {
|
|
"vscode": {
|
|
"languageId": "python"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Computing the cosine similarity matrix...\n",
|
|
"Done computing similarity matrix.\n",
|
|
"\n",
|
|
"Top-3 recommended items for each user\n",
|
|
"User 6\n",
|
|
" Item 6 (5.00) Item 77 (2.50) Item 60 (1.00)\n",
|
|
"User 222\n",
|
|
" Item 77 (3.50) Item 75 (2.78)\n",
|
|
"User 424\n",
|
|
" Item 14 (3.50) Item 45 (3.10) Item 54 (2.34)\n",
|
|
"User 87\n",
|
|
" Item 27 (3.00) Item 54 (3.00) Item 82 (3.00) Item 32 (1.00)\n",
|
|
"User 121\n",
|
|
" Item 98 (3.48) Item 32 (2.83)\n",
|
|
"\n",
|
|
"Computing the cosine similarity matrix...\n",
|
|
"Done computing similarity matrix.\n",
|
|
"\n",
|
|
"Top-3 recommended items for each user\n",
|
|
"User 6\n",
|
|
" Item 77 (3.00) Item 60 (3.00) Item 6 (3.00)\n",
|
|
"User 222\n",
|
|
" Item 77 (2.24) Item 75 (2.00)\n",
|
|
"User 424\n",
|
|
" Item 54 (3.47) Item 14 (3.44) Item 45 (3.00)\n",
|
|
"User 87\n",
|
|
" Item 27 (3.00) Item 32 (3.00) Item 82 (3.00) Item 54 (2.50)\n",
|
|
"User 121\n",
|
|
" Item 32 (3.06) Item 98 (2.31)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import random\n",
|
|
"\n",
|
|
"random.seed(0)\n",
|
|
"nratings = 5000\n",
|
|
"randomData = pd.DataFrame({\n",
|
|
" 'itemID': [random.randint(0,99) for _ in range(nratings)],\n",
|
|
" 'userID': [random.randint(0,999) for _ in range(nratings)],\n",
|
|
" 'rating': [random.randint(1,5) for _ in range(nratings)],\n",
|
|
"})\n",
|
|
"\n",
|
|
"def get_top_n(predictions, n=10):\n",
|
|
" # First map the predictions to each user.\n",
|
|
" byUser = defaultdict(list)\n",
|
|
" for p in predictions:\n",
|
|
" byUser[p.uid].append(p)\n",
|
|
" \n",
|
|
" # For each user, reduce predictions to top-n\n",
|
|
" for uid, userPredictions in byUser.items():\n",
|
|
" byUser[uid] = heapq.nlargest(n, userPredictions, key=lambda p: p.est)\n",
|
|
" return byUser\n",
|
|
"\n",
|
|
"# Convert thes data set into the format required by the surprise package\n",
|
|
"# The columns must correspond to user id, item id and ratings (in that order)\n",
|
|
"reader = Reader(rating_scale=(1, 5))\n",
|
|
"data = Dataset.load_from_df(randomData[['userID', 'itemID', 'rating']], reader)\n",
|
|
"\n",
|
|
"# Split into training and test set\n",
|
|
"trainset, testset = train_test_split(data, test_size=.25, random_state=1)\n",
|
|
"\n",
|
|
"## User-based filtering\n",
|
|
"# compute cosine similarity between users \n",
|
|
"sim_options = {'name': 'cosine', 'user_based': True}\n",
|
|
"algo = KNNBasic(sim_options=sim_options)\n",
|
|
"algo.fit(trainset)\n",
|
|
"\n",
|
|
"# Than predict ratings for all pairs (u, i) that are NOT in the training set.\n",
|
|
"predictions = algo.test(testset)\n",
|
|
"\n",
|
|
"top_n = get_top_n(predictions, n=4)\n",
|
|
"\n",
|
|
"# Print the recommended items for each user\n",
|
|
"print()\n",
|
|
"print('Top-3 recommended items for each user')\n",
|
|
"for uid, user_ratings in list(top_n.items())[:5]:\n",
|
|
" print('User {}'.format(uid))\n",
|
|
" for prediction in user_ratings:\n",
|
|
" print(' Item {0.iid} ({0.est:.2f})'.format(prediction), end='')\n",
|
|
" print()\n",
|
|
"print()\n",
|
|
"\n",
|
|
" \n",
|
|
"## Item-based filtering\n",
|
|
"# compute cosine similarity between users \n",
|
|
"sim_options = {'name': 'cosine', 'user_based': False}\n",
|
|
"algo = KNNBasic(sim_options=sim_options)\n",
|
|
"algo.fit(trainset)\n",
|
|
"\n",
|
|
"# Than predict ratings for all pairs (u, i) that are NOT in the training set.\n",
|
|
"predictions = algo.test(testset)\n",
|
|
"top_n = get_top_n(predictions, n=4)\n",
|
|
"\n",
|
|
"# Print the recommended items for each user\n",
|
|
"print()\n",
|
|
"print('Top-3 recommended items for each user')\n",
|
|
"for uid, user_ratings in list(top_n.items())[:5]:\n",
|
|
" print('User {}'.format(uid))\n",
|
|
" for prediction in user_ratings:\n",
|
|
" print(' Item {0.iid} ({0.est:.2f})'.format(prediction), end='')\n",
|
|
" print()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 28,
|
|
"metadata": {
|
|
"vscode": {
|
|
"languageId": "python"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Computing the cosine similarity matrix...\n",
|
|
"Done computing similarity matrix.\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"Prediction(uid=383, iid=7, r_ui=None, est=2.3661840936304324, details={'actual_k': 4, 'was_impossible': False})"
|
|
]
|
|
},
|
|
"execution_count": 28,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"## Build a model using the full dataset\n",
|
|
"trainset = data.build_full_trainset()\n",
|
|
"sim_options = {'name': 'cosine', 'user_based': False}\n",
|
|
"algo = KNNBasic(sim_options=sim_options)\n",
|
|
"algo.fit(trainset)\n",
|
|
"\n",
|
|
"# Predict rating for user 383 and item 7\n",
|
|
"algo.predict(383, 7)"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|