From ba60fd77c6423316003b720b261d99af42e6f291 Mon Sep 17 00:00:00 2001 From: noah Date: Tue, 7 Feb 2023 18:05:20 -0600 Subject: [PATCH] Adding lecture work --- Lecture-Work.ipynb | 1462 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1462 insertions(+) create mode 100644 Lecture-Work.ipynb diff --git a/Lecture-Work.ipynb b/Lecture-Work.ipynb new file mode 100644 index 0000000..0b65ab2 --- /dev/null +++ b/Lecture-Work.ipynb @@ -0,0 +1,1462 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Chapter 14: Association Rules and Collaborative Filtering\n", + "\n", + "> (c) 2019 Galit Shmueli, Peter C. Bruce, Peter Gedeck \n", + ">\n", + "> Code included in\n", + ">\n", + "> _Data Mining for Business Analytics: Concepts, Techniques, and Applications in Python_ (First Edition) \n", + "> Galit Shmueli, Peter C. Bruce, Peter Gedeck, and Nitin R. Patel. 2019.\n", + "\n", + "## Import required packages" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting package metadata (current_repodata.json): ...working... done\n", + "Solving environment: ...working... failed with initial frozen solve. Retrying with flexible solve.\n", + "Solving environment: ...working... failed with repodata from current_repodata.json, will retry with next repodata source.\n", + "Collecting package metadata (repodata.json): ...working... done\n", + "Solving environment: ...working... done\n", + "\n", + "## Package Plan ##\n", + "\n", + " environment location: C:\\Users\\isa1761\\Anaconda3\n", + "\n", + " added / updated specs:\n", + " - scikit-surprise\n", + "\n", + "\n", + "The following packages will be downloaded:\n", + "\n", + " package | build\n", + " ---------------------------|-----------------\n", + " conda-4.12.0 | py39hcbf5309_0 1.0 MB conda-forge\n", + " scikit-surprise-1.1.1 | py39h5d4886f_2 538 KB conda-forge\n", + " ------------------------------------------------------------\n", + " Total: 1.5 MB\n", + "\n", + "The following NEW packages will be INSTALLED:\n", + "\n", + " scikit-surprise conda-forge/win-64::scikit-surprise-1.1.1-py39h5d4886f_2\n", + "\n", + "The following packages will be UPDATED:\n", + "\n", + " conda 4.11.0-py39hcbf5309_0 --> 4.12.0-py39hcbf5309_0\n", + "\n", + "\n", + "\n", + "Downloading and Extracting Packages\n", + "\n", + "scikit-surprise-1.1. | 538 KB | | 0% \n", + "scikit-surprise-1.1. | 538 KB | 2 | 3% \n", + "scikit-surprise-1.1. | 538 KB | ########## | 100% \n", + "scikit-surprise-1.1. | 538 KB | ########## | 100% \n", + "\n", + "conda-4.12.0 | 1.0 MB | | 0% \n", + "conda-4.12.0 | 1.0 MB | 1 | 2% \n", + "conda-4.12.0 | 1.0 MB | ########## | 100% \n", + "conda-4.12.0 | 1.0 MB | ########## | 100% \n", + "Preparing transaction: ...working... done\n", + "Verifying transaction: ...working... done\n", + "Executing transaction: ...working... done\n", + "\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "conda install -c conda-forge scikit-surprise" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "\n", + "from pathlib import Path\n", + "\n", + "import heapq\n", + "from collections import defaultdict\n", + "\n", + "import pandas as pd\n", + "import matplotlib.pylab as plt\n", + "from mlxtend.frequent_patterns import apriori\n", + "from mlxtend.frequent_patterns import association_rules\n", + "\n", + "from surprise import Dataset, Reader, KNNBasic\n", + "from surprise.model_selection import train_test_split\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Table 14.4" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
RedWhiteBlueOrangeGreenYellow
Transaction
1110010
2010100
3011000
4110100
5101000
6011000
7101000
8111010
9111000
10000001
\n", + "
" + ], + "text/plain": [ + " Red White Blue Orange Green Yellow\n", + "Transaction \n", + "1 1 1 0 0 1 0\n", + "2 0 1 0 1 0 0\n", + "3 0 1 1 0 0 0\n", + "4 1 1 0 1 0 0\n", + "5 1 0 1 0 0 0\n", + "6 0 1 1 0 0 0\n", + "7 1 0 1 0 0 0\n", + "8 1 1 1 0 1 0\n", + "9 1 1 1 0 0 0\n", + "10 0 0 0 0 0 1" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Load and preprocess data set \n", + "fp_df = pd.read_csv('Faceplate.csv')\n", + "fp_df.set_index('Transaction', inplace=True)\n", + "fp_df" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " antecedents consequents support confidence lift leverage\n", + "12 (White, Red) (Green) 0.2 0.5 2.500000 0.12\n", + "15 (Green) (White, Red) 0.2 1.0 2.500000 0.12\n", + "4 (Green) (Red) 0.2 1.0 1.666667 0.08\n", + "13 (White, Green) (Red) 0.2 1.0 1.666667 0.08\n", + "7 (Orange) (White) 0.2 1.0 1.428571 0.06\n", + "8 (Green) (White) 0.2 1.0 1.428571 0.06\n" + ] + } + ], + "source": [ + "# create frequent itemsets\n", + "itemsets = apriori(fp_df, min_support=0.2, use_colnames=True)\n", + "\n", + "# and convert into rules\n", + "rules = association_rules(itemsets, metric='confidence', min_threshold=0.5)\n", + "rules.sort_values(by=['lift'], ascending=False).head(6)\n", + "\n", + "print(rules.sort_values(by=['lift'], ascending=False)\n", + " .drop(columns=['antecedent support', 'consequent support', 'conviction'])\n", + " .head(6))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
antecedentsconsequentsantecedent supportconsequent supportsupportconfidenceliftleverageconviction
12(White, Red)(Green)0.40.20.20.52.5000000.121.6
4(Green)(Red)0.20.60.21.01.6666670.08inf
13(White, Green)(Red)0.20.60.21.01.6666670.08inf
7(Orange)(White)0.20.70.21.01.4285710.06inf
8(Green)(White)0.20.70.21.01.4285710.06inf
14(Red, Green)(White)0.20.70.21.01.4285710.06inf
\n", + "
" + ], + "text/plain": [ + " antecedents consequents antecedent support consequent support \\\n", + "12 (White, Red) (Green) 0.4 0.2 \n", + "4 (Green) (Red) 0.2 0.6 \n", + "13 (White, Green) (Red) 0.2 0.6 \n", + "7 (Orange) (White) 0.2 0.7 \n", + "8 (Green) (White) 0.2 0.7 \n", + "14 (Red, Green) (White) 0.2 0.7 \n", + "\n", + " support confidence lift leverage conviction \n", + "12 0.2 0.5 2.500000 0.12 1.6 \n", + "4 0.2 1.0 1.666667 0.08 inf \n", + "13 0.2 1.0 1.666667 0.08 inf \n", + "7 0.2 1.0 1.428571 0.06 inf \n", + "8 0.2 1.0 1.428571 0.06 inf \n", + "14 0.2 1.0 1.428571 0.06 inf " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# filter to get rules with single consequents only\n", + "rules[[len(c) == 1 for c in rules.consequents]].sort_values(by=['lift'], ascending=False).head(6)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The apriori method accepts sparse data frames as well. If we convert the original data frame to sparse format, we can see that the memory requirements go down to 40%. The `fill_value` argument informs the `to_sparse` method here which fields to ignore in each transaction." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Density 0.4\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
antecedentsconsequentsantecedent supportconsequent supportsupportconfidenceliftleverageconviction
12(White, Red)(Green)0.40.20.20.52.5000000.121.6
15(Green)(White, Red)0.20.40.21.02.5000000.12inf
4(Green)(Red)0.20.60.21.01.6666670.08inf
13(White, Green)(Red)0.20.60.21.01.6666670.08inf
7(Orange)(White)0.20.70.21.01.4285710.06inf
8(Green)(White)0.20.70.21.01.4285710.06inf
\n", + "
" + ], + "text/plain": [ + " antecedents consequents antecedent support consequent support \\\n", + "12 (White, Red) (Green) 0.4 0.2 \n", + "15 (Green) (White, Red) 0.2 0.4 \n", + "4 (Green) (Red) 0.2 0.6 \n", + "13 (White, Green) (Red) 0.2 0.6 \n", + "7 (Orange) (White) 0.2 0.7 \n", + "8 (Green) (White) 0.2 0.7 \n", + "\n", + " support confidence lift leverage conviction \n", + "12 0.2 0.5 2.500000 0.12 1.6 \n", + "15 0.2 1.0 2.500000 0.12 inf \n", + "4 0.2 1.0 1.666667 0.08 inf \n", + "13 0.2 1.0 1.666667 0.08 inf \n", + "7 0.2 1.0 1.428571 0.06 inf \n", + "8 0.2 1.0 1.428571 0.06 inf " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Convert data set into a sparse data frame\n", + "sparse_df = fp_df.to_sparse(fill_value=0)\n", + "print('Density {}'.format(sparse_df.density))\n", + "\n", + "# create frequent itemsets\n", + "itemsets = apriori(sparse_df, min_support=0.2, use_colnames=True)\n", + "\n", + "# and convert into rules\n", + "rules = association_rules(itemsets, metric='confidence', min_threshold=0.5)\n", + "rules.sort_values(by=['lift'], ascending=False).head(6)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data required for Table 14.5 and 14.6" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[{8}, {8, 3, 4}, {8}, {9, 3}, {9}, {8, 1}, {9, 6}, {9, 3, 5, 7}, {8}, set(), {1, 9, 7}, {1, 4, 5, 8, 9}, {9, 5, 7}, {8, 6, 7}, {9, 3, 7}, {1, 4, 9}, {8, 6, 7}, {8}, set(), {9}, {8, 2, 5, 6}, {9, 4, 6}, {9, 4}, {8, 9}, {8, 6}, {8, 1, 6}, {8, 5}, {8, 9, 4}, {9}, {8}, {8, 1, 5}, {9, 3, 6}, {9, 7}, {8, 9, 7}, {8, 3, 4, 6}, {8, 1, 4}, {8, 4, 7}, {8, 9}, {9, 4, 5, 7}, {8, 9, 2}, {9, 2, 5}, {1, 2, 9, 7}, {8, 5}, {8, 1, 7}, {8}, {9, 2, 7}, {9, 4, 6}, {9}, {9}, {8, 6, 7}]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
123456789
0000000010
1001100010
2000000010
3001000001
4000000001
\n", + "
" + ], + "text/plain": [ + " 1 2 3 4 5 6 7 8 9\n", + "0 0 0 0 0 0 0 0 1 0\n", + "1 0 0 1 1 0 0 0 1 0\n", + "2 0 0 0 0 0 0 0 1 0\n", + "3 0 0 1 0 0 0 0 0 1\n", + "4 0 0 0 0 0 0 0 0 1" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Prepare the dataset for table 14.6 based on table 14.5\n", + "from itertools import chain\n", + "randomTransactions = [{8}, {3,4,8}, {8}, {3,9}, {9}, {1,8}, {6,9}, {3,5,7,9}, {8}, set(), \n", + " {1,7,9}, {1,4,5,8,9}, {5,7,9}, {6,7,8}, {3,7,9}, {1,4,9}, {6,7,8}, {8}, set(), {9},\n", + " {2,5,6,8}, {4,6,9}, {4,9}, {8,9}, {6,8}, {1,6,8}, {5,8}, {4,8,9}, {9}, {8},\n", + " {1,5,8}, {3,6,9}, {7,9}, {7,8,9}, {3,4,6,8}, {1,4,8}, {4,7,8}, {8,9}, {4,5,7,9}, {2,8,9},\n", + " {2,5,9}, {1,2,7,9}, {5,8}, {1,7,8}, {8}, {2,7,9}, {4,6,9}, {9}, {9}, {6,7,8}]\n", + "print(randomTransactions)\n", + "uniqueItems = sorted(set(chain.from_iterable(randomTransactions)))\n", + "randomData = pd.DataFrame(0, index=range(len(randomTransactions)), columns=uniqueItems)\n", + "for row, transaction in enumerate(randomTransactions):\n", + " for item in transaction:\n", + " randomData.loc[row][item] = 1\n", + "randomData.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Table 14.6" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " antecedents consequents support confidence lift leverage\n", + "3 (8, 3) (4) 0.04 1.0 4.545455 0.0312\n", + "1 (1, 5) (8) 0.04 1.0 1.851852 0.0184\n", + "2 (2, 7) (9) 0.04 1.0 1.851852 0.0184\n", + "4 (3, 4) (8) 0.04 1.0 1.851852 0.0184\n", + "5 (3, 7) (9) 0.04 1.0 1.851852 0.0184\n", + "6 (4, 5) (9) 0.04 1.0 1.851852 0.0184\n" + ] + } + ], + "source": [ + "# create frequent itemsets\n", + "itemsets = apriori(randomData, min_support=2/len(randomData), use_colnames=True)\n", + "# and convert into rules\n", + "rules = association_rules(itemsets, metric='confidence', min_threshold=0.7)\n", + "print(rules.sort_values(by=['lift'], ascending=False)\n", + " .drop(columns=['antecedent support', 'consequent support', 'conviction'])\n", + " .head(6))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Table 14.8" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ChildBksYouthBksCookBksDoItYBksRefBksArtBksGeogBksItalCookItalAtlasItalArtFlorence
001100000000
100000000000
211101011000
300000000000
400000000000
\n", + "
" + ], + "text/plain": [ + " ChildBks YouthBks CookBks DoItYBks RefBks ArtBks GeogBks ItalCook \\\n", + "0 0 1 1 0 0 0 0 0 \n", + "1 0 0 0 0 0 0 0 0 \n", + "2 1 1 1 0 1 0 1 1 \n", + "3 0 0 0 0 0 0 0 0 \n", + "4 0 0 0 0 0 0 0 0 \n", + "\n", + " ItalAtlas ItalArt Florence \n", + "0 0 0 0 \n", + "1 0 0 0 \n", + "2 0 0 0 \n", + "3 0 0 0 \n", + "4 0 0 0 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# load dataset\n", + "all_books_df = pd.read_csv('CharlesBookClub.csv')\n", + "\n", + "# create the binary incidence matrix\n", + "ignore = ['Seq#', 'ID#', 'Gender', 'M', 'R', 'F', 'FirstPurch', 'Related Purchase',\n", + " 'Mcode', 'Rcode', 'Fcode', 'Yes_Florence', 'No_Florence']\n", + "count_books = all_books_df.drop(columns=ignore)\n", + "count_books[count_books > 0] = 1\n", + "\n", + "count_books.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# determine item frequencies\n", + "itemFrequency = count_books.sum(axis=0) / len(count_books)\n", + "\n", + "# and plot as histogram\n", + "ax = itemFrequency.plot.bar(color='blue')\n", + "plt.ylabel('Item frequency (relative)')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of rules 81\n", + " antecedents consequents support confidence lift leverage\n", + "64 (RefBks, YouthBks) (CookBks, ChildBks) 0.05525 0.68000 2.80992 0.03559\n", + "73 (RefBks, DoItYBks) (CookBks, ChildBks) 0.06125 0.66216 2.73621 0.03886\n", + "60 (DoItYBks, YouthBks) (CookBks, ChildBks) 0.06700 0.64891 2.68145 0.04201\n", + "80 (RefBks, GeogBks) (CookBks, ChildBks) 0.05025 0.61468 2.54000 0.03047\n", + "69 (GeogBks, YouthBks) (CookBks, ChildBks) 0.06325 0.60526 2.50109 0.03796\n", + "77 (DoItYBks, GeogBks) (CookBks, ChildBks) 0.06050 0.59901 2.47525 0.03606\n", + "65 (CookBks, ChildBks, GeogBks) (YouthBks) 0.06325 0.57763 2.42445 0.03716\n", + "71 (CookBks, RefBks, ChildBks) (DoItYBks) 0.06125 0.59179 2.32301 0.03488\n", + "47 (DoItYBks, GeogBks) (YouthBks) 0.05450 0.53960 2.26486 0.03044\n", + "61 (CookBks, RefBks, ChildBks) (YouthBks) 0.05525 0.53382 2.24057 0.03059\n", + "56 (CookBks, DoItYBks, ChildBks) (YouthBks) 0.06700 0.52446 2.20131 0.03656\n", + "58 (CookBks, ChildBks, YouthBks) (DoItYBks) 0.06700 0.55833 2.19169 0.03643\n", + "34 (RefBks, ChildBks) (DoItYBks) 0.07100 0.55361 2.17314 0.03833\n", + "75 (CookBks, ChildBks, GeogBks) (DoItYBks) 0.06050 0.55251 2.16884 0.03260\n", + "19 (ChildBks, GeogBks) (YouthBks) 0.07550 0.51624 2.16680 0.04066\n", + "45 (CookBks, GeogBks) (YouthBks) 0.08025 0.51360 2.15572 0.04302\n", + "63 (RefBks, ChildBks, YouthBks) (CookBks) 0.05525 0.89113 2.14471 0.02949\n", + "17 (ChildBks, YouthBks) (DoItYBks) 0.08025 0.54407 2.13569 0.04267\n", + "50 (CookBks, RefBks) (DoItYBks) 0.07450 0.53309 2.09262 0.03890\n", + "28 (RefBks) (CookBks, ChildBks) 0.10350 0.50549 2.08882 0.05395\n", + "70 (CookBks, DoItYBks, RefBks) (ChildBks) 0.06125 0.82215 2.08667 0.03190\n", + "15 (YouthBks) (CookBks, ChildBks) 0.12000 0.50367 2.08129 0.06234\n", + "72 (RefBks, DoItYBks, ChildBks) (CookBks) 0.06125 0.86268 2.07624 0.03175\n", + "23 (CookBks, ChildBks) (DoItYBks) 0.12775 0.52789 2.07220 0.06610\n", + "25 (DoItYBks) (CookBks, ChildBks) 0.12775 0.50147 2.07220 0.06610\n" + ] + } + ], + "source": [ + "# create frequent itemsets and rules\n", + "itemsets = apriori(count_books, min_support=200/4000, use_colnames=True)\n", + "rules = association_rules(itemsets, metric='confidence', min_threshold=0.5)\n", + "\n", + "print('Number of rules', len(rules))\n", + "\n", + "# Display 25 rules with highest lift\n", + "rules.sort_values(by=['lift'], ascending=False).head(25)\n", + "\n", + "pd.set_option('precision', 5)\n", + "pd.set_option('display.width', 100)\n", + "print(rules.sort_values(by=['lift'], ascending=False).drop(columns=['antecedent support', 'consequent support', 'conviction']).head(25))\n", + "pd.set_option('precision', 6)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
antecedentsconsequentsantecedent supportconsequent supportsupportconfidenceliftleverageconviction
47(DoItYBks, GeogBks)(YouthBks)0.101000.238250.054500.5396042.2648640.0304371.654554
34(RefBks, ChildBks)(DoItYBks)0.128250.254750.071000.5536062.1731350.0383281.669490
19(ChildBks, GeogBks)(YouthBks)0.146250.238250.075500.5162392.1667970.0406561.574642
45(CookBks, GeogBks)(YouthBks)0.156250.238250.080250.5136002.1557190.0430231.566098
17(ChildBks, YouthBks)(DoItYBks)0.147500.254750.080250.5440682.1356930.0426741.634563
50(CookBks, RefBks)(DoItYBks)0.139750.254750.074500.5330952.0926190.0388991.596148
23(CookBks, ChildBks)(DoItYBks)0.242000.254750.127750.5278932.0721980.0661011.578560
49(GeogBks, YouthBks)(DoItYBks)0.104500.254750.054500.5215312.0472270.0278791.557573
41(CookBks, YouthBks)(DoItYBks)0.161000.254750.083750.5201862.0419480.0427351.553207
43(RefBks, YouthBks)(CookBks)0.081250.415500.068250.8400002.0216610.0344913.653125
\n", + "
" + ], + "text/plain": [ + " antecedents consequents antecedent support consequent support support confidence \\\n", + "47 (DoItYBks, GeogBks) (YouthBks) 0.10100 0.23825 0.05450 0.539604 \n", + "34 (RefBks, ChildBks) (DoItYBks) 0.12825 0.25475 0.07100 0.553606 \n", + "19 (ChildBks, GeogBks) (YouthBks) 0.14625 0.23825 0.07550 0.516239 \n", + "45 (CookBks, GeogBks) (YouthBks) 0.15625 0.23825 0.08025 0.513600 \n", + "17 (ChildBks, YouthBks) (DoItYBks) 0.14750 0.25475 0.08025 0.544068 \n", + "50 (CookBks, RefBks) (DoItYBks) 0.13975 0.25475 0.07450 0.533095 \n", + "23 (CookBks, ChildBks) (DoItYBks) 0.24200 0.25475 0.12775 0.527893 \n", + "49 (GeogBks, YouthBks) (DoItYBks) 0.10450 0.25475 0.05450 0.521531 \n", + "41 (CookBks, YouthBks) (DoItYBks) 0.16100 0.25475 0.08375 0.520186 \n", + "43 (RefBks, YouthBks) (CookBks) 0.08125 0.41550 0.06825 0.840000 \n", + "\n", + " lift leverage conviction \n", + "47 2.264864 0.030437 1.654554 \n", + "34 2.173135 0.038328 1.669490 \n", + "19 2.166797 0.040656 1.574642 \n", + "45 2.155719 0.043023 1.566098 \n", + "17 2.135693 0.042674 1.634563 \n", + "50 2.092619 0.038899 1.596148 \n", + "23 2.072198 0.066101 1.578560 \n", + "49 2.047227 0.027879 1.557573 \n", + "41 2.041948 0.042735 1.553207 \n", + "43 2.021661 0.034491 3.653125 " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Filter rules by number of antecedents (maximum 2) and consequents (maximum 1)\n", + "rules = rules[[len(c) <= 2 for c in rules.antecedents]]\n", + "rules = rules[[len(c) == 1 for c in rules.consequents]]\n", + "\n", + "rules.sort_values(by=['lift'], ascending=False).head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Section 14.2 Collaborative Filtering" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "user: 823519 item: 30 r_ui = 4.00 est = 3.54 {'was_impossible': True, 'reason': 'User and/or item is unkown.'}\n" + ] + } + ], + "source": [ + "ratings = pd.DataFrame([\n", + " [30878, 1, 4], [30878, 5, 1], [30878, 18, 3], [30878, 28, 3], [30878, 30, 4], [30878, 44, 5], \n", + " [124105, 1, 4], \n", + " [822109, 1, 5], \n", + " [823519, 1, 3], [823519, 8, 1], [823519, 17, 4], [823519, 28, 4], [823519, 30, 5], \n", + " [885013, 1, 4], [885013, 5, 5], \n", + " [893988, 1, 3], [893988, 30, 4], [893988, 44, 4], \n", + " [1248029, 1, 3], [1248029, 28, 2], [1248029, 30, 4], [1248029, 48, 3], \n", + " [1503895, 1, 4], \n", + " [1842128, 1, 4], [1842128, 30, 3], \n", + " [2238063, 1, 3], \n", + "], columns=['customerID', 'movieID', 'rating'])\n", + "\n", + "reader = Reader(rating_scale=(1, 5))\n", + "data = Dataset.load_from_df(ratings[['customerID', 'movieID', 'rating']], reader)\n", + "trainset = data.build_full_trainset()\n", + "sim_options = {'name': 'cosine', 'user_based': False} # compute cosine similarities between items\n", + "algo = KNNBasic(sim_options=sim_options)\n", + "algo.fit(trainset)\n", + "pred = algo.predict(str(823519), str(30), r_ui=4, verbose=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Table 14.11" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "\n", + "Top-3 recommended items for each user\n", + "User 6\n", + " Item 6 (5.00) Item 77 (2.50) Item 60 (1.00)\n", + "User 222\n", + " Item 77 (3.50) Item 75 (2.78)\n", + "User 424\n", + " Item 14 (3.50) Item 45 (3.10) Item 54 (2.34)\n", + "User 87\n", + " Item 27 (3.00) Item 54 (3.00) Item 82 (3.00) Item 32 (1.00)\n", + "User 121\n", + " Item 98 (3.48) Item 32 (2.83)\n", + "\n", + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n", + "\n", + "Top-3 recommended items for each user\n", + "User 6\n", + " Item 77 (3.00) Item 60 (3.00) Item 6 (3.00)\n", + "User 222\n", + " Item 77 (2.24) Item 75 (2.00)\n", + "User 424\n", + " Item 54 (3.47) Item 14 (3.44) Item 45 (3.00)\n", + "User 87\n", + " Item 27 (3.00) Item 32 (3.00) Item 82 (3.00) Item 54 (2.50)\n", + "User 121\n", + " Item 32 (3.06) Item 98 (2.31)\n" + ] + } + ], + "source": [ + "import random\n", + "\n", + "random.seed(0)\n", + "nratings = 5000\n", + "randomData = pd.DataFrame({\n", + " 'itemID': [random.randint(0,99) for _ in range(nratings)],\n", + " 'userID': [random.randint(0,999) for _ in range(nratings)],\n", + " 'rating': [random.randint(1,5) for _ in range(nratings)],\n", + "})\n", + "\n", + "def get_top_n(predictions, n=10):\n", + " # First map the predictions to each user.\n", + " byUser = defaultdict(list)\n", + " for p in predictions:\n", + " byUser[p.uid].append(p)\n", + " \n", + " # For each user, reduce predictions to top-n\n", + " for uid, userPredictions in byUser.items():\n", + " byUser[uid] = heapq.nlargest(n, userPredictions, key=lambda p: p.est)\n", + " return byUser\n", + "\n", + "# Convert thes data set into the format required by the surprise package\n", + "# The columns must correspond to user id, item id and ratings (in that order)\n", + "reader = Reader(rating_scale=(1, 5))\n", + "data = Dataset.load_from_df(randomData[['userID', 'itemID', 'rating']], reader)\n", + "\n", + "# Split into training and test set\n", + "trainset, testset = train_test_split(data, test_size=.25, random_state=1)\n", + "\n", + "## User-based filtering\n", + "# compute cosine similarity between users \n", + "sim_options = {'name': 'cosine', 'user_based': True}\n", + "algo = KNNBasic(sim_options=sim_options)\n", + "algo.fit(trainset)\n", + "\n", + "# Than predict ratings for all pairs (u, i) that are NOT in the training set.\n", + "predictions = algo.test(testset)\n", + "\n", + "top_n = get_top_n(predictions, n=4)\n", + "\n", + "# Print the recommended items for each user\n", + "print()\n", + "print('Top-3 recommended items for each user')\n", + "for uid, user_ratings in list(top_n.items())[:5]:\n", + " print('User {}'.format(uid))\n", + " for prediction in user_ratings:\n", + " print(' Item {0.iid} ({0.est:.2f})'.format(prediction), end='')\n", + " print()\n", + "print()\n", + "\n", + " \n", + "## Item-based filtering\n", + "# compute cosine similarity between users \n", + "sim_options = {'name': 'cosine', 'user_based': False}\n", + "algo = KNNBasic(sim_options=sim_options)\n", + "algo.fit(trainset)\n", + "\n", + "# Than predict ratings for all pairs (u, i) that are NOT in the training set.\n", + "predictions = algo.test(testset)\n", + "top_n = get_top_n(predictions, n=4)\n", + "\n", + "# Print the recommended items for each user\n", + "print()\n", + "print('Top-3 recommended items for each user')\n", + "for uid, user_ratings in list(top_n.items())[:5]:\n", + " print('User {}'.format(uid))\n", + " for prediction in user_ratings:\n", + " print(' Item {0.iid} ({0.est:.2f})'.format(prediction), end='')\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Computing the cosine similarity matrix...\n", + "Done computing similarity matrix.\n" + ] + }, + { + "data": { + "text/plain": [ + "Prediction(uid=383, iid=7, r_ui=None, est=2.3661840936304324, details={'actual_k': 4, 'was_impossible': False})" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "## Build a model using the full dataset\n", + "trainset = data.build_full_trainset()\n", + "sim_options = {'name': 'cosine', 'user_based': False}\n", + "algo = KNNBasic(sim_options=sim_options)\n", + "algo.fit(trainset)\n", + "\n", + "# Predict rating for user 383 and item 7\n", + "algo.predict(383, 7)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}