{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Chapter 14: Association Rules and Collaborative Filtering\n", "\n", "> (c) 2019 Galit Shmueli, Peter C. Bruce, Peter Gedeck \n", ">\n", "> Code included in\n", ">\n", "> _Data Mining for Business Analytics: Concepts, Techniques, and Applications in Python_ (First Edition) \n", "> Galit Shmueli, Peter C. Bruce, Peter Gedeck, and Nitin R. Patel. 2019.\n", "\n", "## Import required packages" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting package metadata (current_repodata.json): ...working... done\n", "Solving environment: ...working... failed with initial frozen solve. Retrying with flexible solve.\n", "Solving environment: ...working... failed with repodata from current_repodata.json, will retry with next repodata source.\n", "Collecting package metadata (repodata.json): ...working... done\n", "Solving environment: ...working... done\n", "\n", "## Package Plan ##\n", "\n", " environment location: C:\\Users\\isa1761\\Anaconda3\n", "\n", " added / updated specs:\n", " - scikit-surprise\n", "\n", "\n", "The following packages will be downloaded:\n", "\n", " package | build\n", " ---------------------------|-----------------\n", " conda-4.12.0 | py39hcbf5309_0 1.0 MB conda-forge\n", " scikit-surprise-1.1.1 | py39h5d4886f_2 538 KB conda-forge\n", " ------------------------------------------------------------\n", " Total: 1.5 MB\n", "\n", "The following NEW packages will be INSTALLED:\n", "\n", " scikit-surprise conda-forge/win-64::scikit-surprise-1.1.1-py39h5d4886f_2\n", "\n", "The following packages will be UPDATED:\n", "\n", " conda 4.11.0-py39hcbf5309_0 --> 4.12.0-py39hcbf5309_0\n", "\n", "\n", "\n", "Downloading and Extracting Packages\n", "\n", "scikit-surprise-1.1. | 538 KB | | 0% \n", "scikit-surprise-1.1. | 538 KB | 2 | 3% \n", "scikit-surprise-1.1. | 538 KB | ########## | 100% \n", "scikit-surprise-1.1. | 538 KB | ########## | 100% \n", "\n", "conda-4.12.0 | 1.0 MB | | 0% \n", "conda-4.12.0 | 1.0 MB | 1 | 2% \n", "conda-4.12.0 | 1.0 MB | ########## | 100% \n", "conda-4.12.0 | 1.0 MB | ########## | 100% \n", "Preparing transaction: ...working... done\n", "Verifying transaction: ...working... done\n", "Executing transaction: ...working... done\n", "\n", "Note: you may need to restart the kernel to use updated packages.\n" ] } ], "source": [ "conda install -c conda-forge scikit-surprise" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", "\n", "from pathlib import Path\n", "\n", "import heapq\n", "from collections import defaultdict\n", "\n", "import pandas as pd\n", "import matplotlib.pylab as plt\n", "from mlxtend.frequent_patterns import apriori\n", "from mlxtend.frequent_patterns import association_rules\n", "\n", "from surprise import Dataset, Reader, KNNBasic\n", "from surprise.model_selection import train_test_split\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Table 14.4" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
RedWhiteBlueOrangeGreenYellow
Transaction
1110010
2010100
3011000
4110100
5101000
6011000
7101000
8111010
9111000
10000001
\n", "
" ], "text/plain": [ " Red White Blue Orange Green Yellow\n", "Transaction \n", "1 1 1 0 0 1 0\n", "2 0 1 0 1 0 0\n", "3 0 1 1 0 0 0\n", "4 1 1 0 1 0 0\n", "5 1 0 1 0 0 0\n", "6 0 1 1 0 0 0\n", "7 1 0 1 0 0 0\n", "8 1 1 1 0 1 0\n", "9 1 1 1 0 0 0\n", "10 0 0 0 0 0 1" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Load and preprocess data set \n", "fp_df = pd.read_csv('Faceplate.csv')\n", "fp_df.set_index('Transaction', inplace=True)\n", "fp_df" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " antecedents consequents support confidence lift leverage\n", "12 (White, Red) (Green) 0.2 0.5 2.500000 0.12\n", "15 (Green) (White, Red) 0.2 1.0 2.500000 0.12\n", "4 (Green) (Red) 0.2 1.0 1.666667 0.08\n", "13 (White, Green) (Red) 0.2 1.0 1.666667 0.08\n", "7 (Orange) (White) 0.2 1.0 1.428571 0.06\n", "8 (Green) (White) 0.2 1.0 1.428571 0.06\n" ] } ], "source": [ "# create frequent itemsets\n", "itemsets = apriori(fp_df, min_support=0.2, use_colnames=True)\n", "\n", "# and convert into rules\n", "rules = association_rules(itemsets, metric='confidence', min_threshold=0.5)\n", "rules.sort_values(by=['lift'], ascending=False).head(6)\n", "\n", "print(rules.sort_values(by=['lift'], ascending=False)\n", " .drop(columns=['antecedent support', 'consequent support', 'conviction'])\n", " .head(6))" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
antecedentsconsequentsantecedent supportconsequent supportsupportconfidenceliftleverageconviction
12(White, Red)(Green)0.40.20.20.52.5000000.121.6
4(Green)(Red)0.20.60.21.01.6666670.08inf
13(White, Green)(Red)0.20.60.21.01.6666670.08inf
7(Orange)(White)0.20.70.21.01.4285710.06inf
8(Green)(White)0.20.70.21.01.4285710.06inf
14(Red, Green)(White)0.20.70.21.01.4285710.06inf
\n", "
" ], "text/plain": [ " antecedents consequents antecedent support consequent support \\\n", "12 (White, Red) (Green) 0.4 0.2 \n", "4 (Green) (Red) 0.2 0.6 \n", "13 (White, Green) (Red) 0.2 0.6 \n", "7 (Orange) (White) 0.2 0.7 \n", "8 (Green) (White) 0.2 0.7 \n", "14 (Red, Green) (White) 0.2 0.7 \n", "\n", " support confidence lift leverage conviction \n", "12 0.2 0.5 2.500000 0.12 1.6 \n", "4 0.2 1.0 1.666667 0.08 inf \n", "13 0.2 1.0 1.666667 0.08 inf \n", "7 0.2 1.0 1.428571 0.06 inf \n", "8 0.2 1.0 1.428571 0.06 inf \n", "14 0.2 1.0 1.428571 0.06 inf " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# filter to get rules with single consequents only\n", "rules[[len(c) == 1 for c in rules.consequents]].sort_values(by=['lift'], ascending=False).head(6)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The apriori method accepts sparse data frames as well. If we convert the original data frame to sparse format, we can see that the memory requirements go down to 40%. The `fill_value` argument informs the `to_sparse` method here which fields to ignore in each transaction." ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Density 0.4\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
antecedentsconsequentsantecedent supportconsequent supportsupportconfidenceliftleverageconviction
12(White, Red)(Green)0.40.20.20.52.5000000.121.6
15(Green)(White, Red)0.20.40.21.02.5000000.12inf
4(Green)(Red)0.20.60.21.01.6666670.08inf
13(White, Green)(Red)0.20.60.21.01.6666670.08inf
7(Orange)(White)0.20.70.21.01.4285710.06inf
8(Green)(White)0.20.70.21.01.4285710.06inf
\n", "
" ], "text/plain": [ " antecedents consequents antecedent support consequent support \\\n", "12 (White, Red) (Green) 0.4 0.2 \n", "15 (Green) (White, Red) 0.2 0.4 \n", "4 (Green) (Red) 0.2 0.6 \n", "13 (White, Green) (Red) 0.2 0.6 \n", "7 (Orange) (White) 0.2 0.7 \n", "8 (Green) (White) 0.2 0.7 \n", "\n", " support confidence lift leverage conviction \n", "12 0.2 0.5 2.500000 0.12 1.6 \n", "15 0.2 1.0 2.500000 0.12 inf \n", "4 0.2 1.0 1.666667 0.08 inf \n", "13 0.2 1.0 1.666667 0.08 inf \n", "7 0.2 1.0 1.428571 0.06 inf \n", "8 0.2 1.0 1.428571 0.06 inf " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Convert data set into a sparse data frame\n", "sparse_df = fp_df.to_sparse(fill_value=0)\n", "print('Density {}'.format(sparse_df.density))\n", "\n", "# create frequent itemsets\n", "itemsets = apriori(sparse_df, min_support=0.2, use_colnames=True)\n", "\n", "# and convert into rules\n", "rules = association_rules(itemsets, metric='confidence', min_threshold=0.5)\n", "rules.sort_values(by=['lift'], ascending=False).head(6)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data required for Table 14.5 and 14.6" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[{8}, {8, 3, 4}, {8}, {9, 3}, {9}, {8, 1}, {9, 6}, {9, 3, 5, 7}, {8}, set(), {1, 9, 7}, {1, 4, 5, 8, 9}, {9, 5, 7}, {8, 6, 7}, {9, 3, 7}, {1, 4, 9}, {8, 6, 7}, {8}, set(), {9}, {8, 2, 5, 6}, {9, 4, 6}, {9, 4}, {8, 9}, {8, 6}, {8, 1, 6}, {8, 5}, {8, 9, 4}, {9}, {8}, {8, 1, 5}, {9, 3, 6}, {9, 7}, {8, 9, 7}, {8, 3, 4, 6}, {8, 1, 4}, {8, 4, 7}, {8, 9}, {9, 4, 5, 7}, {8, 9, 2}, {9, 2, 5}, {1, 2, 9, 7}, {8, 5}, {8, 1, 7}, {8}, {9, 2, 7}, {9, 4, 6}, {9}, {9}, {8, 6, 7}]\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
123456789
0000000010
1001100010
2000000010
3001000001
4000000001
\n", "
" ], "text/plain": [ " 1 2 3 4 5 6 7 8 9\n", "0 0 0 0 0 0 0 0 1 0\n", "1 0 0 1 1 0 0 0 1 0\n", "2 0 0 0 0 0 0 0 1 0\n", "3 0 0 1 0 0 0 0 0 1\n", "4 0 0 0 0 0 0 0 0 1" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Prepare the dataset for table 14.6 based on table 14.5\n", "from itertools import chain\n", "randomTransactions = [{8}, {3,4,8}, {8}, {3,9}, {9}, {1,8}, {6,9}, {3,5,7,9}, {8}, set(), \n", " {1,7,9}, {1,4,5,8,9}, {5,7,9}, {6,7,8}, {3,7,9}, {1,4,9}, {6,7,8}, {8}, set(), {9},\n", " {2,5,6,8}, {4,6,9}, {4,9}, {8,9}, {6,8}, {1,6,8}, {5,8}, {4,8,9}, {9}, {8},\n", " {1,5,8}, {3,6,9}, {7,9}, {7,8,9}, {3,4,6,8}, {1,4,8}, {4,7,8}, {8,9}, {4,5,7,9}, {2,8,9},\n", " {2,5,9}, {1,2,7,9}, {5,8}, {1,7,8}, {8}, {2,7,9}, {4,6,9}, {9}, {9}, {6,7,8}]\n", "print(randomTransactions)\n", "uniqueItems = sorted(set(chain.from_iterable(randomTransactions)))\n", "randomData = pd.DataFrame(0, index=range(len(randomTransactions)), columns=uniqueItems)\n", "for row, transaction in enumerate(randomTransactions):\n", " for item in transaction:\n", " randomData.loc[row][item] = 1\n", "randomData.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Table 14.6" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " antecedents consequents support confidence lift leverage\n", "3 (8, 3) (4) 0.04 1.0 4.545455 0.0312\n", "1 (1, 5) (8) 0.04 1.0 1.851852 0.0184\n", "2 (2, 7) (9) 0.04 1.0 1.851852 0.0184\n", "4 (3, 4) (8) 0.04 1.0 1.851852 0.0184\n", "5 (3, 7) (9) 0.04 1.0 1.851852 0.0184\n", "6 (4, 5) (9) 0.04 1.0 1.851852 0.0184\n" ] } ], "source": [ "# create frequent itemsets\n", "itemsets = apriori(randomData, min_support=2/len(randomData), use_colnames=True)\n", "# and convert into rules\n", "rules = association_rules(itemsets, metric='confidence', min_threshold=0.7)\n", "print(rules.sort_values(by=['lift'], ascending=False)\n", " .drop(columns=['antecedent support', 'consequent support', 'conviction'])\n", " .head(6))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Table 14.8" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ChildBksYouthBksCookBksDoItYBksRefBksArtBksGeogBksItalCookItalAtlasItalArtFlorence
001100000000
100000000000
211101011000
300000000000
400000000000
\n", "
" ], "text/plain": [ " ChildBks YouthBks CookBks DoItYBks RefBks ArtBks GeogBks ItalCook \\\n", "0 0 1 1 0 0 0 0 0 \n", "1 0 0 0 0 0 0 0 0 \n", "2 1 1 1 0 1 0 1 1 \n", "3 0 0 0 0 0 0 0 0 \n", "4 0 0 0 0 0 0 0 0 \n", "\n", " ItalAtlas ItalArt Florence \n", "0 0 0 0 \n", "1 0 0 0 \n", "2 0 0 0 \n", "3 0 0 0 \n", "4 0 0 0 " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# load dataset\n", "all_books_df = pd.read_csv('CharlesBookClub.csv')\n", "\n", "# create the binary incidence matrix\n", "ignore = ['Seq#', 'ID#', 'Gender', 'M', 'R', 'F', 'FirstPurch', 'Related Purchase',\n", " 'Mcode', 'Rcode', 'Fcode', 'Yes_Florence', 'No_Florence']\n", "count_books = all_books_df.drop(columns=ignore)\n", "count_books[count_books > 0] = 1\n", "\n", "count_books.head()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# determine item frequencies\n", "itemFrequency = count_books.sum(axis=0) / len(count_books)\n", "\n", "# and plot as histogram\n", "ax = itemFrequency.plot.bar(color='blue')\n", "plt.ylabel('Item frequency (relative)')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of rules 81\n", " antecedents consequents support confidence lift leverage\n", "64 (RefBks, YouthBks) (CookBks, ChildBks) 0.05525 0.68000 2.80992 0.03559\n", "73 (RefBks, DoItYBks) (CookBks, ChildBks) 0.06125 0.66216 2.73621 0.03886\n", "60 (DoItYBks, YouthBks) (CookBks, ChildBks) 0.06700 0.64891 2.68145 0.04201\n", "80 (RefBks, GeogBks) (CookBks, ChildBks) 0.05025 0.61468 2.54000 0.03047\n", "69 (GeogBks, YouthBks) (CookBks, ChildBks) 0.06325 0.60526 2.50109 0.03796\n", "77 (DoItYBks, GeogBks) (CookBks, ChildBks) 0.06050 0.59901 2.47525 0.03606\n", "65 (CookBks, ChildBks, GeogBks) (YouthBks) 0.06325 0.57763 2.42445 0.03716\n", "71 (CookBks, RefBks, ChildBks) (DoItYBks) 0.06125 0.59179 2.32301 0.03488\n", "47 (DoItYBks, GeogBks) (YouthBks) 0.05450 0.53960 2.26486 0.03044\n", "61 (CookBks, RefBks, ChildBks) (YouthBks) 0.05525 0.53382 2.24057 0.03059\n", "56 (CookBks, DoItYBks, ChildBks) (YouthBks) 0.06700 0.52446 2.20131 0.03656\n", "58 (CookBks, ChildBks, YouthBks) (DoItYBks) 0.06700 0.55833 2.19169 0.03643\n", "34 (RefBks, ChildBks) (DoItYBks) 0.07100 0.55361 2.17314 0.03833\n", "75 (CookBks, ChildBks, GeogBks) (DoItYBks) 0.06050 0.55251 2.16884 0.03260\n", "19 (ChildBks, GeogBks) (YouthBks) 0.07550 0.51624 2.16680 0.04066\n", "45 (CookBks, GeogBks) (YouthBks) 0.08025 0.51360 2.15572 0.04302\n", "63 (RefBks, ChildBks, YouthBks) (CookBks) 0.05525 0.89113 2.14471 0.02949\n", "17 (ChildBks, YouthBks) (DoItYBks) 0.08025 0.54407 2.13569 0.04267\n", "50 (CookBks, RefBks) (DoItYBks) 0.07450 0.53309 2.09262 0.03890\n", "28 (RefBks) (CookBks, ChildBks) 0.10350 0.50549 2.08882 0.05395\n", "70 (CookBks, DoItYBks, RefBks) (ChildBks) 0.06125 0.82215 2.08667 0.03190\n", "15 (YouthBks) (CookBks, ChildBks) 0.12000 0.50367 2.08129 0.06234\n", "72 (RefBks, DoItYBks, ChildBks) (CookBks) 0.06125 0.86268 2.07624 0.03175\n", "23 (CookBks, ChildBks) (DoItYBks) 0.12775 0.52789 2.07220 0.06610\n", "25 (DoItYBks) (CookBks, ChildBks) 0.12775 0.50147 2.07220 0.06610\n" ] } ], "source": [ "# create frequent itemsets and rules\n", "itemsets = apriori(count_books, min_support=200/4000, use_colnames=True)\n", "rules = association_rules(itemsets, metric='confidence', min_threshold=0.5)\n", "\n", "print('Number of rules', len(rules))\n", "\n", "# Display 25 rules with highest lift\n", "rules.sort_values(by=['lift'], ascending=False).head(25)\n", "\n", "pd.set_option('precision', 5)\n", "pd.set_option('display.width', 100)\n", "print(rules.sort_values(by=['lift'], ascending=False).drop(columns=['antecedent support', 'consequent support', 'conviction']).head(25))\n", "pd.set_option('precision', 6)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
antecedentsconsequentsantecedent supportconsequent supportsupportconfidenceliftleverageconviction
47(DoItYBks, GeogBks)(YouthBks)0.101000.238250.054500.5396042.2648640.0304371.654554
34(RefBks, ChildBks)(DoItYBks)0.128250.254750.071000.5536062.1731350.0383281.669490
19(ChildBks, GeogBks)(YouthBks)0.146250.238250.075500.5162392.1667970.0406561.574642
45(CookBks, GeogBks)(YouthBks)0.156250.238250.080250.5136002.1557190.0430231.566098
17(ChildBks, YouthBks)(DoItYBks)0.147500.254750.080250.5440682.1356930.0426741.634563
50(CookBks, RefBks)(DoItYBks)0.139750.254750.074500.5330952.0926190.0388991.596148
23(CookBks, ChildBks)(DoItYBks)0.242000.254750.127750.5278932.0721980.0661011.578560
49(GeogBks, YouthBks)(DoItYBks)0.104500.254750.054500.5215312.0472270.0278791.557573
41(CookBks, YouthBks)(DoItYBks)0.161000.254750.083750.5201862.0419480.0427351.553207
43(RefBks, YouthBks)(CookBks)0.081250.415500.068250.8400002.0216610.0344913.653125
\n", "
" ], "text/plain": [ " antecedents consequents antecedent support consequent support support confidence \\\n", "47 (DoItYBks, GeogBks) (YouthBks) 0.10100 0.23825 0.05450 0.539604 \n", "34 (RefBks, ChildBks) (DoItYBks) 0.12825 0.25475 0.07100 0.553606 \n", "19 (ChildBks, GeogBks) (YouthBks) 0.14625 0.23825 0.07550 0.516239 \n", "45 (CookBks, GeogBks) (YouthBks) 0.15625 0.23825 0.08025 0.513600 \n", "17 (ChildBks, YouthBks) (DoItYBks) 0.14750 0.25475 0.08025 0.544068 \n", "50 (CookBks, RefBks) (DoItYBks) 0.13975 0.25475 0.07450 0.533095 \n", "23 (CookBks, ChildBks) (DoItYBks) 0.24200 0.25475 0.12775 0.527893 \n", "49 (GeogBks, YouthBks) (DoItYBks) 0.10450 0.25475 0.05450 0.521531 \n", "41 (CookBks, YouthBks) (DoItYBks) 0.16100 0.25475 0.08375 0.520186 \n", "43 (RefBks, YouthBks) (CookBks) 0.08125 0.41550 0.06825 0.840000 \n", "\n", " lift leverage conviction \n", "47 2.264864 0.030437 1.654554 \n", "34 2.173135 0.038328 1.669490 \n", "19 2.166797 0.040656 1.574642 \n", "45 2.155719 0.043023 1.566098 \n", "17 2.135693 0.042674 1.634563 \n", "50 2.092619 0.038899 1.596148 \n", "23 2.072198 0.066101 1.578560 \n", "49 2.047227 0.027879 1.557573 \n", "41 2.041948 0.042735 1.553207 \n", "43 2.021661 0.034491 3.653125 " ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Filter rules by number of antecedents (maximum 2) and consequents (maximum 1)\n", "rules = rules[[len(c) <= 2 for c in rules.antecedents]]\n", "rules = rules[[len(c) == 1 for c in rules.consequents]]\n", "\n", "rules.sort_values(by=['lift'], ascending=False).head(10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Section 14.2 Collaborative Filtering" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Computing the cosine similarity matrix...\n", "Done computing similarity matrix.\n", "user: 823519 item: 30 r_ui = 4.00 est = 3.54 {'was_impossible': True, 'reason': 'User and/or item is unkown.'}\n" ] } ], "source": [ "ratings = pd.DataFrame([\n", " [30878, 1, 4], [30878, 5, 1], [30878, 18, 3], [30878, 28, 3], [30878, 30, 4], [30878, 44, 5], \n", " [124105, 1, 4], \n", " [822109, 1, 5], \n", " [823519, 1, 3], [823519, 8, 1], [823519, 17, 4], [823519, 28, 4], [823519, 30, 5], \n", " [885013, 1, 4], [885013, 5, 5], \n", " [893988, 1, 3], [893988, 30, 4], [893988, 44, 4], \n", " [1248029, 1, 3], [1248029, 28, 2], [1248029, 30, 4], [1248029, 48, 3], \n", " [1503895, 1, 4], \n", " [1842128, 1, 4], [1842128, 30, 3], \n", " [2238063, 1, 3], \n", "], columns=['customerID', 'movieID', 'rating'])\n", "\n", "reader = Reader(rating_scale=(1, 5))\n", "data = Dataset.load_from_df(ratings[['customerID', 'movieID', 'rating']], reader)\n", "trainset = data.build_full_trainset()\n", "sim_options = {'name': 'cosine', 'user_based': False} # compute cosine similarities between items\n", "algo = KNNBasic(sim_options=sim_options)\n", "algo.fit(trainset)\n", "pred = algo.predict(str(823519), str(30), r_ui=4, verbose=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Table 14.11" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Computing the cosine similarity matrix...\n", "Done computing similarity matrix.\n", "\n", "Top-3 recommended items for each user\n", "User 6\n", " Item 6 (5.00) Item 77 (2.50) Item 60 (1.00)\n", "User 222\n", " Item 77 (3.50) Item 75 (2.78)\n", "User 424\n", " Item 14 (3.50) Item 45 (3.10) Item 54 (2.34)\n", "User 87\n", " Item 27 (3.00) Item 54 (3.00) Item 82 (3.00) Item 32 (1.00)\n", "User 121\n", " Item 98 (3.48) Item 32 (2.83)\n", "\n", "Computing the cosine similarity matrix...\n", "Done computing similarity matrix.\n", "\n", "Top-3 recommended items for each user\n", "User 6\n", " Item 77 (3.00) Item 60 (3.00) Item 6 (3.00)\n", "User 222\n", " Item 77 (2.24) Item 75 (2.00)\n", "User 424\n", " Item 54 (3.47) Item 14 (3.44) Item 45 (3.00)\n", "User 87\n", " Item 27 (3.00) Item 32 (3.00) Item 82 (3.00) Item 54 (2.50)\n", "User 121\n", " Item 32 (3.06) Item 98 (2.31)\n" ] } ], "source": [ "import random\n", "\n", "random.seed(0)\n", "nratings = 5000\n", "randomData = pd.DataFrame({\n", " 'itemID': [random.randint(0,99) for _ in range(nratings)],\n", " 'userID': [random.randint(0,999) for _ in range(nratings)],\n", " 'rating': [random.randint(1,5) for _ in range(nratings)],\n", "})\n", "\n", "def get_top_n(predictions, n=10):\n", " # First map the predictions to each user.\n", " byUser = defaultdict(list)\n", " for p in predictions:\n", " byUser[p.uid].append(p)\n", " \n", " # For each user, reduce predictions to top-n\n", " for uid, userPredictions in byUser.items():\n", " byUser[uid] = heapq.nlargest(n, userPredictions, key=lambda p: p.est)\n", " return byUser\n", "\n", "# Convert thes data set into the format required by the surprise package\n", "# The columns must correspond to user id, item id and ratings (in that order)\n", "reader = Reader(rating_scale=(1, 5))\n", "data = Dataset.load_from_df(randomData[['userID', 'itemID', 'rating']], reader)\n", "\n", "# Split into training and test set\n", "trainset, testset = train_test_split(data, test_size=.25, random_state=1)\n", "\n", "## User-based filtering\n", "# compute cosine similarity between users \n", "sim_options = {'name': 'cosine', 'user_based': True}\n", "algo = KNNBasic(sim_options=sim_options)\n", "algo.fit(trainset)\n", "\n", "# Than predict ratings for all pairs (u, i) that are NOT in the training set.\n", "predictions = algo.test(testset)\n", "\n", "top_n = get_top_n(predictions, n=4)\n", "\n", "# Print the recommended items for each user\n", "print()\n", "print('Top-3 recommended items for each user')\n", "for uid, user_ratings in list(top_n.items())[:5]:\n", " print('User {}'.format(uid))\n", " for prediction in user_ratings:\n", " print(' Item {0.iid} ({0.est:.2f})'.format(prediction), end='')\n", " print()\n", "print()\n", "\n", " \n", "## Item-based filtering\n", "# compute cosine similarity between users \n", "sim_options = {'name': 'cosine', 'user_based': False}\n", "algo = KNNBasic(sim_options=sim_options)\n", "algo.fit(trainset)\n", "\n", "# Than predict ratings for all pairs (u, i) that are NOT in the training set.\n", "predictions = algo.test(testset)\n", "top_n = get_top_n(predictions, n=4)\n", "\n", "# Print the recommended items for each user\n", "print()\n", "print('Top-3 recommended items for each user')\n", "for uid, user_ratings in list(top_n.items())[:5]:\n", " print('User {}'.format(uid))\n", " for prediction in user_ratings:\n", " print(' Item {0.iid} ({0.est:.2f})'.format(prediction), end='')\n", " print()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Computing the cosine similarity matrix...\n", "Done computing similarity matrix.\n" ] }, { "data": { "text/plain": [ "Prediction(uid=383, iid=7, r_ui=None, est=2.3661840936304324, details={'actual_k': 4, 'was_impossible': False})" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## Build a model using the full dataset\n", "trainset = data.build_full_trainset()\n", "sim_options = {'name': 'cosine', 'user_based': False}\n", "algo = KNNBasic(sim_options=sim_options)\n", "algo.fit(trainset)\n", "\n", "# Predict rating for user 383 and item 7\n", "algo.predict(383, 7)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" } }, "nbformat": 4, "nbformat_minor": 2 }