QM-7063-Learning-Practice-3/Schrick-Noah_Learning-Practice-3.ipynb
2023-02-12 18:59:05 -06:00

119 lines
7.2 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# Learning Practice 2 for the University of Tulsa's QM-7063 Data Mining Course\n",
"# Dimension Reduction\n",
"# Professor: Dr. Abdulrashid, Spring 2023\n",
"# Noah L. Schrick - 1492657\n",
"\n",
"import heapq\n",
"from collections import defaultdict\n",
"\n",
"import pandas as pd\n",
"import matplotlib.pylab as plt\n",
"from mlxtend.frequent_patterns import apriori\n",
"from mlxtend.frequent_patterns import association_rules\n",
"\n",
"from surprise import Dataset, Reader, KNNBasic\n",
"from surprise.model_selection import train_test_split\n"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Problem 14.1\n",
"This is a good approach for exploring associative relationships between customers. Since there is company data mixed with demographic data, the association rules can yield better results and demonstrate better associations since purchases can be examined with respect to age, location, number of dependents, and any other demographic data available."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Problem 14.3"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"ename": "KeyError",
"evalue": "\"None of [Index(['userID', 'itemID', 'rating'], dtype='object')] are in the [columns]\"",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[3], line 18\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[39m# Convert the data set into the format required by the surprise package\u001b[39;00m\n\u001b[1;32m 16\u001b[0m \u001b[39m# The columns must correspond to user id, item id and ratings (in that order)\u001b[39;00m\n\u001b[1;32m 17\u001b[0m reader \u001b[39m=\u001b[39m Reader(rating_scale\u001b[39m=\u001b[39m(\u001b[39m1\u001b[39m, \u001b[39m5\u001b[39m))\n\u001b[0;32m---> 18\u001b[0m data \u001b[39m=\u001b[39m Dataset\u001b[39m.\u001b[39mload_from_df(courses_df[[\u001b[39m'\u001b[39;49m\u001b[39muserID\u001b[39;49m\u001b[39m'\u001b[39;49m, \u001b[39m'\u001b[39;49m\u001b[39mitemID\u001b[39;49m\u001b[39m'\u001b[39;49m, \u001b[39m'\u001b[39;49m\u001b[39mrating\u001b[39;49m\u001b[39m'\u001b[39;49m]], reader)\n\u001b[1;32m 20\u001b[0m \u001b[39m# Split into training and test set\u001b[39;00m\n\u001b[1;32m 21\u001b[0m trainset, testset \u001b[39m=\u001b[39m train_test_split(data, test_size\u001b[39m=\u001b[39m\u001b[39m.25\u001b[39m, random_state\u001b[39m=\u001b[39m\u001b[39m1\u001b[39m)\n",
"File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/core/frame.py:3811\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3809\u001b[0m \u001b[39mif\u001b[39;00m is_iterator(key):\n\u001b[1;32m 3810\u001b[0m key \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(key)\n\u001b[0;32m-> 3811\u001b[0m indexer \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mcolumns\u001b[39m.\u001b[39;49m_get_indexer_strict(key, \u001b[39m\"\u001b[39;49m\u001b[39mcolumns\u001b[39;49m\u001b[39m\"\u001b[39;49m)[\u001b[39m1\u001b[39m]\n\u001b[1;32m 3813\u001b[0m \u001b[39m# take() does not accept boolean indexers\u001b[39;00m\n\u001b[1;32m 3814\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mgetattr\u001b[39m(indexer, \u001b[39m\"\u001b[39m\u001b[39mdtype\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39mNone\u001b[39;00m) \u001b[39m==\u001b[39m \u001b[39mbool\u001b[39m:\n",
"File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/core/indexes/base.py:6113\u001b[0m, in \u001b[0;36mIndex._get_indexer_strict\u001b[0;34m(self, key, axis_name)\u001b[0m\n\u001b[1;32m 6110\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 6111\u001b[0m keyarr, indexer, new_indexer \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_reindex_non_unique(keyarr)\n\u001b[0;32m-> 6113\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_raise_if_missing(keyarr, indexer, axis_name)\n\u001b[1;32m 6115\u001b[0m keyarr \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mtake(indexer)\n\u001b[1;32m 6116\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(key, Index):\n\u001b[1;32m 6117\u001b[0m \u001b[39m# GH 42790 - Preserve name from an Index\u001b[39;00m\n",
"File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/core/indexes/base.py:6173\u001b[0m, in \u001b[0;36mIndex._raise_if_missing\u001b[0;34m(self, key, indexer, axis_name)\u001b[0m\n\u001b[1;32m 6171\u001b[0m \u001b[39mif\u001b[39;00m use_interval_msg:\n\u001b[1;32m 6172\u001b[0m key \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(key)\n\u001b[0;32m-> 6173\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mKeyError\u001b[39;00m(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mNone of [\u001b[39m\u001b[39m{\u001b[39;00mkey\u001b[39m}\u001b[39;00m\u001b[39m] are in the [\u001b[39m\u001b[39m{\u001b[39;00maxis_name\u001b[39m}\u001b[39;00m\u001b[39m]\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m 6175\u001b[0m not_found \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(ensure_index(key)[missing_mask\u001b[39m.\u001b[39mnonzero()[\u001b[39m0\u001b[39m]]\u001b[39m.\u001b[39munique())\n\u001b[1;32m 6176\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mKeyError\u001b[39;00m(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m{\u001b[39;00mnot_found\u001b[39m}\u001b[39;00m\u001b[39m not in index\u001b[39m\u001b[39m\"\u001b[39m)\n",
"\u001b[0;31mKeyError\u001b[0m: \"None of [Index(['userID', 'itemID', 'rating'], dtype='object')] are in the [columns]\""
]
}
],
"source": [
"## Read in Course Topics data\n",
"courses_df = pd.read_csv('Coursetopics.csv')\n",
"\n",
"reader = Reader(rating_scale=(0, 1))\n",
"data = Dataset.load_from_df(courses_df['customerID', 'movieID', 'rating']], reader)\n",
"trainset = data.build_full_trainset()\n",
"sim_options = {'name': 'cosine', 'user_based': True} # compute cosine similarities between items\n",
"algo = KNNBasic(sim_options=sim_options)\n",
"algo.fit(trainset)\n",
"pred = algo.predict(str(823519), str(30), r_ui=4, verbose=True)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Problem 14.4"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
},
"vscode": {
"interpreter": {
"hash": "767d51c1340bd893661ea55ea3124f6de3c7a262a8b4abca0554b478b1e2ff90"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}