QM-7063-Learning-Practice-3/Schrick-Noah_Learning-Practice-3.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Learning Practice 2 for the University of Tulsa's QM-7063 Data Mining Course\n",
    "# Dimension Reduction\n",
    "# Professor: Dr. Abdulrashid, Spring 2023\n",
    "# Noah L. Schrick - 1492657\n",
    "\n",
    "import heapq\n",
    "from collections import defaultdict\n",
    "\n",
    "import pandas as pd\n",
    "import matplotlib.pylab as plt\n",
    "from mlxtend.frequent_patterns import apriori\n",
    "from mlxtend.frequent_patterns import association_rules\n",
    "\n",
    "from surprise import Dataset, Reader, KNNBasic\n",
    "from surprise.model_selection import train_test_split\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Problem 14.1\n",
    "This is a good approach for exploring associative relationships between customers. Since there is company data mixed with demographic data, the association rules can yield better results and demonstrate better associations since purchases can be examined with respect to age, location, number of dependents, and any other demographic data available."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Problem 14.3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "ename": "KeyError",
     "evalue": "\"None of [Index(['userID', 'itemID', 'rating'], dtype='object')] are in the [columns]\"",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[3], line 18\u001b[0m\n\u001b[1;32m     15\u001b[0m \u001b[39m# Convert the data set into the format required by the surprise package\u001b[39;00m\n\u001b[1;32m     16\u001b[0m \u001b[39m# The columns must correspond to user id, item id and ratings (in that order)\u001b[39;00m\n\u001b[1;32m     17\u001b[0m reader \u001b[39m=\u001b[39m Reader(rating_scale\u001b[39m=\u001b[39m(\u001b[39m1\u001b[39m, \u001b[39m5\u001b[39m))\n\u001b[0;32m---> 18\u001b[0m data \u001b[39m=\u001b[39m Dataset\u001b[39m.\u001b[39mload_from_df(courses_df[[\u001b[39m'\u001b[39;49m\u001b[39muserID\u001b[39;49m\u001b[39m'\u001b[39;49m, \u001b[39m'\u001b[39;49m\u001b[39mitemID\u001b[39;49m\u001b[39m'\u001b[39;49m, \u001b[39m'\u001b[39;49m\u001b[39mrating\u001b[39;49m\u001b[39m'\u001b[39;49m]], reader)\n\u001b[1;32m     20\u001b[0m \u001b[39m# Split into training and test set\u001b[39;00m\n\u001b[1;32m     21\u001b[0m trainset, testset \u001b[39m=\u001b[39m train_test_split(data, test_size\u001b[39m=\u001b[39m\u001b[39m.25\u001b[39m, random_state\u001b[39m=\u001b[39m\u001b[39m1\u001b[39m)\n",
      "File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/core/frame.py:3811\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m   3809\u001b[0m     \u001b[39mif\u001b[39;00m is_iterator(key):\n\u001b[1;32m   3810\u001b[0m         key \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(key)\n\u001b[0;32m-> 3811\u001b[0m     indexer \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mcolumns\u001b[39m.\u001b[39;49m_get_indexer_strict(key, \u001b[39m\"\u001b[39;49m\u001b[39mcolumns\u001b[39;49m\u001b[39m\"\u001b[39;49m)[\u001b[39m1\u001b[39m]\n\u001b[1;32m   3813\u001b[0m \u001b[39m# take() does not accept boolean indexers\u001b[39;00m\n\u001b[1;32m   3814\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mgetattr\u001b[39m(indexer, \u001b[39m\"\u001b[39m\u001b[39mdtype\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39mNone\u001b[39;00m) \u001b[39m==\u001b[39m \u001b[39mbool\u001b[39m:\n",
      "File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/core/indexes/base.py:6113\u001b[0m, in \u001b[0;36mIndex._get_indexer_strict\u001b[0;34m(self, key, axis_name)\u001b[0m\n\u001b[1;32m   6110\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m   6111\u001b[0m     keyarr, indexer, new_indexer \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_reindex_non_unique(keyarr)\n\u001b[0;32m-> 6113\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_raise_if_missing(keyarr, indexer, axis_name)\n\u001b[1;32m   6115\u001b[0m keyarr \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mtake(indexer)\n\u001b[1;32m   6116\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(key, Index):\n\u001b[1;32m   6117\u001b[0m     \u001b[39m# GH 42790 - Preserve name from an Index\u001b[39;00m\n",
      "File \u001b[0;32m~/.local/lib/python3.10/site-packages/pandas/core/indexes/base.py:6173\u001b[0m, in \u001b[0;36mIndex._raise_if_missing\u001b[0;34m(self, key, indexer, axis_name)\u001b[0m\n\u001b[1;32m   6171\u001b[0m     \u001b[39mif\u001b[39;00m use_interval_msg:\n\u001b[1;32m   6172\u001b[0m         key \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(key)\n\u001b[0;32m-> 6173\u001b[0m     \u001b[39mraise\u001b[39;00m \u001b[39mKeyError\u001b[39;00m(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mNone of [\u001b[39m\u001b[39m{\u001b[39;00mkey\u001b[39m}\u001b[39;00m\u001b[39m] are in the [\u001b[39m\u001b[39m{\u001b[39;00maxis_name\u001b[39m}\u001b[39;00m\u001b[39m]\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m   6175\u001b[0m not_found \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(ensure_index(key)[missing_mask\u001b[39m.\u001b[39mnonzero()[\u001b[39m0\u001b[39m]]\u001b[39m.\u001b[39munique())\n\u001b[1;32m   6176\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mKeyError\u001b[39;00m(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m{\u001b[39;00mnot_found\u001b[39m}\u001b[39;00m\u001b[39m not in index\u001b[39m\u001b[39m\"\u001b[39m)\n",
      "\u001b[0;31mKeyError\u001b[0m: \"None of [Index(['userID', 'itemID', 'rating'], dtype='object')] are in the [columns]\""
     ]
    }
   ],
   "source": [
    "## Read in Course Topics data\n",
    "courses_df = pd.read_csv('Coursetopics.csv')\n",
    "\n",
    "reader = Reader(rating_scale=(0, 1))\n",
    "data = Dataset.load_from_df(courses_df['customerID', 'movieID', 'rating']], reader)\n",
    "trainset = data.build_full_trainset()\n",
    "sim_options = {'name': 'cosine', 'user_based': True}  # compute cosine similarities between items\n",
    "algo = KNNBasic(sim_options=sim_options)\n",
    "algo.fit(trainset)\n",
    "pred = algo.predict(str(823519), str(30), r_ui=4, verbose=True)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Problem 14.4"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  },
  "vscode": {
   "interpreter": {
    "hash": "767d51c1340bd893661ea55ea3124f6de3c7a262a8b4abca0554b478b1e2ff90"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}