14.4

2023-02-13 16:01:10 -06:00 · 2023-02-13 16:01:10 -06:00 · 11e917b369
commit 11e917b369
parent b24483b7a3
2 changed files with 1106 additions and 30 deletions
--- a/Cosmetics.csv
+++ b/Cosmetics.csv
--- a/Schrick-Noah_Learning-Practice-3.ipynb
+++ b/Schrick-Noah_Learning-Practice-3.ipynb
@ -45,31 +45,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 30,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Computing the cosine similarity matrix...\n"
-     ]
-    },
-    {
-     "ename": "ZeroDivisionError",
-     "evalue": "float division",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mZeroDivisionError\u001b[0m                         Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[28], line 16\u001b[0m\n\u001b[1;32m     14\u001b[0m sim_options \u001b[39m=\u001b[39m {\u001b[39m'\u001b[39m\u001b[39mname\u001b[39m\u001b[39m'\u001b[39m: \u001b[39m'\u001b[39m\u001b[39mcosine\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m'\u001b[39m\u001b[39muser_based\u001b[39m\u001b[39m'\u001b[39m: \u001b[39mTrue\u001b[39;00m}  \u001b[39m# compute cosine similarities between users\u001b[39;00m\n\u001b[1;32m     15\u001b[0m algo \u001b[39m=\u001b[39m KNNBasic(sim_options\u001b[39m=\u001b[39msim_options)\n\u001b[0;32m---> 16\u001b[0m algo\u001b[39m.\u001b[39;49mfit(trainset)\n\u001b[1;32m     17\u001b[0m \u001b[39m#pred = algo.predict(str(823519), str(30), r_ui=4, verbose=True)\u001b[39;00m\n",
-      "File \u001b[0;32m~/.local/lib/python3.10/site-packages/surprise/prediction_algorithms/knns.py:98\u001b[0m, in \u001b[0;36mKNNBasic.fit\u001b[0;34m(self, trainset)\u001b[0m\n\u001b[1;32m     95\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mfit\u001b[39m(\u001b[39mself\u001b[39m, trainset):\n\u001b[1;32m     97\u001b[0m     SymmetricAlgo\u001b[39m.\u001b[39mfit(\u001b[39mself\u001b[39m, trainset)\n\u001b[0;32m---> 98\u001b[0m     \u001b[39mself\u001b[39m\u001b[39m.\u001b[39msim \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mcompute_similarities()\n\u001b[1;32m    100\u001b[0m     \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\n",
-      "File \u001b[0;32m~/.local/lib/python3.10/site-packages/surprise/prediction_algorithms/algo_base.py:248\u001b[0m, in \u001b[0;36mAlgoBase.compute_similarities\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    246\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mgetattr\u001b[39m(\u001b[39mself\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mverbose\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39mFalse\u001b[39;00m):\n\u001b[1;32m    247\u001b[0m     \u001b[39mprint\u001b[39m(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mComputing the \u001b[39m\u001b[39m{\u001b[39;00mname\u001b[39m}\u001b[39;00m\u001b[39m similarity matrix...\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m--> 248\u001b[0m sim \u001b[39m=\u001b[39m construction_func[name](\u001b[39m*\u001b[39;49margs)\n\u001b[1;32m    249\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mgetattr\u001b[39m(\u001b[39mself\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mverbose\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39mFalse\u001b[39;00m):\n\u001b[1;32m    250\u001b[0m     \u001b[39mprint\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39mDone computing similarity matrix.\u001b[39m\u001b[39m\"\u001b[39m)\n",
-      "File \u001b[0;32m~/.local/lib/python3.10/site-packages/surprise/similarities.pyx:83\u001b[0m, in \u001b[0;36msurprise.similarities.cosine\u001b[0;34m()\u001b[0m\n",
-      "\u001b[0;31mZeroDivisionError\u001b[0m: float division"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "## Read in Course Topics data\n",
    "courses_df = pd.read_csv('Coursetopics.csv')\n",
@ -79,16 +57,14 @@
    "course_melt = courses_df.melt(id_vars =['Index'], value_vars =['Intro', 'DataMining', 'Survey', 'Cat Data', 'Regression', 'Forecast', 'DOE', 'SW'], \n",
    "        var_name ='Course', value_name ='Taken')\n",
    "\n",
-    "\n",
    "reader = Reader(rating_scale=(0, 1))\n",
    "data = Dataset.load_from_df(course_melt[['Index', 'Course', 'Taken']], reader)\n",
    "trainset = data.build_full_trainset()\n",
    "\n",
    "# NOTE: The following will error. This is expected and part of the question. Explanation in the corresponding answer.\n",
-    "sim_options = {'name': 'cosine', 'user_based': True}  # compute cosine similarities between users\n",
-    "algo = KNNBasic(sim_options=sim_options)\n",
-    "algo.fit(trainset)\n",
-    "#pred = algo.predict(str(823519), str(30), r_ui=4, verbose=True)"
+    "#sim_options = {'name': 'cosine', 'user_based': True}  # compute cosine similarities between users\n",
+    "#algo = KNNBasic(sim_options=sim_options)\n",
+    "#algo.fit(trainset)\n"
   ]
  },
  {
@ -96,7 +72,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "The provided dataset is composed of Boolean values for \"have taken\" or \"have not taken\" various courses. The dataset represents \"have not taken\" with a 0, and \"taken\" with a 1. The dataset is considered a sparse matrix, since each user has only taken a few of the listed courses. Due to the sparsity, when computing the cosine between users, many computations involve comparing a user's \"not taken\" course to another user's \"not taken\" course. This leads to difficulties with the cosine computation since the denominator will be zero, causing a float division error. This can be remedied by using \"NULL\" values, which are supported in the surprise package."
+    "The provided dataset is composed of binary values for \"have taken\" or \"have not taken\" various courses. The dataset represents \"have not taken\" with a 0, and \"taken\" with a 1. The dataset is considered a sparse matrix, since each user has only taken a few of the listed courses. Due to the sparsity, when computing the cosine between users, many computations involve comparing a user's \"not taken\" course to another user's \"not taken\" course. This leads to difficulties with the cosine computation since the denominator will be zero, causing a float division error. This can be remedied by using \"NULL\" values, which are supported in the surprise package, by using a sparse matrix cosine similarity function, or error-checking for zeroes during the cosine similarity computation."
   ]
  },
  {
@ -117,6 +93,105 @@
    "        i. Interpret the first three rules in the output in words.\n",
    "        ii. Reviewing the first couple of dozen rules, comment on their redundancy and how you would assess their utility."
   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "a)\n",
+    "    Transaction 1: Customer purchased Blush, Nail Polish, Brushes, Concealer, and Bronzer.\n",
+    "    Transaction 6: Customer purchased Concealer.\n",
+    "    Transaction 11: Customer purchased Nail Polish and Bronzer.\n",
+    "\n",
+    "b) \n",
+    "    i) Confidence is the measure of uncertainty regarding the association rule. The confidence is calculated using the number of transactions that contain both antecedents and consequents, divided by the number of transactions with the antecedent. For the first row, a 30.23% confidence is obtained. The confidence is rather low due to the size of the dataset. Only 12 transactions occurred, and of these 12, only 1 transaction purchased eyebrow pencils.\n",
+    "\n",
+    "    ii) Support is the degree in which the data supports the validity of an association rule. It is the number of antecedent and consequent itemsets. Support may also be represented in percentage form, which is the number of antecedent and consequent itemset occurrences divided by the total number of entries. The first row has a support value of 0.013, which corresponds to the one entry for eyebrow pencils, divided by the 77 total entries.\n",
+    "\n",
+    "    iii) Lift is the strength of an association rule assuming the antecedents and consequents are independent. The support is obtained similar to the calculation for confidence, but is now based on independence. Lift is equal to the confidence divided by the benchmark confidence, where benchmark confidence is obtained by the number of transactions with the consequent, divided by the total number of transactions. The first row has a lift of 7.198... which corresponds to the confidence (0.3023) divided by (1/24).\n",
+    "\n",
+    "    iv) If Blush, Concealer, Mascara, Eye Shadow, and Lipstick is purchased, then Eyebrow Pencils are also purchased."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                  antecedents    consequents  antecedent support  \\\n",
+      "0                   (Brushes)  (Nail Polish)               0.149   \n",
+      "1                   (Mascara)   (Eye shadow)               0.357   \n",
+      "2                (Eye shadow)      (Mascara)               0.381   \n",
+      "3          (Blush, Lip liner)    (Concealer)               0.124   \n",
+      "4            (Blush, Mascara)   (Eye shadow)               0.184   \n",
+      "5         (Blush, Eye shadow)      (Mascara)               0.182   \n",
+      "6      (Nail Polish, Mascara)   (Eye shadow)               0.134   \n",
+      "7   (Nail Polish, Eye shadow)      (Mascara)               0.131   \n",
+      "8        (Lip liner, Bronzer)    (Concealer)               0.128   \n",
+      "9         (Eyeliner, Bronzer)    (Concealer)               0.146   \n",
+      "10      (Lip liner, Eyeliner)    (Concealer)               0.130   \n",
+      "11       (Mascara, Concealer)   (Eye shadow)               0.204   \n",
+      "\n",
+      "    consequent support  support  confidence      lift  leverage  conviction  \n",
+      "0                0.280    0.149    1.000000  3.571429  0.107280         inf  \n",
+      "1                0.381    0.321    0.899160  2.359999  0.184983    6.138417  \n",
+      "2                0.357    0.321    0.842520  2.359999  0.184983    4.083050  \n",
+      "3                0.442    0.108    0.870968  1.970515  0.053192    4.324500  \n",
+      "4                0.381    0.169    0.918478  2.410704  0.098896    7.593067  \n",
+      "5                0.357    0.169    0.928571  2.601040  0.104026    9.002000  \n",
+      "6                0.381    0.119    0.888060  2.330865  0.067946    5.529733  \n",
+      "7                0.357    0.119    0.908397  2.544529  0.072233    7.019417  \n",
+      "8                0.442    0.103    0.804687  1.820560  0.046424    2.856960  \n",
+      "9                0.442    0.119    0.815068  1.844046  0.054468    3.017333  \n",
+      "10               0.442    0.120    0.923077  2.088409  0.062540    7.254000  \n",
+      "11               0.381    0.179    0.877451  2.303021  0.101276    5.051040  \n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/noah/.local/lib/python3.10/site-packages/mlxtend/frequent_patterns/fpcommon.py:111: DeprecationWarning: DataFrames with non-bool types result in worse computationalperformance and their support might be discontinued in the future.Please use a DataFrame with bool type\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "## c\n",
+    "# Read in Cosmetics data\n",
+    "cosmetics_df = pd.read_csv('Cosmetics.csv')\n",
+    "cosmetics_df = cosmetics_df.drop('Trans. ', axis=1)\n",
+    "# create frequent itemsets\n",
+    "itemsets = apriori(cosmetics_df, min_support=0.1, use_colnames=True)\n",
+    "\n",
+    "# and convert into rules\n",
+    "rules = association_rules(itemsets)\n",
+    "#rules.sort_values(by=['lift'], ascending=False).head(6)\n",
+    "\n",
+    "print(rules.head(12))\n",
+    "#print(rules.sort_values(by=['lift'], ascending=False)\n",
+    "#      .drop(columns=['antecedent support', 'consequent support', 'conviction'])\n",
+    "#      .head(6))\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "c)\n",
+    "    i) \n",
+    "        If Brushes are purchased, then Nail Polish is purchased.\n",
+    "        If Mascara is purchased, then Eye Shadow is purchased.\n",
+    "        If Eye Shadow is purchaed, then Mascara is purchased.    \n",
+    "    ii)\n",
+    "        The first couple dozen of rules are permutations of themselves, and have a degree of redundancy due to these permutations. Sorting the list of rules by lift would allow for an easier assessment of utility since rules with greater lift would be grouped together at the top of the list."
+   ]
  }
 ],
 "metadata": {