15.1

2023-02-20 22:52:41 -06:00 · 2023-02-20 22:52:41 -06:00 · a946316cc6
commit a946316cc6
parent 7e39baeed3
1 changed files with 63 additions and 6 deletions
--- a/Schrick-Noah_Learning-Practice-4.ipynb
+++ b/Schrick-Noah_Learning-Practice-4.ipynb
@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 55,
+   "execution_count": 122,
   "metadata": {},
   "outputs": [],
   "source": [
@ -12,9 +12,13 @@
    "# Noah L. Schrick - 1492657\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.metrics import pairwise\n",
    "import matplotlib.pylab as plt\n",
    "from scipy.cluster.hierarchy import dendrogram, linkage, fcluster\n",
    "from tabulate import tabulate\n",
    "from scipy import cluster\n",
    "import seaborn as sns\n",
    "\n",
    "pd.options.mode.chained_assignment = None  # default='warn'\n"
   ]
@ -46,7 +50,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 60,
+   "execution_count": 134,
   "metadata": {},
   "outputs": [
    {
@ -58,6 +62,15 @@
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Colorado Christian University\n",
      "475   NaN\n",
      "Name: # PT undergrad, dtype: float64\n"
     ]
    }
   ],
   "source": [
@ -77,6 +90,8 @@
    "                                     metric='euclidean')\n",
    "pd.DataFrame(university_dist, columns=university_df.index, index=university_df.index).head(5)\n",
    "\n",
    "uni_hclust = fcluster(linkage(university_df_num_norm, 'complete'), 6, criterion='maxclust')\n",
    "\n",
    "Z = linkage(university_df_num_norm, method='complete')\n",
    "\n",
    "fig = plt.figure(figsize=(10, 6))\n",
@ -88,7 +103,36 @@
    "plt.xticks(rotation=45, ha='right')\n",
    "plt.show()\n",
    "# reasonable number of clusters for describing the data:\n",
-    "# At distance of 10.5 (horizontal line in the dendrogram image) data can be reduced to 9 clusters"
+    "# At distance of 10.5 (horizontal line in the dendrogram image) data can be reduced to 9 clusters\n",
    "\n",
    "# c:  Compare the summary statistics for each cluster\n",
    "cutree = cluster.hierarchy.cut_tree(Z, n_clusters=[5, 10])\n",
    "clust_stats = university_df_num_norm.agg(['mean', 'median'])\n",
    "\n",
    "# d: Use the categorical measurements to categorize\n",
    "state_table = tabulate(university_df[['State']], cutree)\n",
    "pub_priv_table = tabulate(university_df[['Public (1)/ Private (2)']], cutree)\n",
    "\n",
    "#university_df_num_norm.index = ['{}: {}'.format(cluster, state) for cluster, state in zip(uni_hclust, university_df.index)]\n",
    "#sns.clustermap(university_df_num_norm, method='average', col_cluster=False,  cmap=\"mako_r\")\n",
    "#plt.show()\n",
    "\n",
    "# e: Other external information\n",
    "# Text answer\n",
    "\n",
    "# f: Compute the Euclidean distance of this record from each of the clusters that you found above (using only the measurements that you have)\n",
    "tufts_df = raw_university_df.loc[raw_university_df['College Name'] == 'Tufts University']\n",
    "tufts_df = tufts_df.drop(['# PT undergrad'], axis=1)\n",
    "tufts_df_num = tufts_df.select_dtypes(include='number') # get numeric cols only\n",
    "\n",
    "tufts_dist = pairwise.pairwise_distances(tufts_df_num, Y=university_df_num_norm, metric='euclidean')\n",
    "\n",
    "# Closest cluster:\n",
    "print(raw_university_df.iloc[np.where(tufts_dist == tufts_dist.min())[1][0]]['College Name'])\n",
    "\n",
    "# impute missing (from raw data - non-normalized)\n",
    "tufts_df['# PT undergrad'] = clust_stats['# PT undergrad']['mean']\n",
    "print(tufts_df['# PT undergrad'])"
   ]
  },
  {
@ -125,10 +169,23 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 136,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
-   "source": []
+    {
     "data": {
      "text/plain": [
       "3.771458257750001e-17"
      ]
     },
     "execution_count": 136,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clust_stats['# PT undergrad']['mean']"
   ]
  }
 ],
 "metadata": {