From a946316cc640b0fb49c0422586c3152320d6c45d Mon Sep 17 00:00:00 2001 From: noah Date: Mon, 20 Feb 2023 22:52:41 -0600 Subject: [PATCH] 15.1 --- Schrick-Noah_Learning-Practice-4.ipynb | 69 +++++++++++++++++++++++--- 1 file changed, 63 insertions(+), 6 deletions(-) diff --git a/Schrick-Noah_Learning-Practice-4.ipynb b/Schrick-Noah_Learning-Practice-4.ipynb index 6075bac..a1dcae3 100644 --- a/Schrick-Noah_Learning-Practice-4.ipynb +++ b/Schrick-Noah_Learning-Practice-4.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 55, + "execution_count": 122, "metadata": {}, "outputs": [], "source": [ @@ -12,9 +12,13 @@ "# Noah L. Schrick - 1492657\n", "\n", "import pandas as pd\n", + "import numpy as np\n", "from sklearn.metrics import pairwise\n", "import matplotlib.pylab as plt\n", "from scipy.cluster.hierarchy import dendrogram, linkage, fcluster\n", + "from tabulate import tabulate\n", + "from scipy import cluster\n", + "import seaborn as sns\n", "\n", "pd.options.mode.chained_assignment = None # default='warn'\n" ] @@ -46,7 +50,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 134, "metadata": {}, "outputs": [ { @@ -58,6 +62,15 @@ }, "metadata": {}, "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Colorado Christian University\n", + "475 NaN\n", + "Name: # PT undergrad, dtype: float64\n" + ] } ], "source": [ @@ -77,6 +90,8 @@ " metric='euclidean')\n", "pd.DataFrame(university_dist, columns=university_df.index, index=university_df.index).head(5)\n", "\n", + "uni_hclust = fcluster(linkage(university_df_num_norm, 'complete'), 6, criterion='maxclust')\n", + "\n", "Z = linkage(university_df_num_norm, method='complete')\n", "\n", "fig = plt.figure(figsize=(10, 6))\n", @@ -88,7 +103,36 @@ "plt.xticks(rotation=45, ha='right')\n", "plt.show()\n", "# reasonable number of clusters for describing the data:\n", - "# At distance of 10.5 (horizontal line in the dendrogram image) data can be reduced to 9 clusters" + "# At distance of 10.5 (horizontal line in the dendrogram image) data can be reduced to 9 clusters\n", + "\n", + "# c: Compare the summary statistics for each cluster\n", + "cutree = cluster.hierarchy.cut_tree(Z, n_clusters=[5, 10])\n", + "clust_stats = university_df_num_norm.agg(['mean', 'median'])\n", + "\n", + "# d: Use the categorical measurements to categorize\n", + "state_table = tabulate(university_df[['State']], cutree)\n", + "pub_priv_table = tabulate(university_df[['Public (1)/ Private (2)']], cutree)\n", + "\n", + "#university_df_num_norm.index = ['{}: {}'.format(cluster, state) for cluster, state in zip(uni_hclust, university_df.index)]\n", + "#sns.clustermap(university_df_num_norm, method='average', col_cluster=False, cmap=\"mako_r\")\n", + "#plt.show()\n", + "\n", + "# e: Other external information\n", + "# Text answer\n", + "\n", + "# f: Compute the Euclidean distance of this record from each of the clusters that you found above (using only the measurements that you have)\n", + "tufts_df = raw_university_df.loc[raw_university_df['College Name'] == 'Tufts University']\n", + "tufts_df = tufts_df.drop(['# PT undergrad'], axis=1)\n", + "tufts_df_num = tufts_df.select_dtypes(include='number') # get numeric cols only\n", + "\n", + "tufts_dist = pairwise.pairwise_distances(tufts_df_num, Y=university_df_num_norm, metric='euclidean')\n", + "\n", + "# Closest cluster:\n", + "print(raw_university_df.iloc[np.where(tufts_dist == tufts_dist.min())[1][0]]['College Name'])\n", + "\n", + "# impute missing (from raw data - non-normalized)\n", + "tufts_df['# PT undergrad'] = clust_stats['# PT undergrad']['mean']\n", + "print(tufts_df['# PT undergrad'])" ] }, { @@ -125,10 +169,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 136, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "3.771458257750001e-17" + ] + }, + "execution_count": 136, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clust_stats['# PT undergrad']['mean']" + ] } ], "metadata": {