This commit is contained in:
Noah L. Schrick 2023-02-20 22:52:41 -06:00
parent 7e39baeed3
commit a946316cc6

View File

@ -2,7 +2,7 @@
"cells": [ "cells": [
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 55, "execution_count": 122,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -12,9 +12,13 @@
"# Noah L. Schrick - 1492657\n", "# Noah L. Schrick - 1492657\n",
"\n", "\n",
"import pandas as pd\n", "import pandas as pd\n",
"import numpy as np\n",
"from sklearn.metrics import pairwise\n", "from sklearn.metrics import pairwise\n",
"import matplotlib.pylab as plt\n", "import matplotlib.pylab as plt\n",
"from scipy.cluster.hierarchy import dendrogram, linkage, fcluster\n", "from scipy.cluster.hierarchy import dendrogram, linkage, fcluster\n",
"from tabulate import tabulate\n",
"from scipy import cluster\n",
"import seaborn as sns\n",
"\n", "\n",
"pd.options.mode.chained_assignment = None # default='warn'\n" "pd.options.mode.chained_assignment = None # default='warn'\n"
] ]
@ -46,7 +50,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 60, "execution_count": 134,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -58,6 +62,15 @@
}, },
"metadata": {}, "metadata": {},
"output_type": "display_data" "output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Colorado Christian University\n",
"475 NaN\n",
"Name: # PT undergrad, dtype: float64\n"
]
} }
], ],
"source": [ "source": [
@ -77,6 +90,8 @@
" metric='euclidean')\n", " metric='euclidean')\n",
"pd.DataFrame(university_dist, columns=university_df.index, index=university_df.index).head(5)\n", "pd.DataFrame(university_dist, columns=university_df.index, index=university_df.index).head(5)\n",
"\n", "\n",
"uni_hclust = fcluster(linkage(university_df_num_norm, 'complete'), 6, criterion='maxclust')\n",
"\n",
"Z = linkage(university_df_num_norm, method='complete')\n", "Z = linkage(university_df_num_norm, method='complete')\n",
"\n", "\n",
"fig = plt.figure(figsize=(10, 6))\n", "fig = plt.figure(figsize=(10, 6))\n",
@ -88,7 +103,36 @@
"plt.xticks(rotation=45, ha='right')\n", "plt.xticks(rotation=45, ha='right')\n",
"plt.show()\n", "plt.show()\n",
"# reasonable number of clusters for describing the data:\n", "# reasonable number of clusters for describing the data:\n",
"# At distance of 10.5 (horizontal line in the dendrogram image) data can be reduced to 9 clusters" "# At distance of 10.5 (horizontal line in the dendrogram image) data can be reduced to 9 clusters\n",
"\n",
"# c: Compare the summary statistics for each cluster\n",
"cutree = cluster.hierarchy.cut_tree(Z, n_clusters=[5, 10])\n",
"clust_stats = university_df_num_norm.agg(['mean', 'median'])\n",
"\n",
"# d: Use the categorical measurements to categorize\n",
"state_table = tabulate(university_df[['State']], cutree)\n",
"pub_priv_table = tabulate(university_df[['Public (1)/ Private (2)']], cutree)\n",
"\n",
"#university_df_num_norm.index = ['{}: {}'.format(cluster, state) for cluster, state in zip(uni_hclust, university_df.index)]\n",
"#sns.clustermap(university_df_num_norm, method='average', col_cluster=False, cmap=\"mako_r\")\n",
"#plt.show()\n",
"\n",
"# e: Other external information\n",
"# Text answer\n",
"\n",
"# f: Compute the Euclidean distance of this record from each of the clusters that you found above (using only the measurements that you have)\n",
"tufts_df = raw_university_df.loc[raw_university_df['College Name'] == 'Tufts University']\n",
"tufts_df = tufts_df.drop(['# PT undergrad'], axis=1)\n",
"tufts_df_num = tufts_df.select_dtypes(include='number') # get numeric cols only\n",
"\n",
"tufts_dist = pairwise.pairwise_distances(tufts_df_num, Y=university_df_num_norm, metric='euclidean')\n",
"\n",
"# Closest cluster:\n",
"print(raw_university_df.iloc[np.where(tufts_dist == tufts_dist.min())[1][0]]['College Name'])\n",
"\n",
"# impute missing (from raw data - non-normalized)\n",
"tufts_df['# PT undergrad'] = clust_stats['# PT undergrad']['mean']\n",
"print(tufts_df['# PT undergrad'])"
] ]
}, },
{ {
@ -125,10 +169,23 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 136,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
"source": [] {
"data": {
"text/plain": [
"3.771458257750001e-17"
]
},
"execution_count": 136,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clust_stats['# PT undergrad']['mean']"
]
} }
], ],
"metadata": { "metadata": {