15.1
This commit is contained in:
parent
7e39baeed3
commit
a946316cc6
@ -2,7 +2,7 @@
|
|||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 55,
|
"execution_count": 122,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@ -12,9 +12,13 @@
|
|||||||
"# Noah L. Schrick - 1492657\n",
|
"# Noah L. Schrick - 1492657\n",
|
||||||
"\n",
|
"\n",
|
||||||
"import pandas as pd\n",
|
"import pandas as pd\n",
|
||||||
|
"import numpy as np\n",
|
||||||
"from sklearn.metrics import pairwise\n",
|
"from sklearn.metrics import pairwise\n",
|
||||||
"import matplotlib.pylab as plt\n",
|
"import matplotlib.pylab as plt\n",
|
||||||
"from scipy.cluster.hierarchy import dendrogram, linkage, fcluster\n",
|
"from scipy.cluster.hierarchy import dendrogram, linkage, fcluster\n",
|
||||||
|
"from tabulate import tabulate\n",
|
||||||
|
"from scipy import cluster\n",
|
||||||
|
"import seaborn as sns\n",
|
||||||
"\n",
|
"\n",
|
||||||
"pd.options.mode.chained_assignment = None # default='warn'\n"
|
"pd.options.mode.chained_assignment = None # default='warn'\n"
|
||||||
]
|
]
|
||||||
@ -46,7 +50,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 60,
|
"execution_count": 134,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
@ -58,6 +62,15 @@
|
|||||||
},
|
},
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "display_data"
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Colorado Christian University\n",
|
||||||
|
"475 NaN\n",
|
||||||
|
"Name: # PT undergrad, dtype: float64\n"
|
||||||
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
@ -77,6 +90,8 @@
|
|||||||
" metric='euclidean')\n",
|
" metric='euclidean')\n",
|
||||||
"pd.DataFrame(university_dist, columns=university_df.index, index=university_df.index).head(5)\n",
|
"pd.DataFrame(university_dist, columns=university_df.index, index=university_df.index).head(5)\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
"uni_hclust = fcluster(linkage(university_df_num_norm, 'complete'), 6, criterion='maxclust')\n",
|
||||||
|
"\n",
|
||||||
"Z = linkage(university_df_num_norm, method='complete')\n",
|
"Z = linkage(university_df_num_norm, method='complete')\n",
|
||||||
"\n",
|
"\n",
|
||||||
"fig = plt.figure(figsize=(10, 6))\n",
|
"fig = plt.figure(figsize=(10, 6))\n",
|
||||||
@ -88,7 +103,36 @@
|
|||||||
"plt.xticks(rotation=45, ha='right')\n",
|
"plt.xticks(rotation=45, ha='right')\n",
|
||||||
"plt.show()\n",
|
"plt.show()\n",
|
||||||
"# reasonable number of clusters for describing the data:\n",
|
"# reasonable number of clusters for describing the data:\n",
|
||||||
"# At distance of 10.5 (horizontal line in the dendrogram image) data can be reduced to 9 clusters"
|
"# At distance of 10.5 (horizontal line in the dendrogram image) data can be reduced to 9 clusters\n",
|
||||||
|
"\n",
|
||||||
|
"# c: Compare the summary statistics for each cluster\n",
|
||||||
|
"cutree = cluster.hierarchy.cut_tree(Z, n_clusters=[5, 10])\n",
|
||||||
|
"clust_stats = university_df_num_norm.agg(['mean', 'median'])\n",
|
||||||
|
"\n",
|
||||||
|
"# d: Use the categorical measurements to categorize\n",
|
||||||
|
"state_table = tabulate(university_df[['State']], cutree)\n",
|
||||||
|
"pub_priv_table = tabulate(university_df[['Public (1)/ Private (2)']], cutree)\n",
|
||||||
|
"\n",
|
||||||
|
"#university_df_num_norm.index = ['{}: {}'.format(cluster, state) for cluster, state in zip(uni_hclust, university_df.index)]\n",
|
||||||
|
"#sns.clustermap(university_df_num_norm, method='average', col_cluster=False, cmap=\"mako_r\")\n",
|
||||||
|
"#plt.show()\n",
|
||||||
|
"\n",
|
||||||
|
"# e: Other external information\n",
|
||||||
|
"# Text answer\n",
|
||||||
|
"\n",
|
||||||
|
"# f: Compute the Euclidean distance of this record from each of the clusters that you found above (using only the measurements that you have)\n",
|
||||||
|
"tufts_df = raw_university_df.loc[raw_university_df['College Name'] == 'Tufts University']\n",
|
||||||
|
"tufts_df = tufts_df.drop(['# PT undergrad'], axis=1)\n",
|
||||||
|
"tufts_df_num = tufts_df.select_dtypes(include='number') # get numeric cols only\n",
|
||||||
|
"\n",
|
||||||
|
"tufts_dist = pairwise.pairwise_distances(tufts_df_num, Y=university_df_num_norm, metric='euclidean')\n",
|
||||||
|
"\n",
|
||||||
|
"# Closest cluster:\n",
|
||||||
|
"print(raw_university_df.iloc[np.where(tufts_dist == tufts_dist.min())[1][0]]['College Name'])\n",
|
||||||
|
"\n",
|
||||||
|
"# impute missing (from raw data - non-normalized)\n",
|
||||||
|
"tufts_df['# PT undergrad'] = clust_stats['# PT undergrad']['mean']\n",
|
||||||
|
"print(tufts_df['# PT undergrad'])"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -125,10 +169,23 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 136,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
"source": []
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"3.771458257750001e-17"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 136,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"clust_stats['# PT undergrad']['mean']"
|
||||||
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user