{
"cells": [
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"# Learning Practice 2 for the University of Tulsa's QM-7063 Data Mining Course\n",
"# Dimension Reduction\n",
"# Professor: Dr. Abdulrashid, Spring 2023\n",
"# Noah L. Schrick - 1492657\n",
"\n",
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" mean | \n",
" sd | \n",
" min | \n",
" max | \n",
" median | \n",
" length | \n",
" miss.val | \n",
"
\n",
" \n",
" \n",
" \n",
" | CRIM | \n",
" 3.613524 | \n",
" 8.601545 | \n",
" 0.00632 | \n",
" 88.9762 | \n",
" 0.25651 | \n",
" 506 | \n",
" 0 | \n",
"
\n",
" \n",
" | ZN | \n",
" 11.363636 | \n",
" 23.322453 | \n",
" 0.00000 | \n",
" 100.0000 | \n",
" 0.00000 | \n",
" 506 | \n",
" 0 | \n",
"
\n",
" \n",
" | INDUS | \n",
" 11.136779 | \n",
" 6.860353 | \n",
" 0.46000 | \n",
" 27.7400 | \n",
" 9.69000 | \n",
" 506 | \n",
" 0 | \n",
"
\n",
" \n",
" | CHAS | \n",
" 0.069170 | \n",
" 0.253994 | \n",
" 0.00000 | \n",
" 1.0000 | \n",
" 0.00000 | \n",
" 506 | \n",
" 0 | \n",
"
\n",
" \n",
" | NOX | \n",
" 0.554695 | \n",
" 0.115878 | \n",
" 0.38500 | \n",
" 0.8710 | \n",
" 0.53800 | \n",
" 506 | \n",
" 0 | \n",
"
\n",
" \n",
" | RM | \n",
" 6.284634 | \n",
" 0.702617 | \n",
" 3.56100 | \n",
" 8.7800 | \n",
" 6.20850 | \n",
" 506 | \n",
" 0 | \n",
"
\n",
" \n",
" | AGE | \n",
" 68.574901 | \n",
" 28.148861 | \n",
" 2.90000 | \n",
" 100.0000 | \n",
" 77.50000 | \n",
" 506 | \n",
" 0 | \n",
"
\n",
" \n",
" | DIS | \n",
" 3.795043 | \n",
" 2.105710 | \n",
" 1.12960 | \n",
" 12.1265 | \n",
" 3.20745 | \n",
" 506 | \n",
" 0 | \n",
"
\n",
" \n",
" | RAD | \n",
" 9.549407 | \n",
" 8.707259 | \n",
" 1.00000 | \n",
" 24.0000 | \n",
" 5.00000 | \n",
" 506 | \n",
" 0 | \n",
"
\n",
" \n",
" | TAX | \n",
" 408.237154 | \n",
" 168.537116 | \n",
" 187.00000 | \n",
" 711.0000 | \n",
" 330.00000 | \n",
" 506 | \n",
" 0 | \n",
"
\n",
" \n",
" | PTRATIO | \n",
" 18.455534 | \n",
" 2.164946 | \n",
" 12.60000 | \n",
" 22.0000 | \n",
" 19.05000 | \n",
" 506 | \n",
" 0 | \n",
"
\n",
" \n",
" | LSTAT | \n",
" 12.653063 | \n",
" 7.141062 | \n",
" 1.73000 | \n",
" 37.9700 | \n",
" 11.36000 | \n",
" 506 | \n",
" 0 | \n",
"
\n",
" \n",
" | MEDV | \n",
" 22.532806 | \n",
" 9.197104 | \n",
" 5.00000 | \n",
" 50.0000 | \n",
" 21.20000 | \n",
" 506 | \n",
" 0 | \n",
"
\n",
" \n",
" | CAT. MEDV | \n",
" 0.166008 | \n",
" 0.372456 | \n",
" 0.00000 | \n",
" 1.0000 | \n",
" 0.00000 | \n",
" 506 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" mean sd min max median length \\\n",
"CRIM 3.613524 8.601545 0.00632 88.9762 0.25651 506 \n",
"ZN 11.363636 23.322453 0.00000 100.0000 0.00000 506 \n",
"INDUS 11.136779 6.860353 0.46000 27.7400 9.69000 506 \n",
"CHAS 0.069170 0.253994 0.00000 1.0000 0.00000 506 \n",
"NOX 0.554695 0.115878 0.38500 0.8710 0.53800 506 \n",
"RM 6.284634 0.702617 3.56100 8.7800 6.20850 506 \n",
"AGE 68.574901 28.148861 2.90000 100.0000 77.50000 506 \n",
"DIS 3.795043 2.105710 1.12960 12.1265 3.20745 506 \n",
"RAD 9.549407 8.707259 1.00000 24.0000 5.00000 506 \n",
"TAX 408.237154 168.537116 187.00000 711.0000 330.00000 506 \n",
"PTRATIO 18.455534 2.164946 12.60000 22.0000 19.05000 506 \n",
"LSTAT 12.653063 7.141062 1.73000 37.9700 11.36000 506 \n",
"MEDV 22.532806 9.197104 5.00000 50.0000 21.20000 506 \n",
"CAT. MEDV 0.166008 0.372456 0.00000 1.0000 0.00000 506 \n",
"\n",
" miss.val \n",
"CRIM 0 \n",
"ZN 0 \n",
"INDUS 0 \n",
"CHAS 0 \n",
"NOX 0 \n",
"RM 0 \n",
"AGE 0 \n",
"DIS 0 \n",
"RAD 0 \n",
"TAX 0 \n",
"PTRATIO 0 \n",
"LSTAT 0 \n",
"MEDV 0 \n",
"CAT. MEDV 0 "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bostonHousing_df = pd.read_csv('BostonHousing.csv')\n",
"bostonHousing_df = bostonHousing_df.rename(columns={'CAT.MEDV': 'CAT_MEDV'})\n",
"\n",
"# Compute mean, standard dev., min, max, median, length, and missing values for all variables\n",
"pd.DataFrame({'mean': bostonHousing_df.mean(),\n",
"'sd': bostonHousing_df.std(),\n",
"'min': bostonHousing_df.min(),\n",
"'max': bostonHousing_df.max(),\n",
"'median': bostonHousing_df.median(),\n",
"'length': len(bostonHousing_df),\n",
"'miss.val': bostonHousing_df.isnull().sum(),\n",
"})\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" CRIM | \n",
" ZN | \n",
" INDUS | \n",
" CHAS | \n",
" NOX | \n",
" RM | \n",
" AGE | \n",
" DIS | \n",
" RAD | \n",
" TAX | \n",
" PTRATIO | \n",
" LSTAT | \n",
" MEDV | \n",
" CAT. MEDV | \n",
"
\n",
" \n",
" \n",
" \n",
" | CRIM | \n",
" 1.00 | \n",
" -0.20 | \n",
" 0.41 | \n",
" -0.06 | \n",
" 0.42 | \n",
" -0.22 | \n",
" 0.35 | \n",
" -0.38 | \n",
" 0.63 | \n",
" 0.58 | \n",
" 0.29 | \n",
" 0.46 | \n",
" -0.39 | \n",
" -0.15 | \n",
"
\n",
" \n",
" | ZN | \n",
" -0.20 | \n",
" 1.00 | \n",
" -0.53 | \n",
" -0.04 | \n",
" -0.52 | \n",
" 0.31 | \n",
" -0.57 | \n",
" 0.66 | \n",
" -0.31 | \n",
" -0.31 | \n",
" -0.39 | \n",
" -0.41 | \n",
" 0.36 | \n",
" 0.37 | \n",
"
\n",
" \n",
" | INDUS | \n",
" 0.41 | \n",
" -0.53 | \n",
" 1.00 | \n",
" 0.06 | \n",
" 0.76 | \n",
" -0.39 | \n",
" 0.64 | \n",
" -0.71 | \n",
" 0.60 | \n",
" 0.72 | \n",
" 0.38 | \n",
" 0.60 | \n",
" -0.48 | \n",
" -0.37 | \n",
"
\n",
" \n",
" | CHAS | \n",
" -0.06 | \n",
" -0.04 | \n",
" 0.06 | \n",
" 1.00 | \n",
" 0.09 | \n",
" 0.09 | \n",
" 0.09 | \n",
" -0.10 | \n",
" -0.01 | \n",
" -0.04 | \n",
" -0.12 | \n",
" -0.05 | \n",
" 0.18 | \n",
" 0.11 | \n",
"
\n",
" \n",
" | NOX | \n",
" 0.42 | \n",
" -0.52 | \n",
" 0.76 | \n",
" 0.09 | \n",
" 1.00 | \n",
" -0.30 | \n",
" 0.73 | \n",
" -0.77 | \n",
" 0.61 | \n",
" 0.67 | \n",
" 0.19 | \n",
" 0.59 | \n",
" -0.43 | \n",
" -0.23 | \n",
"
\n",
" \n",
" | RM | \n",
" -0.22 | \n",
" 0.31 | \n",
" -0.39 | \n",
" 0.09 | \n",
" -0.30 | \n",
" 1.00 | \n",
" -0.24 | \n",
" 0.21 | \n",
" -0.21 | \n",
" -0.29 | \n",
" -0.36 | \n",
" -0.61 | \n",
" 0.70 | \n",
" 0.64 | \n",
"
\n",
" \n",
" | AGE | \n",
" 0.35 | \n",
" -0.57 | \n",
" 0.64 | \n",
" 0.09 | \n",
" 0.73 | \n",
" -0.24 | \n",
" 1.00 | \n",
" -0.75 | \n",
" 0.46 | \n",
" 0.51 | \n",
" 0.26 | \n",
" 0.60 | \n",
" -0.38 | \n",
" -0.19 | \n",
"
\n",
" \n",
" | DIS | \n",
" -0.38 | \n",
" 0.66 | \n",
" -0.71 | \n",
" -0.10 | \n",
" -0.77 | \n",
" 0.21 | \n",
" -0.75 | \n",
" 1.00 | \n",
" -0.49 | \n",
" -0.53 | \n",
" -0.23 | \n",
" -0.50 | \n",
" 0.25 | \n",
" 0.12 | \n",
"
\n",
" \n",
" | RAD | \n",
" 0.63 | \n",
" -0.31 | \n",
" 0.60 | \n",
" -0.01 | \n",
" 0.61 | \n",
" -0.21 | \n",
" 0.46 | \n",
" -0.49 | \n",
" 1.00 | \n",
" 0.91 | \n",
" 0.46 | \n",
" 0.49 | \n",
" -0.38 | \n",
" -0.20 | \n",
"
\n",
" \n",
" | TAX | \n",
" 0.58 | \n",
" -0.31 | \n",
" 0.72 | \n",
" -0.04 | \n",
" 0.67 | \n",
" -0.29 | \n",
" 0.51 | \n",
" -0.53 | \n",
" 0.91 | \n",
" 1.00 | \n",
" 0.46 | \n",
" 0.54 | \n",
" -0.47 | \n",
" -0.27 | \n",
"
\n",
" \n",
" | PTRATIO | \n",
" 0.29 | \n",
" -0.39 | \n",
" 0.38 | \n",
" -0.12 | \n",
" 0.19 | \n",
" -0.36 | \n",
" 0.26 | \n",
" -0.23 | \n",
" 0.46 | \n",
" 0.46 | \n",
" 1.00 | \n",
" 0.37 | \n",
" -0.51 | \n",
" -0.44 | \n",
"
\n",
" \n",
" | LSTAT | \n",
" 0.46 | \n",
" -0.41 | \n",
" 0.60 | \n",
" -0.05 | \n",
" 0.59 | \n",
" -0.61 | \n",
" 0.60 | \n",
" -0.50 | \n",
" 0.49 | \n",
" 0.54 | \n",
" 0.37 | \n",
" 1.00 | \n",
" -0.74 | \n",
" -0.47 | \n",
"
\n",
" \n",
" | MEDV | \n",
" -0.39 | \n",
" 0.36 | \n",
" -0.48 | \n",
" 0.18 | \n",
" -0.43 | \n",
" 0.70 | \n",
" -0.38 | \n",
" 0.25 | \n",
" -0.38 | \n",
" -0.47 | \n",
" -0.51 | \n",
" -0.74 | \n",
" 1.00 | \n",
" 0.79 | \n",
"
\n",
" \n",
" | CAT. MEDV | \n",
" -0.15 | \n",
" 0.37 | \n",
" -0.37 | \n",
" 0.11 | \n",
" -0.23 | \n",
" 0.64 | \n",
" -0.19 | \n",
" 0.12 | \n",
" -0.20 | \n",
" -0.27 | \n",
" -0.44 | \n",
" -0.47 | \n",
" 0.79 | \n",
" 1.00 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX \\\n",
"CRIM 1.00 -0.20 0.41 -0.06 0.42 -0.22 0.35 -0.38 0.63 0.58 \n",
"ZN -0.20 1.00 -0.53 -0.04 -0.52 0.31 -0.57 0.66 -0.31 -0.31 \n",
"INDUS 0.41 -0.53 1.00 0.06 0.76 -0.39 0.64 -0.71 0.60 0.72 \n",
"CHAS -0.06 -0.04 0.06 1.00 0.09 0.09 0.09 -0.10 -0.01 -0.04 \n",
"NOX 0.42 -0.52 0.76 0.09 1.00 -0.30 0.73 -0.77 0.61 0.67 \n",
"RM -0.22 0.31 -0.39 0.09 -0.30 1.00 -0.24 0.21 -0.21 -0.29 \n",
"AGE 0.35 -0.57 0.64 0.09 0.73 -0.24 1.00 -0.75 0.46 0.51 \n",
"DIS -0.38 0.66 -0.71 -0.10 -0.77 0.21 -0.75 1.00 -0.49 -0.53 \n",
"RAD 0.63 -0.31 0.60 -0.01 0.61 -0.21 0.46 -0.49 1.00 0.91 \n",
"TAX 0.58 -0.31 0.72 -0.04 0.67 -0.29 0.51 -0.53 0.91 1.00 \n",
"PTRATIO 0.29 -0.39 0.38 -0.12 0.19 -0.36 0.26 -0.23 0.46 0.46 \n",
"LSTAT 0.46 -0.41 0.60 -0.05 0.59 -0.61 0.60 -0.50 0.49 0.54 \n",
"MEDV -0.39 0.36 -0.48 0.18 -0.43 0.70 -0.38 0.25 -0.38 -0.47 \n",
"CAT. MEDV -0.15 0.37 -0.37 0.11 -0.23 0.64 -0.19 0.12 -0.20 -0.27 \n",
"\n",
" PTRATIO LSTAT MEDV CAT. MEDV \n",
"CRIM 0.29 0.46 -0.39 -0.15 \n",
"ZN -0.39 -0.41 0.36 0.37 \n",
"INDUS 0.38 0.60 -0.48 -0.37 \n",
"CHAS -0.12 -0.05 0.18 0.11 \n",
"NOX 0.19 0.59 -0.43 -0.23 \n",
"RM -0.36 -0.61 0.70 0.64 \n",
"AGE 0.26 0.60 -0.38 -0.19 \n",
"DIS -0.23 -0.50 0.25 0.12 \n",
"RAD 0.46 0.49 -0.38 -0.20 \n",
"TAX 0.46 0.54 -0.47 -0.27 \n",
"PTRATIO 1.00 0.37 -0.51 -0.44 \n",
"LSTAT 0.37 1.00 -0.74 -0.47 \n",
"MEDV -0.51 -0.74 1.00 0.79 \n",
"CAT. MEDV -0.44 -0.47 0.79 1.00 "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Correlation Matrix\n",
"bostonHousing_df.corr().round(2)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"RM_bin CHAS\n",
"3 0 25.300000\n",
"4 0 15.407143\n",
"5 0 17.200000\n",
" 1 22.218182\n",
"6 0 21.769170\n",
" 1 25.918750\n",
"7 0 35.964444\n",
" 1 44.066667\n",
"8 0 45.700000\n",
" 1 35.950000\n",
"Name: MEDV, dtype: float64"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Create bins of size 1 for variables\n",
"\n",
"bostonHousing_df['RM_bin'] = pd.cut(bostonHousing_df.RM, range(0, 10), labels=False)\n",
"\n",
"# Compute average of MEDV by (binned) RM and CHAS. \n",
"# Group the data using groupby, then restrict the analysis to MEDV and determine the mean for each group.\n",
"\n",
"bostonHousing_df.groupby(['RM_bin', 'CHAS'])['MEDV'].mean()\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | CHAS | \n",
" 0 | \n",
" 1 | \n",
" All | \n",
"
\n",
" \n",
" | RM_bin | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | 3 | \n",
" 25.300000 | \n",
" NaN | \n",
" 25.300000 | \n",
"
\n",
" \n",
" | 4 | \n",
" 15.407143 | \n",
" NaN | \n",
" 15.407143 | \n",
"
\n",
" \n",
" | 5 | \n",
" 17.200000 | \n",
" 22.218182 | \n",
" 17.551592 | \n",
"
\n",
" \n",
" | 6 | \n",
" 21.769170 | \n",
" 25.918750 | \n",
" 22.015985 | \n",
"
\n",
" \n",
" | 7 | \n",
" 35.964444 | \n",
" 44.066667 | \n",
" 36.917647 | \n",
"
\n",
" \n",
" | 8 | \n",
" 45.700000 | \n",
" 35.950000 | \n",
" 44.200000 | \n",
"
\n",
" \n",
" | All | \n",
" 22.093843 | \n",
" 28.440000 | \n",
" 22.532806 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
"CHAS 0 1 All\n",
"RM_bin \n",
"3 25.300000 NaN 25.300000\n",
"4 15.407143 NaN 15.407143\n",
"5 17.200000 22.218182 17.551592\n",
"6 21.769170 25.918750 22.015985\n",
"7 35.964444 44.066667 36.917647\n",
"8 45.700000 35.950000 44.200000\n",
"All 22.093843 28.440000 22.532806"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# create bins of size 1 for RM\n",
"bostonHousing_df['RM_bin'] = pd.cut(bostonHousing_df.RM, range(0, 10), labels=False)\n",
"\n",
"# use pivot_table() to reshape data and generate pivot table\n",
"pd.pivot_table(bostonHousing_df, values='MEDV', index=['RM_bin'], columns=['CHAS'], aggfunc=np.mean, margins=True)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}