{ "cells": [ { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# Learning Practice 2 for the University of Tulsa's QM-7063 Data Mining Course\n", "# Dimension Reduction\n", "# Professor: Dr. Abdulrashid, Spring 2023\n", "# Noah L. Schrick - 1492657\n", "\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
meansdminmaxmedianlengthmiss.val
CRIM3.6135248.6015450.0063288.97620.256515060
ZN11.36363623.3224530.00000100.00000.000005060
INDUS11.1367796.8603530.4600027.74009.690005060
CHAS0.0691700.2539940.000001.00000.000005060
NOX0.5546950.1158780.385000.87100.538005060
RM6.2846340.7026173.561008.78006.208505060
AGE68.57490128.1488612.90000100.000077.500005060
DIS3.7950432.1057101.1296012.12653.207455060
RAD9.5494078.7072591.0000024.00005.000005060
TAX408.237154168.537116187.00000711.0000330.000005060
PTRATIO18.4555342.16494612.6000022.000019.050005060
LSTAT12.6530637.1410621.7300037.970011.360005060
MEDV22.5328069.1971045.0000050.000021.200005060
CAT. MEDV0.1660080.3724560.000001.00000.000005060
\n", "
" ], "text/plain": [ " mean sd min max median length \\\n", "CRIM 3.613524 8.601545 0.00632 88.9762 0.25651 506 \n", "ZN 11.363636 23.322453 0.00000 100.0000 0.00000 506 \n", "INDUS 11.136779 6.860353 0.46000 27.7400 9.69000 506 \n", "CHAS 0.069170 0.253994 0.00000 1.0000 0.00000 506 \n", "NOX 0.554695 0.115878 0.38500 0.8710 0.53800 506 \n", "RM 6.284634 0.702617 3.56100 8.7800 6.20850 506 \n", "AGE 68.574901 28.148861 2.90000 100.0000 77.50000 506 \n", "DIS 3.795043 2.105710 1.12960 12.1265 3.20745 506 \n", "RAD 9.549407 8.707259 1.00000 24.0000 5.00000 506 \n", "TAX 408.237154 168.537116 187.00000 711.0000 330.00000 506 \n", "PTRATIO 18.455534 2.164946 12.60000 22.0000 19.05000 506 \n", "LSTAT 12.653063 7.141062 1.73000 37.9700 11.36000 506 \n", "MEDV 22.532806 9.197104 5.00000 50.0000 21.20000 506 \n", "CAT. MEDV 0.166008 0.372456 0.00000 1.0000 0.00000 506 \n", "\n", " miss.val \n", "CRIM 0 \n", "ZN 0 \n", "INDUS 0 \n", "CHAS 0 \n", "NOX 0 \n", "RM 0 \n", "AGE 0 \n", "DIS 0 \n", "RAD 0 \n", "TAX 0 \n", "PTRATIO 0 \n", "LSTAT 0 \n", "MEDV 0 \n", "CAT. MEDV 0 " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bostonHousing_df = pd.read_csv('BostonHousing.csv')\n", "bostonHousing_df = bostonHousing_df.rename(columns={'CAT.MEDV': 'CAT_MEDV'})\n", "\n", "# Compute mean, standard dev., min, max, median, length, and missing values for all variables\n", "pd.DataFrame({'mean': bostonHousing_df.mean(),\n", "'sd': bostonHousing_df.std(),\n", "'min': bostonHousing_df.min(),\n", "'max': bostonHousing_df.max(),\n", "'median': bostonHousing_df.median(),\n", "'length': len(bostonHousing_df),\n", "'miss.val': bostonHousing_df.isnull().sum(),\n", "})\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CRIMZNINDUSCHASNOXRMAGEDISRADTAXPTRATIOLSTATMEDVCAT. MEDV
CRIM1.00-0.200.41-0.060.42-0.220.35-0.380.630.580.290.46-0.39-0.15
ZN-0.201.00-0.53-0.04-0.520.31-0.570.66-0.31-0.31-0.39-0.410.360.37
INDUS0.41-0.531.000.060.76-0.390.64-0.710.600.720.380.60-0.48-0.37
CHAS-0.06-0.040.061.000.090.090.09-0.10-0.01-0.04-0.12-0.050.180.11
NOX0.42-0.520.760.091.00-0.300.73-0.770.610.670.190.59-0.43-0.23
RM-0.220.31-0.390.09-0.301.00-0.240.21-0.21-0.29-0.36-0.610.700.64
AGE0.35-0.570.640.090.73-0.241.00-0.750.460.510.260.60-0.38-0.19
DIS-0.380.66-0.71-0.10-0.770.21-0.751.00-0.49-0.53-0.23-0.500.250.12
RAD0.63-0.310.60-0.010.61-0.210.46-0.491.000.910.460.49-0.38-0.20
TAX0.58-0.310.72-0.040.67-0.290.51-0.530.911.000.460.54-0.47-0.27
PTRATIO0.29-0.390.38-0.120.19-0.360.26-0.230.460.461.000.37-0.51-0.44
LSTAT0.46-0.410.60-0.050.59-0.610.60-0.500.490.540.371.00-0.74-0.47
MEDV-0.390.36-0.480.18-0.430.70-0.380.25-0.38-0.47-0.51-0.741.000.79
CAT. MEDV-0.150.37-0.370.11-0.230.64-0.190.12-0.20-0.27-0.44-0.470.791.00
\n", "
" ], "text/plain": [ " CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX \\\n", "CRIM 1.00 -0.20 0.41 -0.06 0.42 -0.22 0.35 -0.38 0.63 0.58 \n", "ZN -0.20 1.00 -0.53 -0.04 -0.52 0.31 -0.57 0.66 -0.31 -0.31 \n", "INDUS 0.41 -0.53 1.00 0.06 0.76 -0.39 0.64 -0.71 0.60 0.72 \n", "CHAS -0.06 -0.04 0.06 1.00 0.09 0.09 0.09 -0.10 -0.01 -0.04 \n", "NOX 0.42 -0.52 0.76 0.09 1.00 -0.30 0.73 -0.77 0.61 0.67 \n", "RM -0.22 0.31 -0.39 0.09 -0.30 1.00 -0.24 0.21 -0.21 -0.29 \n", "AGE 0.35 -0.57 0.64 0.09 0.73 -0.24 1.00 -0.75 0.46 0.51 \n", "DIS -0.38 0.66 -0.71 -0.10 -0.77 0.21 -0.75 1.00 -0.49 -0.53 \n", "RAD 0.63 -0.31 0.60 -0.01 0.61 -0.21 0.46 -0.49 1.00 0.91 \n", "TAX 0.58 -0.31 0.72 -0.04 0.67 -0.29 0.51 -0.53 0.91 1.00 \n", "PTRATIO 0.29 -0.39 0.38 -0.12 0.19 -0.36 0.26 -0.23 0.46 0.46 \n", "LSTAT 0.46 -0.41 0.60 -0.05 0.59 -0.61 0.60 -0.50 0.49 0.54 \n", "MEDV -0.39 0.36 -0.48 0.18 -0.43 0.70 -0.38 0.25 -0.38 -0.47 \n", "CAT. MEDV -0.15 0.37 -0.37 0.11 -0.23 0.64 -0.19 0.12 -0.20 -0.27 \n", "\n", " PTRATIO LSTAT MEDV CAT. MEDV \n", "CRIM 0.29 0.46 -0.39 -0.15 \n", "ZN -0.39 -0.41 0.36 0.37 \n", "INDUS 0.38 0.60 -0.48 -0.37 \n", "CHAS -0.12 -0.05 0.18 0.11 \n", "NOX 0.19 0.59 -0.43 -0.23 \n", "RM -0.36 -0.61 0.70 0.64 \n", "AGE 0.26 0.60 -0.38 -0.19 \n", "DIS -0.23 -0.50 0.25 0.12 \n", "RAD 0.46 0.49 -0.38 -0.20 \n", "TAX 0.46 0.54 -0.47 -0.27 \n", "PTRATIO 1.00 0.37 -0.51 -0.44 \n", "LSTAT 0.37 1.00 -0.74 -0.47 \n", "MEDV -0.51 -0.74 1.00 0.79 \n", "CAT. MEDV -0.44 -0.47 0.79 1.00 " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Correlation Matrix\n", "bostonHousing_df.corr().round(2)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "RM_bin CHAS\n", "3 0 25.300000\n", "4 0 15.407143\n", "5 0 17.200000\n", " 1 22.218182\n", "6 0 21.769170\n", " 1 25.918750\n", "7 0 35.964444\n", " 1 44.066667\n", "8 0 45.700000\n", " 1 35.950000\n", "Name: MEDV, dtype: float64" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Create bins of size 1 for variables\n", "\n", "bostonHousing_df['RM_bin'] = pd.cut(bostonHousing_df.RM, range(0, 10), labels=False)\n", "\n", "# Compute average of MEDV by (binned) RM and CHAS. \n", "# Group the data using groupby, then restrict the analysis to MEDV and determine the mean for each group.\n", "\n", "bostonHousing_df.groupby(['RM_bin', 'CHAS'])['MEDV'].mean()\n" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CHAS01All
RM_bin
325.300000NaN25.300000
415.407143NaN15.407143
517.20000022.21818217.551592
621.76917025.91875022.015985
735.96444444.06666736.917647
845.70000035.95000044.200000
All22.09384328.44000022.532806
\n", "
" ], "text/plain": [ "CHAS 0 1 All\n", "RM_bin \n", "3 25.300000 NaN 25.300000\n", "4 15.407143 NaN 15.407143\n", "5 17.200000 22.218182 17.551592\n", "6 21.769170 25.918750 22.015985\n", "7 35.964444 44.066667 36.917647\n", "8 45.700000 35.950000 44.200000\n", "All 22.093843 28.440000 22.532806" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# create bins of size 1 for RM\n", "bostonHousing_df['RM_bin'] = pd.cut(bostonHousing_df.RM, range(0, 10), labels=False)\n", "\n", "# use pivot_table() to reshape data and generate pivot table\n", "pd.pivot_table(bostonHousing_df, values='MEDV', index=['RM_bin'], columns=['CHAS'], aggfunc=np.mean, margins=True)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.9" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }