927 lines
54 KiB
Plaintext
927 lines
54 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 52,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Lecture 3 for the University of Tulsa's QM-7063 Data Mining Course\n",
|
|
"# Dimension Reduction\n",
|
|
"# Professor: Dr. Abdulrashid, Spring 2023\n",
|
|
"# Noah L. Schrick - 1492657\n",
|
|
"\n",
|
|
"import pandas as pd\n",
|
|
"import numpy as np\n",
|
|
"from sklearn.decomposition import PCA\n",
|
|
"import matplotlib.pyplot as plt"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 36,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>mean</th>\n",
|
|
" <th>sd</th>\n",
|
|
" <th>min</th>\n",
|
|
" <th>max</th>\n",
|
|
" <th>median</th>\n",
|
|
" <th>length</th>\n",
|
|
" <th>miss.val</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>CRIM</th>\n",
|
|
" <td>3.613524</td>\n",
|
|
" <td>8.601545</td>\n",
|
|
" <td>0.00632</td>\n",
|
|
" <td>88.9762</td>\n",
|
|
" <td>0.25651</td>\n",
|
|
" <td>506</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>ZN</th>\n",
|
|
" <td>11.363636</td>\n",
|
|
" <td>23.322453</td>\n",
|
|
" <td>0.00000</td>\n",
|
|
" <td>100.0000</td>\n",
|
|
" <td>0.00000</td>\n",
|
|
" <td>506</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>INDUS</th>\n",
|
|
" <td>11.136779</td>\n",
|
|
" <td>6.860353</td>\n",
|
|
" <td>0.46000</td>\n",
|
|
" <td>27.7400</td>\n",
|
|
" <td>9.69000</td>\n",
|
|
" <td>506</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>CHAS</th>\n",
|
|
" <td>0.069170</td>\n",
|
|
" <td>0.253994</td>\n",
|
|
" <td>0.00000</td>\n",
|
|
" <td>1.0000</td>\n",
|
|
" <td>0.00000</td>\n",
|
|
" <td>506</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>NOX</th>\n",
|
|
" <td>0.554695</td>\n",
|
|
" <td>0.115878</td>\n",
|
|
" <td>0.38500</td>\n",
|
|
" <td>0.8710</td>\n",
|
|
" <td>0.53800</td>\n",
|
|
" <td>506</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>RM</th>\n",
|
|
" <td>6.284634</td>\n",
|
|
" <td>0.702617</td>\n",
|
|
" <td>3.56100</td>\n",
|
|
" <td>8.7800</td>\n",
|
|
" <td>6.20850</td>\n",
|
|
" <td>506</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>AGE</th>\n",
|
|
" <td>68.574901</td>\n",
|
|
" <td>28.148861</td>\n",
|
|
" <td>2.90000</td>\n",
|
|
" <td>100.0000</td>\n",
|
|
" <td>77.50000</td>\n",
|
|
" <td>506</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>DIS</th>\n",
|
|
" <td>3.795043</td>\n",
|
|
" <td>2.105710</td>\n",
|
|
" <td>1.12960</td>\n",
|
|
" <td>12.1265</td>\n",
|
|
" <td>3.20745</td>\n",
|
|
" <td>506</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>RAD</th>\n",
|
|
" <td>9.549407</td>\n",
|
|
" <td>8.707259</td>\n",
|
|
" <td>1.00000</td>\n",
|
|
" <td>24.0000</td>\n",
|
|
" <td>5.00000</td>\n",
|
|
" <td>506</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>TAX</th>\n",
|
|
" <td>408.237154</td>\n",
|
|
" <td>168.537116</td>\n",
|
|
" <td>187.00000</td>\n",
|
|
" <td>711.0000</td>\n",
|
|
" <td>330.00000</td>\n",
|
|
" <td>506</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>PTRATIO</th>\n",
|
|
" <td>18.455534</td>\n",
|
|
" <td>2.164946</td>\n",
|
|
" <td>12.60000</td>\n",
|
|
" <td>22.0000</td>\n",
|
|
" <td>19.05000</td>\n",
|
|
" <td>506</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>LSTAT</th>\n",
|
|
" <td>12.653063</td>\n",
|
|
" <td>7.141062</td>\n",
|
|
" <td>1.73000</td>\n",
|
|
" <td>37.9700</td>\n",
|
|
" <td>11.36000</td>\n",
|
|
" <td>506</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>MEDV</th>\n",
|
|
" <td>22.532806</td>\n",
|
|
" <td>9.197104</td>\n",
|
|
" <td>5.00000</td>\n",
|
|
" <td>50.0000</td>\n",
|
|
" <td>21.20000</td>\n",
|
|
" <td>506</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>CAT_MEDV</th>\n",
|
|
" <td>0.166008</td>\n",
|
|
" <td>0.372456</td>\n",
|
|
" <td>0.00000</td>\n",
|
|
" <td>1.0000</td>\n",
|
|
" <td>0.00000</td>\n",
|
|
" <td>506</td>\n",
|
|
" <td>0</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" mean sd min max median length \\\n",
|
|
"CRIM 3.613524 8.601545 0.00632 88.9762 0.25651 506 \n",
|
|
"ZN 11.363636 23.322453 0.00000 100.0000 0.00000 506 \n",
|
|
"INDUS 11.136779 6.860353 0.46000 27.7400 9.69000 506 \n",
|
|
"CHAS 0.069170 0.253994 0.00000 1.0000 0.00000 506 \n",
|
|
"NOX 0.554695 0.115878 0.38500 0.8710 0.53800 506 \n",
|
|
"RM 6.284634 0.702617 3.56100 8.7800 6.20850 506 \n",
|
|
"AGE 68.574901 28.148861 2.90000 100.0000 77.50000 506 \n",
|
|
"DIS 3.795043 2.105710 1.12960 12.1265 3.20745 506 \n",
|
|
"RAD 9.549407 8.707259 1.00000 24.0000 5.00000 506 \n",
|
|
"TAX 408.237154 168.537116 187.00000 711.0000 330.00000 506 \n",
|
|
"PTRATIO 18.455534 2.164946 12.60000 22.0000 19.05000 506 \n",
|
|
"LSTAT 12.653063 7.141062 1.73000 37.9700 11.36000 506 \n",
|
|
"MEDV 22.532806 9.197104 5.00000 50.0000 21.20000 506 \n",
|
|
"CAT_MEDV 0.166008 0.372456 0.00000 1.0000 0.00000 506 \n",
|
|
"\n",
|
|
" miss.val \n",
|
|
"CRIM 0 \n",
|
|
"ZN 0 \n",
|
|
"INDUS 0 \n",
|
|
"CHAS 0 \n",
|
|
"NOX 0 \n",
|
|
"RM 0 \n",
|
|
"AGE 0 \n",
|
|
"DIS 0 \n",
|
|
"RAD 0 \n",
|
|
"TAX 0 \n",
|
|
"PTRATIO 0 \n",
|
|
"LSTAT 0 \n",
|
|
"MEDV 0 \n",
|
|
"CAT_MEDV 0 "
|
|
]
|
|
},
|
|
"execution_count": 36,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"bostonHousing_df = pd.read_csv('BostonHousing.csv')\n",
|
|
"bostonHousing_df = bostonHousing_df.rename(columns={'CAT.MEDV': 'CAT_MEDV'})\n",
|
|
"\n",
|
|
"# Compute mean, standard dev., min, max, median, length, and missing values for all variables\n",
|
|
"pd.DataFrame({'mean': bostonHousing_df.mean(),\n",
|
|
"'sd': bostonHousing_df.std(),\n",
|
|
"'min': bostonHousing_df.min(),\n",
|
|
"'max': bostonHousing_df.max(),\n",
|
|
"'median': bostonHousing_df.median(),\n",
|
|
"'length': len(bostonHousing_df),\n",
|
|
"'miss.val': bostonHousing_df.isnull().sum(),\n",
|
|
"})\n",
|
|
"\n",
|
|
"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 37,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>CRIM</th>\n",
|
|
" <th>ZN</th>\n",
|
|
" <th>INDUS</th>\n",
|
|
" <th>CHAS</th>\n",
|
|
" <th>NOX</th>\n",
|
|
" <th>RM</th>\n",
|
|
" <th>AGE</th>\n",
|
|
" <th>DIS</th>\n",
|
|
" <th>RAD</th>\n",
|
|
" <th>TAX</th>\n",
|
|
" <th>PTRATIO</th>\n",
|
|
" <th>LSTAT</th>\n",
|
|
" <th>MEDV</th>\n",
|
|
" <th>CAT_MEDV</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>CRIM</th>\n",
|
|
" <td>1.00</td>\n",
|
|
" <td>-0.20</td>\n",
|
|
" <td>0.41</td>\n",
|
|
" <td>-0.06</td>\n",
|
|
" <td>0.42</td>\n",
|
|
" <td>-0.22</td>\n",
|
|
" <td>0.35</td>\n",
|
|
" <td>-0.38</td>\n",
|
|
" <td>0.63</td>\n",
|
|
" <td>0.58</td>\n",
|
|
" <td>0.29</td>\n",
|
|
" <td>0.46</td>\n",
|
|
" <td>-0.39</td>\n",
|
|
" <td>-0.15</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>ZN</th>\n",
|
|
" <td>-0.20</td>\n",
|
|
" <td>1.00</td>\n",
|
|
" <td>-0.53</td>\n",
|
|
" <td>-0.04</td>\n",
|
|
" <td>-0.52</td>\n",
|
|
" <td>0.31</td>\n",
|
|
" <td>-0.57</td>\n",
|
|
" <td>0.66</td>\n",
|
|
" <td>-0.31</td>\n",
|
|
" <td>-0.31</td>\n",
|
|
" <td>-0.39</td>\n",
|
|
" <td>-0.41</td>\n",
|
|
" <td>0.36</td>\n",
|
|
" <td>0.37</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>INDUS</th>\n",
|
|
" <td>0.41</td>\n",
|
|
" <td>-0.53</td>\n",
|
|
" <td>1.00</td>\n",
|
|
" <td>0.06</td>\n",
|
|
" <td>0.76</td>\n",
|
|
" <td>-0.39</td>\n",
|
|
" <td>0.64</td>\n",
|
|
" <td>-0.71</td>\n",
|
|
" <td>0.60</td>\n",
|
|
" <td>0.72</td>\n",
|
|
" <td>0.38</td>\n",
|
|
" <td>0.60</td>\n",
|
|
" <td>-0.48</td>\n",
|
|
" <td>-0.37</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>CHAS</th>\n",
|
|
" <td>-0.06</td>\n",
|
|
" <td>-0.04</td>\n",
|
|
" <td>0.06</td>\n",
|
|
" <td>1.00</td>\n",
|
|
" <td>0.09</td>\n",
|
|
" <td>0.09</td>\n",
|
|
" <td>0.09</td>\n",
|
|
" <td>-0.10</td>\n",
|
|
" <td>-0.01</td>\n",
|
|
" <td>-0.04</td>\n",
|
|
" <td>-0.12</td>\n",
|
|
" <td>-0.05</td>\n",
|
|
" <td>0.18</td>\n",
|
|
" <td>0.11</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>NOX</th>\n",
|
|
" <td>0.42</td>\n",
|
|
" <td>-0.52</td>\n",
|
|
" <td>0.76</td>\n",
|
|
" <td>0.09</td>\n",
|
|
" <td>1.00</td>\n",
|
|
" <td>-0.30</td>\n",
|
|
" <td>0.73</td>\n",
|
|
" <td>-0.77</td>\n",
|
|
" <td>0.61</td>\n",
|
|
" <td>0.67</td>\n",
|
|
" <td>0.19</td>\n",
|
|
" <td>0.59</td>\n",
|
|
" <td>-0.43</td>\n",
|
|
" <td>-0.23</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>RM</th>\n",
|
|
" <td>-0.22</td>\n",
|
|
" <td>0.31</td>\n",
|
|
" <td>-0.39</td>\n",
|
|
" <td>0.09</td>\n",
|
|
" <td>-0.30</td>\n",
|
|
" <td>1.00</td>\n",
|
|
" <td>-0.24</td>\n",
|
|
" <td>0.21</td>\n",
|
|
" <td>-0.21</td>\n",
|
|
" <td>-0.29</td>\n",
|
|
" <td>-0.36</td>\n",
|
|
" <td>-0.61</td>\n",
|
|
" <td>0.70</td>\n",
|
|
" <td>0.64</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>AGE</th>\n",
|
|
" <td>0.35</td>\n",
|
|
" <td>-0.57</td>\n",
|
|
" <td>0.64</td>\n",
|
|
" <td>0.09</td>\n",
|
|
" <td>0.73</td>\n",
|
|
" <td>-0.24</td>\n",
|
|
" <td>1.00</td>\n",
|
|
" <td>-0.75</td>\n",
|
|
" <td>0.46</td>\n",
|
|
" <td>0.51</td>\n",
|
|
" <td>0.26</td>\n",
|
|
" <td>0.60</td>\n",
|
|
" <td>-0.38</td>\n",
|
|
" <td>-0.19</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>DIS</th>\n",
|
|
" <td>-0.38</td>\n",
|
|
" <td>0.66</td>\n",
|
|
" <td>-0.71</td>\n",
|
|
" <td>-0.10</td>\n",
|
|
" <td>-0.77</td>\n",
|
|
" <td>0.21</td>\n",
|
|
" <td>-0.75</td>\n",
|
|
" <td>1.00</td>\n",
|
|
" <td>-0.49</td>\n",
|
|
" <td>-0.53</td>\n",
|
|
" <td>-0.23</td>\n",
|
|
" <td>-0.50</td>\n",
|
|
" <td>0.25</td>\n",
|
|
" <td>0.12</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>RAD</th>\n",
|
|
" <td>0.63</td>\n",
|
|
" <td>-0.31</td>\n",
|
|
" <td>0.60</td>\n",
|
|
" <td>-0.01</td>\n",
|
|
" <td>0.61</td>\n",
|
|
" <td>-0.21</td>\n",
|
|
" <td>0.46</td>\n",
|
|
" <td>-0.49</td>\n",
|
|
" <td>1.00</td>\n",
|
|
" <td>0.91</td>\n",
|
|
" <td>0.46</td>\n",
|
|
" <td>0.49</td>\n",
|
|
" <td>-0.38</td>\n",
|
|
" <td>-0.20</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>TAX</th>\n",
|
|
" <td>0.58</td>\n",
|
|
" <td>-0.31</td>\n",
|
|
" <td>0.72</td>\n",
|
|
" <td>-0.04</td>\n",
|
|
" <td>0.67</td>\n",
|
|
" <td>-0.29</td>\n",
|
|
" <td>0.51</td>\n",
|
|
" <td>-0.53</td>\n",
|
|
" <td>0.91</td>\n",
|
|
" <td>1.00</td>\n",
|
|
" <td>0.46</td>\n",
|
|
" <td>0.54</td>\n",
|
|
" <td>-0.47</td>\n",
|
|
" <td>-0.27</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>PTRATIO</th>\n",
|
|
" <td>0.29</td>\n",
|
|
" <td>-0.39</td>\n",
|
|
" <td>0.38</td>\n",
|
|
" <td>-0.12</td>\n",
|
|
" <td>0.19</td>\n",
|
|
" <td>-0.36</td>\n",
|
|
" <td>0.26</td>\n",
|
|
" <td>-0.23</td>\n",
|
|
" <td>0.46</td>\n",
|
|
" <td>0.46</td>\n",
|
|
" <td>1.00</td>\n",
|
|
" <td>0.37</td>\n",
|
|
" <td>-0.51</td>\n",
|
|
" <td>-0.44</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>LSTAT</th>\n",
|
|
" <td>0.46</td>\n",
|
|
" <td>-0.41</td>\n",
|
|
" <td>0.60</td>\n",
|
|
" <td>-0.05</td>\n",
|
|
" <td>0.59</td>\n",
|
|
" <td>-0.61</td>\n",
|
|
" <td>0.60</td>\n",
|
|
" <td>-0.50</td>\n",
|
|
" <td>0.49</td>\n",
|
|
" <td>0.54</td>\n",
|
|
" <td>0.37</td>\n",
|
|
" <td>1.00</td>\n",
|
|
" <td>-0.74</td>\n",
|
|
" <td>-0.47</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>MEDV</th>\n",
|
|
" <td>-0.39</td>\n",
|
|
" <td>0.36</td>\n",
|
|
" <td>-0.48</td>\n",
|
|
" <td>0.18</td>\n",
|
|
" <td>-0.43</td>\n",
|
|
" <td>0.70</td>\n",
|
|
" <td>-0.38</td>\n",
|
|
" <td>0.25</td>\n",
|
|
" <td>-0.38</td>\n",
|
|
" <td>-0.47</td>\n",
|
|
" <td>-0.51</td>\n",
|
|
" <td>-0.74</td>\n",
|
|
" <td>1.00</td>\n",
|
|
" <td>0.79</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>CAT_MEDV</th>\n",
|
|
" <td>-0.15</td>\n",
|
|
" <td>0.37</td>\n",
|
|
" <td>-0.37</td>\n",
|
|
" <td>0.11</td>\n",
|
|
" <td>-0.23</td>\n",
|
|
" <td>0.64</td>\n",
|
|
" <td>-0.19</td>\n",
|
|
" <td>0.12</td>\n",
|
|
" <td>-0.20</td>\n",
|
|
" <td>-0.27</td>\n",
|
|
" <td>-0.44</td>\n",
|
|
" <td>-0.47</td>\n",
|
|
" <td>0.79</td>\n",
|
|
" <td>1.00</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX \\\n",
|
|
"CRIM 1.00 -0.20 0.41 -0.06 0.42 -0.22 0.35 -0.38 0.63 0.58 \n",
|
|
"ZN -0.20 1.00 -0.53 -0.04 -0.52 0.31 -0.57 0.66 -0.31 -0.31 \n",
|
|
"INDUS 0.41 -0.53 1.00 0.06 0.76 -0.39 0.64 -0.71 0.60 0.72 \n",
|
|
"CHAS -0.06 -0.04 0.06 1.00 0.09 0.09 0.09 -0.10 -0.01 -0.04 \n",
|
|
"NOX 0.42 -0.52 0.76 0.09 1.00 -0.30 0.73 -0.77 0.61 0.67 \n",
|
|
"RM -0.22 0.31 -0.39 0.09 -0.30 1.00 -0.24 0.21 -0.21 -0.29 \n",
|
|
"AGE 0.35 -0.57 0.64 0.09 0.73 -0.24 1.00 -0.75 0.46 0.51 \n",
|
|
"DIS -0.38 0.66 -0.71 -0.10 -0.77 0.21 -0.75 1.00 -0.49 -0.53 \n",
|
|
"RAD 0.63 -0.31 0.60 -0.01 0.61 -0.21 0.46 -0.49 1.00 0.91 \n",
|
|
"TAX 0.58 -0.31 0.72 -0.04 0.67 -0.29 0.51 -0.53 0.91 1.00 \n",
|
|
"PTRATIO 0.29 -0.39 0.38 -0.12 0.19 -0.36 0.26 -0.23 0.46 0.46 \n",
|
|
"LSTAT 0.46 -0.41 0.60 -0.05 0.59 -0.61 0.60 -0.50 0.49 0.54 \n",
|
|
"MEDV -0.39 0.36 -0.48 0.18 -0.43 0.70 -0.38 0.25 -0.38 -0.47 \n",
|
|
"CAT_MEDV -0.15 0.37 -0.37 0.11 -0.23 0.64 -0.19 0.12 -0.20 -0.27 \n",
|
|
"\n",
|
|
" PTRATIO LSTAT MEDV CAT_MEDV \n",
|
|
"CRIM 0.29 0.46 -0.39 -0.15 \n",
|
|
"ZN -0.39 -0.41 0.36 0.37 \n",
|
|
"INDUS 0.38 0.60 -0.48 -0.37 \n",
|
|
"CHAS -0.12 -0.05 0.18 0.11 \n",
|
|
"NOX 0.19 0.59 -0.43 -0.23 \n",
|
|
"RM -0.36 -0.61 0.70 0.64 \n",
|
|
"AGE 0.26 0.60 -0.38 -0.19 \n",
|
|
"DIS -0.23 -0.50 0.25 0.12 \n",
|
|
"RAD 0.46 0.49 -0.38 -0.20 \n",
|
|
"TAX 0.46 0.54 -0.47 -0.27 \n",
|
|
"PTRATIO 1.00 0.37 -0.51 -0.44 \n",
|
|
"LSTAT 0.37 1.00 -0.74 -0.47 \n",
|
|
"MEDV -0.51 -0.74 1.00 0.79 \n",
|
|
"CAT_MEDV -0.44 -0.47 0.79 1.00 "
|
|
]
|
|
},
|
|
"execution_count": 37,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"# Correlation Matrix\n",
|
|
"bostonHousing_df.corr().round(2)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 38,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"RM_bin CHAS\n",
|
|
"3 0 25.300000\n",
|
|
"4 0 15.407143\n",
|
|
"5 0 17.200000\n",
|
|
" 1 22.218182\n",
|
|
"6 0 21.769170\n",
|
|
" 1 25.918750\n",
|
|
"7 0 35.964444\n",
|
|
" 1 44.066667\n",
|
|
"8 0 45.700000\n",
|
|
" 1 35.950000\n",
|
|
"Name: MEDV, dtype: float64"
|
|
]
|
|
},
|
|
"execution_count": 38,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"# Create bins of size 1 for variables\n",
|
|
"\n",
|
|
"bostonHousing_df['RM_bin'] = pd.cut(bostonHousing_df.RM, range(0, 10), labels=False)\n",
|
|
"\n",
|
|
"# Compute average of MEDV by (binned) RM and CHAS. \n",
|
|
"# Group the data using groupby, then restrict the analysis to MEDV and determine the mean for each group.\n",
|
|
"\n",
|
|
"bostonHousing_df.groupby(['RM_bin', 'CHAS'])['MEDV'].mean()\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th>CHAS</th>\n",
|
|
" <th>0</th>\n",
|
|
" <th>1</th>\n",
|
|
" <th>All</th>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>RM_bin</th>\n",
|
|
" <th></th>\n",
|
|
" <th></th>\n",
|
|
" <th></th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>25.300000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>25.300000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>15.407143</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>15.407143</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>5</th>\n",
|
|
" <td>17.200000</td>\n",
|
|
" <td>22.218182</td>\n",
|
|
" <td>17.551592</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>6</th>\n",
|
|
" <td>21.769170</td>\n",
|
|
" <td>25.918750</td>\n",
|
|
" <td>22.015985</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>7</th>\n",
|
|
" <td>35.964444</td>\n",
|
|
" <td>44.066667</td>\n",
|
|
" <td>36.917647</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>8</th>\n",
|
|
" <td>45.700000</td>\n",
|
|
" <td>35.950000</td>\n",
|
|
" <td>44.200000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>All</th>\n",
|
|
" <td>22.093843</td>\n",
|
|
" <td>28.440000</td>\n",
|
|
" <td>22.532806</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
"CHAS 0 1 All\n",
|
|
"RM_bin \n",
|
|
"3 25.300000 NaN 25.300000\n",
|
|
"4 15.407143 NaN 15.407143\n",
|
|
"5 17.200000 22.218182 17.551592\n",
|
|
"6 21.769170 25.918750 22.015985\n",
|
|
"7 35.964444 44.066667 36.917647\n",
|
|
"8 45.700000 35.950000 44.200000\n",
|
|
"All 22.093843 28.440000 22.532806"
|
|
]
|
|
},
|
|
"execution_count": 15,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"# create bins of size 1 for RM\n",
|
|
"bostonHousing_df['RM_bin'] = pd.cut(bostonHousing_df.RM, range(0, 10), labels=False)\n",
|
|
"\n",
|
|
"# use pivot_table() to reshape data and generate pivot table\n",
|
|
"pd.pivot_table(bostonHousing_df, values='MEDV', index=['RM_bin'], columns=['CHAS'], aggfunc=np.mean, margins=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 49,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"image/png": "",
|
|
"text/plain": [
|
|
"<Figure size 640x480 with 1 Axes>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"## Stacked Bar Charts\n",
|
|
"# use crosstab to create a cross-tabulation of two variables\n",
|
|
"tbl = pd.crosstab(bostonHousing_df.CAT_MEDV, bostonHousing_df.ZN)\n",
|
|
"\n",
|
|
"# convert numbers to ratios\n",
|
|
"propTbl = tbl / tbl.sum()\n",
|
|
"propTbl.round(2)\n",
|
|
"\n",
|
|
"# plot the ratios in a stacked bar chart\n",
|
|
"ax = propTbl.transpose().plot(kind='bar', stacked=True)\n",
|
|
"plt.title('Distribution of CAT_MEDV by ZN')\n",
|
|
"plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))\n",
|
|
"plt.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 53,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>PC1</th>\n",
|
|
" <th>PC2</th>\n",
|
|
" <th>PC3</th>\n",
|
|
" <th>PC4</th>\n",
|
|
" <th>PC5</th>\n",
|
|
" <th>PC6</th>\n",
|
|
" <th>PC7</th>\n",
|
|
" <th>PC8</th>\n",
|
|
" <th>PC9</th>\n",
|
|
" <th>PC10</th>\n",
|
|
" <th>PC11</th>\n",
|
|
" <th>PC12</th>\n",
|
|
" <th>PC13</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>Standard deviation</th>\n",
|
|
" <td>83.7641</td>\n",
|
|
" <td>70.9143</td>\n",
|
|
" <td>22.6437</td>\n",
|
|
" <td>19.1815</td>\n",
|
|
" <td>8.4232</td>\n",
|
|
" <td>2.0917</td>\n",
|
|
" <td>1.6994</td>\n",
|
|
" <td>0.7796</td>\n",
|
|
" <td>0.6578</td>\n",
|
|
" <td>0.3704</td>\n",
|
|
" <td>0.1864</td>\n",
|
|
" <td>0.063</td>\n",
|
|
" <td>0.0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>Proportion of variance</th>\n",
|
|
" <td>0.5395</td>\n",
|
|
" <td>0.3867</td>\n",
|
|
" <td>0.0394</td>\n",
|
|
" <td>0.0283</td>\n",
|
|
" <td>0.0055</td>\n",
|
|
" <td>0.0003</td>\n",
|
|
" <td>0.0002</td>\n",
|
|
" <td>0.0000</td>\n",
|
|
" <td>0.0000</td>\n",
|
|
" <td>0.0000</td>\n",
|
|
" <td>0.0000</td>\n",
|
|
" <td>0.000</td>\n",
|
|
" <td>0.0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>Cumulative proportion</th>\n",
|
|
" <td>0.5395</td>\n",
|
|
" <td>0.9262</td>\n",
|
|
" <td>0.9656</td>\n",
|
|
" <td>0.9939</td>\n",
|
|
" <td>0.9993</td>\n",
|
|
" <td>0.9997</td>\n",
|
|
" <td>0.9999</td>\n",
|
|
" <td>1.0000</td>\n",
|
|
" <td>1.0000</td>\n",
|
|
" <td>1.0000</td>\n",
|
|
" <td>1.0000</td>\n",
|
|
" <td>1.000</td>\n",
|
|
" <td>1.0</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" PC1 PC2 PC3 PC4 PC5 PC6 \\\n",
|
|
"Standard deviation 83.7641 70.9143 22.6437 19.1815 8.4232 2.0917 \n",
|
|
"Proportion of variance 0.5395 0.3867 0.0394 0.0283 0.0055 0.0003 \n",
|
|
"Cumulative proportion 0.5395 0.9262 0.9656 0.9939 0.9993 0.9997 \n",
|
|
"\n",
|
|
" PC7 PC8 PC9 PC10 PC11 PC12 PC13 \n",
|
|
"Standard deviation 1.6994 0.7796 0.6578 0.3704 0.1864 0.063 0.0 \n",
|
|
"Proportion of variance 0.0002 0.0000 0.0000 0.0000 0.0000 0.000 0.0 \n",
|
|
"Cumulative proportion 0.9999 1.0000 1.0000 1.0000 1.0000 1.000 1.0 "
|
|
]
|
|
},
|
|
"execution_count": 53,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"## PCA\n",
|
|
"cereals_df = pd.read_csv('Cereals.csv')\n",
|
|
"\n",
|
|
"pcs = PCA(n_components=2)\n",
|
|
"pcs.fit(cereals_df[['calories', 'rating']])\n",
|
|
"\n",
|
|
"pcs = PCA()\n",
|
|
"pcs.fit(cereals_df.iloc[:, 3:].dropna(axis=0))\n",
|
|
"pcsSummary_df = pd.DataFrame({'Standard deviation': \n",
|
|
" np.sqrt(pcs.explained_variance_), 'Proportion of variance': \n",
|
|
" pcs.explained_variance_ratio_,'Cumulative proportion': \n",
|
|
" np.cumsum(pcs.explained_variance_ratio_)})\n",
|
|
"pcsSummary_df = pcsSummary_df.transpose()\n",
|
|
"pcsSummary_df.columns = ['PC{}'.format(i) for i in range(1, \n",
|
|
" len(pcsSummary_df.columns) + 1)]\n",
|
|
"pcsSummary_df.round(4)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 57,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"array([[ 379.63089542, -188.68156228],\n",
|
|
" [-188.68156228, 197.32632105]])"
|
|
]
|
|
},
|
|
"execution_count": 57,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"np.cov(cereals_df.calories, cereals_df.rating)"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"orig_nbformat": 4
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|