From 0f3ac3ae6c19b151f8fd5c0d19ff7d02b64bc22c Mon Sep 17 00:00:00 2001 From: noah Date: Wed, 29 Mar 2023 17:48:15 -0500 Subject: [PATCH] k-NN for Boston Housing data --- .~lock.BostonHousing.csv# | 1 + Schrick-Noah_Learning-Practice-8.ipynb | 141 ++++++++++++++++++++++--- 2 files changed, 126 insertions(+), 16 deletions(-) create mode 100644 .~lock.BostonHousing.csv# diff --git a/.~lock.BostonHousing.csv# b/.~lock.BostonHousing.csv# new file mode 100644 index 0000000..9c34378 --- /dev/null +++ b/.~lock.BostonHousing.csv# @@ -0,0 +1 @@ +,noah,NovaArchSys,29.03.2023 16:42,file:///home/noah/.config/libreoffice/4; \ No newline at end of file diff --git a/Schrick-Noah_Learning-Practice-8.ipynb b/Schrick-Noah_Learning-Practice-8.ipynb index 1f104e5..6f81ae4 100644 --- a/Schrick-Noah_Learning-Practice-8.ipynb +++ b/Schrick-Noah_Learning-Practice-8.ipynb @@ -14,7 +14,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -27,12 +27,15 @@ "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.naive_bayes import MultinomialNB\n", + "from sklearn.metrics import mean_squared_error\n", + "from math import sqrt\n", + "\n", "import matplotlib.pylab as plt\n", "from dmba import classificationSummary, gainsChart\n", "\n", "from sklearn import preprocessing\n", "from sklearn.metrics import accuracy_score\n", - "from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier" + "from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier, KNeighborsRegressor" ] }, { @@ -49,11 +52,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 64, "metadata": {}, "outputs": [], "source": [ - "# Pre-processing" + "# Pre-processing\n", + "housing_df = pd.read_csv('BostonHousing.csv')\n", + "trainData, validData = train_test_split(housing_df, test_size=0.4, random_state=26)" ] }, { @@ -67,11 +72,72 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 68, "metadata": {}, "outputs": [], "source": [ - "# a" + "# a\n", + "## Normalize\n", + "scaler = preprocessing.StandardScaler()\n", + "\n", + "predictors = housing_df.columns.values.tolist()\n", + "predictors.remove('CAT. MEDV')\n", + "predictors.remove('MEDV')\n", + "\n", + "scaler.fit(trainData[predictors]) # Note the use of an array of column names\n", + "\n", + "# Transform the full dataset\n", + "housingNorm = pd.concat([pd.DataFrame(scaler.transform(housing_df[predictors]), \n", + " columns=predictors),\n", + " housing_df[['MEDV']]], axis=1)\n", + "\n", + "trainNorm = housingNorm.iloc[trainData.index]\n", + "validNorm = housingNorm.iloc[validData.index]\n", + "#newHousingNorm = pd.DataFrame(scaler.transform(housingNorm), columns=predictors)" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " k accuracy\n", + "0 1 0.643141\n", + "1 2 0.720241\n", + "2 3 0.710570\n", + "3 4 0.676896\n" + ] + } + ], + "source": [ + "## K-NN\n", + "results = []\n", + "train_X = trainNorm[predictors]\n", + "train_y = trainNorm['MEDV']\n", + "valid_X = validNorm[predictors]\n", + "valid_y = validNorm['MEDV']\n", + "\n", + "for k in range(1,5):\n", + " knn = KNeighborsRegressor(n_neighbors=k).fit(train_X, train_y)\n", + " results.append({\n", + " 'k': k,\n", + " 'accuracy': knn.score(valid_X, valid_y)\n", + " })\n", + "\n", + "# Convert results to a pandas data frame\n", + "results = pd.DataFrame(results)\n", + "print(results)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The best k is k=2. This means we have greater accuracy when each point has its nearest 2 neighbors identified." ] }, { @@ -87,11 +153,33 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 73, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[19.45]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/noah/.local/lib/python3.10/site-packages/sklearn/base.py:420: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n", + " warnings.warn(\n", + "/home/noah/.local/lib/python3.10/site-packages/sklearn/base.py:420: UserWarning: X does not have valid feature names, but KNeighborsRegressor was fitted with feature names\n", + " warnings.warn(\n" + ] + } + ], "source": [ - "# b" + "# b\n", + "sample = [[0.2, 0, 7, 0, 0.538, 6, 62, 4.7, 4, 307, 21, 10]]\n", + "sample_norm = scaler.transform(sample)\n", + "ans_b = knn.predict(sample_norm)\n", + "print(ans_b)" ] }, { @@ -104,12 +192,11 @@ ] }, { - "cell_type": "code", - "execution_count": null, + "attachments": {}, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "# c" + "Error will be zero, or close to zero. Since the training data was used to build the model, checking the training data against itself should yield very high accuracy." ] }, { @@ -126,7 +213,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# d." + "# d.\n", + "This model was built and tuned with a specific k value (k=2). This value of k was the best for the data used, but does not mean that this value of k is the best for future data." ] }, { @@ -143,7 +231,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# e." + "# e.\n", + "\n", + "1. Select k-nearest samples\n", + "2. Compute average value for k-nearest neighbors\n", + "3. Score value with new data\n", + "4. Repeat for all new data samples\n", + "\n", + "If there are several thousands of new tracts, then k-NN will take much longer to run. The model will need to be rebuilt numerous times over the several thousand pieces of data. The lack of scalability will be a disadvantage. " ] }, { @@ -388,8 +483,22 @@ } ], "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, "language_info": { - "name": "python" + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.10" }, "orig_nbformat": 4 },