Compare commits

...

2 Commits

Author SHA1 Message Date
2510582087 Removing lock 2023-03-29 17:48:29 -05:00
0f3ac3ae6c k-NN for Boston Housing data 2023-03-29 17:48:15 -05:00

View File

@ -14,7 +14,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
@ -27,12 +27,15 @@
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.naive_bayes import MultinomialNB\n",
"from sklearn.metrics import mean_squared_error\n",
"from math import sqrt\n",
"\n",
"import matplotlib.pylab as plt\n",
"from dmba import classificationSummary, gainsChart\n",
"\n",
"from sklearn import preprocessing\n",
"from sklearn.metrics import accuracy_score\n",
"from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier"
"from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier, KNeighborsRegressor"
]
},
{
@ -49,11 +52,13 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 64,
"metadata": {},
"outputs": [],
"source": [
"# Pre-processing"
"# Pre-processing\n",
"housing_df = pd.read_csv('BostonHousing.csv')\n",
"trainData, validData = train_test_split(housing_df, test_size=0.4, random_state=26)"
]
},
{
@ -67,11 +72,72 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 68,
"metadata": {},
"outputs": [],
"source": [
"# a"
"# a\n",
"## Normalize\n",
"scaler = preprocessing.StandardScaler()\n",
"\n",
"predictors = housing_df.columns.values.tolist()\n",
"predictors.remove('CAT. MEDV')\n",
"predictors.remove('MEDV')\n",
"\n",
"scaler.fit(trainData[predictors]) # Note the use of an array of column names\n",
"\n",
"# Transform the full dataset\n",
"housingNorm = pd.concat([pd.DataFrame(scaler.transform(housing_df[predictors]), \n",
" columns=predictors),\n",
" housing_df[['MEDV']]], axis=1)\n",
"\n",
"trainNorm = housingNorm.iloc[trainData.index]\n",
"validNorm = housingNorm.iloc[validData.index]\n",
"#newHousingNorm = pd.DataFrame(scaler.transform(housingNorm), columns=predictors)"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" k accuracy\n",
"0 1 0.643141\n",
"1 2 0.720241\n",
"2 3 0.710570\n",
"3 4 0.676896\n"
]
}
],
"source": [
"## K-NN\n",
"results = []\n",
"train_X = trainNorm[predictors]\n",
"train_y = trainNorm['MEDV']\n",
"valid_X = validNorm[predictors]\n",
"valid_y = validNorm['MEDV']\n",
"\n",
"for k in range(1,5):\n",
" knn = KNeighborsRegressor(n_neighbors=k).fit(train_X, train_y)\n",
" results.append({\n",
" 'k': k,\n",
" 'accuracy': knn.score(valid_X, valid_y)\n",
" })\n",
"\n",
"# Convert results to a pandas data frame\n",
"results = pd.DataFrame(results)\n",
"print(results)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The best k is k=2. This means we have greater accuracy when each point has its nearest 2 neighbors identified."
]
},
{
@ -87,11 +153,33 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 73,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[19.45]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/noah/.local/lib/python3.10/site-packages/sklearn/base.py:420: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n",
" warnings.warn(\n",
"/home/noah/.local/lib/python3.10/site-packages/sklearn/base.py:420: UserWarning: X does not have valid feature names, but KNeighborsRegressor was fitted with feature names\n",
" warnings.warn(\n"
]
}
],
"source": [
"# b"
"# b\n",
"sample = [[0.2, 0, 7, 0, 0.538, 6, 62, 4.7, 4, 307, 21, 10]]\n",
"sample_norm = scaler.transform(sample)\n",
"ans_b = knn.predict(sample_norm)\n",
"print(ans_b)"
]
},
{
@ -104,12 +192,11 @@
]
},
{
"cell_type": "code",
"execution_count": null,
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"outputs": [],
"source": [
"# c"
"Error will be zero, or close to zero. Since the training data was used to build the model, checking the training data against itself should yield very high accuracy."
]
},
{
@ -126,7 +213,8 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"# d."
"# d.\n",
"This model was built and tuned with a specific k value (k=2). This value of k was the best for the data used, but does not mean that this value of k is the best for future data."
]
},
{
@ -143,7 +231,14 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"# e."
"# e.\n",
"\n",
"1. Select k-nearest samples\n",
"2. Compute average value for k-nearest neighbors\n",
"3. Score value with new data\n",
"4. Repeat for all new data samples\n",
"\n",
"If there are several thousands of new tracts, then k-NN will take much longer to run. The model will need to be rebuilt numerous times over the several thousand pieces of data. The lack of scalability will be a disadvantage. "
]
},
{
@ -388,8 +483,22 @@
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python"
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.10"
},
"orig_nbformat": 4
},