Compare commits
2 Commits
faa108bc90
...
2510582087
| Author | SHA1 | Date | |
|---|---|---|---|
| 2510582087 | |||
| 0f3ac3ae6c |
@ -14,7 +14,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -27,12 +27,15 @@
|
||||
"import pandas as pd\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"from sklearn.naive_bayes import MultinomialNB\n",
|
||||
"from sklearn.metrics import mean_squared_error\n",
|
||||
"from math import sqrt\n",
|
||||
"\n",
|
||||
"import matplotlib.pylab as plt\n",
|
||||
"from dmba import classificationSummary, gainsChart\n",
|
||||
"\n",
|
||||
"from sklearn import preprocessing\n",
|
||||
"from sklearn.metrics import accuracy_score\n",
|
||||
"from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier"
|
||||
"from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier, KNeighborsRegressor"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -49,11 +52,13 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 64,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Pre-processing"
|
||||
"# Pre-processing\n",
|
||||
"housing_df = pd.read_csv('BostonHousing.csv')\n",
|
||||
"trainData, validData = train_test_split(housing_df, test_size=0.4, random_state=26)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -67,11 +72,72 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 68,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# a"
|
||||
"# a\n",
|
||||
"## Normalize\n",
|
||||
"scaler = preprocessing.StandardScaler()\n",
|
||||
"\n",
|
||||
"predictors = housing_df.columns.values.tolist()\n",
|
||||
"predictors.remove('CAT. MEDV')\n",
|
||||
"predictors.remove('MEDV')\n",
|
||||
"\n",
|
||||
"scaler.fit(trainData[predictors]) # Note the use of an array of column names\n",
|
||||
"\n",
|
||||
"# Transform the full dataset\n",
|
||||
"housingNorm = pd.concat([pd.DataFrame(scaler.transform(housing_df[predictors]), \n",
|
||||
" columns=predictors),\n",
|
||||
" housing_df[['MEDV']]], axis=1)\n",
|
||||
"\n",
|
||||
"trainNorm = housingNorm.iloc[trainData.index]\n",
|
||||
"validNorm = housingNorm.iloc[validData.index]\n",
|
||||
"#newHousingNorm = pd.DataFrame(scaler.transform(housingNorm), columns=predictors)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 69,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" k accuracy\n",
|
||||
"0 1 0.643141\n",
|
||||
"1 2 0.720241\n",
|
||||
"2 3 0.710570\n",
|
||||
"3 4 0.676896\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"## K-NN\n",
|
||||
"results = []\n",
|
||||
"train_X = trainNorm[predictors]\n",
|
||||
"train_y = trainNorm['MEDV']\n",
|
||||
"valid_X = validNorm[predictors]\n",
|
||||
"valid_y = validNorm['MEDV']\n",
|
||||
"\n",
|
||||
"for k in range(1,5):\n",
|
||||
" knn = KNeighborsRegressor(n_neighbors=k).fit(train_X, train_y)\n",
|
||||
" results.append({\n",
|
||||
" 'k': k,\n",
|
||||
" 'accuracy': knn.score(valid_X, valid_y)\n",
|
||||
" })\n",
|
||||
"\n",
|
||||
"# Convert results to a pandas data frame\n",
|
||||
"results = pd.DataFrame(results)\n",
|
||||
"print(results)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The best k is k=2. This means we have greater accuracy when each point has its nearest 2 neighbors identified."
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -87,11 +153,33 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 73,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[19.45]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/home/noah/.local/lib/python3.10/site-packages/sklearn/base.py:420: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n",
|
||||
" warnings.warn(\n",
|
||||
"/home/noah/.local/lib/python3.10/site-packages/sklearn/base.py:420: UserWarning: X does not have valid feature names, but KNeighborsRegressor was fitted with feature names\n",
|
||||
" warnings.warn(\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# b"
|
||||
"# b\n",
|
||||
"sample = [[0.2, 0, 7, 0, 0.538, 6, 62, 4.7, 4, 307, 21, 10]]\n",
|
||||
"sample_norm = scaler.transform(sample)\n",
|
||||
"ans_b = knn.predict(sample_norm)\n",
|
||||
"print(ans_b)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -104,12 +192,11 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# c"
|
||||
"Error will be zero, or close to zero. Since the training data was used to build the model, checking the training data against itself should yield very high accuracy."
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -126,7 +213,8 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# d."
|
||||
"# d.\n",
|
||||
"This model was built and tuned with a specific k value (k=2). This value of k was the best for the data used, but does not mean that this value of k is the best for future data."
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -143,7 +231,14 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# e."
|
||||
"# e.\n",
|
||||
"\n",
|
||||
"1. Select k-nearest samples\n",
|
||||
"2. Compute average value for k-nearest neighbors\n",
|
||||
"3. Score value with new data\n",
|
||||
"4. Repeat for all new data samples\n",
|
||||
"\n",
|
||||
"If there are several thousands of new tracts, then k-NN will take much longer to run. The model will need to be rebuilt numerous times over the several thousand pieces of data. The lack of scalability will be a disadvantage. "
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -388,8 +483,22 @@
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"name": "python"
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.10"
|
||||
},
|
||||
"orig_nbformat": 4
|
||||
},
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user