Compare commits

..

No commits in common. "25105820877e3f6a2380d05352588858f7170c47" and "faa108bc90b4f246c43f30bc4cf1f773f3f893db" have entirely different histories.

View File

@ -14,7 +14,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 16, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -27,15 +27,12 @@
"import pandas as pd\n", "import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n", "from sklearn.model_selection import train_test_split\n",
"from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.naive_bayes import MultinomialNB\n",
"from sklearn.metrics import mean_squared_error\n",
"from math import sqrt\n",
"\n",
"import matplotlib.pylab as plt\n", "import matplotlib.pylab as plt\n",
"from dmba import classificationSummary, gainsChart\n", "from dmba import classificationSummary, gainsChart\n",
"\n", "\n",
"from sklearn import preprocessing\n", "from sklearn import preprocessing\n",
"from sklearn.metrics import accuracy_score\n", "from sklearn.metrics import accuracy_score\n",
"from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier, KNeighborsRegressor" "from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier"
] ]
}, },
{ {
@ -52,13 +49,11 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 64, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# Pre-processing\n", "# Pre-processing"
"housing_df = pd.read_csv('BostonHousing.csv')\n",
"trainData, validData = train_test_split(housing_df, test_size=0.4, random_state=26)"
] ]
}, },
{ {
@ -72,72 +67,11 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 68, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# a\n", "# a"
"## Normalize\n",
"scaler = preprocessing.StandardScaler()\n",
"\n",
"predictors = housing_df.columns.values.tolist()\n",
"predictors.remove('CAT. MEDV')\n",
"predictors.remove('MEDV')\n",
"\n",
"scaler.fit(trainData[predictors]) # Note the use of an array of column names\n",
"\n",
"# Transform the full dataset\n",
"housingNorm = pd.concat([pd.DataFrame(scaler.transform(housing_df[predictors]), \n",
" columns=predictors),\n",
" housing_df[['MEDV']]], axis=1)\n",
"\n",
"trainNorm = housingNorm.iloc[trainData.index]\n",
"validNorm = housingNorm.iloc[validData.index]\n",
"#newHousingNorm = pd.DataFrame(scaler.transform(housingNorm), columns=predictors)"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" k accuracy\n",
"0 1 0.643141\n",
"1 2 0.720241\n",
"2 3 0.710570\n",
"3 4 0.676896\n"
]
}
],
"source": [
"## K-NN\n",
"results = []\n",
"train_X = trainNorm[predictors]\n",
"train_y = trainNorm['MEDV']\n",
"valid_X = validNorm[predictors]\n",
"valid_y = validNorm['MEDV']\n",
"\n",
"for k in range(1,5):\n",
" knn = KNeighborsRegressor(n_neighbors=k).fit(train_X, train_y)\n",
" results.append({\n",
" 'k': k,\n",
" 'accuracy': knn.score(valid_X, valid_y)\n",
" })\n",
"\n",
"# Convert results to a pandas data frame\n",
"results = pd.DataFrame(results)\n",
"print(results)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The best k is k=2. This means we have greater accuracy when each point has its nearest 2 neighbors identified."
] ]
}, },
{ {
@ -153,33 +87,11 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 73, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [],
{
"name": "stdout",
"output_type": "stream",
"text": [
"[19.45]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/noah/.local/lib/python3.10/site-packages/sklearn/base.py:420: UserWarning: X does not have valid feature names, but StandardScaler was fitted with feature names\n",
" warnings.warn(\n",
"/home/noah/.local/lib/python3.10/site-packages/sklearn/base.py:420: UserWarning: X does not have valid feature names, but KNeighborsRegressor was fitted with feature names\n",
" warnings.warn(\n"
]
}
],
"source": [ "source": [
"# b\n", "# b"
"sample = [[0.2, 0, 7, 0, 0.538, 6, 62, 4.7, 4, 307, 21, 10]]\n",
"sample_norm = scaler.transform(sample)\n",
"ans_b = knn.predict(sample_norm)\n",
"print(ans_b)"
] ]
}, },
{ {
@ -192,11 +104,12 @@
] ]
}, },
{ {
"attachments": {}, "cell_type": "code",
"cell_type": "markdown", "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [],
"source": [ "source": [
"Error will be zero, or close to zero. Since the training data was used to build the model, checking the training data against itself should yield very high accuracy." "# c"
] ]
}, },
{ {
@ -213,8 +126,7 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"# d.\n", "# d."
"This model was built and tuned with a specific k value (k=2). This value of k was the best for the data used, but does not mean that this value of k is the best for future data."
] ]
}, },
{ {
@ -231,14 +143,7 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"# e.\n", "# e."
"\n",
"1. Select k-nearest samples\n",
"2. Compute average value for k-nearest neighbors\n",
"3. Score value with new data\n",
"4. Repeat for all new data samples\n",
"\n",
"If there are several thousands of new tracts, then k-NN will take much longer to run. The model will need to be rebuilt numerous times over the several thousand pieces of data. The lack of scalability will be a disadvantage. "
] ]
}, },
{ {
@ -483,22 +388,8 @@
} }
], ],
"metadata": { "metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": { "language_info": {
"codemirror_mode": { "name": "python"
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.10"
}, },
"orig_nbformat": 4 "orig_nbformat": 4
}, },