This is a maching learning question. Using the Kaggle diamonds dataset, build a KNN based estimator...

Using the Kaggle diamonds dataset, build a KNN based estimator for estimating the price of a diamond and propose an appropriate K value.

Please use python and google colab format. Thank you!


import numpy as np # linear algebra

import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt

import pandas as pd

import cufflinks as cf

import sklearn

from sklearn import svm, preprocessing

import seaborn as sns

import plotly.graph_objs as go

import plotly.plotly as py

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot


import os

df = pd.read_csv('../input/diamonds.csv')


sns.FacetGrid(df, hue = 'cut', height = 6).map(sns.distplot, 'price').add_legend()


cut_dict = {'Fair' : 1, 'Good' : 2, 'Very Good' : 3, 'Premium' : 4, 'Ideal' : 5}

clarity_dict ={ 'I1' : 1, 'SI2' : 2, 'SI1' : 3, 'VS2' : 4, 'VS1' : 5, 'VVS2' : 6, 'VVS1' : 7 , 'IF' : 8}

color_dict = {'D':7, 'E':6, 'F':5, 'G':4, 'H':3, 'I':2, 'J':1}

df['cut'] = df['cut'].map(cut_dict)

df['clarity'] = df['clarity'].map(clarity_dict)

df['color'] = df['color'].map(color_dict)

df = df.drop('Unnamed: 0', axis = 1)



df = sklearn.utils.shuffle(df, random_state = 42)

X = df.drop(['price'], axis = 1).values

X = preprocessing.scale(X)

y = df['price'].values

y = preprocessing.scale(y)

test_size = 200

X_train = X[: -test_size]

y_train = y[: -test_size]

X_test = X[-test_size :]

y_test = y[-test_size :]

from sklearn.neighbors import KNeighborsRegressor

score = []

for k in range(1,20): # running for different K values to know which yields the max accuracy.

clf = KNeighborsRegressor(n_neighbors = k, weights = 'distance', p=1), y_train)

score.append(clf.score(X_test, y_test ))

k_max = score.index(max(score))+1

print( "At K = {}, Max Accuracy = {}".format(k_max, max(score)*100))


clf = KNeighborsRegressor(n_neighbors = k_max, weights = 'distance', p=1), y_train)

print(clf.score(X_test, y_test ))

y_pred = clf.predict(X_test)

