In: Computer Science
This is A PYTHON Question.
Developing a machine learning system for a given dataset.
Dataset
Diabetes dataset - https://www.kaggle.com/uciml/pima-indians-diabetes-database
Algorithms
Either one of the following:
1. K-Nearest Neighbors
2. Support Vector Machines
3. Neural Networks
Notes:
Explain your choice of algorithms and analyze the models developed.
Show what patterns/insights can be extracted from your chosen dataset and the selected algorithms.
Sorry that link is not working
I'm chosing support vector machine, because svm is very good at labeled data, so i choosed svm
if you have any problem in code, please let me know, you if you get desired output please upvote, any problem please mention in comment section
import pandas as pd
import requests
import csv
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn import svm
import numpy as np
payload = {
'__RequestVerificationToken': '',
'username': '',
'password': '',
'rememberme': 'false'
}
loginURL = 'https://www.kaggle.com/account/login'
dataURL = "https://www.kaggle.com/uciml/pima-indians-diabetes-database/downloads/diabetes.csv"
with requests.Session() as c:
response = c.get(loginURL).text
AFToken = response[response.index('antiForgeryToken')+19:response.index('isAnonymous: ')-12]
#print("AntiForgeryToken={}".format(AFToken))
payload['__RequestVerificationToken']=AFToken
c.post(loginURL + "?isModal=true&returnUrl=/", data=payload)
download = c.get(dataURL)
decoded_content = download.content.decode('utf-8')
cr = csv.reader(decoded_content.splitlines(), delimiter=',')
my_list = list(cr)
#for row in my_list:
# print(row)
df = pd.DataFrame(my_list)
def plot_decision_regions(X, y, classifier,test_idx=None, resolution=0.02):
# setup marker generator and color map
markers = ('s', 'x', 'v', '^', 'o')
colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
cmap = ListedColormap(colors[:len(np.unique(y))])
# plot the decision surface
x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
np.arange(x2_min, x2_max, resolution))
Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
Z = Z.reshape(xx1.shape)
plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
plt.xlim(xx1.min(), xx1.max())
plt.ylim(xx2.min(), xx2.max())
# plot all samples
X_test, y_test = X[test_idx, :], y[test_idx]
for idx, cl in enumerate(np.unique(y)):
plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1],
alpha=0.8, c=cmap(idx),
marker=markers[idx], label=cl)
# highlight test samples
if test_idx:
X_test, y_test = X[test_idx, :], y[test_idx]
plt.scatter(X_test[:, 0], X_test[:, 1], c='',
alpha=1.0, linewidth=1, marker='v',
s=55, label='test set')
X = df[['Glucose','BMI']]
y = df['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)
machine1 = svm.SVC(kernel = 'linear')
machine1.fit(X_train,y_train)
y_pred = machine1.predict(X_test)
plot_decision_regions(X_train, y_train, machine1)
plt.show