In: Computer Science
Please find implementation of KNN algorithm (in C++) below
Please explain how each code implements the following steps of the algorithm in question:
1. Determine parameter K = number of nearest neighbors
2. Calculate the distance between the query-instance and all the
training samples
3. Sort the distance and determine nearest neighbors based on the
K-th minimumdistance
4. Gather the category Y of the nearest neighbors
5. Use simple majority of the category of nearest neighbors as the
prediction value of the query instance
// C++ program to find groups of unknown
// Points using K nearest neighbour algorithm.
#include <bits/stdc++.h>
using namespace std;
struct Point
{
int val; // Group of point
double x, y; // Co-ordinate of point
double distance; // Distance from test point
};
// Used to sort an array of points by increasing
// order of distance
bool comparison(Point a, Point b)
{
return (a.distance < b.distance);
}
// This function finds classification of point p using
// k nearest neighbour algorithm. It assumes only two
// groups and returns 0 if p belongs to group 0, else
// 1 (belongs to group 1).
int classifyAPoint(Point arr[], int n, int k, Point p)
{
// Fill distances of all points from p
for (int i = 0; i < n; i++)
arr[i].distance =
sqrt((arr[i].x - p.x) * (arr[i].x - p.x) +
(arr[i].y - p.y) * (arr[i].y - p.y));
// Sort the Points by distance from p
sort(arr, arr+n, comparison);
// Now consider the first k elements and only
// two groups
int freq1 = 0; // Frequency of group 0
int freq2 = 0; // Frequency of group 1
for (int i = 0; i < k; i++)
{
if (arr[i].val == 0)
freq1++;
else if (arr[i].val == 1)
freq2++;
}
return (freq1 > freq2 ? 0 : 1);
}
// Driver code
int main()
{
int n = 17; // Number of data points
Point arr[n];
arr[0].x = 1;
arr[0].y = 12;
arr[0].val = 0;
arr[1].x = 2;
arr[1].y = 5;
arr[1].val = 0;
arr[2].x = 5;
arr[2].y = 3;
arr[2].val = 1;
arr[3].x = 3;
arr[3].y = 2;
arr[3].val = 1;
arr[4].x = 3;
arr[4].y = 6;
arr[4].val = 0;
arr[5].x = 1.5;
arr[5].y = 9;
arr[5].val = 1;
arr[6].x = 7;
arr[6].y = 2;
arr[6].val = 1;
arr[7].x = 6;
arr[7].y = 1;
arr[7].val = 1;
arr[8].x = 3.8;
arr[8].y = 3;
arr[8].val = 1;
arr[9].x = 3;
arr[9].y = 10;
arr[9].val = 0;
arr[10].x = 5.6;
arr[10].y = 4;
arr[10].val = 1;
arr[11].x = 4;
arr[11].y = 2;
arr[11].val = 1;
arr[12].x = 3.5;
arr[12].y = 8;
arr[12].val = 0;
arr[13].x = 2;
arr[13].y = 11;
arr[13].val = 0;
arr[14].x = 2;
arr[14].y = 5;
arr[14].val = 1;
arr[15].x = 2;
arr[15].y = 9;
arr[15].val = 0;
arr[16].x = 1;
arr[16].y = 7;
arr[16].val = 0;
/*Testing Point*/
Point p;
p.x = 2.5;
p.y = 7;
// Parameter to decide group of the testing point
int k = 3;
printf ("The value classified to unknown point"
" is %d.\n", classifyAPoint(arr, n, k, p));
return 0;
}
Output:
The value classified to unknown point is 0.
Weighted kNN is a modified version of k nearest neighbors. One
of the many issues that affect the performance of the kNN algorithm
is the choice of the hyper parameter k. If k is too small, the
algorithm would be more sensitive to outliers. If k is too large,
then the neighbour hood may include too many points from other
classes.
Another issue is the approach to combining the class labels. The
simplest method is to take the majority vote, but this can be a
problem if the nearest neighbours vary widely in their distance and
the closest neighbours more reliably indicate the class of the
object.
Intuition:
Consider the following training set
The red labels indicate the class 0 points and the green labels
indicate class 1 points.
Consider the white point as the query point( the point whose class
label has to be predicted)
If we give the above dataset to a kNN based classifier, then the
classifier would declare the query point to belong to the class 0.
But in the plot, it is clear that the point is more closer to the
class 1 points compared to the class 0 points. To overcome this
disadvantage, weighted kNN is used. In weighted kNN, the nearest k
points are given a weight using a function called as the kernel
function. The intuition behind weighted kNN, is to give more weight
to the points which are nearby and less weight to the points which
are farther away. Any function can be used as a kernel function for
the weighted knn classifier whose value decreases as the distance
increases. The simple function which is used is the inverse
distance function.
Algorithm:
Let L = { ( xi , yi ) , i = 1, . . . ,n } be a training set of observations xi with given class yi and let x be a new observation(query point), whose class label y has to be predicted.
Compute d(xi, x) for i = 1, . . . ,n , the distance between the query point and every other point in the training set.
Select D’ ⊆ D, the set of k nearest training data points to the query points
Predict the class of the query point, using distance-weighted voting. The v represents the class labels. Use the following formula
Implementation:
Consider 0 as the label for class 0 and 1 as the label for class 1.
Below is the implementation of weighted-kNN algorithm.
// C++ program to implement the
// weighted K nearest neighbour algorithm.
#include <bits/stdc++.h>
using namespace std;
struct Point
{
int val; // Class of point
double x, y; // Co-ordinate of point
double distance; // Distance from test point
};
// Used to sort an array of points by increasing
// order of weighted distance
bool comparison(Point a, Point b)
{
return (a.distance < b.distance);
}
// This function finds classification of point p using
// weighted k nearest neighbour algorithm. It assumes only
// two groups and returns 0 if p belongs to class 0, else
// 1 (belongs to class 1).
int weightedkNN(Point arr[], int n, int k, Point p)
{
// Fill weighted distances of all points from p
for (int i = 0; i < n; i++)
arr[i].distance =
(sqrt((arr[i].x - p.x) * (arr[i].x - p.x) +
(arr[i].y - p.y) * (arr[i].y - p.y)));
// Sort the Points by weighted distance from p
sort(arr, arr+n, comparison);
// Now consider the first k elements and only
// two groups
double freq1 = 0; // weighted sum of group 0
double freq2 = 0; // weighted sum of group 1
for (int i = 0; i < k; i++)
{
if (arr[i].val == 0)
freq1 += double(1/arr[i].distance);
else if (arr[i].val == 1)
freq2 += double(1/arr[i].distance);
}
return (freq1 > freq2 ? 0 : 1);
}
// Driver code
int main()
{
int n = 13; // Number of data points
Point arr[n];
arr[0].x = 0;
arr[0].y = 4;
arr[0].val = 0;
arr[1].x = 1;
arr[1].y = 4.9;
arr[1].val = 0;
arr[2].x = 1.6;
arr[2].y = 5.4;
arr[2].val = 0;
arr[3].x = 2.2;
arr[3].y = 6;
arr[3].val = 0;
arr[4].x = 2.8;
arr[4].y = 7;
arr[4].val = 0;
arr[5].x = 3.2;
arr[5].y = 8;
arr[5].val = 0;
arr[6].x = 3.4;
arr[6].y = 9;
arr[6].val = 0;
arr[7].x = 1.8;
arr[7].y = 1;
arr[7].val = 1;
arr[8].x = 2.2;
arr[8].y = 3;
arr[8].val = 1;
arr[9].x = 3;
arr[9].y = 4;
arr[9].val = 1;
arr[10].x = 4;
arr[10].y = 4.5;
arr[10].val = 1;
arr[11].x = 5;
arr[11].y = 5;
arr[11].val = 1;
arr[12].x = 6;
arr[12].y = 5.5;
arr[12].val = 1;
/*Testing Point*/
Point p;
p.x = 2;
p.y = 4;
// Parameter to decide the class of the query point
int k = 5;
printf ("The value classified to query point"
" is: %d.\n", weightedkNN(arr, n, k, p));
return 0;
}
Determine parameter K = number of nearest neighbors
K-Nearest Neighbours
K-Nearest Neighbors is one of the most basic yet essential classification algorithms in Machine Learning. It belongs to the supervised learning domain and finds intense application in pattern recognition, data mining and intrusion detection.
We are given some prior data (also called training data), which classifies coordinates into groups identified by an attribute.
As an example, consider the following table of data points containing two features:
Now, given another set of data points (also called testing data), allocate these points a group by analyzing the training set. Note that the unclassified points are marked as ‘White’.
Intuition
If we plot these points on a graph, we may be able to locate some
clusters or groups. Now, given an unclassified point, we can assign
it to a group by observing what group its nearest neighbors belong
to. This means a point close to a cluster of points classified as
‘Red’ has a higher probability of getting classified as ‘Red’.
Intuitively, we can see that the first point (2.5, 7) should be classified as ‘Green’ and the second point (5.5, 4.5) should be classified as ‘Red’.
Algorithm
Let m be the number of training data samples. Let p be an unknown
point.
Store the training samples in an array of data points arr[]. This means each element of this array represents a tuple (x, y).
for i=0 to m:
Calculate Euclidean distance d(arr[i], p).
Make set S of K smallest distances obtained. Each of these distances corresponds to an already classified data point.
Return the majority label among S.
K can be kept as an odd number so that we can calculate a clear majority in the case where only two groups are possible (e.g. Red/Blue). With increasing K, we get smoother, more defined boundaries across different classifications. Also, the accuracy of the above classifier increases as we increase the number of data points in the training set.
Example Program
Assume 0 and 1 as the two classifiers (groups).
// C++ program to find groups of unknown
// Points using K nearest neighbour algorithm.
#include <bits/stdc++.h>
using namespace std;
struct Point
{
int val; // Group of point
double x, y; // Co-ordinate of point
double distance; // Distance from test point
};
// Used to sort an array of points by increasing
// order of distance
bool comparison(Point a, Point b)
{
return (a.distance < b.distance);
}
// This function finds classification of point p using
// k nearest neighbour algorithm. It assumes only two
// groups and returns 0 if p belongs to group 0, else
// 1 (belongs to group 1).
int classifyAPoint(Point arr[], int n, int k, Point p)
{
// Fill distances of all points from p
for (int i = 0; i < n; i++)
arr[i].distance =
sqrt((arr[i].x - p.x) * (arr[i].x - p.x) +
(arr[i].y - p.y) * (arr[i].y - p.y));
// Sort the Points by distance from p
sort(arr, arr+n, comparison);
// Now consider the first k elements and only
// two groups
int freq1 = 0; // Frequency of group 0
int freq2 = 0; // Frequency of group 1
for (int i = 0; i < k; i++)
{
if (arr[i].val == 0)
freq1++;
else if (arr[i].val == 1)
freq2++;
}
return (freq1 > freq2 ? 0 : 1);
}
// Driver code
int main()
{
int n = 17; // Number of data points
Point arr[n];
arr[0].x = 1;
arr[0].y = 12;
arr[0].val = 0;
arr[1].x = 2;
arr[1].y = 5;
arr[1].val = 0;
arr[2].x = 5;
arr[2].y = 3;
arr[2].val = 1;
arr[3].x = 3;
arr[3].y = 2;
arr[3].val = 1;
arr[4].x = 3;
arr[4].y = 6;
arr[4].val = 0;
arr[5].x = 1.5;
arr[5].y = 9;
arr[5].val = 1;
arr[6].x = 7;
arr[6].y = 2;
arr[6].val = 1;
arr[7].x = 6;
arr[7].y = 1;
arr[7].val = 1;
arr[8].x = 3.8;
arr[8].y = 3;
arr[8].val = 1;
arr[9].x = 3;
arr[9].y = 10;
arr[9].val = 0;
arr[10].x = 5.6;
arr[10].y = 4;
arr[10].val = 1;
arr[11].x = 4;
arr[11].y = 2;
arr[11].val = 1;
arr[12].x = 3.5;
arr[12].y = 8;
arr[12].val = 0;
arr[13].x = 2;
arr[13].y = 11;
arr[13].val = 0;
arr[14].x = 2;
arr[14].y = 5;
arr[14].val = 1;
arr[15].x = 2;
arr[15].y = 9;
arr[15].val = 0;
arr[16].x = 1;
arr[16].y = 7;
arr[16].val = 0;
/*Testing Point*/
Point p;
p.x = 2.5;
p.y = 7;
// Parameter to decide group of the testing point
int k = 3;
printf ("The value classified to unknown point"
" is %d.\n", classifyAPoint(arr, n, k, p));
return 0;
References : Shaji Vellaoor Computer Algorithm Analysis and Design Test