1
2
3
4
5 """
6 This module provides code for doing k-nearest-neighbors classification.
7
8 k Nearest Neighbors is a supervised learning algorithm that classifies
9 a new observation based the classes in its surrounding neighborhood.
10
11 Glossary:
12
13 - distance The distance between two points in the feature space.
14 - weight The importance given to each point for classification.
15
16
17 Classes:
18
19 - kNN Holds information for a nearest neighbors classifier.
20
21
22 Functions:
23
24 - train Train a new kNN classifier.
25 - calculate Calculate the probabilities of each class, given an observation.
26 - classify Classify an observation into a class.
27
28 Weighting Functions:
29
30 - equal_weight Every example is given a weight of 1.
31
32 """
33
34 import numpy
35
36
38 """Holds information necessary to do nearest neighbors classification.
39
40 Members:
41
42 - classes Set of the possible classes.
43 - xs List of the neighbors.
44 - ys List of the classes that the neighbors belong to.
45 - k Number of neighbors to look at.
46
47 """
49 """kNN()"""
50 self.classes = set()
51 self.xs = []
52 self.ys = []
53 self.k = None
54
55
57 """equal_weight(x, y) -> 1"""
58
59 return 1
60
61
62 -def train(xs, ys, k, typecode=None):
63 """train(xs, ys, k) -> kNN
64
65 Train a k nearest neighbors classifier on a training set. xs is a
66 list of observations and ys is a list of the class assignments.
67 Thus, xs and ys should contain the same number of elements. k is
68 the number of neighbors that should be examined when doing the
69 classification.
70 """
71 knn = kNN()
72 knn.classes = set(ys)
73 knn.xs = numpy.asarray(xs, typecode)
74 knn.ys = ys
75 knn.k = k
76 return knn
77
78
80 """calculate(knn, x[, weight_fn][, distance_fn]) -> weight dict
81
82 Calculate the probability for each class. knn is a kNN object. x
83 is the observed data. weight_fn is an optional function that
84 takes x and a training example, and returns a weight. distance_fn
85 is an optional function that takes two points and returns the
86 distance between them. If distance_fn is None (the default), the
87 Euclidean distance is used. Returns a dictionary of the class to
88 the weight given to the class.
89 """
90 x = numpy.asarray(x)
91
92 order = []
93 if distance_fn:
94 for i in range(len(knn.xs)):
95 dist = distance_fn(x, knn.xs[i])
96 order.append((dist, i))
97 else:
98
99 temp = numpy.zeros(len(x))
100
101
102 for i in range(len(knn.xs)):
103 temp[:] = x - knn.xs[i]
104 dist = numpy.sqrt(numpy.dot(temp, temp))
105 order.append((dist, i))
106 order.sort()
107
108
109 weights = {}
110 for k in knn.classes:
111 weights[k] = 0.0
112 for dist, i in order[:knn.k]:
113 klass = knn.ys[i]
114 weights[klass] = weights[klass] + weight_fn(x, knn.xs[i])
115
116 return weights
117
118
120 """classify(knn, x[, weight_fn][, distance_fn]) -> class
121
122 Classify an observation into a class. If not specified, weight_fn will
123 give all neighbors equal weight. distance_fn is an optional function
124 that takes two points and returns the distance between them. If
125 distance_fn is None (the default), the Euclidean distance is used.
126 """
127 weights = calculate(
128 knn, x, weight_fn=weight_fn, distance_fn=distance_fn)
129
130 most_class = None
131 most_weight = None
132 for klass, weight in weights.items():
133 if most_class is None or weight > most_weight:
134 most_class = klass
135 most_weight = weight
136 return most_class
137