Package Bio :: Module kNN
[hide private]
[frames] | no frames]

Source Code for Module Bio.kNN

  1  #!/usr/bin/env python 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """ 
  6  This module provides code for doing k-nearest-neighbors classification. 
  7   
  8  k Nearest Neighbors is a supervised learning algorithm that classifies 
  9  a new observation based the classes in its surrounding neighborhood. 
 10   
 11  Glossary: 
 12   
 13      - distance   The distance between two points in the feature space. 
 14      - weight     The importance given to each point for classification. 
 15   
 16   
 17  Classes: 
 18   
 19      - kNN           Holds information for a nearest neighbors classifier. 
 20   
 21   
 22  Functions: 
 23   
 24      - train        Train a new kNN classifier. 
 25      - calculate    Calculate the probabilities of each class, given an observation. 
 26      - classify     Classify an observation into a class. 
 27   
 28  Weighting Functions: 
 29   
 30      - equal_weight    Every example is given a weight of 1. 
 31   
 32  """ 
 33   
 34  import numpy 
 35   
 36  __docformat__ = "restructuredtext en" 
 37   
 38   
39 -class kNN(object):
40 """Holds information necessary to do nearest neighbors classification. 41 42 Members: 43 44 - classes Set of the possible classes. 45 - xs List of the neighbors. 46 - ys List of the classes that the neighbors belong to. 47 - k Number of neighbors to look at. 48 49 """
50 - def __init__(self):
51 """kNN()""" 52 self.classes = set() 53 self.xs = [] 54 self.ys = [] 55 self.k = None
56 57
58 -def equal_weight(x, y):
59 """equal_weight(x, y) -> 1""" 60 # everything gets 1 vote 61 return 1
62 63
64 -def train(xs, ys, k, typecode=None):
65 """train(xs, ys, k) -> kNN 66 67 Train a k nearest neighbors classifier on a training set. xs is a 68 list of observations and ys is a list of the class assignments. 69 Thus, xs and ys should contain the same number of elements. k is 70 the number of neighbors that should be examined when doing the 71 classification. 72 """ 73 knn = kNN() 74 knn.classes = set(ys) 75 knn.xs = numpy.asarray(xs, typecode) 76 knn.ys = ys 77 knn.k = k 78 return knn
79 80
81 -def calculate(knn, x, weight_fn=equal_weight, distance_fn=None):
82 """calculate(knn, x[, weight_fn][, distance_fn]) -> weight dict 83 84 Calculate the probability for each class. knn is a kNN object. x 85 is the observed data. weight_fn is an optional function that 86 takes x and a training example, and returns a weight. distance_fn 87 is an optional function that takes two points and returns the 88 distance between them. If distance_fn is None (the default), the 89 Euclidean distance is used. Returns a dictionary of the class to 90 the weight given to the class. 91 """ 92 x = numpy.asarray(x) 93 94 order = [] # list of (distance, index) 95 if distance_fn: 96 for i in range(len(knn.xs)): 97 dist = distance_fn(x, knn.xs[i]) 98 order.append((dist, i)) 99 else: 100 # Default: Use a fast implementation of the Euclidean distance 101 temp = numpy.zeros(len(x)) 102 # Predefining temp allows reuse of this array, making this 103 # function about twice as fast. 104 for i in range(len(knn.xs)): 105 temp[:] = x - knn.xs[i] 106 dist = numpy.sqrt(numpy.dot(temp, temp)) 107 order.append((dist, i)) 108 order.sort() 109 110 # first 'k' are the ones I want. 111 weights = {} # class -> number of votes 112 for k in knn.classes: 113 weights[k] = 0.0 114 for dist, i in order[:knn.k]: 115 klass = knn.ys[i] 116 weights[klass] = weights[klass] + weight_fn(x, knn.xs[i]) 117 118 return weights
119 120
121 -def classify(knn, x, weight_fn=equal_weight, distance_fn=None):
122 """classify(knn, x[, weight_fn][, distance_fn]) -> class 123 124 Classify an observation into a class. If not specified, weight_fn will 125 give all neighbors equal weight. distance_fn is an optional function 126 that takes two points and returns the distance between them. If 127 distance_fn is None (the default), the Euclidean distance is used. 128 """ 129 weights = calculate( 130 knn, x, weight_fn=weight_fn, distance_fn=distance_fn) 131 132 most_class = None 133 most_weight = None 134 for klass, weight in weights.items(): 135 if most_class is None or weight > most_weight: 136 most_class = klass 137 most_weight = weight 138 return most_class
139