Package Bio :: Module LogisticRegression
[hide private]
[frames] | no frames]

Source Code for Module Bio.LogisticRegression

  1  #!/usr/bin/env python 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """ 
  6  This module provides code for doing logistic regressions. 
  7   
  8   
  9  Classes: 
 10  LogisticRegression    Holds information for a LogisticRegression classifier. 
 11   
 12   
 13  Functions: 
 14  train        Train a new classifier. 
 15  calculate    Calculate the probabilities of each class, given an observation. 
 16  classify     Classify an observation into a class. 
 17  """ 
 18   
 19  import numpy 
 20  import numpy.linalg 
 21   
 22   
23 -class LogisticRegression(object):
24 """Holds information necessary to do logistic regression 25 classification. 26 27 Members: 28 beta List of the weights for each dimension. 29 30 """
31 - def __init__(self):
32 """LogisticRegression()""" 33 self.beta = []
34 35
36 -def train(xs, ys, update_fn=None, typecode=None):
37 """train(xs, ys[, update_fn]) -> LogisticRegression 38 39 Train a logistic regression classifier on a training set. xs is a 40 list of observations and ys is a list of the class assignments, 41 which should be 0 or 1. xs and ys should contain the same number 42 of elements. update_fn is an optional callback function that 43 takes as parameters that iteration number and log likelihood. 44 45 """ 46 if len(xs) != len(ys): 47 raise ValueError("xs and ys should be the same length.") 48 classes = set(ys) 49 if classes != set([0, 1]): 50 raise ValueError("Classes should be 0's and 1's") 51 if typecode is None: 52 typecode = 'd' 53 54 # Dimensionality of the data is the dimensionality of the 55 # observations plus a constant dimension. 56 N, ndims = len(xs), len(xs[0]) + 1 57 if N==0 or ndims==1: 58 raise ValueError("No observations or observation of 0 dimension.") 59 60 # Make an X array, with a constant first dimension. 61 X = numpy.ones((N, ndims), typecode) 62 X[:, 1:] = xs 63 Xt = numpy.transpose(X) 64 y = numpy.asarray(ys, typecode) 65 66 # Initialize the beta parameter to 0. 67 beta = numpy.zeros(ndims, typecode) 68 69 MAX_ITERATIONS = 500 70 CONVERGE_THRESHOLD = 0.01 71 stepsize = 1.0 72 # Now iterate using Newton-Raphson until the log-likelihoods 73 # converge. 74 i = 0 75 old_beta = old_llik = None 76 while i < MAX_ITERATIONS: 77 # Calculate the probabilities. p = e^(beta X) / (1+e^(beta X)) 78 ebetaX = numpy.exp(numpy.dot(beta, Xt)) 79 p = ebetaX / (1+ebetaX) 80 81 # Find the log likelihood score and see if I've converged. 82 logp = y*numpy.log(p) + (1-y)*numpy.log(1-p) 83 llik = sum(logp) 84 if update_fn is not None: 85 update_fn(iter, llik) 86 if old_llik is not None: 87 # Check to see if the likelihood decreased. If it did, then 88 # restore the old beta parameters and half the step size. 89 if llik < old_llik: 90 stepsize = stepsize / 2.0 91 beta = old_beta 92 # If I've converged, then stop. 93 if numpy.fabs(llik-old_llik) <= CONVERGE_THRESHOLD: 94 break 95 old_llik, old_beta = llik, beta 96 i += 1 97 98 W = numpy.identity(N) * p 99 Xtyp = numpy.dot(Xt, y-p) # Calculate the first derivative. 100 XtWX = numpy.dot(numpy.dot(Xt, W), X) # Calculate the second derivative. 101 #u, s, vt = singular_value_decomposition(XtWX) 102 #print "U", u 103 #print "S", s 104 delta = numpy.linalg.solve(XtWX, Xtyp) 105 if numpy.fabs(stepsize-1.0) > 0.001: 106 delta = delta * stepsize 107 beta = beta + delta # Update beta. 108 else: 109 raise RuntimeError("Didn't converge.") 110 111 lr = LogisticRegression() 112 lr.beta = map(float, beta) # Convert back to regular array. 113 return lr
114 115
116 -def calculate(lr, x):
117 """calculate(lr, x) -> list of probabilities 118 119 Calculate the probability for each class. lr is a 120 LogisticRegression object. x is the observed data. Returns a 121 list of the probability that it fits each class. 122 123 """ 124 # Insert a constant term for x. 125 x = numpy.asarray([1.0] + x) 126 # Calculate the probability. p = e^(beta X) / (1+e^(beta X)) 127 ebetaX = numpy.exp(numpy.dot(lr.beta, x)) 128 p = ebetaX / (1+ebetaX) 129 return [1-p, p]
130 131
132 -def classify(lr, x):
133 """classify(lr, x) -> 1 or 0 134 135 Classify an observation into a class. 136 137 """ 138 probs = calculate(lr, x) 139 if probs[0] > probs[1]: 140 return 0 141 return 1
142