Package Bio :: Module LogisticRegression
[hide private]
[frames] | no frames]

Source Code for Module Bio.LogisticRegression

  1  #!/usr/bin/env python 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Code for doing logistic regressions. 
  6   
  7   
  8  Classes: 
  9  LogisticRegression    Holds information for a LogisticRegression classifier. 
 10   
 11   
 12  Functions: 
 13  train        Train a new classifier. 
 14  calculate    Calculate the probabilities of each class, given an observation. 
 15  classify     Classify an observation into a class. 
 16  """ 
 17   
 18  from __future__ import print_function 
 19   
 20  import numpy 
 21  import numpy.linalg 
 22   
 23   
24 -class LogisticRegression(object):
25 """Holds information necessary to do logistic regression 26 classification. 27 28 Members: 29 beta List of the weights for each dimension. 30 31 """ 32
33 - def __init__(self):
34 """LogisticRegression()""" 35 self.beta = []
36 37
38 -def train(xs, ys, update_fn=None, typecode=None):
39 """train(xs, ys[, update_fn]) -> LogisticRegression 40 41 Train a logistic regression classifier on a training set. xs is a 42 list of observations and ys is a list of the class assignments, 43 which should be 0 or 1. xs and ys should contain the same number 44 of elements. update_fn is an optional callback function that 45 takes as parameters that iteration number and log likelihood. 46 47 """ 48 if len(xs) != len(ys): 49 raise ValueError("xs and ys should be the same length.") 50 classes = set(ys) 51 if classes != set([0, 1]): 52 raise ValueError("Classes should be 0's and 1's") 53 if typecode is None: 54 typecode = 'd' 55 56 # Dimensionality of the data is the dimensionality of the 57 # observations plus a constant dimension. 58 N, ndims = len(xs), len(xs[0]) + 1 59 if N == 0 or ndims == 1: 60 raise ValueError("No observations or observation of 0 dimension.") 61 62 # Make an X array, with a constant first dimension. 63 X = numpy.ones((N, ndims), typecode) 64 X[:, 1:] = xs 65 Xt = numpy.transpose(X) 66 y = numpy.asarray(ys, typecode) 67 68 # Initialize the beta parameter to 0. 69 beta = numpy.zeros(ndims, typecode) 70 71 MAX_ITERATIONS = 500 72 CONVERGE_THRESHOLD = 0.01 73 stepsize = 1.0 74 # Now iterate using Newton-Raphson until the log-likelihoods 75 # converge. 76 i = 0 77 old_beta = old_llik = None 78 while i < MAX_ITERATIONS: 79 # Calculate the probabilities. p = e^(beta X) / (1+e^(beta X)) 80 ebetaX = numpy.exp(numpy.dot(beta, Xt)) 81 p = ebetaX / (1 + ebetaX) 82 83 # Find the log likelihood score and see if I've converged. 84 logp = y * numpy.log(p) + (1 - y) * numpy.log(1 - p) 85 llik = sum(logp) 86 if update_fn is not None: 87 update_fn(iter, llik) 88 if old_llik is not None: 89 # Check to see if the likelihood decreased. If it did, then 90 # restore the old beta parameters and half the step size. 91 if llik < old_llik: 92 stepsize /= 2.0 93 beta = old_beta 94 # If I've converged, then stop. 95 if numpy.fabs(llik - old_llik) <= CONVERGE_THRESHOLD: 96 break 97 old_llik, old_beta = llik, beta 98 i += 1 99 100 W = numpy.identity(N) * p 101 Xtyp = numpy.dot(Xt, y - p) # Calculate the first derivative. 102 XtWX = numpy.dot(numpy.dot(Xt, W), X) # Calculate the second derivative. 103 delta = numpy.linalg.solve(XtWX, Xtyp) 104 if numpy.fabs(stepsize - 1.0) > 0.001: 105 delta *= stepsize 106 beta += delta # Update beta. 107 else: 108 raise RuntimeError("Didn't converge.") 109 110 lr = LogisticRegression() 111 lr.beta = [float(x) for x in beta] # Convert back to regular array. 112 return lr
113 114
115 -def calculate(lr, x):
116 """calculate(lr, x) -> list of probabilities 117 118 Calculate the probability for each class. lr is a 119 LogisticRegression object. x is the observed data. Returns a 120 list of the probability that it fits each class. 121 122 """ 123 # Insert a constant term for x. 124 x = numpy.asarray([1.0] + x) 125 # Calculate the probability. p = e^(beta X) / (1+e^(beta X)) 126 ebetaX = numpy.exp(numpy.dot(lr.beta, x)) 127 p = ebetaX / (1 + ebetaX) 128 return [1 - p, p]
129 130
131 -def classify(lr, x):
132 """classify(lr, x) -> 1 or 0 133 134 Classify an observation into a class. 135 136 """ 137 probs = calculate(lr, x) 138 if probs[0] > probs[1]: 139 return 0 140 return 1
141