1
2
3
4
5 """
6 This module provides code for doing logistic regressions.
7
8
9 Classes:
10 LogisticRegression Holds information for a LogisticRegression classifier.
11
12
13 Functions:
14 train Train a new classifier.
15 calculate Calculate the probabilities of each class, given an observation.
16 classify Classify an observation into a class.
17 """
18
19 import numpy
20 import numpy.linalg
21
22
24 """Holds information necessary to do logistic regression
25 classification.
26
27 Members:
28 beta List of the weights for each dimension.
29
30 """
32 """LogisticRegression()"""
33 self.beta = []
34
35
36 -def train(xs, ys, update_fn=None, typecode=None):
37 """train(xs, ys[, update_fn]) -> LogisticRegression
38
39 Train a logistic regression classifier on a training set. xs is a
40 list of observations and ys is a list of the class assignments,
41 which should be 0 or 1. xs and ys should contain the same number
42 of elements. update_fn is an optional callback function that
43 takes as parameters that iteration number and log likelihood.
44
45 """
46 if len(xs) != len(ys):
47 raise ValueError("xs and ys should be the same length.")
48 classes = set(ys)
49 if classes != set([0, 1]):
50 raise ValueError("Classes should be 0's and 1's")
51 if typecode is None:
52 typecode = 'd'
53
54
55
56 N, ndims = len(xs), len(xs[0]) + 1
57 if N==0 or ndims==1:
58 raise ValueError("No observations or observation of 0 dimension.")
59
60
61 X = numpy.ones((N, ndims), typecode)
62 X[:, 1:] = xs
63 Xt = numpy.transpose(X)
64 y = numpy.asarray(ys, typecode)
65
66
67 beta = numpy.zeros(ndims, typecode)
68
69 MAX_ITERATIONS = 500
70 CONVERGE_THRESHOLD = 0.01
71 stepsize = 1.0
72
73
74 i = 0
75 old_beta = old_llik = None
76 while i < MAX_ITERATIONS:
77
78 ebetaX = numpy.exp(numpy.dot(beta, Xt))
79 p = ebetaX / (1+ebetaX)
80
81
82 logp = y*numpy.log(p) + (1-y)*numpy.log(1-p)
83 llik = sum(logp)
84 if update_fn is not None:
85 update_fn(iter, llik)
86 if old_llik is not None:
87
88
89 if llik < old_llik:
90 stepsize = stepsize / 2.0
91 beta = old_beta
92
93 if numpy.fabs(llik-old_llik) <= CONVERGE_THRESHOLD:
94 break
95 old_llik, old_beta = llik, beta
96 i += 1
97
98 W = numpy.identity(N) * p
99 Xtyp = numpy.dot(Xt, y-p)
100 XtWX = numpy.dot(numpy.dot(Xt, W), X)
101
102
103
104 delta = numpy.linalg.solve(XtWX, Xtyp)
105 if numpy.fabs(stepsize-1.0) > 0.001:
106 delta = delta * stepsize
107 beta = beta + delta
108 else:
109 raise RuntimeError("Didn't converge.")
110
111 lr = LogisticRegression()
112 lr.beta = map(float, beta)
113 return lr
114
115
117 """calculate(lr, x) -> list of probabilities
118
119 Calculate the probability for each class. lr is a
120 LogisticRegression object. x is the observed data. Returns a
121 list of the probability that it fits each class.
122
123 """
124
125 x = numpy.asarray([1.0] + x)
126
127 ebetaX = numpy.exp(numpy.dot(lr.beta, x))
128 p = ebetaX / (1+ebetaX)
129 return [1-p, p]
130
131
133 """classify(lr, x) -> 1 or 0
134
135 Classify an observation into a class.
136
137 """
138 probs = calculate(lr, x)
139 if probs[0] > probs[1]:
140 return 0
141 return 1
142