[frames] | no frames]

Source Code for Module Bio.NaiveBayes

```  1  # Copyright 2000 by Jeffrey Chang.  All rights reserved.
2  # This code is part of the Biopython distribution and governed by its
4  # as part of this package.
5
6  """General Naive Bayes learner.
7
8  Naive Bayes is a supervised classification algorithm that uses Bayes
9  rule to compute the fit between a new observation and some previously
10  observed data.  The observations are discrete feature vectors, with
11  the Bayes assumption that the features are independent.  Although this
12  is hardly ever true, the classifier works well enough in practice.
13
14  Glossary:
15   - observation - A feature vector of discrete data.
16   - class       - A possible classification for an observation.
17
18  Classes:
19   - NaiveBayes - Holds information for a naive Bayes classifier.
20
21  Functions:
22   - train     - Train a new naive Bayes classifier.
23   - calculate - Calculate the probabilities of each class,
24     given an observation.
25   - classify  - Classify an observation into a class.
26
27  """
28
29  from __future__ import print_function
30
31  import numpy
32
33
34 -def _contents(items):
35      """Return a dictionary where the key is the item and the value is the probablity associated (PRIVATE)."""
36      term = 1.0 / len(items)
37      counts = {}
38      for item in items:
39          counts[item] = counts.get(item, 0) + term
40      return counts
41
42
43 -class NaiveBayes(object):
44      """Hold information for a NaiveBayes classifier.
45
46      Attributes:
47       - classes        - List of the possible classes of data.
48       - p_conditional  - CLASS x DIM array of dicts of value -> ``P(value|class,dim)``
49       - p_prior        - List of the prior probabilities for every class.
50       - dimensionality - Dimensionality of the data.
51
52      """
53
54 -    def __init__(self):
55          """Initialize the class."""
56          self.classes = []
57          self.p_conditional = None
58          self.p_prior = []
59          self.dimensionality = None
60
61
62 -def calculate(nb, observation, scale=False):
63      """Calculate the logarithmic conditional probability for each class.
64
65      Arguments:
66       - nb          - A NaiveBayes classifier that has been trained.
67       - observation - A list representing the observed data.
68       - scale       - Boolean to indicate whether the probability should be
69         scaled by ``P(observation)``.  By default, no scaling is done.
70
71      A dictionary is returned where the key is the class and the value is
72      the log probability of the class.
73      """
74      # P(class|observation) = P(observation|class)*P(class)/P(observation)
75      # Taking the log:
76      # lP(class|observation) = lP(observation|class)+lP(class)-lP(observation)
77
78      # Make sure the observation has the right dimensionality.
79      if len(observation) != nb.dimensionality:
80          raise ValueError("observation in {0} dimension, but classifier in {1}".format(len(observation),
81                                                                                        nb.dimensionality))
82
83      # Calculate log P(observation|class) for every class.
84      n = len(nb.classes)
85      lp_observation_class = numpy.zeros(n)   # array of log P(observation|class)
86      for i in range(n):
87          # log P(observation|class) = SUM_i log P(observation_i|class)
88          probs = [None] * len(observation)
89          for j in range(len(observation)):
90              probs[j] = nb.p_conditional[i][j].get(observation[j], 0)
91          lprobs = numpy.log(numpy.clip(probs, 1.e-300, 1.e+300))
92          lp_observation_class[i] = sum(lprobs)
93
94      # Calculate log P(class).
95      lp_prior = numpy.log(nb.p_prior)
96
97      # Calculate log P(observation).
98      lp_observation = 0.0          # P(observation)
99      if scale:   # Only calculate this if requested.
100          # log P(observation) = log SUM_i P(observation|class_i)P(class_i)
101          obs = numpy.exp(numpy.clip(lp_prior + lp_observation_class, -700, +700))
102          lp_observation = numpy.log(sum(obs))
103
104      # Calculate log P(class|observation).
105      lp_class_observation = {}      # Dict of class : log P(class|observation)
106      for i in range(len(nb.classes)):
107          lp_class_observation[nb.classes[i]] = \
108              lp_observation_class[i] + lp_prior[i] - lp_observation
109
110      return lp_class_observation
111
112
113 -def classify(nb, observation):
114      """Classify an observation into a class."""
115      # The class is the one with the highest probability.
116      probs = calculate(nb, observation, scale=False)
117      max_prob = max_class = None
118      for klass in nb.classes:
119          if max_prob is None or probs[klass] > max_prob:
120              max_prob, max_class = probs[klass], klass
121      return max_class
122
123
124 -def train(training_set, results, priors=None, typecode=None):
125      """Train a NaiveBayes classifier on a training set.
126
127      Arguments:
128       - training_set - List of observations.
129       - results      - List of the class assignments for each observation.
130         Thus, training_set and results must be the same length.
131       - priors       - Optional dictionary specifying the prior probabilities
132         for each type of result. If not specified, the priors will
133         be estimated from the training results.
134
135      """
136      if not len(training_set):
137          raise ValueError("No data in the training set.")
138      if len(training_set) != len(results):
139          raise ValueError("training_set and results should be parallel lists.")
140
141      # If no typecode is specified, try to pick a reasonable one.  If
142      # training_set is a Numeric array, then use that typecode.
143      # Otherwise, choose a reasonable default.
144      # XXX NOT IMPLEMENTED
145
146      # Check to make sure each vector in the training set has the same
147      # dimensionality.
148      dimensions = [len(x) for x in training_set]
149      if min(dimensions) != max(dimensions):
150          raise ValueError("observations have different dimensionality")
151
152      nb = NaiveBayes()
153      nb.dimensionality = dimensions[0]
154
155      # Get a list of all the classes, and
156      # estimate the prior probabilities for the classes.
157      if priors is not None:
158          percs = priors
159          nb.classes = list(set(results))
160      else:
161          class_freq = _contents(results)
162          nb.classes = list(class_freq.keys())
163          percs = class_freq
164      nb.classes.sort()   # keep it tidy
165
166      nb.p_prior = numpy.zeros(len(nb.classes))
167      for i in range(len(nb.classes)):
168          nb.p_prior[i] = percs[nb.classes[i]]
169
170      # Collect all the observations in class.  For each class, make a
171      # matrix of training instances versus dimensions.  I might be able
172      # to optimize this with Numeric, if the training_set parameter
173      # were guaranteed to be a matrix.  However, this may not be the
174      # case, because the client may be hacking up a sparse matrix or
175      # something.
176      c2i = {}      # class to index of class
177      for index, key in enumerate(nb.classes):
178          c2i[key] = index
179      observations = [[] for c in nb.classes]  # separate observations by class
180      for i in range(len(results)):
181          klass, obs = results[i], training_set[i]
182          observations[c2i[klass]].append(obs)
183      # Now make the observations Numeric matrix.
184      for i in range(len(observations)):
185          # XXX typecode must be specified!
186          observations[i] = numpy.asarray(observations[i], typecode)
187
188      # Calculate P(value|class,dim) for every class.
189      # This is a good loop to optimize.
190      nb.p_conditional = []
191      for i in range(len(nb.classes)):
192          class_observations = observations[i]   # observations for this class
193          nb.p_conditional.append([None] * nb.dimensionality)
194          for j in range(nb.dimensionality):
195              # Collect all the values in this dimension.
196              values = class_observations[:, j]
197
198              # Add pseudocounts here.  This needs to be parameterized.
199              # values = list(values) + range(len(nb.classes))  # XXX add 1
200
201              # Estimate P(value|class,dim)
202              nb.p_conditional[i][j] = _contents(values)
203      return nb
204
<!--
expandto(location.href);
// -->

```

 Generated by Epydoc 3.0.1 on Wed Apr 4 10:50:01 2018 http://epydoc.sourceforge.net