[frames] | no frames]

# Source Code for Package Bio.Cluster

```  1  # This code is part of the Biopython distribution and governed by its
3  # as part of this package.
4  #
5  """Cluster Analysis.
6
7  The Bio.Cluster provides commonly used clustering algorithms and was
8  designed with the application to gene expression data in mind. However,
9  this module can also be used for cluster analysis of other types of data.
10
11  Bio.Cluster and the underlying C Clustering Library is described in
12  M. De Hoon et al. (2004) http://dx.doi.org/10.1093/bioinformatics/bth078
13  """
14
15  import numpy
16
17  from Bio.Cluster.cluster import version, kcluster, kmedoids, treecluster
18  from Bio.Cluster.cluster import somcluster, median, mean, clusterdistance
19  from Bio.Cluster.cluster import clustercentroids, distancematrix, pca
20  from Bio.Cluster.cluster import Node, Tree
21
22
23 -def _treesort(order, nodeorder, nodecounts, tree):
24      # Find the order of the nodes consistent with the hierarchical clustering
25      # tree, taking into account the preferred order of nodes.
26      nNodes = len(tree)
27      nElements = nNodes + 1
28      neworder = numpy.zeros(nElements)
29      clusterids = numpy.arange(nElements)
30      for i in range(nNodes):
31          i1 = tree[i].left
32          i2 = tree[i].right
33          if i1 < 0:
34              order1 = nodeorder[-i1 - 1]
35              count1 = nodecounts[-i1 - 1]
36          else:
37              order1 = order[i1]
38              count1 = 1
39          if i2 < 0:
40              order2 = nodeorder[-i2 - 1]
41              count2 = nodecounts[-i2 - 1]
42          else:
43              order2 = order[i2]
44              count2 = 1
45          # If order1 and order2 are equal, their order is determined
46          # by the order in which they were clustered
47          if i1 < i2:
48              if order1 < order2:
49                  increase = count1
50              else:
51                  increase = count2
52              for j in range(nElements):
53                  clusterid = clusterids[j]
54                  if clusterid == i1 and order1 >= order2:
55                      neworder[j] += increase
56                  if clusterid == i2 and order1 < order2:
57                      neworder[j] += increase
58                  if clusterid == i1 or clusterid == i2:
59                      clusterids[j] = -i - 1
60          else:
61              if order1 <= order2:
62                  increase = count1
63              else:
64                  increase = count2
65              for j in range(nElements):
66                  clusterid = clusterids[j]
67                  if clusterid == i1 and order1 > order2:
68                      neworder[j] += increase
69                  if clusterid == i2 and order1 <= order2:
70                      neworder[j] += increase
71                  if clusterid == i1 or clusterid == i2:
72                      clusterids[j] = -i - 1
73      return numpy.argsort(neworder)
74
75
76 -def _savetree(jobname, tree, order, transpose):
77      # Save the hierarchical clustering solution given by the tree, following
78      # the specified order, in a file whose name is based on jobname.
79      if transpose == 0:
80          extension = ".gtr"
81          keyword = "GENE"
82      else:
83          extension = ".atr"
84          keyword = "ARRY"
85      nnodes = len(tree)
86      with open(jobname + extension, "w") as outputfile:
87          nodeindex = 0
88          nodeID = [''] * nnodes
89          nodecounts = numpy.zeros(nnodes, int)
90          nodeorder = numpy.zeros(nnodes)
91          nodedist = numpy.array([node.distance for node in tree])
92          for nodeindex in range(nnodes):
93              min1 = tree[nodeindex].left
94              min2 = tree[nodeindex].right
95              nodeID[nodeindex] = "NODE%dX" % (nodeindex + 1)
96              outputfile.write(nodeID[nodeindex])
97              outputfile.write("\t")
98              if min1 < 0:
99                  index1 = -min1 - 1
100                  order1 = nodeorder[index1]
101                  counts1 = nodecounts[index1]
102                  outputfile.write(nodeID[index1] + "\t")
103                  nodedist[nodeindex] = max(nodedist[nodeindex], nodedist[index1])
104              else:
105                  order1 = order[min1]
106                  counts1 = 1
107                  outputfile.write("%s%dX\t" % (keyword, min1))
108              if min2 < 0:
109                  index2 = -min2 - 1
110                  order2 = nodeorder[index2]
111                  counts2 = nodecounts[index2]
112                  outputfile.write(nodeID[index2] + "\t")
113                  nodedist[nodeindex] = max(nodedist[nodeindex], nodedist[index2])
114              else:
115                  order2 = order[min2]
116                  counts2 = 1
117                  outputfile.write("%s%dX\t" % (keyword, min2))
118              outputfile.write(str(1.0 - nodedist[nodeindex]))
119              outputfile.write("\n")
120              counts = counts1 + counts2
121              nodecounts[nodeindex] = counts
122              nodeorder[nodeindex] = (counts1 * order1 + counts2 * order2) / counts
123      # Now set up order based on the tree structure
124      index = _treesort(order, nodeorder, nodecounts, tree)
125      return index
126
127
128 -class Record(object):
129      """Store gene expression data.
130
131      A Record stores the gene expression data and related information contained
132      in a data file following the file format defined for Michael Eisen's
133      Cluster/TreeView program.
134
135      Attributes:
136       - data:     a matrix containing the gene expression data
137       - mask:     a matrix containing only 1's and 0's, denoting which values
138         are present (1) or missing (0). If all elements of mask are
139         one (no missing data), then mask is set to None.
140       - geneid:   a list containing a unique identifier for each gene
141         (e.g., ORF name)
142       - genename: a list containing an additional description for each gene
143         (e.g., gene name)
144       - gweight:  the weight to be used for each gene when calculating the
145         distance
146       - gorder:   an array of real numbers indicating the preferred order of the
147         genes in the output file
148       - expid:    a list containing a unique identifier for each experimental
149         condition
150       - eweight:  the weight to be used for each experimental condition when
151         calculating the distance
152       - eorder:   an array of real numbers indication the preferred order in the
153         output file of the experimental conditions
154       - uniqid:   the string that was used instead of UNIQID in the input file.
155
156      """
157
158 -    def __init__(self, handle=None):
159          """Read gene expression data from the file handle and return a Record.
160
161          The file should be in the format defined for Michael Eisen's
162          Cluster/TreeView program.
163          """
164          self.data = None
166          self.geneid = None
167          self.genename = None
168          self.gweight = None
169          self.gorder = None
170          self.expid = None
171          self.eweight = None
172          self.eorder = None
173          self.uniqid = None
174          if not handle:
175              return
177          n = len(line)
178          self.uniqid = line[0]
179          self.expid = []
180          cols = {0: "GENEID"}
181          for word in line[1:]:
182              if word == "NAME":
183                  cols[line.index(word)] = word
184                  self.genename = []
185              elif word == "GWEIGHT":
186                  cols[line.index(word)] = word
187                  self.gweight = []
188              elif word == "GORDER":
189                  cols[line.index(word)] = word
190                  self.gorder = []
191              else:
192                  self.expid.append(word)
193          self.geneid = []
194          self.data = []
197          for line in handle:
198              line = line.strip("\r\n").split("\t")
199              if len(line) != n:
200                  raise ValueError("Line with %d columns found (expected %d)" %
201                                   (len(line), n))
202              if line[0] == "EWEIGHT":
203                  i = max(cols) + 1
204                  self.eweight = numpy.array(line[i:], float)
205                  continue
206              if line[0] == "EORDER":
207                  i = max(cols) + 1
208                  self.eorder = numpy.array(line[i:], float)
209                  continue
210              rowdata = []
212              n = len(line)
213              for i in range(n):
214                  word = line[i]
215                  if i in cols:
216                      if cols[i] == "GENEID":
217                          self.geneid.append(word)
218                      if cols[i] == "NAME":
219                          self.genename.append(word)
220                      if cols[i] == "GWEIGHT":
221                          self.gweight.append(float(word))
222                      if cols[i] == "GORDER":
223                          self.gorder.append(float(word))
224                      continue
225                  if not word:
226                      rowdata.append(0.0)
229                  else:
230                      rowdata.append(float(word))
232              self.data.append(rowdata)
234          self.data = numpy.array(self.data)
237          else:
239          if self.gweight:
240              self.gweight = numpy.array(self.gweight)
241          if self.gorder:
242              self.gorder = numpy.array(self.gorder)
243
244 -    def treecluster(self, transpose=0, method='m', dist='e'):
245          """Apply hierarchical clustering and return a Tree object.
246
247          The pairwise single, complete, centroid, and average linkage
248          hierarchical clustering methods are available.
249
250          Arguments:
251           - transpose: if equal to 0, genes (rows) are clustered;
252             if equal to 1, microarrays (columns) are clustered.
253           - dist     : specifies the distance function to be used:
254
255             - dist=='e': Euclidean distance
256             - dist=='b': City Block distance
257             - dist=='c': Pearson correlation
258             - dist=='a': absolute value of the correlation
259             - dist=='u': uncentered correlation
260             - dist=='x': absolute uncentered correlation
261             - dist=='s': Spearman's rank correlation
262             - dist=='k': Kendall's tau
263
264           - method   : specifies which linkage method is used:
265
266             - method=='s': Single pairwise linkage
267             - method=='m': Complete (maximum) pairwise linkage (default)
269             - method=='a': Average pairwise linkage
270
272          the Tree object returned by this method.
273          """
274          if transpose == 0:
275              weight = self.eweight
276          else:
277              weight = self.gweight
278          return treecluster(self.data, self.mask, weight, transpose, method,
279                             dist)
280
281 -    def kcluster(self, nclusters=2, transpose=0, npass=1, method='a', dist='e',
282                   initialid=None):
283          """Apply k-means or k-median clustering.
284
285          This method returns a tuple (clusterid, error, nfound).
286
287          Arguments:
288           - nclusters: number of clusters (the 'k' in k-means)
289           - transpose: if equal to 0, genes (rows) are clustered;
290             if equal to 1, microarrays (columns) are clustered.
291           - npass    : number of times the k-means clustering algorithm is
292             performed, each time with a different (random) initial condition.
293           - method   : specifies how the center of a cluster is found:
294
295             - method=='a': arithmetic mean
296             - method=='m': median
297
298           - dist     : specifies the distance function to be used:
299
300               - dist=='e': Euclidean distance
301               - dist=='b': City Block distance
302               - dist=='c': Pearson correlation
303               - dist=='a': absolute value of the correlation
304               - dist=='u': uncentered correlation
305               - dist=='x': absolute uncentered correlation
306               - dist=='s': Spearman's rank correlation
307               - dist=='k': Kendall's tau
308
309           - initialid: the initial clustering from which the algorithm should
310             start. If initialid is None, the routine carries out npass
311             repetitions of the EM algorithm, each time starting from a different
312             random initial clustering. If initialid is given, the routine
313             carries out the EM algorithm only once, starting from the given
314             initial clustering and without randomizing the  order in which items
315             are assigned to clusters (i.e., using the same order as in the data
316             matrix). In that case, the k-means algorithm is fully deterministic.
317
318          Return values:
319           - clusterid: array containing the number of the cluster to which each
320             gene/microarray was assigned in the best k-means clustering
321             solution that was found in the npass runs;
322           - error:     the within-cluster sum of distances for the returned
323             k-means clustering solution;
324           - nfound:    the number of times this solution was found.
325
326          """
327          if transpose == 0:
328              weight = self.eweight
329          else:
330              weight = self.gweight
331          return kcluster(self.data, nclusters, self.mask, weight, transpose,
332                          npass, method, dist, initialid)
333
334 -    def somcluster(self, transpose=0, nxgrid=2, nygrid=1, inittau=0.02,
335                     niter=1, dist='e'):
336          """Calculate a self-organizing map on a rectangular grid.
337
338          The somcluster method returns a tuple (clusterid, celldata).
339
340          Arguments:
341           - transpose: if equal to 0, genes (rows) are clustered;
342             if equal to 1, microarrays (columns) are clustered.
343           - nxgrid   : the horizontal dimension of the rectangular SOM map
344           - nygrid   : the vertical dimension of the rectangular SOM map
345           - inittau  : the initial value of tau (the neighborbood function)
346           - niter    : the number of iterations
347           - dist     : specifies the distance function to be used:
348
349             - dist=='e': Euclidean distance
350             - dist=='b': City Block distance
351             - dist=='c': Pearson correlation
352             - dist=='a': absolute value of the correlation
353             - dist=='u': uncentered correlation
354             - dist=='x': absolute uncentered correlation
355             - dist=='s': Spearman's rank correlation
356             - dist=='k': Kendall's tau
357
358          Return values:
359           - clusterid: array with two columns, while the number of rows is equal
360             to the number of genes or the number of microarrays depending on
361             whether genes or microarrays are being clustered. Each row in
362             the array contains the x and y coordinates of the cell in the
363             rectangular SOM grid to which the gene or microarray was assigned.
364           - celldata:  an array with dimensions (nxgrid, nygrid, number of
365             microarrays) if genes are being clustered, or (nxgrid, nygrid,
366             number of genes) if microarrays are being clustered. Each element
367             [ix][iy] of this array is a 1D vector containing the gene
368             expression data for the centroid of the cluster in the SOM grid
369             cell with coordinates (ix, iy).
370
371          """
372          if transpose == 0:
373              weight = self.eweight
374          else:
375              weight = self.gweight
376          return somcluster(self.data, self.mask, weight, transpose,
377                            nxgrid, nygrid, inittau, niter, dist)
378
379 -    def clustercentroids(self, clusterid=None, method='a', transpose=0):
380          """Calculate the cluster centroids and return a tuple (cdata, cmask).
381
382          The centroid is defined as either the mean or the median over all
383          elements for each dimension.
384
385          Arguments:
386           - data     : nrows x ncolumns array containing the expression data
387           - mask     : nrows x ncolumns array of integers, showing which data
388             are missing. If mask[i][j]==0, then data[i][j] is missing.
389           - transpose: if equal to 0, gene (row) clusters are considered;
390             if equal to 1, microarray (column) clusters are considered.
391           - clusterid: array containing the cluster number for each gene or
392             microarray. The cluster number should be non-negative.
393           - method   : specifies how the centroid is calculated:
394
395             - method=='a': arithmetic mean over each dimension. (default)
396             - method=='m': median over each dimension.
397
398          Return values:
399           - cdata    : 2D array containing the cluster centroids. If transpose==0,
400             then the dimensions of cdata are nclusters x ncolumns. If
401             transpose==1, then the dimensions of cdata are nrows x nclusters.
402           - cmask    : 2D array of integers describing which elements in cdata,
403             if any, are missing.
404
405          """
406          return clustercentroids(self.data, self.mask, clusterid, method,
407                                  transpose)
408
409 -    def clusterdistance(self, index1=0, index2=0, method='a', dist='e',
410                          transpose=0):
411          """Calculate the distance between two clusters.
412
413          Arguments:
414           - index1   : 1D array identifying which genes/microarrays belong to the
415             first cluster. If the cluster contains only one gene, then
416             index1 can also be written as a single integer.
417           - index2   : 1D array identifying which genes/microarrays belong to the
418             second cluster. If the cluster contains only one gene, then
419             index2 can also be written as a single integer.
420           - transpose: if equal to 0, genes (rows) are clustered;
421             if equal to 1, microarrays (columns) are clustered.
422           - dist     : specifies the distance function to be used:
423
424             - dist=='e': Euclidean distance
425             - dist=='b': City Block distance
426             - dist=='c': Pearson correlation
427             - dist=='a': absolute value of the correlation
428             - dist=='u': uncentered correlation
429             - dist=='x': absolute uncentered correlation
430             - dist=='s': Spearman's rank correlation
431             - dist=='k': Kendall's tau
432
433           - method   : specifies how the distance between two clusters is defined:
434
435             - method=='a': the distance between the arithmetic means of the
436               two clusters
437             - method=='m': the distance between the medians of the two clusters
438             - method=='s': the smallest pairwise distance between members of
439               the two clusters
440             - method=='x': the largest pairwise distance between members of
441               the two clusters
442             - method=='v': average of the pairwise distances between members
443               of the clusters
444
445           - transpose: if equal to 0: clusters of genes (rows) are considered;
446             if equal to 1: clusters of microarrays (columns) are considered.
447
448          """
449          if transpose == 0:
450              weight = self.eweight
451          else:
452              weight = self.gweight
454                                 index1, index2, method, dist, transpose)
455
456 -    def distancematrix(self, transpose=0, dist='e'):
457          """Calculate the distance matrix and return it as a list of arrays.
458
459          Arguments:
460           - transpose: if equal to 0: calculate the distances between genes
461             (rows); if equal to 1: calculate the distances beteeen microarrays
462             (columns).
463           - dist     : specifies the distance function to be used:
464
465             - dist=='e': Euclidean distance
466             - dist=='b': City Block distance
467             - dist=='c': Pearson correlation
468             - dist=='a': absolute value of the correlation
469             - dist=='u': uncentered correlation
470             - dist=='x': absolute uncentered correlation
471             - dist=='s': Spearman's rank correlation
472             - dist=='k': Kendall's tau
473
474          Return value:
475
476          The distance matrix is returned as a list of 1D arrays containing the
477          distance matrix between the gene expression data. The number of columns
478          in each row is equal to the row number. Hence, the first row has zero
479          elements. An example of the return value is::
480
481              matrix = [[],
482                        array([1.]),
483                        array([7., 3.]),
484                        array([4., 2., 6.])]
485
486          This corresponds to the distance matrix::
487
488              [0., 1., 7., 4.]
489              [1., 0., 3., 2.]
490              [7., 3., 0., 6.]
491              [4., 2., 6., 0.]
492
493          """
494          if transpose == 0:
495              weight = self.eweight
496          else:
497              weight = self.gweight
498          return distancematrix(self.data, self.mask, weight, transpose, dist)
499
500 -    def save(self, jobname, geneclusters=None, expclusters=None):
501          """Save the clustering results.
502
503          The saved files follow the convention for the Java TreeView program,
504          which can therefore be used to view the clustering result.
505
506          Arguments:
507           - jobname:   The base name of the files to be saved. The filenames
508             are jobname.cdt, jobname.gtr, and jobname.atr for hierarchical
509             clustering, and jobname-K*.cdt, jobname-K*.kgg, jobname-K*.kag
510             for k-means clustering results.
511           - geneclusters=None:  For hierarchical clustering results,
512             geneclusters is a Tree object as returned by the treecluster
513             method. For k-means clustering results, geneclusters is a vector
514             containing ngenes integers, describing to which cluster a given
515             gene belongs. This vector can be calculated by kcluster.
516           - expclusters=None:  For hierarchical clustering results, expclusters
517             is a Tree object as returned by the treecluster method. For k-means
518             clustering results, expclusters is a vector containing nexps
519             integers, describing to which cluster a given experimental
520             condition belongs. This vector can be calculated by kcluster.
521
522          """
523          (ngenes, nexps) = numpy.shape(self.data)
524          if self.gorder is None:
525              gorder = numpy.arange(ngenes)
526          else:
527              gorder = self.gorder
528          if self.eorder is None:
529              eorder = numpy.arange(nexps)
530          else:
531              eorder = self.eorder
532          if geneclusters is not None and expclusters is not None and \
533             type(geneclusters) != type(expclusters):
534              raise ValueError("found one k-means and one hierarchical "
535                               "clustering solution in geneclusters and "
536                               "expclusters")
537          gid = 0
538          aid = 0
539          filename = jobname
540          postfix = ""
541          if isinstance(geneclusters, Tree):
542              # This is a hierarchical clustering result.
543              geneindex = _savetree(jobname, geneclusters, gorder, 0)
544              gid = 1
545          elif geneclusters is not None:
546              # This is a k-means clustering result.
547              filename = jobname + "_K"
548              k = max(geneclusters) + 1
549              kggfilename = "%s_K_G%d.kgg" % (jobname, k)
550              geneindex = self._savekmeans(kggfilename, geneclusters, gorder, 0)
551              postfix = "_G%d" % k
552          else:
553              geneindex = numpy.argsort(gorder)
554          if isinstance(expclusters, Tree):
555              # This is a hierarchical clustering result.
556              expindex = _savetree(jobname, expclusters, eorder, 1)
557              aid = 1
558          elif expclusters is not None:
559              # This is a k-means clustering result.
560              filename = jobname + "_K"
561              k = max(expclusters) + 1
562              kagfilename = "%s_K_A%d.kag" % (jobname, k)
563              expindex = self._savekmeans(kagfilename, expclusters, eorder, 1)
564              postfix += "_A%d" % k
565          else:
566              expindex = numpy.argsort(eorder)
567          filename = filename + postfix
568          self._savedata(filename, gid, aid, geneindex, expindex)
569
570 -    def _savekmeans(self, filename, clusterids, order, transpose):
571          # Save a k-means clustering solution
572          if transpose == 0:
573              label = self.uniqid
574              names = self.geneid
575          else:
576              label = "ARRAY"
577              names = self.expid
578          with open(filename, "w") as outputfile:
579              outputfile.write(label + "\tGROUP\n")
580              index = numpy.argsort(order)
581              n = len(names)
582              sortedindex = numpy.zeros(n, int)
583              counter = 0
584              cluster = 0
585              while counter < n:
586                  for j in index:
587                      if clusterids[j] == cluster:
588                          outputfile.write("%s\t%s\n" % (names[j], cluster))
589                          sortedindex[counter] = j
590                          counter += 1
591                  cluster += 1
592          return sortedindex
593
594 -    def _savedata(self, jobname, gid, aid, geneindex, expindex):
595          # Save the clustered data.
596          if self.genename is None:
597              genename = self.geneid
598          else:
599              genename = self.genename
600          (ngenes, nexps) = numpy.shape(self.data)
601          with open(jobname + '.cdt', 'w') as outputfile:
602              if self.mask is not None:
604              else:
605                  mask = numpy.ones((ngenes, nexps), int)
606              if self.gweight is not None:
607                  gweight = self.gweight
608              else:
609                  gweight = numpy.ones(ngenes)
610              if self.eweight is not None:
611                  eweight = self.eweight
612              else:
613                  eweight = numpy.ones(nexps)
614              if gid:
615                  outputfile.write('GID\t')
616              outputfile.write(self.uniqid)
617              outputfile.write('\tNAME\tGWEIGHT')
619              for j in expindex:
620                  outputfile.write('\t%s' % self.expid[j])
621              outputfile.write('\n')
622              if aid:
623                  outputfile.write("AID")
624                  if gid:
625                      outputfile.write('\t')
626                  outputfile.write("\t\t")
627                  for j in expindex:
628                      outputfile.write('\tARRY%dX' % j)
629                  outputfile.write('\n')
630              outputfile.write('EWEIGHT')
631              if gid:
632                  outputfile.write('\t')
633              outputfile.write('\t\t')
634              for j in expindex:
635                  outputfile.write('\t%f' % eweight[j])
636              outputfile.write('\n')
637              for i in geneindex:
638                  if gid:
639                      outputfile.write('GENE%dX\t' % i)
640                  outputfile.write("%s\t%s\t%f" %
641                                   (self.geneid[i], genename[i], gweight[i]))
642                  for j in expindex:
643                      outputfile.write('\t')
645                          outputfile.write(str(self.data[i, j]))
646                  outputfile.write('\n')
647
648
650      """Read gene expression data from the file handle and return a Record.
651
652      The file should be in the file format defined for Michael Eisen's
653      Cluster/TreeView program.
654      """
655      return Record(handle)
656
<!--
expandto(location.href);
// -->

```

 Generated by Epydoc 3.0.1 on Mon Jul 10 15:16:52 2017 http://epydoc.sourceforge.net