Package Bio :: Package Cluster
[hide private]
[frames] | no frames]

Source Code for Package Bio.Cluster

  1  # This code is part of the Biopython distribution and governed by its 
  2  # license.  Please see the LICENSE file that should have been included 
  3  # as part of this package. 
  4  # 
  5  """Cluster Analysis. 
  6   
  7  The Bio.Cluster provides commonly used clustering algorithms and was 
  8  designed with the application to gene expression data in mind. However, 
  9  this module can also be used for cluster analysis of other types of data. 
 10   
 11  Bio.Cluster and the underlying C Clustering Library is described in 
 12  M. De Hoon et al. (2004) http://dx.doi.org/10.1093/bioinformatics/bth078 
 13  """ 
 14   
 15  import numpy 
 16   
 17  from Bio.Cluster.cluster import version, kcluster, kmedoids, treecluster 
 18  from Bio.Cluster.cluster import somcluster, median, mean, clusterdistance 
 19  from Bio.Cluster.cluster import clustercentroids, distancematrix, pca 
 20  from Bio.Cluster.cluster import Node, Tree 
 21   
 22   
23 -def _treesort(order, nodeorder, nodecounts, tree):
24 # Find the order of the nodes consistent with the hierarchical clustering 25 # tree, taking into account the preferred order of nodes. 26 nNodes = len(tree) 27 nElements = nNodes + 1 28 neworder = numpy.zeros(nElements) 29 clusterids = numpy.arange(nElements) 30 for i in range(nNodes): 31 i1 = tree[i].left 32 i2 = tree[i].right 33 if i1 < 0: 34 order1 = nodeorder[-i1 - 1] 35 count1 = nodecounts[-i1 - 1] 36 else: 37 order1 = order[i1] 38 count1 = 1 39 if i2 < 0: 40 order2 = nodeorder[-i2 - 1] 41 count2 = nodecounts[-i2 - 1] 42 else: 43 order2 = order[i2] 44 count2 = 1 45 # If order1 and order2 are equal, their order is determined 46 # by the order in which they were clustered 47 if i1 < i2: 48 if order1 < order2: 49 increase = count1 50 else: 51 increase = count2 52 for j in range(nElements): 53 clusterid = clusterids[j] 54 if clusterid == i1 and order1 >= order2: 55 neworder[j] += increase 56 if clusterid == i2 and order1 < order2: 57 neworder[j] += increase 58 if clusterid == i1 or clusterid == i2: 59 clusterids[j] = -i - 1 60 else: 61 if order1 <= order2: 62 increase = count1 63 else: 64 increase = count2 65 for j in range(nElements): 66 clusterid = clusterids[j] 67 if clusterid == i1 and order1 > order2: 68 neworder[j] += increase 69 if clusterid == i2 and order1 <= order2: 70 neworder[j] += increase 71 if clusterid == i1 or clusterid == i2: 72 clusterids[j] = -i - 1 73 return numpy.argsort(neworder)
74 75
76 -def _savetree(jobname, tree, order, transpose):
77 # Save the hierarchical clustering solution given by the tree, following 78 # the specified order, in a file whose name is based on jobname. 79 if transpose == 0: 80 extension = ".gtr" 81 keyword = "GENE" 82 else: 83 extension = ".atr" 84 keyword = "ARRY" 85 nnodes = len(tree) 86 with open(jobname + extension, "w") as outputfile: 87 nodeindex = 0 88 nodeID = [''] * nnodes 89 nodecounts = numpy.zeros(nnodes, int) 90 nodeorder = numpy.zeros(nnodes) 91 nodedist = numpy.array([node.distance for node in tree]) 92 for nodeindex in range(nnodes): 93 min1 = tree[nodeindex].left 94 min2 = tree[nodeindex].right 95 nodeID[nodeindex] = "NODE%dX" % (nodeindex + 1) 96 outputfile.write(nodeID[nodeindex]) 97 outputfile.write("\t") 98 if min1 < 0: 99 index1 = -min1 - 1 100 order1 = nodeorder[index1] 101 counts1 = nodecounts[index1] 102 outputfile.write(nodeID[index1] + "\t") 103 nodedist[nodeindex] = max(nodedist[nodeindex], nodedist[index1]) 104 else: 105 order1 = order[min1] 106 counts1 = 1 107 outputfile.write("%s%dX\t" % (keyword, min1)) 108 if min2 < 0: 109 index2 = -min2 - 1 110 order2 = nodeorder[index2] 111 counts2 = nodecounts[index2] 112 outputfile.write(nodeID[index2] + "\t") 113 nodedist[nodeindex] = max(nodedist[nodeindex], nodedist[index2]) 114 else: 115 order2 = order[min2] 116 counts2 = 1 117 outputfile.write("%s%dX\t" % (keyword, min2)) 118 outputfile.write(str(1.0 - nodedist[nodeindex])) 119 outputfile.write("\n") 120 counts = counts1 + counts2 121 nodecounts[nodeindex] = counts 122 nodeorder[nodeindex] = (counts1 * order1 + counts2 * order2) / counts 123 # Now set up order based on the tree structure 124 index = _treesort(order, nodeorder, nodecounts, tree) 125 return index
126 127
128 -class Record(object):
129 """Store gene expression data. 130 131 A Record stores the gene expression data and related information contained 132 in a data file following the file format defined for Michael Eisen's 133 Cluster/TreeView program. 134 135 Attributes: 136 - data: a matrix containing the gene expression data 137 - mask: a matrix containing only 1's and 0's, denoting which values 138 are present (1) or missing (0). If all elements of mask are 139 one (no missing data), then mask is set to None. 140 - geneid: a list containing a unique identifier for each gene 141 (e.g., ORF name) 142 - genename: a list containing an additional description for each gene 143 (e.g., gene name) 144 - gweight: the weight to be used for each gene when calculating the 145 distance 146 - gorder: an array of real numbers indicating the preferred order of the 147 genes in the output file 148 - expid: a list containing a unique identifier for each experimental 149 condition 150 - eweight: the weight to be used for each experimental condition when 151 calculating the distance 152 - eorder: an array of real numbers indication the preferred order in the 153 output file of the experimental conditions 154 - uniqid: the string that was used instead of UNIQID in the input file. 155 156 """ 157
158 - def __init__(self, handle=None):
159 """Read gene expression data from the file handle and return a Record. 160 161 The file should be in the format defined for Michael Eisen's 162 Cluster/TreeView program. 163 """ 164 self.data = None 165 self.mask = None 166 self.geneid = None 167 self.genename = None 168 self.gweight = None 169 self.gorder = None 170 self.expid = None 171 self.eweight = None 172 self.eorder = None 173 self.uniqid = None 174 if not handle: 175 return 176 line = handle.readline().strip("\r\n").split("\t") 177 n = len(line) 178 self.uniqid = line[0] 179 self.expid = [] 180 cols = {0: "GENEID"} 181 for word in line[1:]: 182 if word == "NAME": 183 cols[line.index(word)] = word 184 self.genename = [] 185 elif word == "GWEIGHT": 186 cols[line.index(word)] = word 187 self.gweight = [] 188 elif word == "GORDER": 189 cols[line.index(word)] = word 190 self.gorder = [] 191 else: 192 self.expid.append(word) 193 self.geneid = [] 194 self.data = [] 195 self.mask = [] 196 needmask = 0 197 for line in handle: 198 line = line.strip("\r\n").split("\t") 199 if len(line) != n: 200 raise ValueError("Line with %d columns found (expected %d)" % 201 (len(line), n)) 202 if line[0] == "EWEIGHT": 203 i = max(cols) + 1 204 self.eweight = numpy.array(line[i:], float) 205 continue 206 if line[0] == "EORDER": 207 i = max(cols) + 1 208 self.eorder = numpy.array(line[i:], float) 209 continue 210 rowdata = [] 211 rowmask = [] 212 n = len(line) 213 for i in range(n): 214 word = line[i] 215 if i in cols: 216 if cols[i] == "GENEID": 217 self.geneid.append(word) 218 if cols[i] == "NAME": 219 self.genename.append(word) 220 if cols[i] == "GWEIGHT": 221 self.gweight.append(float(word)) 222 if cols[i] == "GORDER": 223 self.gorder.append(float(word)) 224 continue 225 if not word: 226 rowdata.append(0.0) 227 rowmask.append(0) 228 needmask = 1 229 else: 230 rowdata.append(float(word)) 231 rowmask.append(1) 232 self.data.append(rowdata) 233 self.mask.append(rowmask) 234 self.data = numpy.array(self.data) 235 if needmask: 236 self.mask = numpy.array(self.mask, int) 237 else: 238 self.mask = None 239 if self.gweight: 240 self.gweight = numpy.array(self.gweight) 241 if self.gorder: 242 self.gorder = numpy.array(self.gorder)
243
244 - def treecluster(self, transpose=0, method='m', dist='e'):
245 """Apply hierarchical clustering and return a Tree object. 246 247 The pairwise single, complete, centroid, and average linkage 248 hierarchical clustering methods are available. 249 250 Arguments: 251 - transpose: if equal to 0, genes (rows) are clustered; 252 if equal to 1, microarrays (columns) are clustered. 253 - dist : specifies the distance function to be used: 254 255 - dist=='e': Euclidean distance 256 - dist=='b': City Block distance 257 - dist=='c': Pearson correlation 258 - dist=='a': absolute value of the correlation 259 - dist=='u': uncentered correlation 260 - dist=='x': absolute uncentered correlation 261 - dist=='s': Spearman's rank correlation 262 - dist=='k': Kendall's tau 263 264 - method : specifies which linkage method is used: 265 266 - method=='s': Single pairwise linkage 267 - method=='m': Complete (maximum) pairwise linkage (default) 268 - method=='c': Centroid linkage 269 - method=='a': Average pairwise linkage 270 271 See the description of the Tree class for more information about 272 the Tree object returned by this method. 273 """ 274 if transpose == 0: 275 weight = self.eweight 276 else: 277 weight = self.gweight 278 return treecluster(self.data, self.mask, weight, transpose, method, 279 dist)
280
281 - def kcluster(self, nclusters=2, transpose=0, npass=1, method='a', dist='e', 282 initialid=None):
283 """Apply k-means or k-median clustering. 284 285 This method returns a tuple (clusterid, error, nfound). 286 287 Arguments: 288 - nclusters: number of clusters (the 'k' in k-means) 289 - transpose: if equal to 0, genes (rows) are clustered; 290 if equal to 1, microarrays (columns) are clustered. 291 - npass : number of times the k-means clustering algorithm is 292 performed, each time with a different (random) initial condition. 293 - method : specifies how the center of a cluster is found: 294 295 - method=='a': arithmetic mean 296 - method=='m': median 297 298 - dist : specifies the distance function to be used: 299 300 - dist=='e': Euclidean distance 301 - dist=='b': City Block distance 302 - dist=='c': Pearson correlation 303 - dist=='a': absolute value of the correlation 304 - dist=='u': uncentered correlation 305 - dist=='x': absolute uncentered correlation 306 - dist=='s': Spearman's rank correlation 307 - dist=='k': Kendall's tau 308 309 - initialid: the initial clustering from which the algorithm should 310 start. If initialid is None, the routine carries out npass 311 repetitions of the EM algorithm, each time starting from a different 312 random initial clustering. If initialid is given, the routine 313 carries out the EM algorithm only once, starting from the given 314 initial clustering and without randomizing the order in which items 315 are assigned to clusters (i.e., using the same order as in the data 316 matrix). In that case, the k-means algorithm is fully deterministic. 317 318 Return values: 319 - clusterid: array containing the number of the cluster to which each 320 gene/microarray was assigned in the best k-means clustering 321 solution that was found in the npass runs; 322 - error: the within-cluster sum of distances for the returned 323 k-means clustering solution; 324 - nfound: the number of times this solution was found. 325 326 """ 327 if transpose == 0: 328 weight = self.eweight 329 else: 330 weight = self.gweight 331 return kcluster(self.data, nclusters, self.mask, weight, transpose, 332 npass, method, dist, initialid)
333
334 - def somcluster(self, transpose=0, nxgrid=2, nygrid=1, inittau=0.02, 335 niter=1, dist='e'):
336 """Calculate a self-organizing map on a rectangular grid. 337 338 The somcluster method returns a tuple (clusterid, celldata). 339 340 Arguments: 341 - transpose: if equal to 0, genes (rows) are clustered; 342 if equal to 1, microarrays (columns) are clustered. 343 - nxgrid : the horizontal dimension of the rectangular SOM map 344 - nygrid : the vertical dimension of the rectangular SOM map 345 - inittau : the initial value of tau (the neighborbood function) 346 - niter : the number of iterations 347 - dist : specifies the distance function to be used: 348 349 - dist=='e': Euclidean distance 350 - dist=='b': City Block distance 351 - dist=='c': Pearson correlation 352 - dist=='a': absolute value of the correlation 353 - dist=='u': uncentered correlation 354 - dist=='x': absolute uncentered correlation 355 - dist=='s': Spearman's rank correlation 356 - dist=='k': Kendall's tau 357 358 Return values: 359 - clusterid: array with two columns, while the number of rows is equal 360 to the number of genes or the number of microarrays depending on 361 whether genes or microarrays are being clustered. Each row in 362 the array contains the x and y coordinates of the cell in the 363 rectangular SOM grid to which the gene or microarray was assigned. 364 - celldata: an array with dimensions (nxgrid, nygrid, number of 365 microarrays) if genes are being clustered, or (nxgrid, nygrid, 366 number of genes) if microarrays are being clustered. Each element 367 [ix][iy] of this array is a 1D vector containing the gene 368 expression data for the centroid of the cluster in the SOM grid 369 cell with coordinates (ix, iy). 370 371 """ 372 if transpose == 0: 373 weight = self.eweight 374 else: 375 weight = self.gweight 376 return somcluster(self.data, self.mask, weight, transpose, 377 nxgrid, nygrid, inittau, niter, dist)
378
379 - def clustercentroids(self, clusterid=None, method='a', transpose=0):
380 """Calculate the cluster centroids and return a tuple (cdata, cmask). 381 382 The centroid is defined as either the mean or the median over all 383 elements for each dimension. 384 385 Arguments: 386 - data : nrows x ncolumns array containing the expression data 387 - mask : nrows x ncolumns array of integers, showing which data 388 are missing. If mask[i][j]==0, then data[i][j] is missing. 389 - transpose: if equal to 0, gene (row) clusters are considered; 390 if equal to 1, microarray (column) clusters are considered. 391 - clusterid: array containing the cluster number for each gene or 392 microarray. The cluster number should be non-negative. 393 - method : specifies how the centroid is calculated: 394 395 - method=='a': arithmetic mean over each dimension. (default) 396 - method=='m': median over each dimension. 397 398 Return values: 399 - cdata : 2D array containing the cluster centroids. If transpose==0, 400 then the dimensions of cdata are nclusters x ncolumns. If 401 transpose==1, then the dimensions of cdata are nrows x nclusters. 402 - cmask : 2D array of integers describing which elements in cdata, 403 if any, are missing. 404 405 """ 406 return clustercentroids(self.data, self.mask, clusterid, method, 407 transpose)
408
409 - def clusterdistance(self, index1=0, index2=0, method='a', dist='e', 410 transpose=0):
411 """Calculate the distance between two clusters. 412 413 Arguments: 414 - index1 : 1D array identifying which genes/microarrays belong to the 415 first cluster. If the cluster contains only one gene, then 416 index1 can also be written as a single integer. 417 - index2 : 1D array identifying which genes/microarrays belong to the 418 second cluster. If the cluster contains only one gene, then 419 index2 can also be written as a single integer. 420 - transpose: if equal to 0, genes (rows) are clustered; 421 if equal to 1, microarrays (columns) are clustered. 422 - dist : specifies the distance function to be used: 423 424 - dist=='e': Euclidean distance 425 - dist=='b': City Block distance 426 - dist=='c': Pearson correlation 427 - dist=='a': absolute value of the correlation 428 - dist=='u': uncentered correlation 429 - dist=='x': absolute uncentered correlation 430 - dist=='s': Spearman's rank correlation 431 - dist=='k': Kendall's tau 432 433 - method : specifies how the distance between two clusters is defined: 434 435 - method=='a': the distance between the arithmetic means of the 436 two clusters 437 - method=='m': the distance between the medians of the two clusters 438 - method=='s': the smallest pairwise distance between members of 439 the two clusters 440 - method=='x': the largest pairwise distance between members of 441 the two clusters 442 - method=='v': average of the pairwise distances between members 443 of the clusters 444 445 - transpose: if equal to 0: clusters of genes (rows) are considered; 446 if equal to 1: clusters of microarrays (columns) are considered. 447 448 """ 449 if transpose == 0: 450 weight = self.eweight 451 else: 452 weight = self.gweight 453 return clusterdistance(self.data, self.mask, weight, 454 index1, index2, method, dist, transpose)
455
456 - def distancematrix(self, transpose=0, dist='e'):
457 """Calculate the distance matrix and return it as a list of arrays. 458 459 Arguments: 460 - transpose: if equal to 0: calculate the distances between genes 461 (rows); if equal to 1: calculate the distances beteeen microarrays 462 (columns). 463 - dist : specifies the distance function to be used: 464 465 - dist=='e': Euclidean distance 466 - dist=='b': City Block distance 467 - dist=='c': Pearson correlation 468 - dist=='a': absolute value of the correlation 469 - dist=='u': uncentered correlation 470 - dist=='x': absolute uncentered correlation 471 - dist=='s': Spearman's rank correlation 472 - dist=='k': Kendall's tau 473 474 Return value: 475 476 The distance matrix is returned as a list of 1D arrays containing the 477 distance matrix between the gene expression data. The number of columns 478 in each row is equal to the row number. Hence, the first row has zero 479 elements. An example of the return value is:: 480 481 matrix = [[], 482 array([1.]), 483 array([7., 3.]), 484 array([4., 2., 6.])] 485 486 This corresponds to the distance matrix:: 487 488 [0., 1., 7., 4.] 489 [1., 0., 3., 2.] 490 [7., 3., 0., 6.] 491 [4., 2., 6., 0.] 492 493 """ 494 if transpose == 0: 495 weight = self.eweight 496 else: 497 weight = self.gweight 498 return distancematrix(self.data, self.mask, weight, transpose, dist)
499
500 - def save(self, jobname, geneclusters=None, expclusters=None):
501 """Save the clustering results. 502 503 The saved files follow the convention for the Java TreeView program, 504 which can therefore be used to view the clustering result. 505 506 Arguments: 507 - jobname: The base name of the files to be saved. The filenames 508 are jobname.cdt, jobname.gtr, and jobname.atr for hierarchical 509 clustering, and jobname-K*.cdt, jobname-K*.kgg, jobname-K*.kag 510 for k-means clustering results. 511 - geneclusters=None: For hierarchical clustering results, 512 geneclusters is a Tree object as returned by the treecluster 513 method. For k-means clustering results, geneclusters is a vector 514 containing ngenes integers, describing to which cluster a given 515 gene belongs. This vector can be calculated by kcluster. 516 - expclusters=None: For hierarchical clustering results, expclusters 517 is a Tree object as returned by the treecluster method. For k-means 518 clustering results, expclusters is a vector containing nexps 519 integers, describing to which cluster a given experimental 520 condition belongs. This vector can be calculated by kcluster. 521 522 """ 523 (ngenes, nexps) = numpy.shape(self.data) 524 if self.gorder is None: 525 gorder = numpy.arange(ngenes) 526 else: 527 gorder = self.gorder 528 if self.eorder is None: 529 eorder = numpy.arange(nexps) 530 else: 531 eorder = self.eorder 532 if geneclusters is not None and expclusters is not None and \ 533 type(geneclusters) != type(expclusters): 534 raise ValueError("found one k-means and one hierarchical " 535 "clustering solution in geneclusters and " 536 "expclusters") 537 gid = 0 538 aid = 0 539 filename = jobname 540 postfix = "" 541 if isinstance(geneclusters, Tree): 542 # This is a hierarchical clustering result. 543 geneindex = _savetree(jobname, geneclusters, gorder, 0) 544 gid = 1 545 elif geneclusters is not None: 546 # This is a k-means clustering result. 547 filename = jobname + "_K" 548 k = max(geneclusters) + 1 549 kggfilename = "%s_K_G%d.kgg" % (jobname, k) 550 geneindex = self._savekmeans(kggfilename, geneclusters, gorder, 0) 551 postfix = "_G%d" % k 552 else: 553 geneindex = numpy.argsort(gorder) 554 if isinstance(expclusters, Tree): 555 # This is a hierarchical clustering result. 556 expindex = _savetree(jobname, expclusters, eorder, 1) 557 aid = 1 558 elif expclusters is not None: 559 # This is a k-means clustering result. 560 filename = jobname + "_K" 561 k = max(expclusters) + 1 562 kagfilename = "%s_K_A%d.kag" % (jobname, k) 563 expindex = self._savekmeans(kagfilename, expclusters, eorder, 1) 564 postfix += "_A%d" % k 565 else: 566 expindex = numpy.argsort(eorder) 567 filename = filename + postfix 568 self._savedata(filename, gid, aid, geneindex, expindex)
569
570 - def _savekmeans(self, filename, clusterids, order, transpose):
571 # Save a k-means clustering solution 572 if transpose == 0: 573 label = self.uniqid 574 names = self.geneid 575 else: 576 label = "ARRAY" 577 names = self.expid 578 with open(filename, "w") as outputfile: 579 outputfile.write(label + "\tGROUP\n") 580 index = numpy.argsort(order) 581 n = len(names) 582 sortedindex = numpy.zeros(n, int) 583 counter = 0 584 cluster = 0 585 while counter < n: 586 for j in index: 587 if clusterids[j] == cluster: 588 outputfile.write("%s\t%s\n" % (names[j], cluster)) 589 sortedindex[counter] = j 590 counter += 1 591 cluster += 1 592 return sortedindex
593
594 - def _savedata(self, jobname, gid, aid, geneindex, expindex):
595 # Save the clustered data. 596 if self.genename is None: 597 genename = self.geneid 598 else: 599 genename = self.genename 600 (ngenes, nexps) = numpy.shape(self.data) 601 with open(jobname + '.cdt', 'w') as outputfile: 602 if self.mask is not None: 603 mask = self.mask 604 else: 605 mask = numpy.ones((ngenes, nexps), int) 606 if self.gweight is not None: 607 gweight = self.gweight 608 else: 609 gweight = numpy.ones(ngenes) 610 if self.eweight is not None: 611 eweight = self.eweight 612 else: 613 eweight = numpy.ones(nexps) 614 if gid: 615 outputfile.write('GID\t') 616 outputfile.write(self.uniqid) 617 outputfile.write('\tNAME\tGWEIGHT') 618 # Now add headers for data columns. 619 for j in expindex: 620 outputfile.write('\t%s' % self.expid[j]) 621 outputfile.write('\n') 622 if aid: 623 outputfile.write("AID") 624 if gid: 625 outputfile.write('\t') 626 outputfile.write("\t\t") 627 for j in expindex: 628 outputfile.write('\tARRY%dX' % j) 629 outputfile.write('\n') 630 outputfile.write('EWEIGHT') 631 if gid: 632 outputfile.write('\t') 633 outputfile.write('\t\t') 634 for j in expindex: 635 outputfile.write('\t%f' % eweight[j]) 636 outputfile.write('\n') 637 for i in geneindex: 638 if gid: 639 outputfile.write('GENE%dX\t' % i) 640 outputfile.write("%s\t%s\t%f" % 641 (self.geneid[i], genename[i], gweight[i])) 642 for j in expindex: 643 outputfile.write('\t') 644 if mask[i, j]: 645 outputfile.write(str(self.data[i, j])) 646 outputfile.write('\n')
647 648
649 -def read(handle):
650 """Read gene expression data from the file handle and return a Record. 651 652 The file should be in the file format defined for Michael Eisen's 653 Cluster/TreeView program. 654 """ 655 return Record(handle)
656