Package Bio :: Package Cluster
[hide private]
[frames] | no frames]

Source Code for Package Bio.Cluster

  1  # This code is part of the Biopython distribution and governed by its 
  2  # license.  Please see the LICENSE file that should have been included 
  3  # as part of this package. 
  4  # 
  5   
  6  import numpy 
  7   
  8  from Bio.Cluster.cluster import * 
  9   
 10  __docformat__ = "restructuredtext en" 
 11   
 12   
13 -def _treesort(order, nodeorder, nodecounts, tree):
14 # Find the order of the nodes consistent with the hierarchical clustering 15 # tree, taking into account the preferred order of nodes. 16 nNodes = len(tree) 17 nElements = nNodes + 1 18 neworder = numpy.zeros(nElements) 19 clusterids = numpy.arange(nElements) 20 for i in range(nNodes): 21 i1 = tree[i].left 22 i2 = tree[i].right 23 if i1 < 0: 24 order1 = nodeorder[-i1 - 1] 25 count1 = nodecounts[-i1 - 1] 26 else: 27 order1 = order[i1] 28 count1 = 1 29 if i2 < 0: 30 order2 = nodeorder[-i2 - 1] 31 count2 = nodecounts[-i2 - 1] 32 else: 33 order2 = order[i2] 34 count2 = 1 35 # If order1 and order2 are equal, their order is determined 36 # by the order in which they were clustered 37 if i1 < i2: 38 if order1 < order2: 39 increase = count1 40 else: 41 increase = count2 42 for j in range(nElements): 43 clusterid = clusterids[j] 44 if clusterid == i1 and order1 >= order2: 45 neworder[j] += increase 46 if clusterid == i2 and order1 < order2: 47 neworder[j] += increase 48 if clusterid == i1 or clusterid == i2: 49 clusterids[j] = -i - 1 50 else: 51 if order1 <= order2: 52 increase = count1 53 else: 54 increase = count2 55 for j in range(nElements): 56 clusterid = clusterids[j] 57 if clusterid == i1 and order1 > order2: 58 neworder[j] += increase 59 if clusterid == i2 and order1 <= order2: 60 neworder[j] += increase 61 if clusterid == i1 or clusterid == i2: 62 clusterids[j] = -i - 1 63 return numpy.argsort(neworder)
64 65
66 -def _savetree(jobname, tree, order, transpose):
67 # Save the hierarchical clustering solution given by the tree, following 68 # the specified order, in a file whose name is based on jobname. 69 if transpose == 0: 70 extension = ".gtr" 71 keyword = "GENE" 72 else: 73 extension = ".atr" 74 keyword = "ARRY" 75 nnodes = len(tree) 76 with open(jobname + extension, "w") as outputfile: 77 nodeindex = 0 78 nodeID = [''] * nnodes 79 nodecounts = numpy.zeros(nnodes, int) 80 nodeorder = numpy.zeros(nnodes) 81 nodedist = numpy.array([node.distance for node in tree]) 82 for nodeindex in range(nnodes): 83 min1 = tree[nodeindex].left 84 min2 = tree[nodeindex].right 85 nodeID[nodeindex] = "NODE%dX" % (nodeindex + 1) 86 outputfile.write(nodeID[nodeindex]) 87 outputfile.write("\t") 88 if min1 < 0: 89 index1 = -min1 - 1 90 order1 = nodeorder[index1] 91 counts1 = nodecounts[index1] 92 outputfile.write(nodeID[index1] + "\t") 93 nodedist[nodeindex] = max(nodedist[nodeindex], nodedist[index1]) 94 else: 95 order1 = order[min1] 96 counts1 = 1 97 outputfile.write("%s%dX\t" % (keyword, min1)) 98 if min2 < 0: 99 index2 = -min2 - 1 100 order2 = nodeorder[index2] 101 counts2 = nodecounts[index2] 102 outputfile.write(nodeID[index2] + "\t") 103 nodedist[nodeindex] = max(nodedist[nodeindex], nodedist[index2]) 104 else: 105 order2 = order[min2] 106 counts2 = 1 107 outputfile.write("%s%dX\t" % (keyword, min2)) 108 outputfile.write(str(1.0 - nodedist[nodeindex])) 109 outputfile.write("\n") 110 counts = counts1 + counts2 111 nodecounts[nodeindex] = counts 112 nodeorder[nodeindex] = (counts1 * order1 + counts2 * order2) / counts 113 # Now set up order based on the tree structure 114 index = _treesort(order, nodeorder, nodecounts, tree) 115 return index
116 117
118 -class Record(object):
119 """Store gene expression data. 120 121 A Record stores the gene expression data and related information contained 122 in a data file following the file format defined for Michael Eisen's 123 Cluster/TreeView program. A Record has the following members: 124 125 - data: a matrix containing the gene expression data 126 - mask: a matrix containing only 1's and 0's, denoting which values 127 are present (1) or missing (0). If all elements of mask are 128 one (no missing data), then mask is set to None. 129 - geneid: a list containing a unique identifier for each gene 130 (e.g., ORF name) 131 - genename: a list containing an additional description for each gene 132 (e.g., gene name) 133 - gweight: the weight to be used for each gene when calculating the 134 distance 135 - gorder: an array of real numbers indicating the preferred order of the 136 genes in the output file 137 - expid: a list containing a unique identifier for each experimental 138 condition 139 - eweight: the weight to be used for each experimental condition when 140 calculating the distance 141 - eorder: an array of real numbers indication the preferred order in the 142 output file of the experimental conditions 143 - uniqid: the string that was used instead of UNIQID in the input file. 144 145 """ 146
147 - def __init__(self, handle=None):
148 """Read gene expression data from the file handle and return a Record. 149 150 The file should be in the format defined for Michael Eisen's 151 Cluster/TreeView program. 152 153 """ 154 self.data = None 155 self.mask = None 156 self.geneid = None 157 self.genename = None 158 self.gweight = None 159 self.gorder = None 160 self.expid = None 161 self.eweight = None 162 self.eorder = None 163 self.uniqid = None 164 if not handle: 165 return 166 line = handle.readline().strip("\r\n").split("\t") 167 n = len(line) 168 self.uniqid = line[0] 169 self.expid = [] 170 cols = {0: "GENEID"} 171 for word in line[1:]: 172 if word == "NAME": 173 cols[line.index(word)] = word 174 self.genename = [] 175 elif word == "GWEIGHT": 176 cols[line.index(word)] = word 177 self.gweight = [] 178 elif word == "GORDER": 179 cols[line.index(word)] = word 180 self.gorder = [] 181 else: 182 self.expid.append(word) 183 self.geneid = [] 184 self.data = [] 185 self.mask = [] 186 needmask = 0 187 for line in handle: 188 line = line.strip("\r\n").split("\t") 189 if len(line) != n: 190 raise ValueError("Line with %d columns found (expected %d)" % 191 (len(line), n)) 192 if line[0] == "EWEIGHT": 193 i = max(cols) + 1 194 self.eweight = numpy.array(line[i:], float) 195 continue 196 if line[0] == "EORDER": 197 i = max(cols) + 1 198 self.eorder = numpy.array(line[i:], float) 199 continue 200 rowdata = [] 201 rowmask = [] 202 n = len(line) 203 for i in range(n): 204 word = line[i] 205 if i in cols: 206 if cols[i] == "GENEID": 207 self.geneid.append(word) 208 if cols[i] == "NAME": 209 self.genename.append(word) 210 if cols[i] == "GWEIGHT": 211 self.gweight.append(float(word)) 212 if cols[i] == "GORDER": 213 self.gorder.append(float(word)) 214 continue 215 if not word: 216 rowdata.append(0.0) 217 rowmask.append(0) 218 needmask = 1 219 else: 220 rowdata.append(float(word)) 221 rowmask.append(1) 222 self.data.append(rowdata) 223 self.mask.append(rowmask) 224 self.data = numpy.array(self.data) 225 if needmask: 226 self.mask = numpy.array(self.mask, int) 227 else: 228 self.mask = None 229 if self.gweight: 230 self.gweight = numpy.array(self.gweight) 231 if self.gorder: 232 self.gorder = numpy.array(self.gorder)
233
234 - def treecluster(self, transpose=0, method='m', dist='e'):
235 """Apply hierarchical clustering and return a Tree object. 236 237 The pairwise single, complete, centroid, and average linkage hierarchical 238 clustering methods are available. 239 240 - transpose: if equal to 0, genes (rows) are clustered; 241 if equal to 1, microarrays (columns) are clustered. 242 - dist : specifies the distance function to be used: 243 244 - dist=='e': Euclidean distance 245 - dist=='b': City Block distance 246 - dist=='c': Pearson correlation 247 - dist=='a': absolute value of the correlation 248 - dist=='u': uncentered correlation 249 - dist=='x': absolute uncentered correlation 250 - dist=='s': Spearman's rank correlation 251 - dist=='k': Kendall's tau 252 253 - method : specifies which linkage method is used: 254 255 - method=='s': Single pairwise linkage 256 - method=='m': Complete (maximum) pairwise linkage (default) 257 - method=='c': Centroid linkage 258 - method=='a': Average pairwise linkage 259 260 See the description of the Tree class for more information about the Tree 261 object returned by this method. 262 263 """ 264 if transpose == 0: 265 weight = self.eweight 266 else: 267 weight = self.gweight 268 return treecluster(self.data, self.mask, weight, transpose, method, 269 dist)
270
271 - def kcluster(self, nclusters=2, transpose=0, npass=1, method='a', dist='e', 272 initialid=None):
273 """Apply k-means or k-median clustering. 274 275 This method returns a tuple (clusterid, error, nfound). 276 277 - nclusters: number of clusters (the 'k' in k-means) 278 - transpose: if equal to 0, genes (rows) are clustered; 279 if equal to 1, microarrays (columns) are clustered. 280 - npass : number of times the k-means clustering algorithm is 281 performed, each time with a different (random) initial 282 condition. 283 - method : specifies how the center of a cluster is found: 284 method=='a': arithmetic mean 285 method=='m': median 286 - dist : specifies the distance function to be used: 287 288 - dist=='e': Euclidean distance 289 - dist=='b': City Block distance 290 - dist=='c': Pearson correlation 291 - dist=='a': absolute value of the correlation 292 - dist=='u': uncentered correlation 293 - dist=='x': absolute uncentered correlation 294 - dist=='s': Spearman's rank correlation 295 - dist=='k': Kendall's tau 296 297 - initialid: the initial clustering from which the algorithm should start. 298 If initialid is None, the routine carries out npass 299 repetitions of the EM algorithm, each time starting from a 300 different random initial clustering. If initialid is given, 301 the routine carries out the EM algorithm only once, starting 302 from the given initial clustering and without randomizing the 303 order in which items are assigned to clusters (i.e., using 304 the same order as in the data matrix). In that case, the 305 k-means algorithm is fully deterministic. 306 307 Return values: 308 - clusterid: array containing the number of the cluster to which each 309 gene/microarray was assigned in the best k-means clustering 310 solution that was found in the npass runs; 311 - error: the within-cluster sum of distances for the returned k-means 312 clustering solution; 313 - nfound: the number of times this solution was found. 314 315 """ 316 317 if transpose == 0: 318 weight = self.eweight 319 else: 320 weight = self.gweight 321 return kcluster(self.data, nclusters, self.mask, weight, transpose, 322 npass, method, dist, initialid)
323
324 - def somcluster(self, transpose=0, nxgrid=2, nygrid=1, inittau=0.02, 325 niter=1, dist='e'):
326 """Calculate a self-organizing map on a rectangular grid. 327 328 The somcluster method returns a tuple (clusterid, celldata). 329 330 - transpose: if equal to 0, genes (rows) are clustered; 331 if equal to 1, microarrays (columns) are clustered. 332 - nxgrid : the horizontal dimension of the rectangular SOM map 333 - nygrid : the vertical dimension of the rectangular SOM map 334 - inittau : the initial value of tau (the neighborbood function) 335 - niter : the number of iterations 336 - dist : specifies the distance function to be used: 337 338 - dist=='e': Euclidean distance 339 - dist=='b': City Block distance 340 - dist=='c': Pearson correlation 341 - dist=='a': absolute value of the correlation 342 - dist=='u': uncentered correlation 343 - dist=='x': absolute uncentered correlation 344 - dist=='s': Spearman's rank correlation 345 - dist=='k': Kendall's tau 346 347 Return values: 348 349 - clusterid: array with two columns, while the number of rows is equal to 350 the number of genes or the number of microarrays depending on 351 whether genes or microarrays are being clustered. Each row in 352 the array contains the x and y coordinates of the cell in the 353 rectangular SOM grid to which the gene or microarray was 354 assigned. 355 - celldata: an array with dimensions (nxgrid, nygrid, number of 356 microarrays) if genes are being clustered, or (nxgrid, 357 nygrid, number of genes) if microarrays are being clustered. 358 Each element [ix][iy] of this array is a 1D vector containing 359 the gene expression data for the centroid of the cluster in 360 the SOM grid cell with coordinates (ix, iy). 361 362 """ 363 364 if transpose == 0: 365 weight = self.eweight 366 else: 367 weight = self.gweight 368 return somcluster(self.data, self.mask, weight, transpose, 369 nxgrid, nygrid, inittau, niter, dist)
370
371 - def clustercentroids(self, clusterid=None, method='a', transpose=0):
372 """Calculate the cluster centroids and return a tuple (cdata, cmask). 373 374 The centroid is defined as either the mean or the median over all elements 375 for each dimension. 376 377 - data : nrows x ncolumns array containing the expression data 378 - mask : nrows x ncolumns array of integers, showing which data are 379 missing. If mask[i][j]==0, then data[i][j] is missing. 380 - transpose: if equal to 0, gene (row) clusters are considered; 381 if equal to 1, microarray (column) clusters are considered. 382 - clusterid: array containing the cluster number for each gene or 383 microarray. The cluster number should be non-negative. 384 - method : specifies how the centroid is calculated: 385 method=='a': arithmetic mean over each dimension. (default) 386 method=='m': median over each dimension. 387 388 Return values: 389 - cdata : 2D array containing the cluster centroids. If transpose==0, 390 then the dimensions of cdata are nclusters x ncolumns. If 391 transpose==1, then the dimensions of cdata are 392 nrows x nclusters. 393 - cmask : 2D array of integers describing which elements in cdata, 394 if any, are missing. 395 396 """ 397 return clustercentroids(self.data, self.mask, clusterid, method, 398 transpose)
399
400 - def clusterdistance(self, index1=[0], index2=[0], method='a', dist='e', 401 transpose=0):
402 """Calculate the distance between two clusters. 403 404 - index1 : 1D array identifying which genes/microarrays belong to the 405 first cluster. If the cluster contains only one gene, then 406 index1 can also be written as a single integer. 407 - index2 : 1D array identifying which genes/microarrays belong to the 408 second cluster. If the cluster contains only one gene, then 409 index2 can also be written as a single integer. 410 - transpose: if equal to 0, genes (rows) are clustered; 411 if equal to 1, microarrays (columns) are clustered. 412 - dist : specifies the distance function to be used: 413 414 - dist=='e': Euclidean distance 415 - dist=='b': City Block distance 416 - dist=='c': Pearson correlation 417 - dist=='a': absolute value of the correlation 418 - dist=='u': uncentered correlation 419 - dist=='x': absolute uncentered correlation 420 - dist=='s': Spearman's rank correlation 421 - dist=='k': Kendall's tau 422 423 - method : specifies how the distance between two clusters is defined: 424 425 - method=='a': the distance between the arithmetic means of the 426 two clusters 427 - method=='m': the distance between the medians of the two 428 clusters 429 - method=='s': the smallest pairwise distance between members 430 of the two clusters 431 - method=='x': the largest pairwise distance between members of 432 the two clusters 433 - method=='v': average of the pairwise distances between 434 members of the clusters 435 436 - transpose: if equal to 0: clusters of genes (rows) are considered; 437 if equal to 1: clusters of microarrays (columns) are considered. 438 439 """ 440 441 if transpose == 0: 442 weight = self.eweight 443 else: 444 weight = self.gweight 445 return clusterdistance(self.data, self.mask, weight, 446 index1, index2, method, dist, transpose)
447
448 - def distancematrix(self, transpose=0, dist='e'):
449 """Calculate the distance matrix and return it as a list of arrays 450 451 - transpose: if equal to 0: calculate the distances between genes (rows); 452 if equal to 1: calculate the distances beteeen microarrays 453 (columns). 454 - dist : specifies the distance function to be used: 455 456 - dist=='e': Euclidean distance 457 - dist=='b': City Block distance 458 - dist=='c': Pearson correlation 459 - dist=='a': absolute value of the correlation 460 - dist=='u': uncentered correlation 461 - dist=='x': absolute uncentered correlation 462 - dist=='s': Spearman's rank correlation 463 - dist=='k': Kendall's tau 464 465 Return value: 466 The distance matrix is returned as a list of 1D arrays containing the 467 distance matrix between the gene expression data. The number of columns 468 in each row is equal to the row number. Hence, the first row has zero 469 elements. An example of the return value is:: 470 471 matrix = [[], 472 array([1.]), 473 array([7., 3.]), 474 array([4., 2., 6.])] 475 476 This corresponds to the distance matrix:: 477 478 [0., 1., 7., 4.] 479 [1., 0., 3., 2.] 480 [7., 3., 0., 6.] 481 [4., 2., 6., 0.] 482 483 """ 484 if transpose == 0: 485 weight = self.eweight 486 else: 487 weight = self.gweight 488 return distancematrix(self.data, self.mask, weight, transpose, dist)
489
490 - def save(self, jobname, geneclusters=None, expclusters=None):
491 """Save the clustering results. 492 493 The saved files follow the convention for the Java TreeView program, 494 which can therefore be used to view the clustering result. 495 496 Arguments: 497 498 - jobname: The base name of the files to be saved. The filenames are 499 jobname.cdt, jobname.gtr, and jobname.atr for 500 hierarchical clustering, and jobname-K*.cdt, 501 jobname-K*.kgg, jobname-K*.kag for k-means clustering 502 results. 503 - geneclusters=None: For hierarchical clustering results, geneclusters 504 is a Tree object as returned by the treecluster method. 505 For k-means clustering results, geneclusters is a vector 506 containing ngenes integers, describing to which cluster a 507 given gene belongs. This vector can be calculated by 508 kcluster. 509 - expclusters=None: For hierarchical clustering results, expclusters 510 is a Tree object as returned by the treecluster method. 511 For k-means clustering results, expclusters is a vector 512 containing nexps integers, describing to which cluster a 513 given experimental condition belongs. This vector can be 514 calculated by kcluster. 515 516 """ 517 (ngenes, nexps) = numpy.shape(self.data) 518 if self.gorder is None: 519 gorder = numpy.arange(ngenes) 520 else: 521 gorder = self.gorder 522 if self.eorder is None: 523 eorder = numpy.arange(nexps) 524 else: 525 eorder = self.eorder 526 if geneclusters is not None and expclusters is not None and \ 527 type(geneclusters) != type(expclusters): 528 raise ValueError("found one k-means and one hierarchical " 529 + "clustering solution in geneclusters and " 530 + "expclusters") 531 gid = 0 532 aid = 0 533 filename = jobname 534 postfix = "" 535 if isinstance(geneclusters, Tree): 536 # This is a hierarchical clustering result. 537 geneindex = _savetree(jobname, geneclusters, gorder, 0) 538 gid = 1 539 elif geneclusters is not None: 540 # This is a k-means clustering result. 541 filename = jobname + "_K" 542 k = max(geneclusters) + 1 543 kggfilename = "%s_K_G%d.kgg" % (jobname, k) 544 geneindex = self._savekmeans(kggfilename, geneclusters, gorder, 0) 545 postfix = "_G%d" % k 546 else: 547 geneindex = numpy.argsort(gorder) 548 if isinstance(expclusters, Tree): 549 # This is a hierarchical clustering result. 550 expindex = _savetree(jobname, expclusters, eorder, 1) 551 aid = 1 552 elif expclusters is not None: 553 # This is a k-means clustering result. 554 filename = jobname + "_K" 555 k = max(expclusters) + 1 556 kagfilename = "%s_K_A%d.kag" % (jobname, k) 557 expindex = self._savekmeans(kagfilename, expclusters, eorder, 1) 558 postfix += "_A%d" % k 559 else: 560 expindex = numpy.argsort(eorder) 561 filename = filename + postfix 562 self._savedata(filename, gid, aid, geneindex, expindex)
563
564 - def _savekmeans(self, filename, clusterids, order, transpose):
565 # Save a k-means clustering solution 566 if transpose == 0: 567 label = self.uniqid 568 names = self.geneid 569 else: 570 label = "ARRAY" 571 names = self.expid 572 with open(filename, "w") as outputfile: 573 outputfile.write(label + "\tGROUP\n") 574 index = numpy.argsort(order) 575 n = len(names) 576 sortedindex = numpy.zeros(n, int) 577 counter = 0 578 cluster = 0 579 while counter < n: 580 for j in index: 581 if clusterids[j] == cluster: 582 outputfile.write("%s\t%s\n" % (names[j], cluster)) 583 sortedindex[counter] = j 584 counter += 1 585 cluster += 1 586 return sortedindex
587
588 - def _savedata(self, jobname, gid, aid, geneindex, expindex):
589 # Save the clustered data. 590 if self.genename is None: 591 genename = self.geneid 592 else: 593 genename = self.genename 594 (ngenes, nexps) = numpy.shape(self.data) 595 with open(jobname + '.cdt', 'w') as outputfile: 596 if self.mask is not None: 597 mask = self.mask 598 else: 599 mask = numpy.ones((ngenes, nexps), int) 600 if self.gweight is not None: 601 gweight = self.gweight 602 else: 603 gweight = numpy.ones(ngenes) 604 if self.eweight is not None: 605 eweight = self.eweight 606 else: 607 eweight = numpy.ones(nexps) 608 if gid: 609 outputfile.write('GID\t') 610 outputfile.write(self.uniqid) 611 outputfile.write('\tNAME\tGWEIGHT') 612 # Now add headers for data columns. 613 for j in expindex: 614 outputfile.write('\t%s' % self.expid[j]) 615 outputfile.write('\n') 616 if aid: 617 outputfile.write("AID") 618 if gid: 619 outputfile.write('\t') 620 outputfile.write("\t\t") 621 for j in expindex: 622 outputfile.write('\tARRY%dX' % j) 623 outputfile.write('\n') 624 outputfile.write('EWEIGHT') 625 if gid: 626 outputfile.write('\t') 627 outputfile.write('\t\t') 628 for j in expindex: 629 outputfile.write('\t%f' % eweight[j]) 630 outputfile.write('\n') 631 for i in geneindex: 632 if gid: 633 outputfile.write('GENE%dX\t' % i) 634 outputfile.write("%s\t%s\t%f" % 635 (self.geneid[i], genename[i], gweight[i])) 636 for j in expindex: 637 outputfile.write('\t') 638 if mask[i, j]: 639 outputfile.write(str(self.data[i, j])) 640 outputfile.write('\n')
641 642
643 -def read(handle):
644 """Read gene expression data from the file handle and return a Record. 645 646 The file should be in the file format defined for Michael Eisen's 647 Cluster/TreeView program. 648 649 """ 650 return Record(handle)
651