Package Bio :: Package Cluster
[hide private]
[frames] | no frames]

Source Code for Package Bio.Cluster

  1  # This code is part of the Biopython distribution and governed by its 
  2  # license.  Please see the LICENSE file that should have been included 
  3  # as part of this package. 
  4  # 
  5   
  6  import numpy 
  7   
  8  from Bio.Cluster.cluster import * 
  9   
 10   
11 -def _treesort(order, nodeorder, nodecounts, tree):
12 # Find the order of the nodes consistent with the hierarchical clustering 13 # tree, taking into account the preferred order of nodes. 14 nNodes = len(tree) 15 nElements = nNodes + 1 16 neworder = numpy.zeros(nElements) 17 clusterids = numpy.arange(nElements) 18 for i in range(nNodes): 19 i1 = tree[i].left 20 i2 = tree[i].right 21 if i1 < 0: 22 order1 = nodeorder[-i1 - 1] 23 count1 = nodecounts[-i1 - 1] 24 else: 25 order1 = order[i1] 26 count1 = 1 27 if i2 < 0: 28 order2 = nodeorder[-i2 - 1] 29 count2 = nodecounts[-i2 - 1] 30 else: 31 order2 = order[i2] 32 count2 = 1 33 # If order1 and order2 are equal, their order is determined 34 # by the order in which they were clustered 35 if i1 < i2: 36 if order1 < order2: 37 increase = count1 38 else: 39 increase = count2 40 for j in range(nElements): 41 clusterid = clusterids[j] 42 if clusterid == i1 and order1 >= order2: 43 neworder[j] += increase 44 if clusterid == i2 and order1 < order2: 45 neworder[j] += increase 46 if clusterid == i1 or clusterid == i2: 47 clusterids[j] = -i - 1 48 else: 49 if order1 <= order2: 50 increase = count1 51 else: 52 increase = count2 53 for j in range(nElements): 54 clusterid = clusterids[j] 55 if clusterid == i1 and order1 > order2: 56 neworder[j] += increase 57 if clusterid == i2 and order1 <= order2: 58 neworder[j] += increase 59 if clusterid == i1 or clusterid == i2: 60 clusterids[j] = -i - 1 61 return numpy.argsort(neworder)
62 63
64 -def _savetree(jobname, tree, order, transpose):
65 # Save the hierarchical clustering solution given by the tree, following 66 # the specified order, in a file whose name is based on jobname. 67 if transpose == 0: 68 extension = ".gtr" 69 keyword = "GENE" 70 else: 71 extension = ".atr" 72 keyword = "ARRY" 73 nnodes = len(tree) 74 with open(jobname + extension, "w") as outputfile: 75 nodeindex = 0 76 nodeID = [''] * nnodes 77 nodecounts = numpy.zeros(nnodes, int) 78 nodeorder = numpy.zeros(nnodes) 79 nodedist = numpy.array([node.distance for node in tree]) 80 for nodeindex in range(nnodes): 81 min1 = tree[nodeindex].left 82 min2 = tree[nodeindex].right 83 nodeID[nodeindex] = "NODE%dX" % (nodeindex + 1) 84 outputfile.write(nodeID[nodeindex]) 85 outputfile.write("\t") 86 if min1 < 0: 87 index1 = -min1 - 1 88 order1 = nodeorder[index1] 89 counts1 = nodecounts[index1] 90 outputfile.write(nodeID[index1] + "\t") 91 nodedist[nodeindex] = max(nodedist[nodeindex], nodedist[index1]) 92 else: 93 order1 = order[min1] 94 counts1 = 1 95 outputfile.write("%s%dX\t" % (keyword, min1)) 96 if min2 < 0: 97 index2 = -min2 - 1 98 order2 = nodeorder[index2] 99 counts2 = nodecounts[index2] 100 outputfile.write(nodeID[index2] + "\t") 101 nodedist[nodeindex] = max(nodedist[nodeindex], nodedist[index2]) 102 else: 103 order2 = order[min2] 104 counts2 = 1 105 outputfile.write("%s%dX\t" % (keyword, min2)) 106 outputfile.write(str(1.0 - nodedist[nodeindex])) 107 outputfile.write("\n") 108 counts = counts1 + counts2 109 nodecounts[nodeindex] = counts 110 nodeorder[nodeindex] = (counts1 * order1 + counts2 * order2) / counts 111 # Now set up order based on the tree structure 112 index = _treesort(order, nodeorder, nodecounts, tree) 113 return index
114 115
116 -class Record(object):
117 """Store gene expression data. 118 119 A Record stores the gene expression data and related information contained 120 in a data file following the file format defined for Michael Eisen's 121 Cluster/TreeView program. A Record has the following members: 122 123 - data: a matrix containing the gene expression data 124 - mask: a matrix containing only 1's and 0's, denoting which values 125 are present (1) or missing (0). If all elements of mask are 126 one (no missing data), then mask is set to None. 127 - geneid: a list containing a unique identifier for each gene 128 (e.g., ORF name) 129 - genename: a list containing an additional description for each gene 130 (e.g., gene name) 131 - gweight: the weight to be used for each gene when calculating the 132 distance 133 - gorder: an array of real numbers indicating the preferred order of the 134 genes in the output file 135 - expid: a list containing a unique identifier for each experimental 136 condition 137 - eweight: the weight to be used for each experimental condition when 138 calculating the distance 139 - eorder: an array of real numbers indication the preferred order in the 140 output file of the experimental conditions 141 - uniqid: the string that was used instead of UNIQID in the input file. 142 143 """ 144
145 - def __init__(self, handle=None):
146 """Read gene expression data from the file handle and return a Record. 147 148 The file should be in the format defined for Michael Eisen's 149 Cluster/TreeView program. 150 151 """ 152 self.data = None 153 self.mask = None 154 self.geneid = None 155 self.genename = None 156 self.gweight = None 157 self.gorder = None 158 self.expid = None 159 self.eweight = None 160 self.eorder = None 161 self.uniqid = None 162 if not handle: 163 return 164 line = handle.readline().strip("\r\n").split("\t") 165 n = len(line) 166 self.uniqid = line[0] 167 self.expid = [] 168 cols = {0: "GENEID"} 169 for word in line[1:]: 170 if word == "NAME": 171 cols[line.index(word)] = word 172 self.genename = [] 173 elif word == "GWEIGHT": 174 cols[line.index(word)] = word 175 self.gweight = [] 176 elif word == "GORDER": 177 cols[line.index(word)] = word 178 self.gorder = [] 179 else: 180 self.expid.append(word) 181 self.geneid = [] 182 self.data = [] 183 self.mask = [] 184 needmask = 0 185 for line in handle: 186 line = line.strip("\r\n").split("\t") 187 if len(line) != n: 188 raise ValueError("Line with %d columns found (expected %d)" % 189 (len(line), n)) 190 if line[0] == "EWEIGHT": 191 i = max(cols) + 1 192 self.eweight = numpy.array(line[i:], float) 193 continue 194 if line[0] == "EORDER": 195 i = max(cols) + 1 196 self.eorder = numpy.array(line[i:], float) 197 continue 198 rowdata = [] 199 rowmask = [] 200 n = len(line) 201 for i in range(n): 202 word = line[i] 203 if i in cols: 204 if cols[i] == "GENEID": 205 self.geneid.append(word) 206 if cols[i] == "NAME": 207 self.genename.append(word) 208 if cols[i] == "GWEIGHT": 209 self.gweight.append(float(word)) 210 if cols[i] == "GORDER": 211 self.gorder.append(float(word)) 212 continue 213 if not word: 214 rowdata.append(0.0) 215 rowmask.append(0) 216 needmask = 1 217 else: 218 rowdata.append(float(word)) 219 rowmask.append(1) 220 self.data.append(rowdata) 221 self.mask.append(rowmask) 222 self.data = numpy.array(self.data) 223 if needmask: 224 self.mask = numpy.array(self.mask, int) 225 else: 226 self.mask = None 227 if self.gweight: 228 self.gweight = numpy.array(self.gweight) 229 if self.gorder: 230 self.gorder = numpy.array(self.gorder)
231
232 - def treecluster(self, transpose=0, method='m', dist='e'):
233 """Apply hierarchical clustering and return a Tree object. 234 235 The pairwise single, complete, centroid, and average linkage hierarchical 236 clustering methods are available. 237 238 - transpose: if equal to 0, genes (rows) are clustered; 239 if equal to 1, microarrays (columns) are clustered. 240 - dist : specifies the distance function to be used: 241 242 - dist=='e': Euclidean distance 243 - dist=='b': City Block distance 244 - dist=='c': Pearson correlation 245 - dist=='a': absolute value of the correlation 246 - dist=='u': uncentered correlation 247 - dist=='x': absolute uncentered correlation 248 - dist=='s': Spearman's rank correlation 249 - dist=='k': Kendall's tau 250 251 - method : specifies which linkage method is used: 252 253 - method=='s': Single pairwise linkage 254 - method=='m': Complete (maximum) pairwise linkage (default) 255 - method=='c': Centroid linkage 256 - method=='a': Average pairwise linkage 257 258 See the description of the Tree class for more information about the Tree 259 object returned by this method. 260 261 """ 262 if transpose == 0: 263 weight = self.eweight 264 else: 265 weight = self.gweight 266 return treecluster(self.data, self.mask, weight, transpose, method, 267 dist)
268
269 - def kcluster(self, nclusters=2, transpose=0, npass=1, method='a', dist='e', 270 initialid=None):
271 """Apply k-means or k-median clustering. 272 273 This method returns a tuple (clusterid, error, nfound). 274 275 - nclusters: number of clusters (the 'k' in k-means) 276 - transpose: if equal to 0, genes (rows) are clustered; 277 if equal to 1, microarrays (columns) are clustered. 278 - npass : number of times the k-means clustering algorithm is 279 performed, each time with a different (random) initial 280 condition. 281 - method : specifies how the center of a cluster is found: 282 method=='a': arithmetic mean 283 method=='m': median 284 - dist : specifies the distance function to be used: 285 286 - dist=='e': Euclidean distance 287 - dist=='b': City Block distance 288 - dist=='c': Pearson correlation 289 - dist=='a': absolute value of the correlation 290 - dist=='u': uncentered correlation 291 - dist=='x': absolute uncentered correlation 292 - dist=='s': Spearman's rank correlation 293 - dist=='k': Kendall's tau 294 295 - initialid: the initial clustering from which the algorithm should start. 296 If initialid is None, the routine carries out npass 297 repetitions of the EM algorithm, each time starting from a 298 different random initial clustering. If initialid is given, 299 the routine carries out the EM algorithm only once, starting 300 from the given initial clustering and without randomizing the 301 order in which items are assigned to clusters (i.e., using 302 the same order as in the data matrix). In that case, the 303 k-means algorithm is fully deterministic. 304 305 Return values: 306 - clusterid: array containing the number of the cluster to which each 307 gene/microarray was assigned in the best k-means clustering 308 solution that was found in the npass runs; 309 - error: the within-cluster sum of distances for the returned k-means 310 clustering solution; 311 - nfound: the number of times this solution was found. 312 313 """ 314 315 if transpose == 0: 316 weight = self.eweight 317 else: 318 weight = self.gweight 319 return kcluster(self.data, nclusters, self.mask, weight, transpose, 320 npass, method, dist, initialid)
321
322 - def somcluster(self, transpose=0, nxgrid=2, nygrid=1, inittau=0.02, 323 niter=1, dist='e'):
324 """Calculate a self-organizing map on a rectangular grid. 325 326 The somcluster method returns a tuple (clusterid, celldata). 327 328 - transpose: if equal to 0, genes (rows) are clustered; 329 if equal to 1, microarrays (columns) are clustered. 330 - nxgrid : the horizontal dimension of the rectangular SOM map 331 - nygrid : the vertical dimension of the rectangular SOM map 332 - inittau : the initial value of tau (the neighborbood function) 333 - niter : the number of iterations 334 - dist : specifies the distance function to be used: 335 336 - dist=='e': Euclidean distance 337 - dist=='b': City Block distance 338 - dist=='c': Pearson correlation 339 - dist=='a': absolute value of the correlation 340 - dist=='u': uncentered correlation 341 - dist=='x': absolute uncentered correlation 342 - dist=='s': Spearman's rank correlation 343 - dist=='k': Kendall's tau 344 345 Return values: 346 347 - clusterid: array with two columns, while the number of rows is equal to 348 the number of genes or the number of microarrays depending on 349 whether genes or microarrays are being clustered. Each row in 350 the array contains the x and y coordinates of the cell in the 351 rectangular SOM grid to which the gene or microarray was 352 assigned. 353 - celldata: an array with dimensions (nxgrid, nygrid, number of 354 microarrays) if genes are being clustered, or (nxgrid, 355 nygrid, number of genes) if microarrays are being clustered. 356 Each element [ix][iy] of this array is a 1D vector containing 357 the gene expression data for the centroid of the cluster in 358 the SOM grid cell with coordinates (ix, iy). 359 360 """ 361 362 if transpose == 0: 363 weight = self.eweight 364 else: 365 weight = self.gweight 366 return somcluster(self.data, self.mask, weight, transpose, 367 nxgrid, nygrid, inittau, niter, dist)
368
369 - def clustercentroids(self, clusterid=None, method='a', transpose=0):
370 """Calculate the cluster centroids and return a tuple (cdata, cmask). 371 372 The centroid is defined as either the mean or the median over all elements 373 for each dimension. 374 375 - data : nrows x ncolumns array containing the expression data 376 - mask : nrows x ncolumns array of integers, showing which data are 377 missing. If mask[i][j]==0, then data[i][j] is missing. 378 - transpose: if equal to 0, gene (row) clusters are considered; 379 if equal to 1, microarray (column) clusters are considered. 380 - clusterid: array containing the cluster number for each gene or 381 microarray. The cluster number should be non-negative. 382 - method : specifies how the centroid is calculated: 383 method=='a': arithmetic mean over each dimension. (default) 384 method=='m': median over each dimension. 385 386 Return values: 387 - cdata : 2D array containing the cluster centroids. If transpose==0, 388 then the dimensions of cdata are nclusters x ncolumns. If 389 transpose==1, then the dimensions of cdata are 390 nrows x nclusters. 391 - cmask : 2D array of integers describing which elements in cdata, 392 if any, are missing. 393 394 """ 395 return clustercentroids(self.data, self.mask, clusterid, method, 396 transpose)
397
398 - def clusterdistance(self, index1=0, index2=0, method='a', dist='e', 399 transpose=0):
400 """Calculate the distance between two clusters. 401 402 - index1 : 1D array identifying which genes/microarrays belong to the 403 first cluster. If the cluster contains only one gene, then 404 index1 can also be written as a single integer. 405 - index2 : 1D array identifying which genes/microarrays belong to the 406 second cluster. If the cluster contains only one gene, then 407 index2 can also be written as a single integer. 408 - transpose: if equal to 0, genes (rows) are clustered; 409 if equal to 1, microarrays (columns) are clustered. 410 - dist : specifies the distance function to be used: 411 412 - dist=='e': Euclidean distance 413 - dist=='b': City Block distance 414 - dist=='c': Pearson correlation 415 - dist=='a': absolute value of the correlation 416 - dist=='u': uncentered correlation 417 - dist=='x': absolute uncentered correlation 418 - dist=='s': Spearman's rank correlation 419 - dist=='k': Kendall's tau 420 421 - method : specifies how the distance between two clusters is defined: 422 423 - method=='a': the distance between the arithmetic means of the 424 two clusters 425 - method=='m': the distance between the medians of the two 426 clusters 427 - method=='s': the smallest pairwise distance between members 428 of the two clusters 429 - method=='x': the largest pairwise distance between members of 430 the two clusters 431 - method=='v': average of the pairwise distances between 432 members of the clusters 433 434 - transpose: if equal to 0: clusters of genes (rows) are considered; 435 if equal to 1: clusters of microarrays (columns) are considered. 436 437 """ 438 439 if transpose == 0: 440 weight = self.eweight 441 else: 442 weight = self.gweight 443 return clusterdistance(self.data, self.mask, weight, 444 index1, index2, method, dist, transpose)
445
446 - def distancematrix(self, transpose=0, dist='e'):
447 """Calculate the distance matrix and return it as a list of arrays 448 449 - transpose: if equal to 0: calculate the distances between genes (rows); 450 if equal to 1: calculate the distances beteeen microarrays 451 (columns). 452 - dist : specifies the distance function to be used: 453 454 - dist=='e': Euclidean distance 455 - dist=='b': City Block distance 456 - dist=='c': Pearson correlation 457 - dist=='a': absolute value of the correlation 458 - dist=='u': uncentered correlation 459 - dist=='x': absolute uncentered correlation 460 - dist=='s': Spearman's rank correlation 461 - dist=='k': Kendall's tau 462 463 Return value: 464 The distance matrix is returned as a list of 1D arrays containing the 465 distance matrix between the gene expression data. The number of columns 466 in each row is equal to the row number. Hence, the first row has zero 467 elements. An example of the return value is:: 468 469 matrix = [[], 470 array([1.]), 471 array([7., 3.]), 472 array([4., 2., 6.])] 473 474 This corresponds to the distance matrix:: 475 476 [0., 1., 7., 4.] 477 [1., 0., 3., 2.] 478 [7., 3., 0., 6.] 479 [4., 2., 6., 0.] 480 481 """ 482 if transpose == 0: 483 weight = self.eweight 484 else: 485 weight = self.gweight 486 return distancematrix(self.data, self.mask, weight, transpose, dist)
487
488 - def save(self, jobname, geneclusters=None, expclusters=None):
489 """Save the clustering results. 490 491 The saved files follow the convention for the Java TreeView program, 492 which can therefore be used to view the clustering result. 493 494 Arguments: 495 496 - jobname: The base name of the files to be saved. The filenames are 497 jobname.cdt, jobname.gtr, and jobname.atr for 498 hierarchical clustering, and jobname-K*.cdt, 499 jobname-K*.kgg, jobname-K*.kag for k-means clustering 500 results. 501 - geneclusters=None: For hierarchical clustering results, geneclusters 502 is a Tree object as returned by the treecluster method. 503 For k-means clustering results, geneclusters is a vector 504 containing ngenes integers, describing to which cluster a 505 given gene belongs. This vector can be calculated by 506 kcluster. 507 - expclusters=None: For hierarchical clustering results, expclusters 508 is a Tree object as returned by the treecluster method. 509 For k-means clustering results, expclusters is a vector 510 containing nexps integers, describing to which cluster a 511 given experimental condition belongs. This vector can be 512 calculated by kcluster. 513 514 """ 515 (ngenes, nexps) = numpy.shape(self.data) 516 if self.gorder is None: 517 gorder = numpy.arange(ngenes) 518 else: 519 gorder = self.gorder 520 if self.eorder is None: 521 eorder = numpy.arange(nexps) 522 else: 523 eorder = self.eorder 524 if geneclusters is not None and expclusters is not None and \ 525 type(geneclusters) != type(expclusters): 526 raise ValueError("found one k-means and one hierarchical " 527 "clustering solution in geneclusters and " 528 "expclusters") 529 gid = 0 530 aid = 0 531 filename = jobname 532 postfix = "" 533 if isinstance(geneclusters, Tree): 534 # This is a hierarchical clustering result. 535 geneindex = _savetree(jobname, geneclusters, gorder, 0) 536 gid = 1 537 elif geneclusters is not None: 538 # This is a k-means clustering result. 539 filename = jobname + "_K" 540 k = max(geneclusters) + 1 541 kggfilename = "%s_K_G%d.kgg" % (jobname, k) 542 geneindex = self._savekmeans(kggfilename, geneclusters, gorder, 0) 543 postfix = "_G%d" % k 544 else: 545 geneindex = numpy.argsort(gorder) 546 if isinstance(expclusters, Tree): 547 # This is a hierarchical clustering result. 548 expindex = _savetree(jobname, expclusters, eorder, 1) 549 aid = 1 550 elif expclusters is not None: 551 # This is a k-means clustering result. 552 filename = jobname + "_K" 553 k = max(expclusters) + 1 554 kagfilename = "%s_K_A%d.kag" % (jobname, k) 555 expindex = self._savekmeans(kagfilename, expclusters, eorder, 1) 556 postfix += "_A%d" % k 557 else: 558 expindex = numpy.argsort(eorder) 559 filename = filename + postfix 560 self._savedata(filename, gid, aid, geneindex, expindex)
561
562 - def _savekmeans(self, filename, clusterids, order, transpose):
563 # Save a k-means clustering solution 564 if transpose == 0: 565 label = self.uniqid 566 names = self.geneid 567 else: 568 label = "ARRAY" 569 names = self.expid 570 with open(filename, "w") as outputfile: 571 outputfile.write(label + "\tGROUP\n") 572 index = numpy.argsort(order) 573 n = len(names) 574 sortedindex = numpy.zeros(n, int) 575 counter = 0 576 cluster = 0 577 while counter < n: 578 for j in index: 579 if clusterids[j] == cluster: 580 outputfile.write("%s\t%s\n" % (names[j], cluster)) 581 sortedindex[counter] = j 582 counter += 1 583 cluster += 1 584 return sortedindex
585
586 - def _savedata(self, jobname, gid, aid, geneindex, expindex):
587 # Save the clustered data. 588 if self.genename is None: 589 genename = self.geneid 590 else: 591 genename = self.genename 592 (ngenes, nexps) = numpy.shape(self.data) 593 with open(jobname + '.cdt', 'w') as outputfile: 594 if self.mask is not None: 595 mask = self.mask 596 else: 597 mask = numpy.ones((ngenes, nexps), int) 598 if self.gweight is not None: 599 gweight = self.gweight 600 else: 601 gweight = numpy.ones(ngenes) 602 if self.eweight is not None: 603 eweight = self.eweight 604 else: 605 eweight = numpy.ones(nexps) 606 if gid: 607 outputfile.write('GID\t') 608 outputfile.write(self.uniqid) 609 outputfile.write('\tNAME\tGWEIGHT') 610 # Now add headers for data columns. 611 for j in expindex: 612 outputfile.write('\t%s' % self.expid[j]) 613 outputfile.write('\n') 614 if aid: 615 outputfile.write("AID") 616 if gid: 617 outputfile.write('\t') 618 outputfile.write("\t\t") 619 for j in expindex: 620 outputfile.write('\tARRY%dX' % j) 621 outputfile.write('\n') 622 outputfile.write('EWEIGHT') 623 if gid: 624 outputfile.write('\t') 625 outputfile.write('\t\t') 626 for j in expindex: 627 outputfile.write('\t%f' % eweight[j]) 628 outputfile.write('\n') 629 for i in geneindex: 630 if gid: 631 outputfile.write('GENE%dX\t' % i) 632 outputfile.write("%s\t%s\t%f" % 633 (self.geneid[i], genename[i], gweight[i])) 634 for j in expindex: 635 outputfile.write('\t') 636 if mask[i, j]: 637 outputfile.write(str(self.data[i, j])) 638 outputfile.write('\n')
639 640
641 -def read(handle):
642 """Read gene expression data from the file handle and return a Record. 643 644 The file should be in the file format defined for Michael Eisen's 645 Cluster/TreeView program. 646 647 """ 648 return Record(handle)
649