Package Bio :: Package Cluster
[hide private]
[frames] | no frames]

Source Code for Package Bio.Cluster

  1  # This code is part of the Biopython distribution and governed by its 
  2  # license.  Please see the LICENSE file that should have been included 
  3  # as part of this package. 
  4  # 
  5   
  6  import numpy 
  7   
  8  from Bio.Cluster.cluster import * 
  9   
 10  __docformat__ = "restructuredtext en" 
 11   
12 -def _treesort(order, nodeorder, nodecounts, tree):
13 # Find the order of the nodes consistent with the hierarchical clustering 14 # tree, taking into account the preferred order of nodes. 15 nNodes = len(tree) 16 nElements = nNodes + 1 17 neworder = numpy.zeros(nElements) 18 clusterids = numpy.arange(nElements) 19 for i in range(nNodes): 20 i1 = tree[i].left 21 i2 = tree[i].right 22 if i1 < 0: 23 order1 = nodeorder[-i1-1] 24 count1 = nodecounts[-i1-1] 25 else: 26 order1 = order[i1] 27 count1 = 1 28 if i2 < 0: 29 order2 = nodeorder[-i2-1] 30 count2 = nodecounts[-i2-1] 31 else: 32 order2 = order[i2] 33 count2 = 1 34 # If order1 and order2 are equal, their order is determined 35 # by the order in which they were clustered 36 if i1 < i2: 37 if order1 < order2: 38 increase = count1 39 else: 40 increase = count2 41 for j in range(nElements): 42 clusterid = clusterids[j] 43 if clusterid == i1 and order1 >= order2: 44 neworder[j] += increase 45 if clusterid == i2 and order1 < order2: 46 neworder[j] += increase 47 if clusterid == i1 or clusterid == i2: 48 clusterids[j] = -i-1 49 else: 50 if order1 <= order2: 51 increase = count1 52 else: 53 increase = count2 54 for j in range(nElements): 55 clusterid = clusterids[j] 56 if clusterid == i1 and order1 > order2: 57 neworder[j] += increase 58 if clusterid == i2 and order1 <= order2: 59 neworder[j] += increase 60 if clusterid == i1 or clusterid == i2: 61 clusterids[j] = -i-1 62 return numpy.argsort(neworder)
63 64
65 -def _savetree(jobname, tree, order, transpose):
66 # Save the hierarchical clustering solution given by the tree, following 67 # the specified order, in a file whose name is based on jobname. 68 if transpose == 0: 69 extension = ".gtr" 70 keyword = "GENE" 71 else: 72 extension = ".atr" 73 keyword = "ARRY" 74 nnodes = len(tree) 75 with open(jobname+extension, "w") as outputfile: 76 nodeindex = 0 77 nodeID = [''] * nnodes 78 nodecounts = numpy.zeros(nnodes, int) 79 nodeorder = numpy.zeros(nnodes) 80 nodedist = numpy.array([node.distance for node in tree]) 81 for nodeindex in range(nnodes): 82 min1 = tree[nodeindex].left 83 min2 = tree[nodeindex].right 84 nodeID[nodeindex] = "NODE%dX" % (nodeindex+1) 85 outputfile.write(nodeID[nodeindex]) 86 outputfile.write("\t") 87 if min1 < 0: 88 index1 = -min1-1 89 order1 = nodeorder[index1] 90 counts1 = nodecounts[index1] 91 outputfile.write(nodeID[index1]+"\t") 92 nodedist[nodeindex] = max(nodedist[nodeindex], nodedist[index1]) 93 else: 94 order1 = order[min1] 95 counts1 = 1 96 outputfile.write("%s%dX\t" % (keyword, min1)) 97 if min2 < 0: 98 index2 = -min2-1 99 order2 = nodeorder[index2] 100 counts2 = nodecounts[index2] 101 outputfile.write(nodeID[index2]+"\t") 102 nodedist[nodeindex] = max(nodedist[nodeindex], nodedist[index2]) 103 else: 104 order2 = order[min2] 105 counts2 = 1 106 outputfile.write("%s%dX\t" % (keyword, min2)) 107 outputfile.write(str(1.0-nodedist[nodeindex])) 108 outputfile.write("\n") 109 counts = counts1 + counts2 110 nodecounts[nodeindex] = counts 111 nodeorder[nodeindex] = (counts1*order1+counts2*order2) / counts 112 # Now set up order based on the tree structure 113 index = _treesort(order, nodeorder, nodecounts, tree) 114 return index
115 116
117 -class Record(object):
118 """Store gene expression data. 119 120 A Record stores the gene expression data and related information contained 121 in a data file following the file format defined for Michael Eisen's 122 Cluster/TreeView program. A Record has the following members: 123 124 - data: a matrix containing the gene expression data 125 - mask: a matrix containing only 1's and 0's, denoting which values 126 are present (1) or missing (0). If all elements of mask are 127 one (no missing data), then mask is set to None. 128 - geneid: a list containing a unique identifier for each gene 129 (e.g., ORF name) 130 - genename: a list containing an additional description for each gene 131 (e.g., gene name) 132 - gweight: the weight to be used for each gene when calculating the 133 distance 134 - gorder: an array of real numbers indicating the preferred order of the 135 genes in the output file 136 - expid: a list containing a unique identifier for each experimental 137 condition 138 - eweight: the weight to be used for each experimental condition when 139 calculating the distance 140 - eorder: an array of real numbers indication the preferred order in the 141 output file of the experimental conditions 142 - uniqid: the string that was used instead of UNIQID in the input file. 143 144 """ 145
146 - def __init__(self, handle=None):
147 """Read gene expression data from the file handle and return a Record. 148 149 The file should be in the format defined for Michael Eisen's 150 Cluster/TreeView program. 151 152 """ 153 self.data = None 154 self.mask = None 155 self.geneid = None 156 self.genename = None 157 self.gweight = None 158 self.gorder = None 159 self.expid = None 160 self.eweight = None 161 self.eorder = None 162 self.uniqid = None 163 if not handle: 164 return 165 line = handle.readline().strip("\r\n").split("\t") 166 n = len(line) 167 self.uniqid = line[0] 168 self.expid = [] 169 cols = {0: "GENEID"} 170 for word in line[1:]: 171 if word == "NAME": 172 cols[line.index(word)] = word 173 self.genename = [] 174 elif word == "GWEIGHT": 175 cols[line.index(word)] = word 176 self.gweight = [] 177 elif word=="GORDER": 178 cols[line.index(word)] = word 179 self.gorder = [] 180 else: 181 self.expid.append(word) 182 self.geneid = [] 183 self.data = [] 184 self.mask = [] 185 needmask = 0 186 for line in handle: 187 line = line.strip("\r\n").split("\t") 188 if len(line) != n: 189 raise ValueError("Line with %d columns found (expected %d)" % 190 (len(line), n)) 191 if line[0] == "EWEIGHT": 192 i = max(cols) + 1 193 self.eweight = numpy.array(line[i:], float) 194 continue 195 if line[0] == "EORDER": 196 i = max(cols) + 1 197 self.eorder = numpy.array(line[i:], float) 198 continue 199 rowdata = [] 200 rowmask = [] 201 n = len(line) 202 for i in range(n): 203 word = line[i] 204 if i in cols: 205 if cols[i] == "GENEID": 206 self.geneid.append(word) 207 if cols[i] == "NAME": 208 self.genename.append(word) 209 if cols[i] == "GWEIGHT": 210 self.gweight.append(float(word)) 211 if cols[i] == "GORDER": 212 self.gorder.append(float(word)) 213 continue 214 if not word: 215 rowdata.append(0.0) 216 rowmask.append(0) 217 needmask = 1 218 else: 219 rowdata.append(float(word)) 220 rowmask.append(1) 221 self.data.append(rowdata) 222 self.mask.append(rowmask) 223 self.data = numpy.array(self.data) 224 if needmask: 225 self.mask = numpy.array(self.mask, int) 226 else: 227 self.mask = None 228 if self.gweight: 229 self.gweight = numpy.array(self.gweight) 230 if self.gorder: 231 self.gorder = numpy.array(self.gorder)
232
233 - def treecluster(self, transpose=0, method='m', dist='e'):
234 """Apply hierarchical clustering and return a Tree object. 235 236 The pairwise single, complete, centroid, and average linkage hierarchical 237 clustering methods are available. 238 239 - transpose: if equal to 0, genes (rows) are clustered; 240 if equal to 1, microarrays (columns) are clustered. 241 - dist : specifies the distance function to be used: 242 243 - dist=='e': Euclidean distance 244 - dist=='b': City Block distance 245 - dist=='c': Pearson correlation 246 - dist=='a': absolute value of the correlation 247 - dist=='u': uncentered correlation 248 - dist=='x': absolute uncentered correlation 249 - dist=='s': Spearman's rank correlation 250 - dist=='k': Kendall's tau 251 252 - method : specifies which linkage method is used: 253 254 - method=='s': Single pairwise linkage 255 - method=='m': Complete (maximum) pairwise linkage (default) 256 - method=='c': Centroid linkage 257 - method=='a': Average pairwise linkage 258 259 See the description of the Tree class for more information about the Tree 260 object returned by this method. 261 262 """ 263 if transpose == 0: 264 weight = self.eweight 265 else: 266 weight = self.gweight 267 return treecluster(self.data, self.mask, weight, transpose, method, 268 dist)
269
270 - def kcluster(self, nclusters=2, transpose=0, npass=1, method='a', dist='e', 271 initialid=None):
272 """Apply k-means or k-median clustering. 273 274 This method returns a tuple (clusterid, error, nfound). 275 276 - nclusters: number of clusters (the 'k' in k-means) 277 - transpose: if equal to 0, genes (rows) are clustered; 278 if equal to 1, microarrays (columns) are clustered. 279 - npass : number of times the k-means clustering algorithm is 280 performed, each time with a different (random) initial 281 condition. 282 - method : specifies how the center of a cluster is found: 283 method=='a': arithmetic mean 284 method=='m': median 285 - dist : specifies the distance function to be used: 286 287 - dist=='e': Euclidean distance 288 - dist=='b': City Block distance 289 - dist=='c': Pearson correlation 290 - dist=='a': absolute value of the correlation 291 - dist=='u': uncentered correlation 292 - dist=='x': absolute uncentered correlation 293 - dist=='s': Spearman's rank correlation 294 - dist=='k': Kendall's tau 295 296 - initialid: the initial clustering from which the algorithm should start. 297 If initialid is None, the routine carries out npass 298 repetitions of the EM algorithm, each time starting from a 299 different random initial clustering. If initialid is given, 300 the routine carries out the EM algorithm only once, starting 301 from the given initial clustering and without randomizing the 302 order in which items are assigned to clusters (i.e., using 303 the same order as in the data matrix). In that case, the 304 k-means algorithm is fully deterministic. 305 306 Return values: 307 - clusterid: array containing the number of the cluster to which each 308 gene/microarray was assigned in the best k-means clustering 309 solution that was found in the npass runs; 310 - error: the within-cluster sum of distances for the returned k-means 311 clustering solution; 312 - nfound: the number of times this solution was found. 313 314 """ 315 316 if transpose == 0: 317 weight = self.eweight 318 else: 319 weight = self.gweight 320 return kcluster(self.data, nclusters, self.mask, weight, transpose, 321 npass, method, dist, initialid)
322
323 - def somcluster(self, transpose=0, nxgrid=2, nygrid=1, inittau=0.02, 324 niter=1, dist='e'):
325 """Calculate a self-organizing map on a rectangular grid. 326 327 The somcluster method returns a tuple (clusterid, celldata). 328 329 - transpose: if equal to 0, genes (rows) are clustered; 330 if equal to 1, microarrays (columns) are clustered. 331 - nxgrid : the horizontal dimension of the rectangular SOM map 332 - nygrid : the vertical dimension of the rectangular SOM map 333 - inittau : the initial value of tau (the neighborbood function) 334 - niter : the number of iterations 335 - dist : specifies the distance function to be used: 336 337 - dist=='e': Euclidean distance 338 - dist=='b': City Block distance 339 - dist=='c': Pearson correlation 340 - dist=='a': absolute value of the correlation 341 - dist=='u': uncentered correlation 342 - dist=='x': absolute uncentered correlation 343 - dist=='s': Spearman's rank correlation 344 - dist=='k': Kendall's tau 345 346 Return values: 347 348 - clusterid: array with two columns, while the number of rows is equal to 349 the number of genes or the number of microarrays depending on 350 whether genes or microarrays are being clustered. Each row in 351 the array contains the x and y coordinates of the cell in the 352 rectangular SOM grid to which the gene or microarray was 353 assigned. 354 - celldata: an array with dimensions (nxgrid, nygrid, number of 355 microarrays) if genes are being clustered, or (nxgrid, 356 nygrid, number of genes) if microarrays are being clustered. 357 Each element [ix][iy] of this array is a 1D vector containing 358 the gene expression data for the centroid of the cluster in 359 the SOM grid cell with coordinates (ix, iy). 360 361 """ 362 363 if transpose == 0: 364 weight = self.eweight 365 else: 366 weight = self.gweight 367 return somcluster(self.data, self.mask, weight, transpose, 368 nxgrid, nygrid, inittau, niter, dist)
369
370 - def clustercentroids(self, clusterid=None, method='a', transpose=0):
371 """Calculate the cluster centroids and return a tuple (cdata, cmask). 372 373 The centroid is defined as either the mean or the median over all elements 374 for each dimension. 375 376 - data : nrows x ncolumns array containing the expression data 377 - mask : nrows x ncolumns array of integers, showing which data are 378 missing. If mask[i][j]==0, then data[i][j] is missing. 379 - transpose: if equal to 0, gene (row) clusters are considered; 380 if equal to 1, microarray (column) clusters are considered. 381 - clusterid: array containing the cluster number for each gene or 382 microarray. The cluster number should be non-negative. 383 - method : specifies how the centroid is calculated: 384 method=='a': arithmetic mean over each dimension. (default) 385 method=='m': median over each dimension. 386 387 Return values: 388 - cdata : 2D array containing the cluster centroids. If transpose==0, 389 then the dimensions of cdata are nclusters x ncolumns. If 390 transpose==1, then the dimensions of cdata are 391 nrows x nclusters. 392 - cmask : 2D array of integers describing which elements in cdata, 393 if any, are missing. 394 395 """ 396 return clustercentroids(self.data, self.mask, clusterid, method, 397 transpose)
398
399 - def clusterdistance(self, index1=[0], index2=[0], method='a', dist='e', 400 transpose=0):
401 """Calculate the distance between two clusters. 402 403 - index1 : 1D array identifying which genes/microarrays belong to the 404 first cluster. If the cluster contains only one gene, then 405 index1 can also be written as a single integer. 406 - index2 : 1D array identifying which genes/microarrays belong to the 407 second cluster. If the cluster contains only one gene, then 408 index2 can also be written as a single integer. 409 - transpose: if equal to 0, genes (rows) are clustered; 410 if equal to 1, microarrays (columns) are clustered. 411 - dist : specifies the distance function to be used: 412 413 - dist=='e': Euclidean distance 414 - dist=='b': City Block distance 415 - dist=='c': Pearson correlation 416 - dist=='a': absolute value of the correlation 417 - dist=='u': uncentered correlation 418 - dist=='x': absolute uncentered correlation 419 - dist=='s': Spearman's rank correlation 420 - dist=='k': Kendall's tau 421 422 - method : specifies how the distance between two clusters is defined: 423 424 - method=='a': the distance between the arithmetic means of the 425 two clusters 426 - method=='m': the distance between the medians of the two 427 clusters 428 - method=='s': the smallest pairwise distance between members 429 of the two clusters 430 - method=='x': the largest pairwise distance between members of 431 the two clusters 432 - method=='v': average of the pairwise distances between 433 members of the clusters 434 435 - transpose: if equal to 0: clusters of genes (rows) are considered; 436 if equal to 1: clusters of microarrays (columns) are considered. 437 438 """ 439 440 if transpose == 0: 441 weight = self.eweight 442 else: 443 weight = self.gweight 444 return clusterdistance(self.data, self.mask, weight, 445 index1, index2, method, dist, transpose)
446
447 - def distancematrix(self, transpose=0, dist='e'):
448 """Calculate the distance matrix and return it as a list of arrays 449 450 - transpose: if equal to 0: calculate the distances between genes (rows); 451 if equal to 1: calculate the distances beteeen microarrays 452 (columns). 453 - dist : specifies the distance function to be used: 454 455 - dist=='e': Euclidean distance 456 - dist=='b': City Block distance 457 - dist=='c': Pearson correlation 458 - dist=='a': absolute value of the correlation 459 - dist=='u': uncentered correlation 460 - dist=='x': absolute uncentered correlation 461 - dist=='s': Spearman's rank correlation 462 - dist=='k': Kendall's tau 463 464 Return value: 465 The distance matrix is returned as a list of 1D arrays containing the 466 distance matrix between the gene expression data. The number of columns 467 in each row is equal to the row number. Hence, the first row has zero 468 elements. An example of the return value is:: 469 470 matrix = [[], 471 array([1.]), 472 array([7., 3.]), 473 array([4., 2., 6.])] 474 475 This corresponds to the distance matrix:: 476 477 [0., 1., 7., 4.] 478 [1., 0., 3., 2.] 479 [7., 3., 0., 6.] 480 [4., 2., 6., 0.] 481 482 """ 483 if transpose == 0: 484 weight = self.eweight 485 else: 486 weight = self.gweight 487 return distancematrix(self.data, self.mask, weight, transpose, dist)
488
489 - def save(self, jobname, geneclusters=None, expclusters=None):
490 """Save the clustering results. 491 492 The saved files follow the convention for the Java TreeView program, 493 which can therefore be used to view the clustering result. 494 495 Arguments: 496 497 - jobname: The base name of the files to be saved. The filenames are 498 jobname.cdt, jobname.gtr, and jobname.atr for 499 hierarchical clustering, and jobname-K*.cdt, 500 jobname-K*.kgg, jobname-K*.kag for k-means clustering 501 results. 502 - geneclusters=None: For hierarchical clustering results, geneclusters 503 is a Tree object as returned by the treecluster method. 504 For k-means clustering results, geneclusters is a vector 505 containing ngenes integers, describing to which cluster a 506 given gene belongs. This vector can be calculated by 507 kcluster. 508 - expclusters=None: For hierarchical clustering results, expclusters 509 is a Tree object as returned by the treecluster method. 510 For k-means clustering results, expclusters is a vector 511 containing nexps integers, describing to which cluster a 512 given experimental condition belongs. This vector can be 513 calculated by kcluster. 514 515 """ 516 (ngenes, nexps) = numpy.shape(self.data) 517 if self.gorder is None: 518 gorder = numpy.arange(ngenes) 519 else: 520 gorder = self.gorder 521 if self.eorder is None: 522 eorder = numpy.arange(nexps) 523 else: 524 eorder = self.eorder 525 if geneclusters is not None and expclusters is not None and \ 526 type(geneclusters) != type(expclusters): 527 raise ValueError("found one k-means and one hierarchical " 528 + "clustering solution in geneclusters and " 529 + "expclusters") 530 gid = 0 531 aid = 0 532 filename = jobname 533 postfix = "" 534 if isinstance(geneclusters, Tree): 535 # This is a hierarchical clustering result. 536 geneindex = _savetree(jobname, geneclusters, gorder, 0) 537 gid = 1 538 elif geneclusters is not None: 539 # This is a k-means clustering result. 540 filename = jobname + "_K" 541 k = max(geneclusters) + 1 542 kggfilename = "%s_K_G%d.kgg" % (jobname, k) 543 geneindex = self._savekmeans(kggfilename, geneclusters, gorder, 0) 544 postfix = "_G%d" % k 545 else: 546 geneindex = numpy.argsort(gorder) 547 if isinstance(expclusters, Tree): 548 # This is a hierarchical clustering result. 549 expindex = _savetree(jobname, expclusters, eorder, 1) 550 aid = 1 551 elif expclusters is not None: 552 # This is a k-means clustering result. 553 filename = jobname + "_K" 554 k = max(expclusters) + 1 555 kagfilename = "%s_K_A%d.kag" % (jobname, k) 556 expindex = self._savekmeans(kagfilename, expclusters, eorder, 1) 557 postfix += "_A%d" % k 558 else: 559 expindex = numpy.argsort(eorder) 560 filename = filename + postfix 561 self._savedata(filename, gid, aid, geneindex, expindex)
562
563 - def _savekmeans(self, filename, clusterids, order, transpose):
564 # Save a k-means clustering solution 565 if transpose == 0: 566 label = self.uniqid 567 names = self.geneid 568 else: 569 label = "ARRAY" 570 names = self.expid 571 with open(filename, "w") as outputfile: 572 outputfile.write(label + "\tGROUP\n") 573 index = numpy.argsort(order) 574 n = len(names) 575 sortedindex = numpy.zeros(n, int) 576 counter = 0 577 cluster = 0 578 while counter < n: 579 for j in index: 580 if clusterids[j] == cluster: 581 outputfile.write("%s\t%s\n" % (names[j], cluster)) 582 sortedindex[counter] = j 583 counter += 1 584 cluster += 1 585 return sortedindex
586
587 - def _savedata(self, jobname, gid, aid, geneindex, expindex):
588 # Save the clustered data. 589 if self.genename is None: 590 genename = self.geneid 591 else: 592 genename = self.genename 593 (ngenes, nexps) = numpy.shape(self.data) 594 with open(jobname+'.cdt', 'w') as outputfile: 595 if self.mask is not None: 596 mask = self.mask 597 else: 598 mask = numpy.ones((ngenes, nexps), int) 599 if self.gweight is not None: 600 gweight = self.gweight 601 else: 602 gweight = numpy.ones(ngenes) 603 if self.eweight is not None: 604 eweight = self.eweight 605 else: 606 eweight = numpy.ones(nexps) 607 if gid: 608 outputfile.write('GID\t') 609 outputfile.write(self.uniqid) 610 outputfile.write('\tNAME\tGWEIGHT') 611 # Now add headers for data columns. 612 for j in expindex: 613 outputfile.write('\t%s' % self.expid[j]) 614 outputfile.write('\n') 615 if aid: 616 outputfile.write("AID") 617 if gid: 618 outputfile.write('\t') 619 outputfile.write("\t\t") 620 for j in expindex: 621 outputfile.write('\tARRY%dX' % j) 622 outputfile.write('\n') 623 outputfile.write('EWEIGHT') 624 if gid: 625 outputfile.write('\t') 626 outputfile.write('\t\t') 627 for j in expindex: 628 outputfile.write('\t%f' % eweight[j]) 629 outputfile.write('\n') 630 for i in geneindex: 631 if gid: 632 outputfile.write('GENE%dX\t' % i) 633 outputfile.write("%s\t%s\t%f" % 634 (self.geneid[i], genename[i], gweight[i])) 635 for j in expindex: 636 outputfile.write('\t') 637 if mask[i, j]: 638 outputfile.write(str(self.data[i, j])) 639 outputfile.write('\n')
640 641
642 -def read(handle):
643 """Read gene expression data from the file handle and return a Record. 644 645 The file should be in the file format defined for Michael Eisen's 646 Cluster/TreeView program. 647 648 """ 649 return Record(handle)
650