Package Bio :: Package Cluster
[hide private]
[frames] | no frames]

Source Code for Package Bio.Cluster

  1  # This code is part of the Biopython distribution and governed by its 
  2  # license.  Please see the LICENSE file that should have been included 
  3  # as part of this package. 
  4  # 
  5   
  6  import numpy 
  7   
  8  from Bio.Cluster.cluster import *  # noqa - lots coming from C code 
  9   
 10   
11 -def _treesort(order, nodeorder, nodecounts, tree):
12 # Find the order of the nodes consistent with the hierarchical clustering 13 # tree, taking into account the preferred order of nodes. 14 nNodes = len(tree) 15 nElements = nNodes + 1 16 neworder = numpy.zeros(nElements) 17 clusterids = numpy.arange(nElements) 18 for i in range(nNodes): 19 i1 = tree[i].left 20 i2 = tree[i].right 21 if i1 < 0: 22 order1 = nodeorder[-i1 - 1] 23 count1 = nodecounts[-i1 - 1] 24 else: 25 order1 = order[i1] 26 count1 = 1 27 if i2 < 0: 28 order2 = nodeorder[-i2 - 1] 29 count2 = nodecounts[-i2 - 1] 30 else: 31 order2 = order[i2] 32 count2 = 1 33 # If order1 and order2 are equal, their order is determined 34 # by the order in which they were clustered 35 if i1 < i2: 36 if order1 < order2: 37 increase = count1 38 else: 39 increase = count2 40 for j in range(nElements): 41 clusterid = clusterids[j] 42 if clusterid == i1 and order1 >= order2: 43 neworder[j] += increase 44 if clusterid == i2 and order1 < order2: 45 neworder[j] += increase 46 if clusterid == i1 or clusterid == i2: 47 clusterids[j] = -i - 1 48 else: 49 if order1 <= order2: 50 increase = count1 51 else: 52 increase = count2 53 for j in range(nElements): 54 clusterid = clusterids[j] 55 if clusterid == i1 and order1 > order2: 56 neworder[j] += increase 57 if clusterid == i2 and order1 <= order2: 58 neworder[j] += increase 59 if clusterid == i1 or clusterid == i2: 60 clusterids[j] = -i - 1 61 return numpy.argsort(neworder)
62 63
64 -def _savetree(jobname, tree, order, transpose):
65 # Save the hierarchical clustering solution given by the tree, following 66 # the specified order, in a file whose name is based on jobname. 67 if transpose == 0: 68 extension = ".gtr" 69 keyword = "GENE" 70 else: 71 extension = ".atr" 72 keyword = "ARRY" 73 nnodes = len(tree) 74 with open(jobname + extension, "w") as outputfile: 75 nodeindex = 0 76 nodeID = [''] * nnodes 77 nodecounts = numpy.zeros(nnodes, int) 78 nodeorder = numpy.zeros(nnodes) 79 nodedist = numpy.array([node.distance for node in tree]) 80 for nodeindex in range(nnodes): 81 min1 = tree[nodeindex].left 82 min2 = tree[nodeindex].right 83 nodeID[nodeindex] = "NODE%dX" % (nodeindex + 1) 84 outputfile.write(nodeID[nodeindex]) 85 outputfile.write("\t") 86 if min1 < 0: 87 index1 = -min1 - 1 88 order1 = nodeorder[index1] 89 counts1 = nodecounts[index1] 90 outputfile.write(nodeID[index1] + "\t") 91 nodedist[nodeindex] = max(nodedist[nodeindex], nodedist[index1]) 92 else: 93 order1 = order[min1] 94 counts1 = 1 95 outputfile.write("%s%dX\t" % (keyword, min1)) 96 if min2 < 0: 97 index2 = -min2 - 1 98 order2 = nodeorder[index2] 99 counts2 = nodecounts[index2] 100 outputfile.write(nodeID[index2] + "\t") 101 nodedist[nodeindex] = max(nodedist[nodeindex], nodedist[index2]) 102 else: 103 order2 = order[min2] 104 counts2 = 1 105 outputfile.write("%s%dX\t" % (keyword, min2)) 106 outputfile.write(str(1.0 - nodedist[nodeindex])) 107 outputfile.write("\n") 108 counts = counts1 + counts2 109 nodecounts[nodeindex] = counts 110 nodeorder[nodeindex] = (counts1 * order1 + counts2 * order2) / counts 111 # Now set up order based on the tree structure 112 index = _treesort(order, nodeorder, nodecounts, tree) 113 return index
114 115
116 -class Record(object):
117 """Store gene expression data. 118 119 A Record stores the gene expression data and related information contained 120 in a data file following the file format defined for Michael Eisen's 121 Cluster/TreeView program. A Record has the following members: 122 123 - data: a matrix containing the gene expression data 124 - mask: a matrix containing only 1's and 0's, denoting which values 125 are present (1) or missing (0). If all elements of mask are 126 one (no missing data), then mask is set to None. 127 - geneid: a list containing a unique identifier for each gene 128 (e.g., ORF name) 129 - genename: a list containing an additional description for each gene 130 (e.g., gene name) 131 - gweight: the weight to be used for each gene when calculating the 132 distance 133 - gorder: an array of real numbers indicating the preferred order of the 134 genes in the output file 135 - expid: a list containing a unique identifier for each experimental 136 condition 137 - eweight: the weight to be used for each experimental condition when 138 calculating the distance 139 - eorder: an array of real numbers indication the preferred order in the 140 output file of the experimental conditions 141 - uniqid: the string that was used instead of UNIQID in the input file. 142 143 """ 144
145 - def __init__(self, handle=None):
146 """Read gene expression data from the file handle and return a Record. 147 148 The file should be in the format defined for Michael Eisen's 149 Cluster/TreeView program. 150 """ 151 self.data = None 152 self.mask = None 153 self.geneid = None 154 self.genename = None 155 self.gweight = None 156 self.gorder = None 157 self.expid = None 158 self.eweight = None 159 self.eorder = None 160 self.uniqid = None 161 if not handle: 162 return 163 line = handle.readline().strip("\r\n").split("\t") 164 n = len(line) 165 self.uniqid = line[0] 166 self.expid = [] 167 cols = {0: "GENEID"} 168 for word in line[1:]: 169 if word == "NAME": 170 cols[line.index(word)] = word 171 self.genename = [] 172 elif word == "GWEIGHT": 173 cols[line.index(word)] = word 174 self.gweight = [] 175 elif word == "GORDER": 176 cols[line.index(word)] = word 177 self.gorder = [] 178 else: 179 self.expid.append(word) 180 self.geneid = [] 181 self.data = [] 182 self.mask = [] 183 needmask = 0 184 for line in handle: 185 line = line.strip("\r\n").split("\t") 186 if len(line) != n: 187 raise ValueError("Line with %d columns found (expected %d)" % 188 (len(line), n)) 189 if line[0] == "EWEIGHT": 190 i = max(cols) + 1 191 self.eweight = numpy.array(line[i:], float) 192 continue 193 if line[0] == "EORDER": 194 i = max(cols) + 1 195 self.eorder = numpy.array(line[i:], float) 196 continue 197 rowdata = [] 198 rowmask = [] 199 n = len(line) 200 for i in range(n): 201 word = line[i] 202 if i in cols: 203 if cols[i] == "GENEID": 204 self.geneid.append(word) 205 if cols[i] == "NAME": 206 self.genename.append(word) 207 if cols[i] == "GWEIGHT": 208 self.gweight.append(float(word)) 209 if cols[i] == "GORDER": 210 self.gorder.append(float(word)) 211 continue 212 if not word: 213 rowdata.append(0.0) 214 rowmask.append(0) 215 needmask = 1 216 else: 217 rowdata.append(float(word)) 218 rowmask.append(1) 219 self.data.append(rowdata) 220 self.mask.append(rowmask) 221 self.data = numpy.array(self.data) 222 if needmask: 223 self.mask = numpy.array(self.mask, int) 224 else: 225 self.mask = None 226 if self.gweight: 227 self.gweight = numpy.array(self.gweight) 228 if self.gorder: 229 self.gorder = numpy.array(self.gorder)
230
231 - def treecluster(self, transpose=0, method='m', dist='e'):
232 """Apply hierarchical clustering and return a Tree object. 233 234 The pairwise single, complete, centroid, and average linkage hierarchical 235 clustering methods are available. 236 237 - transpose: if equal to 0, genes (rows) are clustered; 238 if equal to 1, microarrays (columns) are clustered. 239 - dist : specifies the distance function to be used: 240 241 - dist=='e': Euclidean distance 242 - dist=='b': City Block distance 243 - dist=='c': Pearson correlation 244 - dist=='a': absolute value of the correlation 245 - dist=='u': uncentered correlation 246 - dist=='x': absolute uncentered correlation 247 - dist=='s': Spearman's rank correlation 248 - dist=='k': Kendall's tau 249 250 - method : specifies which linkage method is used: 251 252 - method=='s': Single pairwise linkage 253 - method=='m': Complete (maximum) pairwise linkage (default) 254 - method=='c': Centroid linkage 255 - method=='a': Average pairwise linkage 256 257 See the description of the Tree class for more information about the Tree 258 object returned by this method. 259 260 """ 261 if transpose == 0: 262 weight = self.eweight 263 else: 264 weight = self.gweight 265 return treecluster(self.data, self.mask, weight, transpose, method, 266 dist)
267
268 - def kcluster(self, nclusters=2, transpose=0, npass=1, method='a', dist='e', 269 initialid=None):
270 """Apply k-means or k-median clustering. 271 272 This method returns a tuple (clusterid, error, nfound). 273 274 - nclusters: number of clusters (the 'k' in k-means) 275 - transpose: if equal to 0, genes (rows) are clustered; 276 if equal to 1, microarrays (columns) are clustered. 277 - npass : number of times the k-means clustering algorithm is 278 performed, each time with a different (random) initial 279 condition. 280 - method : specifies how the center of a cluster is found: 281 method=='a': arithmetic mean 282 method=='m': median 283 - dist : specifies the distance function to be used: 284 285 - dist=='e': Euclidean distance 286 - dist=='b': City Block distance 287 - dist=='c': Pearson correlation 288 - dist=='a': absolute value of the correlation 289 - dist=='u': uncentered correlation 290 - dist=='x': absolute uncentered correlation 291 - dist=='s': Spearman's rank correlation 292 - dist=='k': Kendall's tau 293 294 - initialid: the initial clustering from which the algorithm should start. 295 If initialid is None, the routine carries out npass 296 repetitions of the EM algorithm, each time starting from a 297 different random initial clustering. If initialid is given, 298 the routine carries out the EM algorithm only once, starting 299 from the given initial clustering and without randomizing the 300 order in which items are assigned to clusters (i.e., using 301 the same order as in the data matrix). In that case, the 302 k-means algorithm is fully deterministic. 303 304 Return values: 305 - clusterid: array containing the number of the cluster to which each 306 gene/microarray was assigned in the best k-means clustering 307 solution that was found in the npass runs; 308 - error: the within-cluster sum of distances for the returned k-means 309 clustering solution; 310 - nfound: the number of times this solution was found. 311 312 """ 313 if transpose == 0: 314 weight = self.eweight 315 else: 316 weight = self.gweight 317 return kcluster(self.data, nclusters, self.mask, weight, transpose, 318 npass, method, dist, initialid)
319
320 - def somcluster(self, transpose=0, nxgrid=2, nygrid=1, inittau=0.02, 321 niter=1, dist='e'):
322 """Calculate a self-organizing map on a rectangular grid. 323 324 The somcluster method returns a tuple (clusterid, celldata). 325 326 - transpose: if equal to 0, genes (rows) are clustered; 327 if equal to 1, microarrays (columns) are clustered. 328 - nxgrid : the horizontal dimension of the rectangular SOM map 329 - nygrid : the vertical dimension of the rectangular SOM map 330 - inittau : the initial value of tau (the neighborbood function) 331 - niter : the number of iterations 332 - dist : specifies the distance function to be used: 333 334 - dist=='e': Euclidean distance 335 - dist=='b': City Block distance 336 - dist=='c': Pearson correlation 337 - dist=='a': absolute value of the correlation 338 - dist=='u': uncentered correlation 339 - dist=='x': absolute uncentered correlation 340 - dist=='s': Spearman's rank correlation 341 - dist=='k': Kendall's tau 342 343 Return values: 344 345 - clusterid: array with two columns, while the number of rows is equal to 346 the number of genes or the number of microarrays depending on 347 whether genes or microarrays are being clustered. Each row in 348 the array contains the x and y coordinates of the cell in the 349 rectangular SOM grid to which the gene or microarray was 350 assigned. 351 - celldata: an array with dimensions (nxgrid, nygrid, number of 352 microarrays) if genes are being clustered, or (nxgrid, 353 nygrid, number of genes) if microarrays are being clustered. 354 Each element [ix][iy] of this array is a 1D vector containing 355 the gene expression data for the centroid of the cluster in 356 the SOM grid cell with coordinates (ix, iy). 357 358 """ 359 if transpose == 0: 360 weight = self.eweight 361 else: 362 weight = self.gweight 363 return somcluster(self.data, self.mask, weight, transpose, 364 nxgrid, nygrid, inittau, niter, dist)
365
366 - def clustercentroids(self, clusterid=None, method='a', transpose=0):
367 """Calculate the cluster centroids and return a tuple (cdata, cmask). 368 369 The centroid is defined as either the mean or the median over all elements 370 for each dimension. 371 372 - data : nrows x ncolumns array containing the expression data 373 - mask : nrows x ncolumns array of integers, showing which data are 374 missing. If mask[i][j]==0, then data[i][j] is missing. 375 - transpose: if equal to 0, gene (row) clusters are considered; 376 if equal to 1, microarray (column) clusters are considered. 377 - clusterid: array containing the cluster number for each gene or 378 microarray. The cluster number should be non-negative. 379 - method : specifies how the centroid is calculated: 380 method=='a': arithmetic mean over each dimension. (default) 381 method=='m': median over each dimension. 382 383 Return values: 384 - cdata : 2D array containing the cluster centroids. If transpose==0, 385 then the dimensions of cdata are nclusters x ncolumns. If 386 transpose==1, then the dimensions of cdata are 387 nrows x nclusters. 388 - cmask : 2D array of integers describing which elements in cdata, 389 if any, are missing. 390 391 """ 392 return clustercentroids(self.data, self.mask, clusterid, method, 393 transpose)
394
395 - def clusterdistance(self, index1=0, index2=0, method='a', dist='e', 396 transpose=0):
397 """Calculate the distance between two clusters. 398 399 - index1 : 1D array identifying which genes/microarrays belong to the 400 first cluster. If the cluster contains only one gene, then 401 index1 can also be written as a single integer. 402 - index2 : 1D array identifying which genes/microarrays belong to the 403 second cluster. If the cluster contains only one gene, then 404 index2 can also be written as a single integer. 405 - transpose: if equal to 0, genes (rows) are clustered; 406 if equal to 1, microarrays (columns) are clustered. 407 - dist : specifies the distance function to be used: 408 409 - dist=='e': Euclidean distance 410 - dist=='b': City Block distance 411 - dist=='c': Pearson correlation 412 - dist=='a': absolute value of the correlation 413 - dist=='u': uncentered correlation 414 - dist=='x': absolute uncentered correlation 415 - dist=='s': Spearman's rank correlation 416 - dist=='k': Kendall's tau 417 418 - method : specifies how the distance between two clusters is defined: 419 420 - method=='a': the distance between the arithmetic means of the 421 two clusters 422 - method=='m': the distance between the medians of the two 423 clusters 424 - method=='s': the smallest pairwise distance between members 425 of the two clusters 426 - method=='x': the largest pairwise distance between members of 427 the two clusters 428 - method=='v': average of the pairwise distances between 429 members of the clusters 430 431 - transpose: if equal to 0: clusters of genes (rows) are considered; 432 if equal to 1: clusters of microarrays (columns) are considered. 433 434 """ 435 if transpose == 0: 436 weight = self.eweight 437 else: 438 weight = self.gweight 439 return clusterdistance(self.data, self.mask, weight, 440 index1, index2, method, dist, transpose)
441
442 - def distancematrix(self, transpose=0, dist='e'):
443 """Calculate the distance matrix and return it as a list of arrays 444 445 - transpose: if equal to 0: calculate the distances between genes (rows); 446 if equal to 1: calculate the distances beteeen microarrays 447 (columns). 448 - dist : specifies the distance function to be used: 449 450 - dist=='e': Euclidean distance 451 - dist=='b': City Block distance 452 - dist=='c': Pearson correlation 453 - dist=='a': absolute value of the correlation 454 - dist=='u': uncentered correlation 455 - dist=='x': absolute uncentered correlation 456 - dist=='s': Spearman's rank correlation 457 - dist=='k': Kendall's tau 458 459 Return value: 460 The distance matrix is returned as a list of 1D arrays containing the 461 distance matrix between the gene expression data. The number of columns 462 in each row is equal to the row number. Hence, the first row has zero 463 elements. An example of the return value is:: 464 465 matrix = [[], 466 array([1.]), 467 array([7., 3.]), 468 array([4., 2., 6.])] 469 470 This corresponds to the distance matrix:: 471 472 [0., 1., 7., 4.] 473 [1., 0., 3., 2.] 474 [7., 3., 0., 6.] 475 [4., 2., 6., 0.] 476 477 """ 478 if transpose == 0: 479 weight = self.eweight 480 else: 481 weight = self.gweight 482 return distancematrix(self.data, self.mask, weight, transpose, dist)
483
484 - def save(self, jobname, geneclusters=None, expclusters=None):
485 """Save the clustering results. 486 487 The saved files follow the convention for the Java TreeView program, 488 which can therefore be used to view the clustering result. 489 490 Arguments: 491 492 - jobname: The base name of the files to be saved. The filenames are 493 jobname.cdt, jobname.gtr, and jobname.atr for 494 hierarchical clustering, and jobname-K*.cdt, 495 jobname-K*.kgg, jobname-K*.kag for k-means clustering 496 results. 497 - geneclusters=None: For hierarchical clustering results, geneclusters 498 is a Tree object as returned by the treecluster method. 499 For k-means clustering results, geneclusters is a vector 500 containing ngenes integers, describing to which cluster a 501 given gene belongs. This vector can be calculated by 502 kcluster. 503 - expclusters=None: For hierarchical clustering results, expclusters 504 is a Tree object as returned by the treecluster method. 505 For k-means clustering results, expclusters is a vector 506 containing nexps integers, describing to which cluster a 507 given experimental condition belongs. This vector can be 508 calculated by kcluster. 509 510 """ 511 (ngenes, nexps) = numpy.shape(self.data) 512 if self.gorder is None: 513 gorder = numpy.arange(ngenes) 514 else: 515 gorder = self.gorder 516 if self.eorder is None: 517 eorder = numpy.arange(nexps) 518 else: 519 eorder = self.eorder 520 if geneclusters is not None and expclusters is not None and \ 521 type(geneclusters) != type(expclusters): 522 raise ValueError("found one k-means and one hierarchical " 523 "clustering solution in geneclusters and " 524 "expclusters") 525 gid = 0 526 aid = 0 527 filename = jobname 528 postfix = "" 529 if isinstance(geneclusters, Tree): 530 # This is a hierarchical clustering result. 531 geneindex = _savetree(jobname, geneclusters, gorder, 0) 532 gid = 1 533 elif geneclusters is not None: 534 # This is a k-means clustering result. 535 filename = jobname + "_K" 536 k = max(geneclusters) + 1 537 kggfilename = "%s_K_G%d.kgg" % (jobname, k) 538 geneindex = self._savekmeans(kggfilename, geneclusters, gorder, 0) 539 postfix = "_G%d" % k 540 else: 541 geneindex = numpy.argsort(gorder) 542 if isinstance(expclusters, Tree): 543 # This is a hierarchical clustering result. 544 expindex = _savetree(jobname, expclusters, eorder, 1) 545 aid = 1 546 elif expclusters is not None: 547 # This is a k-means clustering result. 548 filename = jobname + "_K" 549 k = max(expclusters) + 1 550 kagfilename = "%s_K_A%d.kag" % (jobname, k) 551 expindex = self._savekmeans(kagfilename, expclusters, eorder, 1) 552 postfix += "_A%d" % k 553 else: 554 expindex = numpy.argsort(eorder) 555 filename = filename + postfix 556 self._savedata(filename, gid, aid, geneindex, expindex)
557
558 - def _savekmeans(self, filename, clusterids, order, transpose):
559 # Save a k-means clustering solution 560 if transpose == 0: 561 label = self.uniqid 562 names = self.geneid 563 else: 564 label = "ARRAY" 565 names = self.expid 566 with open(filename, "w") as outputfile: 567 outputfile.write(label + "\tGROUP\n") 568 index = numpy.argsort(order) 569 n = len(names) 570 sortedindex = numpy.zeros(n, int) 571 counter = 0 572 cluster = 0 573 while counter < n: 574 for j in index: 575 if clusterids[j] == cluster: 576 outputfile.write("%s\t%s\n" % (names[j], cluster)) 577 sortedindex[counter] = j 578 counter += 1 579 cluster += 1 580 return sortedindex
581
582 - def _savedata(self, jobname, gid, aid, geneindex, expindex):
583 # Save the clustered data. 584 if self.genename is None: 585 genename = self.geneid 586 else: 587 genename = self.genename 588 (ngenes, nexps) = numpy.shape(self.data) 589 with open(jobname + '.cdt', 'w') as outputfile: 590 if self.mask is not None: 591 mask = self.mask 592 else: 593 mask = numpy.ones((ngenes, nexps), int) 594 if self.gweight is not None: 595 gweight = self.gweight 596 else: 597 gweight = numpy.ones(ngenes) 598 if self.eweight is not None: 599 eweight = self.eweight 600 else: 601 eweight = numpy.ones(nexps) 602 if gid: 603 outputfile.write('GID\t') 604 outputfile.write(self.uniqid) 605 outputfile.write('\tNAME\tGWEIGHT') 606 # Now add headers for data columns. 607 for j in expindex: 608 outputfile.write('\t%s' % self.expid[j]) 609 outputfile.write('\n') 610 if aid: 611 outputfile.write("AID") 612 if gid: 613 outputfile.write('\t') 614 outputfile.write("\t\t") 615 for j in expindex: 616 outputfile.write('\tARRY%dX' % j) 617 outputfile.write('\n') 618 outputfile.write('EWEIGHT') 619 if gid: 620 outputfile.write('\t') 621 outputfile.write('\t\t') 622 for j in expindex: 623 outputfile.write('\t%f' % eweight[j]) 624 outputfile.write('\n') 625 for i in geneindex: 626 if gid: 627 outputfile.write('GENE%dX\t' % i) 628 outputfile.write("%s\t%s\t%f" % 629 (self.geneid[i], genename[i], gweight[i])) 630 for j in expindex: 631 outputfile.write('\t') 632 if mask[i, j]: 633 outputfile.write(str(self.data[i, j])) 634 outputfile.write('\n')
635 636
637 -def read(handle):
638 """Read gene expression data from the file handle and return a Record. 639 640 The file should be in the file format defined for Michael Eisen's 641 Cluster/TreeView program. 642 """ 643 return Record(handle)
644