Package Bio :: Package Cluster
[hide private]
[frames] | no frames]

Source Code for Package Bio.Cluster

  1  # This code is part of the Biopython distribution and governed by its 
  2  # license.  Please see the LICENSE file that should have been included 
  3  # as part of this package. 
  4  # 
  5  """Cluster Analysis. 
  6   
  7  The Bio.Cluster provides commonly used clustering algorithms and was 
  8  designed with the application to gene expression data in mind. However, 
  9  this module can also be used for cluster analysis of other types of data. 
 10   
 11  Bio.Cluster and the underlying C Clustering Library is described in 
 12  M. de Hoon et al. (2004) https://doi.org/10.1093/bioinformatics/bth078 
 13  """ 
 14   
 15  import numpy 
 16   
 17  from Bio.Cluster.cluster import version, kcluster, kmedoids, treecluster 
 18  from Bio.Cluster.cluster import somcluster, clusterdistance 
 19  from Bio.Cluster.cluster import clustercentroids, distancematrix, pca 
 20  from Bio.Cluster.cluster import Node, Tree 
 21   
 22   
23 -def _savetree(jobname, tree, order, transpose):
24 # Save the hierarchical clustering solution given by the tree, following 25 # the specified order, in a file whose name is based on jobname. 26 if transpose == 0: 27 extension = ".gtr" 28 keyword = "GENE" 29 else: 30 extension = ".atr" 31 keyword = "ARRY" 32 index = tree.sort(order) 33 nnodes = len(tree) 34 with open(jobname + extension, "w") as outputfile: 35 nodeID = [''] * nnodes 36 nodedist = numpy.array([node.distance for node in tree]) 37 for nodeindex in range(nnodes): 38 min1 = tree[nodeindex].left 39 min2 = tree[nodeindex].right 40 nodeID[nodeindex] = "NODE%dX" % (nodeindex + 1) 41 outputfile.write(nodeID[nodeindex]) 42 outputfile.write("\t") 43 if min1 < 0: 44 index1 = -min1 - 1 45 outputfile.write(nodeID[index1] + "\t") 46 nodedist[nodeindex] = max(nodedist[nodeindex], nodedist[index1]) 47 else: 48 outputfile.write("%s%dX\t" % (keyword, min1)) 49 if min2 < 0: 50 index2 = -min2 - 1 51 outputfile.write(nodeID[index2] + "\t") 52 nodedist[nodeindex] = max(nodedist[nodeindex], nodedist[index2]) 53 else: 54 outputfile.write("%s%dX\t" % (keyword, min2)) 55 outputfile.write(str(1.0 - nodedist[nodeindex])) 56 outputfile.write("\n") 57 return index
58 59
60 -class Record(object):
61 """Store gene expression data. 62 63 A Record stores the gene expression data and related information contained 64 in a data file following the file format defined for Michael Eisen's 65 Cluster/TreeView program. 66 67 Attributes: 68 - data: a matrix containing the gene expression data 69 - mask: a matrix containing only 1's and 0's, denoting which values 70 are present (1) or missing (0). If all elements of mask are 71 one (no missing data), then mask is set to None. 72 - geneid: a list containing a unique identifier for each gene 73 (e.g., ORF name) 74 - genename: a list containing an additional description for each gene 75 (e.g., gene name) 76 - gweight: the weight to be used for each gene when calculating the 77 distance 78 - gorder: an array of real numbers indicating the preferred order of the 79 genes in the output file 80 - expid: a list containing a unique identifier for each experimental 81 condition 82 - eweight: the weight to be used for each experimental condition when 83 calculating the distance 84 - eorder: an array of real numbers indication the preferred order in the 85 output file of the experimental conditions 86 - uniqid: the string that was used instead of UNIQID in the input file. 87 88 """ 89
90 - def __init__(self, handle=None):
91 """Read gene expression data from the file handle and return a Record. 92 93 The file should be in the format defined for Michael Eisen's 94 Cluster/TreeView program. 95 """ 96 self.data = None 97 self.mask = None 98 self.geneid = None 99 self.genename = None 100 self.gweight = None 101 self.gorder = None 102 self.expid = None 103 self.eweight = None 104 self.eorder = None 105 self.uniqid = None 106 if not handle: 107 return 108 line = handle.readline().strip("\r\n").split("\t") 109 n = len(line) 110 self.uniqid = line[0] 111 self.expid = [] 112 cols = {0: "GENEID"} 113 for word in line[1:]: 114 if word == "NAME": 115 cols[line.index(word)] = word 116 self.genename = [] 117 elif word == "GWEIGHT": 118 cols[line.index(word)] = word 119 self.gweight = [] 120 elif word == "GORDER": 121 cols[line.index(word)] = word 122 self.gorder = [] 123 else: 124 self.expid.append(word) 125 self.geneid = [] 126 self.data = [] 127 self.mask = [] 128 needmask = 0 129 for line in handle: 130 line = line.strip("\r\n").split("\t") 131 if len(line) != n: 132 raise ValueError("Line with %d columns found (expected %d)" % 133 (len(line), n)) 134 if line[0] == "EWEIGHT": 135 i = max(cols) + 1 136 self.eweight = numpy.array(line[i:], float) 137 continue 138 if line[0] == "EORDER": 139 i = max(cols) + 1 140 self.eorder = numpy.array(line[i:], float) 141 continue 142 rowdata = [] 143 rowmask = [] 144 n = len(line) 145 for i in range(n): 146 word = line[i] 147 if i in cols: 148 if cols[i] == "GENEID": 149 self.geneid.append(word) 150 if cols[i] == "NAME": 151 self.genename.append(word) 152 if cols[i] == "GWEIGHT": 153 self.gweight.append(float(word)) 154 if cols[i] == "GORDER": 155 self.gorder.append(float(word)) 156 continue 157 if not word: 158 rowdata.append(0.0) 159 rowmask.append(0) 160 needmask = 1 161 else: 162 rowdata.append(float(word)) 163 rowmask.append(1) 164 self.data.append(rowdata) 165 self.mask.append(rowmask) 166 self.data = numpy.array(self.data) 167 if needmask: 168 self.mask = numpy.array(self.mask, int) 169 else: 170 self.mask = None 171 if self.gweight: 172 self.gweight = numpy.array(self.gweight) 173 if self.gorder: 174 self.gorder = numpy.array(self.gorder)
175
176 - def treecluster(self, transpose=0, method='m', dist='e'):
177 """Apply hierarchical clustering and return a Tree object. 178 179 The pairwise single, complete, centroid, and average linkage 180 hierarchical clustering methods are available. 181 182 Arguments: 183 - transpose: if equal to 0, genes (rows) are clustered; 184 if equal to 1, microarrays (columns) are clustered. 185 - dist : specifies the distance function to be used: 186 187 - dist=='e': Euclidean distance 188 - dist=='b': City Block distance 189 - dist=='c': Pearson correlation 190 - dist=='a': absolute value of the correlation 191 - dist=='u': uncentered correlation 192 - dist=='x': absolute uncentered correlation 193 - dist=='s': Spearman's rank correlation 194 - dist=='k': Kendall's tau 195 196 - method : specifies which linkage method is used: 197 198 - method=='s': Single pairwise linkage 199 - method=='m': Complete (maximum) pairwise linkage (default) 200 - method=='c': Centroid linkage 201 - method=='a': Average pairwise linkage 202 203 See the description of the Tree class for more information about 204 the Tree object returned by this method. 205 """ 206 if transpose == 0: 207 weight = self.eweight 208 else: 209 weight = self.gweight 210 return treecluster(self.data, self.mask, weight, transpose, method, 211 dist)
212
213 - def kcluster(self, nclusters=2, transpose=0, npass=1, method='a', dist='e', 214 initialid=None):
215 """Apply k-means or k-median clustering. 216 217 This method returns a tuple (clusterid, error, nfound). 218 219 Arguments: 220 - nclusters: number of clusters (the 'k' in k-means) 221 - transpose: if equal to 0, genes (rows) are clustered; 222 if equal to 1, microarrays (columns) are clustered. 223 - npass : number of times the k-means clustering algorithm is 224 performed, each time with a different (random) initial condition. 225 - method : specifies how the center of a cluster is found: 226 227 - method=='a': arithmetic mean 228 - method=='m': median 229 230 - dist : specifies the distance function to be used: 231 232 - dist=='e': Euclidean distance 233 - dist=='b': City Block distance 234 - dist=='c': Pearson correlation 235 - dist=='a': absolute value of the correlation 236 - dist=='u': uncentered correlation 237 - dist=='x': absolute uncentered correlation 238 - dist=='s': Spearman's rank correlation 239 - dist=='k': Kendall's tau 240 241 - initialid: the initial clustering from which the algorithm should 242 start. If initialid is None, the routine carries out npass 243 repetitions of the EM algorithm, each time starting from a different 244 random initial clustering. If initialid is given, the routine 245 carries out the EM algorithm only once, starting from the given 246 initial clustering and without randomizing the order in which items 247 are assigned to clusters (i.e., using the same order as in the data 248 matrix). In that case, the k-means algorithm is fully deterministic. 249 250 Return values: 251 - clusterid: array containing the number of the cluster to which each 252 gene/microarray was assigned in the best k-means clustering 253 solution that was found in the npass runs; 254 - error: the within-cluster sum of distances for the returned 255 k-means clustering solution; 256 - nfound: the number of times this solution was found. 257 258 """ 259 if transpose == 0: 260 weight = self.eweight 261 else: 262 weight = self.gweight 263 return kcluster(self.data, nclusters, self.mask, weight, transpose, 264 npass, method, dist, initialid)
265
266 - def somcluster(self, transpose=0, nxgrid=2, nygrid=1, inittau=0.02, 267 niter=1, dist='e'):
268 """Calculate a self-organizing map on a rectangular grid. 269 270 The somcluster method returns a tuple (clusterid, celldata). 271 272 Arguments: 273 - transpose: if equal to 0, genes (rows) are clustered; 274 if equal to 1, microarrays (columns) are clustered. 275 - nxgrid : the horizontal dimension of the rectangular SOM map 276 - nygrid : the vertical dimension of the rectangular SOM map 277 - inittau : the initial value of tau (the neighborbood function) 278 - niter : the number of iterations 279 - dist : specifies the distance function to be used: 280 281 - dist=='e': Euclidean distance 282 - dist=='b': City Block distance 283 - dist=='c': Pearson correlation 284 - dist=='a': absolute value of the correlation 285 - dist=='u': uncentered correlation 286 - dist=='x': absolute uncentered correlation 287 - dist=='s': Spearman's rank correlation 288 - dist=='k': Kendall's tau 289 290 Return values: 291 - clusterid: array with two columns, while the number of rows is equal 292 to the number of genes or the number of microarrays depending on 293 whether genes or microarrays are being clustered. Each row in 294 the array contains the x and y coordinates of the cell in the 295 rectangular SOM grid to which the gene or microarray was assigned. 296 - celldata: an array with dimensions (nxgrid, nygrid, number of 297 microarrays) if genes are being clustered, or (nxgrid, nygrid, 298 number of genes) if microarrays are being clustered. Each element 299 [ix][iy] of this array is a 1D vector containing the gene 300 expression data for the centroid of the cluster in the SOM grid 301 cell with coordinates (ix, iy). 302 303 """ 304 if transpose == 0: 305 weight = self.eweight 306 else: 307 weight = self.gweight 308 return somcluster(self.data, self.mask, weight, transpose, 309 nxgrid, nygrid, inittau, niter, dist)
310
311 - def clustercentroids(self, clusterid=None, method='a', transpose=0):
312 """Calculate the cluster centroids and return a tuple (cdata, cmask). 313 314 The centroid is defined as either the mean or the median over all 315 elements for each dimension. 316 317 Arguments: 318 - data : nrows x ncolumns array containing the expression data 319 - mask : nrows x ncolumns array of integers, showing which data 320 are missing. If mask[i][j]==0, then data[i][j] is missing. 321 - transpose: if equal to 0, gene (row) clusters are considered; 322 if equal to 1, microarray (column) clusters are considered. 323 - clusterid: array containing the cluster number for each gene or 324 microarray. The cluster number should be non-negative. 325 - method : specifies how the centroid is calculated: 326 327 - method=='a': arithmetic mean over each dimension. (default) 328 - method=='m': median over each dimension. 329 330 Return values: 331 - cdata : 2D array containing the cluster centroids. If transpose==0, 332 then the dimensions of cdata are nclusters x ncolumns. If 333 transpose==1, then the dimensions of cdata are nrows x nclusters. 334 - cmask : 2D array of integers describing which elements in cdata, 335 if any, are missing. 336 337 """ 338 return clustercentroids(self.data, self.mask, clusterid, method, 339 transpose)
340
341 - def clusterdistance(self, index1=0, index2=0, method='a', dist='e', 342 transpose=0):
343 """Calculate the distance between two clusters. 344 345 Arguments: 346 - index1 : 1D array identifying which genes/microarrays belong to the 347 first cluster. If the cluster contains only one gene, then 348 index1 can also be written as a single integer. 349 - index2 : 1D array identifying which genes/microarrays belong to the 350 second cluster. If the cluster contains only one gene, then 351 index2 can also be written as a single integer. 352 - transpose: if equal to 0, genes (rows) are clustered; 353 if equal to 1, microarrays (columns) are clustered. 354 - dist : specifies the distance function to be used: 355 356 - dist=='e': Euclidean distance 357 - dist=='b': City Block distance 358 - dist=='c': Pearson correlation 359 - dist=='a': absolute value of the correlation 360 - dist=='u': uncentered correlation 361 - dist=='x': absolute uncentered correlation 362 - dist=='s': Spearman's rank correlation 363 - dist=='k': Kendall's tau 364 365 - method : specifies how the distance between two clusters is defined: 366 367 - method=='a': the distance between the arithmetic means of the 368 two clusters 369 - method=='m': the distance between the medians of the two clusters 370 - method=='s': the smallest pairwise distance between members of 371 the two clusters 372 - method=='x': the largest pairwise distance between members of 373 the two clusters 374 - method=='v': average of the pairwise distances between members 375 of the clusters 376 377 - transpose: if equal to 0: clusters of genes (rows) are considered; 378 if equal to 1: clusters of microarrays (columns) are considered. 379 380 """ 381 if transpose == 0: 382 weight = self.eweight 383 else: 384 weight = self.gweight 385 return clusterdistance(self.data, self.mask, weight, 386 index1, index2, method, dist, transpose)
387
388 - def distancematrix(self, transpose=0, dist='e'):
389 """Calculate the distance matrix and return it as a list of arrays. 390 391 Arguments: 392 - transpose: if equal to 0: calculate the distances between genes 393 (rows); if equal to 1: calculate the distances beteeen microarrays 394 (columns). 395 - dist : specifies the distance function to be used: 396 397 - dist=='e': Euclidean distance 398 - dist=='b': City Block distance 399 - dist=='c': Pearson correlation 400 - dist=='a': absolute value of the correlation 401 - dist=='u': uncentered correlation 402 - dist=='x': absolute uncentered correlation 403 - dist=='s': Spearman's rank correlation 404 - dist=='k': Kendall's tau 405 406 Return value: 407 408 The distance matrix is returned as a list of 1D arrays containing the 409 distance matrix between the gene expression data. The number of columns 410 in each row is equal to the row number. Hence, the first row has zero 411 elements. An example of the return value is:: 412 413 matrix = [[], 414 array([1.]), 415 array([7., 3.]), 416 array([4., 2., 6.])] 417 418 This corresponds to the distance matrix:: 419 420 [0., 1., 7., 4.] 421 [1., 0., 3., 2.] 422 [7., 3., 0., 6.] 423 [4., 2., 6., 0.] 424 425 """ 426 if transpose == 0: 427 weight = self.eweight 428 else: 429 weight = self.gweight 430 return distancematrix(self.data, self.mask, weight, transpose, dist)
431
432 - def save(self, jobname, geneclusters=None, expclusters=None):
433 """Save the clustering results. 434 435 The saved files follow the convention for the Java TreeView program, 436 which can therefore be used to view the clustering result. 437 438 Arguments: 439 - jobname: The base name of the files to be saved. The filenames 440 are jobname.cdt, jobname.gtr, and jobname.atr for hierarchical 441 clustering, and jobname-K*.cdt, jobname-K*.kgg, jobname-K*.kag 442 for k-means clustering results. 443 - geneclusters=None: For hierarchical clustering results, 444 geneclusters is a Tree object as returned by the treecluster 445 method. For k-means clustering results, geneclusters is a vector 446 containing ngenes integers, describing to which cluster a given 447 gene belongs. This vector can be calculated by kcluster. 448 - expclusters=None: For hierarchical clustering results, expclusters 449 is a Tree object as returned by the treecluster method. For k-means 450 clustering results, expclusters is a vector containing nexps 451 integers, describing to which cluster a given experimental 452 condition belongs. This vector can be calculated by kcluster. 453 454 """ 455 (ngenes, nexps) = numpy.shape(self.data) 456 if self.gorder is None: 457 gorder = numpy.arange(ngenes) 458 else: 459 gorder = self.gorder 460 if self.eorder is None: 461 eorder = numpy.arange(nexps) 462 else: 463 eorder = self.eorder 464 if geneclusters is not None and expclusters is not None and \ 465 type(geneclusters) != type(expclusters): 466 raise ValueError("found one k-means and one hierarchical " 467 "clustering solution in geneclusters and " 468 "expclusters") 469 gid = 0 470 aid = 0 471 filename = jobname 472 postfix = "" 473 if isinstance(geneclusters, Tree): 474 # This is a hierarchical clustering result. 475 geneindex = _savetree(jobname, geneclusters, gorder, 0) 476 gid = 1 477 elif geneclusters is not None: 478 # This is a k-means clustering result. 479 filename = jobname + "_K" 480 k = max(geneclusters) + 1 481 kggfilename = "%s_K_G%d.kgg" % (jobname, k) 482 geneindex = self._savekmeans(kggfilename, geneclusters, gorder, 0) 483 postfix = "_G%d" % k 484 else: 485 geneindex = numpy.argsort(gorder) 486 if isinstance(expclusters, Tree): 487 # This is a hierarchical clustering result. 488 expindex = _savetree(jobname, expclusters, eorder, 1) 489 aid = 1 490 elif expclusters is not None: 491 # This is a k-means clustering result. 492 filename = jobname + "_K" 493 k = max(expclusters) + 1 494 kagfilename = "%s_K_A%d.kag" % (jobname, k) 495 expindex = self._savekmeans(kagfilename, expclusters, eorder, 1) 496 postfix += "_A%d" % k 497 else: 498 expindex = numpy.argsort(eorder) 499 filename = filename + postfix 500 self._savedata(filename, gid, aid, geneindex, expindex)
501
502 - def _savekmeans(self, filename, clusterids, order, transpose):
503 # Save a k-means clustering solution 504 if transpose == 0: 505 label = self.uniqid 506 names = self.geneid 507 else: 508 label = "ARRAY" 509 names = self.expid 510 with open(filename, "w") as outputfile: 511 outputfile.write(label + "\tGROUP\n") 512 index = numpy.argsort(order) 513 n = len(names) 514 sortedindex = numpy.zeros(n, int) 515 counter = 0 516 cluster = 0 517 while counter < n: 518 for j in index: 519 if clusterids[j] == cluster: 520 outputfile.write("%s\t%s\n" % (names[j], cluster)) 521 sortedindex[counter] = j 522 counter += 1 523 cluster += 1 524 return sortedindex
525
526 - def _savedata(self, jobname, gid, aid, geneindex, expindex):
527 # Save the clustered data. 528 if self.genename is None: 529 genename = self.geneid 530 else: 531 genename = self.genename 532 (ngenes, nexps) = numpy.shape(self.data) 533 with open(jobname + '.cdt', 'w') as outputfile: 534 if self.mask is not None: 535 mask = self.mask 536 else: 537 mask = numpy.ones((ngenes, nexps), int) 538 if self.gweight is not None: 539 gweight = self.gweight 540 else: 541 gweight = numpy.ones(ngenes) 542 if self.eweight is not None: 543 eweight = self.eweight 544 else: 545 eweight = numpy.ones(nexps) 546 if gid: 547 outputfile.write('GID\t') 548 outputfile.write(self.uniqid) 549 outputfile.write('\tNAME\tGWEIGHT') 550 # Now add headers for data columns. 551 for j in expindex: 552 outputfile.write('\t%s' % self.expid[j]) 553 outputfile.write('\n') 554 if aid: 555 outputfile.write("AID") 556 if gid: 557 outputfile.write('\t') 558 outputfile.write("\t\t") 559 for j in expindex: 560 outputfile.write('\tARRY%dX' % j) 561 outputfile.write('\n') 562 outputfile.write('EWEIGHT') 563 if gid: 564 outputfile.write('\t') 565 outputfile.write('\t\t') 566 for j in expindex: 567 outputfile.write('\t%f' % eweight[j]) 568 outputfile.write('\n') 569 for i in geneindex: 570 if gid: 571 outputfile.write('GENE%dX\t' % i) 572 outputfile.write("%s\t%s\t%f" % 573 (self.geneid[i], genename[i], gweight[i])) 574 for j in expindex: 575 outputfile.write('\t') 576 if mask[i, j]: 577 outputfile.write(str(self.data[i, j])) 578 outputfile.write('\n')
579 580
581 -def read(handle):
582 """Read gene expression data from the file handle and return a Record. 583 584 The file should be in the file format defined for Michael Eisen's 585 Cluster/TreeView program. 586 """ 587 return Record(handle)
588