Package Bio :: Module pairwise2
[hide private]
[frames] | no frames]

Source Code for Module Bio.pairwise2

  1  # Copyright 2002 by Jeffrey Chang.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """This package implements pairwise sequence alignment using a dynamic 
  7  programming algorithm. 
  8   
  9  This provides functions to get global and local alignments between two 
 10  sequences.  A global alignment finds the best concordance between all 
 11  characters in two sequences.  A local alignment finds just the 
 12  subsequences that align the best. 
 13   
 14  When doing alignments, you can specify the match score and gap 
 15  penalties.  The match score indicates the compatibility between an 
 16  alignment of two characters in the sequences.  Highly compatible 
 17  characters should be given positive scores, and incompatible ones 
 18  should be given negative scores or 0.  The gap penalties should be 
 19  negative. 
 20   
 21  The names of the alignment functions in this module follow the 
 22  convention 
 23  <alignment type>XX 
 24  where <alignment type> is either "global" or "local" and XX is a 2 
 25  character code indicating the parameters it takes.  The first 
 26  character indicates the parameters for matches (and mismatches), and 
 27  the second indicates the parameters for gap penalties. 
 28   
 29  The match parameters are 
 30  CODE  DESCRIPTION 
 31  x     No parameters.  Identical characters have score of 1, otherwise 0. 
 32  m     A match score is the score of identical chars, otherwise mismatch score. 
 33  d     A dictionary returns the score of any pair of characters. 
 34  c     A callback function returns scores. 
 35   
 36  The gap penalty parameters are 
 37  CODE  DESCRIPTION 
 38  x     No gap penalties. 
 39  s     Same open and extend gap penalties for both sequences. 
 40  d     The sequences have different open and extend gap penalties. 
 41  c     A callback function returns the gap penalties. 
 42   
 43  All the different alignment functions are contained in an object 
 44  "align".  For example: 
 45   
 46      >>> from Bio import pairwise2 
 47      >>> alignments = pairwise2.align.globalxx("ACCGT", "ACG") 
 48   
 49  will return a list of the alignments between the two strings.  The 
 50  parameters of the alignment function depends on the function called. 
 51  Some examples: 
 52   
 53      # Find the best global alignment between the two sequences. 
 54      # Identical characters are given 1 point.  No points are deducted 
 55      # for mismatches or gaps. 
 56      >>> for a in pairwise2.align.globalxx("ACCGT", "ACG"): 
 57      ...     print format_alignment(*a) 
 58      ACCGT 
 59      ||||| 
 60      AC-G- 
 61        Score=3 
 62      <BLANKLINE> 
 63      ACCGT 
 64      ||||| 
 65      A-CG- 
 66        Score=3 
 67      <BLANKLINE> 
 68   
 69      # Same thing as before, but with a local alignment. 
 70      >>> for a in pairwise2.align.localxx("ACCGT", "ACG"): 
 71      ...     print format_alignment(*a) 
 72      ACCGT 
 73      |||| 
 74      AC-G- 
 75        Score=3 
 76      <BLANKLINE> 
 77      ACCGT 
 78      |||| 
 79      A-CG- 
 80        Score=3 
 81      <BLANKLINE> 
 82   
 83      # Do a global alignment.  Identical characters are given 2 points, 
 84      # 1 point is deducted for each non-identical character. 
 85      >>> for a in pairwise2.align.globalmx("ACCGT", "ACG", 2, -1): 
 86      ...     print format_alignment(*a) 
 87      ACCGT 
 88      ||||| 
 89      AC-G- 
 90        Score=6 
 91      <BLANKLINE> 
 92      ACCGT 
 93      ||||| 
 94      A-CG- 
 95        Score=6 
 96      <BLANKLINE> 
 97   
 98      # Same as above, except now 0.5 points are deducted when opening a 
 99      # gap, and 0.1 points are deducted when extending it. 
100      >>> for a in pairwise2.align.globalms("ACCGT", "ACG", 2, -1, -.5, -.1): 
101      ...     print format_alignment(*a) 
102      ACCGT 
103      ||||| 
104      AC-G- 
105        Score=5 
106      <BLANKLINE> 
107      ACCGT 
108      ||||| 
109      A-CG- 
110        Score=5 
111      <BLANKLINE> 
112   
113  The alignment function can also use known matrices already included in 
114  Biopython ( Bio.SubsMat -> MatrixInfo ). 
115   
116      >>> from Bio.SubsMat import MatrixInfo as matlist 
117      >>> matrix = matlist.blosum62 
118      >>> for a in pairwise2.align.globaldx("KEVLA", "EVL", matrix): 
119      ...     print format_alignment(*a) 
120      KEVLA 
121      ||||| 
122      -EVL- 
123        Score=13 
124      <BLANKLINE> 
125   
126  To see a description of the parameters for a function, please look at 
127  the docstring for the function via the help function, e.g. 
128  type help(pairwise2.align.localds) at the Python prompt. 
129  """ 
130  # The alignment functions take some undocumented keyword parameters: 
131  # - penalize_extend_when_opening: boolean 
132  #   Whether to count an extension penalty when opening a gap.  If 
133  #   false, a gap of 1 is only penalize an "open" penalty, otherwise it 
134  #   is penalized "open+extend". 
135  # - penalize_end_gaps: boolean 
136  #   Whether to count the gaps at the ends of an alignment.  By 
137  #   default, they are counted for global alignments but not for local 
138  #   ones. 
139  # - gap_char: string 
140  #   Which character to use as a gap character in the alignment 
141  #   returned.  By default, uses '-'. 
142  # - force_generic: boolean 
143  #   Always use the generic, non-cached, dynamic programming function. 
144  #   For debugging. 
145  # - score_only: boolean 
146  #   Only get the best score, don't recover any alignments.  The return 
147  #   value of the function is the score. 
148  # - one_alignment_only: boolean 
149  #   Only recover one alignment. 
150   
151  MAX_ALIGNMENTS = 1000   # maximum alignments recovered in traceback 
152   
153   
154 -class align(object):
155 """This class provides functions that do alignments.""" 156
157 - class alignment_function:
158 """This class is callable impersonates an alignment function. 159 The constructor takes the name of the function. This class 160 will decode the name of the function to figure out how to 161 interpret the parameters. 162 163 """ 164 # match code -> tuple of (parameters, docstring) 165 match2args = { 166 'x': ([], ''), 167 'm': (['match', 'mismatch'], 168 """match is the score to given to identical characters. mismatch is 169 the score given to non-identical ones."""), 170 'd': (['match_dict'], 171 """match_dict is a dictionary where the keys are tuples of pairs of 172 characters and the values are the scores, e.g. ("A", "C") : 2.5."""), 173 'c': (['match_fn'], 174 """match_fn is a callback function that takes two characters and 175 returns the score between them."""), 176 } 177 # penalty code -> tuple of (parameters, docstring) 178 penalty2args = { 179 'x': ([], ''), 180 's': (['open', 'extend'], 181 """open and extend are the gap penalties when a gap is opened and 182 extended. They should be negative."""), 183 'd': (['openA', 'extendA', 'openB', 'extendB'], 184 """openA and extendA are the gap penalties for sequenceA, and openB 185 and extendB for sequeneB. The penalties should be negative."""), 186 'c': (['gap_A_fn', 'gap_B_fn'], 187 """gap_A_fn and gap_B_fn are callback functions that takes 1) the 188 index where the gap is opened, and 2) the length of the gap. They 189 should return a gap penalty."""), 190 } 191
192 - def __init__(self, name):
193 # Check to make sure the name of the function is 194 # reasonable. 195 if name.startswith("global"): 196 if len(name) != 8: 197 raise AttributeError("function should be globalXX") 198 elif name.startswith("local"): 199 if len(name) != 7: 200 raise AttributeError("function should be localXX") 201 else: 202 raise AttributeError(name) 203 align_type, match_type, penalty_type = \ 204 name[:-2], name[-2], name[-1] 205 try: 206 match_args, match_doc = self.match2args[match_type] 207 except KeyError, x: 208 raise AttributeError("unknown match type %r" % match_type) 209 try: 210 penalty_args, penalty_doc = self.penalty2args[penalty_type] 211 except KeyError, x: 212 raise AttributeError("unknown penalty type %r" % penalty_type) 213 214 # Now get the names of the parameters to this function. 215 param_names = ['sequenceA', 'sequenceB'] 216 param_names.extend(match_args) 217 param_names.extend(penalty_args) 218 self.function_name = name 219 self.align_type = align_type 220 self.param_names = param_names 221 222 self.__name__ = self.function_name 223 # Set the doc string. 224 doc = "%s(%s) -> alignments\n" % ( 225 self.__name__, ', '.join(self.param_names)) 226 if match_doc: 227 doc += "\n%s\n" % match_doc 228 if penalty_doc: 229 doc += "\n%s\n" % penalty_doc 230 doc += ( 231 """\nalignments is a list of tuples (seqA, seqB, score, begin, end). 232 seqA and seqB are strings showing the alignment between the 233 sequences. score is the score of the alignment. begin and end 234 are indexes into seqA and seqB that indicate the where the 235 alignment occurs. 236 """) 237 self.__doc__ = doc
238
239 - def decode(self, *args, **keywds):
240 # Decode the arguments for the _align function. keywds 241 # will get passed to it, so translate the arguments to 242 # this function into forms appropriate for _align. 243 keywds = keywds.copy() 244 if len(args) != len(self.param_names): 245 raise TypeError("%s takes exactly %d argument (%d given)" 246 % (self.function_name, len(self.param_names), len(args))) 247 i = 0 248 while i < len(self.param_names): 249 if self.param_names[i] in [ 250 'sequenceA', 'sequenceB', 251 'gap_A_fn', 'gap_B_fn', 'match_fn']: 252 keywds[self.param_names[i]] = args[i] 253 i += 1 254 elif self.param_names[i] == 'match': 255 assert self.param_names[i+1] == 'mismatch' 256 match, mismatch = args[i], args[i+1] 257 keywds['match_fn'] = identity_match(match, mismatch) 258 i += 2 259 elif self.param_names[i] == 'match_dict': 260 keywds['match_fn'] = dictionary_match(args[i]) 261 i += 1 262 elif self.param_names[i] == 'open': 263 assert self.param_names[i+1] == 'extend' 264 open, extend = args[i], args[i+1] 265 pe = keywds.get('penalize_extend_when_opening', 0) 266 keywds['gap_A_fn'] = affine_penalty(open, extend, pe) 267 keywds['gap_B_fn'] = affine_penalty(open, extend, pe) 268 i += 2 269 elif self.param_names[i] == 'openA': 270 assert self.param_names[i+3] == 'extendB' 271 openA, extendA, openB, extendB = args[i:i+4] 272 pe = keywds.get('penalize_extend_when_opening', 0) 273 keywds['gap_A_fn'] = affine_penalty(openA, extendA, pe) 274 keywds['gap_B_fn'] = affine_penalty(openB, extendB, pe) 275 i += 4 276 else: 277 raise ValueError("unknown parameter %r" 278 % self.param_names[i]) 279 280 # Here are the default parameters for _align. Assign 281 # these to keywds, unless already specified. 282 pe = keywds.get('penalize_extend_when_opening', 0) 283 default_params = [ 284 ('match_fn', identity_match(1, 0)), 285 ('gap_A_fn', affine_penalty(0, 0, pe)), 286 ('gap_B_fn', affine_penalty(0, 0, pe)), 287 ('penalize_extend_when_opening', 0), 288 ('penalize_end_gaps', self.align_type == 'global'), 289 ('align_globally', self.align_type == 'global'), 290 ('gap_char', '-'), 291 ('force_generic', 0), 292 ('score_only', 0), 293 ('one_alignment_only', 0) 294 ] 295 for name, default in default_params: 296 keywds[name] = keywds.get(name, default) 297 return keywds
298
299 - def __call__(self, *args, **keywds):
300 keywds = self.decode(*args, **keywds) 301 return _align(**keywds)
302
303 - def __getattr__(self, attr):
304 return self.alignment_function(attr)
305 align = align() 306 307
308 -def _align(sequenceA, sequenceB, match_fn, gap_A_fn, gap_B_fn, 309 penalize_extend_when_opening, penalize_end_gaps, 310 align_globally, gap_char, force_generic, score_only, 311 one_alignment_only):
312 if not sequenceA or not sequenceB: 313 return [] 314 315 if (not force_generic) and isinstance(gap_A_fn, affine_penalty) \ 316 and isinstance(gap_B_fn, affine_penalty): 317 open_A, extend_A = gap_A_fn.open, gap_A_fn.extend 318 open_B, extend_B = gap_B_fn.open, gap_B_fn.extend 319 x = _make_score_matrix_fast( 320 sequenceA, sequenceB, match_fn, open_A, extend_A, open_B, extend_B, 321 penalize_extend_when_opening, penalize_end_gaps, align_globally, 322 score_only) 323 else: 324 x = _make_score_matrix_generic( 325 sequenceA, sequenceB, match_fn, gap_A_fn, gap_B_fn, 326 penalize_extend_when_opening, penalize_end_gaps, align_globally, 327 score_only) 328 score_matrix, trace_matrix = x 329 330 #print "SCORE"; print_matrix(score_matrix) 331 #print "TRACEBACK"; print_matrix(trace_matrix) 332 333 # Look for the proper starting point. Get a list of all possible 334 # starting points. 335 starts = _find_start( 336 score_matrix, sequenceA, sequenceB, 337 gap_A_fn, gap_B_fn, penalize_end_gaps, align_globally) 338 # Find the highest score. 339 best_score = max([x[0] for x in starts]) 340 341 # If they only want the score, then return it. 342 if score_only: 343 return best_score 344 345 tolerance = 0 # XXX do anything with this? 346 # Now find all the positions within some tolerance of the best 347 # score. 348 i = 0 349 while i < len(starts): 350 score, pos = starts[i] 351 if rint(abs(score-best_score)) > rint(tolerance): 352 del starts[i] 353 else: 354 i += 1 355 356 # Recover the alignments and return them. 357 x = _recover_alignments( 358 sequenceA, sequenceB, starts, score_matrix, trace_matrix, 359 align_globally, penalize_end_gaps, gap_char, one_alignment_only) 360 return x
361 362
363 -def _make_score_matrix_generic( 364 sequenceA, sequenceB, match_fn, gap_A_fn, gap_B_fn, 365 penalize_extend_when_opening, penalize_end_gaps, align_globally, 366 score_only):
367 # This is an implementation of the Needleman-Wunsch dynamic 368 # programming algorithm for aligning sequences. 369 370 # Create the score and traceback matrices. These should be in the 371 # shape: 372 # sequenceA (down) x sequenceB (across) 373 lenA, lenB = len(sequenceA), len(sequenceB) 374 score_matrix, trace_matrix = [], [] 375 for i in range(lenA): 376 score_matrix.append([None] * lenB) 377 trace_matrix.append([[None]] * lenB) 378 379 # The top and left borders of the matrices are special cases 380 # because there are no previously aligned characters. To simplify 381 # the main loop, handle these separately. 382 for i in range(lenA): 383 # Align the first residue in sequenceB to the ith residue in 384 # sequence A. This is like opening up i gaps at the beginning 385 # of sequence B. 386 score = match_fn(sequenceA[i], sequenceB[0]) 387 if penalize_end_gaps: 388 score += gap_B_fn(0, i) 389 score_matrix[i][0] = score 390 for i in range(1, lenB): 391 score = match_fn(sequenceA[0], sequenceB[i]) 392 if penalize_end_gaps: 393 score += gap_A_fn(0, i) 394 score_matrix[0][i] = score 395 396 # Fill in the score matrix. Each position in the matrix 397 # represents an alignment between a character from sequenceA to 398 # one in sequence B. As I iterate through the matrix, find the 399 # alignment by choose the best of: 400 # 1) extending a previous alignment without gaps 401 # 2) adding a gap in sequenceA 402 # 3) adding a gap in sequenceB 403 for row in range(1, lenA): 404 for col in range(1, lenB): 405 # First, calculate the score that would occur by extending 406 # the alignment without gaps. 407 best_score = score_matrix[row-1][col-1] 408 best_score_rint = rint(best_score) 409 best_indexes = [(row-1, col-1)] 410 411 # Try to find a better score by opening gaps in sequenceA. 412 # Do this by checking alignments from each column in the 413 # previous row. Each column represents a different 414 # character to align from, and thus a different length 415 # gap. 416 for i in range(0, col-1): 417 score = score_matrix[row-1][i] + gap_A_fn(row, col-1-i) 418 score_rint = rint(score) 419 if score_rint == best_score_rint: 420 best_score, best_score_rint = score, score_rint 421 best_indexes.append((row-1, i)) 422 elif score_rint > best_score_rint: 423 best_score, best_score_rint = score, score_rint 424 best_indexes = [(row-1, i)] 425 426 # Try to find a better score by opening gaps in sequenceB. 427 for i in range(0, row-1): 428 score = score_matrix[i][col-1] + gap_B_fn(col, row-1-i) 429 score_rint = rint(score) 430 if score_rint == best_score_rint: 431 best_score, best_score_rint = score, score_rint 432 best_indexes.append((i, col-1)) 433 elif score_rint > best_score_rint: 434 best_score, best_score_rint = score, score_rint 435 best_indexes = [(i, col-1)] 436 437 score_matrix[row][col] = best_score + \ 438 match_fn(sequenceA[row], sequenceB[col]) 439 if not align_globally and score_matrix[row][col] < 0: 440 score_matrix[row][col] = 0 441 trace_matrix[row][col] = best_indexes 442 return score_matrix, trace_matrix
443 444
445 -def _make_score_matrix_fast( 446 sequenceA, sequenceB, match_fn, open_A, extend_A, open_B, extend_B, 447 penalize_extend_when_opening, penalize_end_gaps, 448 align_globally, score_only):
449 first_A_gap = calc_affine_penalty(1, open_A, extend_A, 450 penalize_extend_when_opening) 451 first_B_gap = calc_affine_penalty(1, open_B, extend_B, 452 penalize_extend_when_opening) 453 454 # Create the score and traceback matrices. These should be in the 455 # shape: 456 # sequenceA (down) x sequenceB (across) 457 lenA, lenB = len(sequenceA), len(sequenceB) 458 score_matrix, trace_matrix = [], [] 459 for i in range(lenA): 460 score_matrix.append([None] * lenB) 461 trace_matrix.append([[None]] * lenB) 462 463 # The top and left borders of the matrices are special cases 464 # because there are no previously aligned characters. To simplify 465 # the main loop, handle these separately. 466 for i in range(lenA): 467 # Align the first residue in sequenceB to the ith residue in 468 # sequence A. This is like opening up i gaps at the beginning 469 # of sequence B. 470 score = match_fn(sequenceA[i], sequenceB[0]) 471 if penalize_end_gaps: 472 score += calc_affine_penalty( 473 i, open_B, extend_B, penalize_extend_when_opening) 474 score_matrix[i][0] = score 475 for i in range(1, lenB): 476 score = match_fn(sequenceA[0], sequenceB[i]) 477 if penalize_end_gaps: 478 score += calc_affine_penalty( 479 i, open_A, extend_A, penalize_extend_when_opening) 480 score_matrix[0][i] = score 481 482 # In the generic algorithm, at each row and column in the score 483 # matrix, we had to scan all previous rows and columns to see 484 # whether opening a gap might yield a higher score. Here, since 485 # we know the penalties are affine, we can cache just the best 486 # score in the previous rows and columns. Instead of scanning 487 # through all the previous rows and cols, we can just look at the 488 # cache for the best one. Whenever the row or col increments, the 489 # best cached score just decreases by extending the gap longer. 490 491 # The best score and indexes for each row (goes down all columns). 492 # I don't need to store the last row because it's the end of the 493 # sequence. 494 row_cache_score, row_cache_index = [None]*(lenA-1), [None]*(lenA-1) 495 # The best score and indexes for each column (goes across rows). 496 col_cache_score, col_cache_index = [None]*(lenB-1), [None]*(lenB-1) 497 498 for i in range(lenA-1): 499 # Initialize each row to be the alignment of sequenceA[i] to 500 # sequenceB[0], plus opening a gap in sequenceA. 501 row_cache_score[i] = score_matrix[i][0] + first_A_gap 502 row_cache_index[i] = [(i, 0)] 503 for i in range(lenB-1): 504 col_cache_score[i] = score_matrix[0][i] + first_B_gap 505 col_cache_index[i] = [(0, i)] 506 507 # Fill in the score_matrix. 508 for row in range(1, lenA): 509 for col in range(1, lenB): 510 # Calculate the score that would occur by extending the 511 # alignment without gaps. 512 nogap_score = score_matrix[row-1][col-1] 513 514 # Check the score that would occur if there were a gap in 515 # sequence A. 516 if col > 1: 517 row_score = row_cache_score[row-1] 518 else: 519 row_score = nogap_score - 1 # Make sure it's not the best. 520 # Check the score that would occur if there were a gap in 521 # sequence B. 522 if row > 1: 523 col_score = col_cache_score[col-1] 524 else: 525 col_score = nogap_score - 1 526 527 best_score = max(nogap_score, row_score, col_score) 528 best_score_rint = rint(best_score) 529 best_index = [] 530 if best_score_rint == rint(nogap_score): 531 best_index.append((row-1, col-1)) 532 if best_score_rint == rint(row_score): 533 best_index.extend(row_cache_index[row-1]) 534 if best_score_rint == rint(col_score): 535 best_index.extend(col_cache_index[col-1]) 536 537 # Set the score and traceback matrices. 538 score = best_score + match_fn(sequenceA[row], sequenceB[col]) 539 if not align_globally and score < 0: 540 score_matrix[row][col] = 0 541 else: 542 score_matrix[row][col] = score 543 trace_matrix[row][col] = best_index 544 545 # Update the cached column scores. The best score for 546 # this can come from either extending the gap in the 547 # previous cached score, or opening a new gap from the 548 # most previously seen character. Compare the two scores 549 # and keep the best one. 550 open_score = score_matrix[row-1][col-1] + first_B_gap 551 extend_score = col_cache_score[col-1] + extend_B 552 open_score_rint, extend_score_rint = \ 553 rint(open_score), rint(extend_score) 554 if open_score_rint > extend_score_rint: 555 col_cache_score[col-1] = open_score 556 col_cache_index[col-1] = [(row-1, col-1)] 557 elif extend_score_rint > open_score_rint: 558 col_cache_score[col-1] = extend_score 559 else: 560 col_cache_score[col-1] = open_score 561 if (row-1, col-1) not in col_cache_index[col-1]: 562 col_cache_index[col-1] = col_cache_index[col-1] + \ 563 [(row-1, col-1)] 564 565 # Update the cached row scores. 566 open_score = score_matrix[row-1][col-1] + first_A_gap 567 extend_score = row_cache_score[row-1] + extend_A 568 open_score_rint, extend_score_rint = \ 569 rint(open_score), rint(extend_score) 570 if open_score_rint > extend_score_rint: 571 row_cache_score[row-1] = open_score 572 row_cache_index[row-1] = [(row-1, col-1)] 573 elif extend_score_rint > open_score_rint: 574 row_cache_score[row-1] = extend_score 575 else: 576 row_cache_score[row-1] = open_score 577 if (row-1, col-1) not in row_cache_index[row-1]: 578 row_cache_index[row-1] = row_cache_index[row-1] + \ 579 [(row-1, col-1)] 580 581 return score_matrix, trace_matrix
582 583
584 -def _recover_alignments(sequenceA, sequenceB, starts, 585 score_matrix, trace_matrix, align_globally, 586 penalize_end_gaps, gap_char, one_alignment_only):
587 # Recover the alignments by following the traceback matrix. This 588 # is a recursive procedure, but it's implemented here iteratively 589 # with a stack. 590 lenA, lenB = len(sequenceA), len(sequenceB) 591 tracebacks = [] # list of (seq1, seq2, score, begin, end) 592 in_process = [] # list of ([same as tracebacks], prev_pos, next_pos) 593 594 # sequenceA and sequenceB may be sequences, including strings, 595 # lists, or list-like objects. In order to preserve the type of 596 # the object, we need to use slices on the sequences instead of 597 # indexes. For example, sequenceA[row] may return a type that's 598 # not compatible with sequenceA, e.g. if sequenceA is a list and 599 # sequenceA[row] is a string. Thus, avoid using indexes and use 600 # slices, e.g. sequenceA[row:row+1]. Assume that client-defined 601 # sequence classes preserve these semantics. 602 603 # Initialize the in_process stack 604 for score, (row, col) in starts: 605 if align_globally: 606 begin, end = None, None 607 else: 608 begin, end = None, -max(lenA-row, lenB-col)+1 609 if not end: 610 end = None 611 # Initialize the in_process list with empty sequences of the 612 # same type as sequenceA. To do this, take empty slices of 613 # the sequences. 614 in_process.append( 615 (sequenceA[0:0], sequenceB[0:0], score, begin, end, 616 (lenA, lenB), (row, col))) 617 if one_alignment_only: 618 break 619 while in_process and len(tracebacks) < MAX_ALIGNMENTS: 620 seqA, seqB, score, begin, end, prev_pos, next_pos = in_process.pop() 621 prevA, prevB = prev_pos 622 if next_pos is None: 623 prevlen = len(seqA) 624 # add the rest of the sequences 625 seqA = sequenceA[:prevA] + seqA 626 seqB = sequenceB[:prevB] + seqB 627 # add the rest of the gaps 628 seqA, seqB = _lpad_until_equal(seqA, seqB, gap_char) 629 630 # Now make sure begin is set. 631 if begin is None: 632 if align_globally: 633 begin = 0 634 else: 635 begin = len(seqA) - prevlen 636 tracebacks.append((seqA, seqB, score, begin, end)) 637 else: 638 nextA, nextB = next_pos 639 nseqA, nseqB = prevA-nextA, prevB-nextB 640 maxseq = max(nseqA, nseqB) 641 ngapA, ngapB = maxseq-nseqA, maxseq-nseqB 642 seqA = sequenceA[nextA:nextA+nseqA] + gap_char*ngapA + seqA 643 seqB = sequenceB[nextB:nextB+nseqB] + gap_char*ngapB + seqB 644 prev_pos = next_pos 645 # local alignment stops early if score falls < 0 646 if not align_globally and score_matrix[nextA][nextB] <= 0: 647 begin = max(prevA, prevB) 648 in_process.append( 649 (seqA, seqB, score, begin, end, prev_pos, None)) 650 else: 651 for next_pos in trace_matrix[nextA][nextB]: 652 in_process.append( 653 (seqA, seqB, score, begin, end, prev_pos, next_pos)) 654 if one_alignment_only: 655 break 656 657 return _clean_alignments(tracebacks)
658 659
660 -def _find_start(score_matrix, sequenceA, sequenceB, gap_A_fn, gap_B_fn, 661 penalize_end_gaps, align_globally):
662 # Return a list of (score, (row, col)) indicating every possible 663 # place to start the tracebacks. 664 if align_globally: 665 if penalize_end_gaps: 666 starts = _find_global_start( 667 sequenceA, sequenceB, score_matrix, gap_A_fn, gap_B_fn, 1) 668 else: 669 starts = _find_global_start( 670 sequenceA, sequenceB, score_matrix, None, None, 0) 671 else: 672 starts = _find_local_start(score_matrix) 673 return starts
674 675
676 -def _find_global_start(sequenceA, sequenceB, 677 score_matrix, gap_A_fn, gap_B_fn, penalize_end_gaps):
678 # The whole sequence should be aligned, so return the positions at 679 # the end of either one of the sequences. 680 nrows, ncols = len(score_matrix), len(score_matrix[0]) 681 positions = [] 682 # Search all rows in the last column. 683 for row in range(nrows): 684 # Find the score, penalizing end gaps if necessary. 685 score = score_matrix[row][ncols-1] 686 if penalize_end_gaps: 687 score += gap_B_fn(ncols, nrows-row-1) 688 positions.append((score, (row, ncols-1))) 689 # Search all columns in the last row. 690 for col in range(ncols-1): 691 score = score_matrix[nrows-1][col] 692 if penalize_end_gaps: 693 score += gap_A_fn(nrows, ncols-col-1) 694 positions.append((score, (nrows-1, col))) 695 return positions
696 697
698 -def _find_local_start(score_matrix):
699 # Return every position in the matrix. 700 positions = [] 701 nrows, ncols = len(score_matrix), len(score_matrix[0]) 702 for row in range(nrows): 703 for col in range(ncols): 704 score = score_matrix[row][col] 705 positions.append((score, (row, col))) 706 return positions
707 708
709 -def _clean_alignments(alignments):
710 # Take a list of alignments and return a cleaned version. Remove 711 # duplicates, make sure begin and end are set correctly, remove 712 # empty alignments. 713 unique_alignments = [] 714 for align in alignments: 715 if align not in unique_alignments: 716 unique_alignments.append(align) 717 i = 0 718 while i < len(unique_alignments): 719 seqA, seqB, score, begin, end = unique_alignments[i] 720 # Make sure end is set reasonably. 721 if end is None: # global alignment 722 end = len(seqA) 723 elif end < 0: 724 end = end + len(seqA) 725 # If there's no alignment here, get rid of it. 726 if begin >= end: 727 del unique_alignments[i] 728 continue 729 unique_alignments[i] = seqA, seqB, score, begin, end 730 i += 1 731 return unique_alignments
732 733
734 -def _pad_until_equal(s1, s2, char):
735 # Add char to the end of s1 or s2 until they are equal length. 736 ls1, ls2 = len(s1), len(s2) 737 if ls1 < ls2: 738 s1 = _pad(s1, char, ls2-ls1) 739 elif ls2 < ls1: 740 s2 = _pad(s2, char, ls1-ls2) 741 return s1, s2
742 743
744 -def _lpad_until_equal(s1, s2, char):
745 # Add char to the beginning of s1 or s2 until they are equal 746 # length. 747 ls1, ls2 = len(s1), len(s2) 748 if ls1 < ls2: 749 s1 = _lpad(s1, char, ls2-ls1) 750 elif ls2 < ls1: 751 s2 = _lpad(s2, char, ls1-ls2) 752 return s1, s2
753 754
755 -def _pad(s, char, n):
756 # Append n chars to the end of s. 757 return s + char*n
758 759
760 -def _lpad(s, char, n):
761 # Prepend n chars to the beginning of s. 762 return char*n + s
763 764 _PRECISION = 1000 765 766
767 -def rint(x, precision=_PRECISION):
768 return int(x * precision + 0.5)
769 770
771 -class identity_match:
772 """identity_match([match][, mismatch]) -> match_fn 773 774 Create a match function for use in an alignment. match and 775 mismatch are the scores to give when two residues are equal or 776 unequal. By default, match is 1 and mismatch is 0. 777 778 """
779 - def __init__(self, match=1, mismatch=0):
780 self.match = match 781 self.mismatch = mismatch
782
783 - def __call__(self, charA, charB):
784 if charA == charB: 785 return self.match 786 return self.mismatch
787 788
789 -class dictionary_match:
790 """dictionary_match(score_dict[, symmetric]) -> match_fn 791 792 Create a match function for use in an alignment. score_dict is a 793 dictionary where the keys are tuples (residue 1, residue 2) and 794 the values are the match scores between those residues. symmetric 795 is a flag that indicates whether the scores are symmetric. If 796 true, then if (res 1, res 2) doesn't exist, I will use the score 797 at (res 2, res 1). 798 799 """
800 - def __init__(self, score_dict, symmetric=1):
801 self.score_dict = score_dict 802 self.symmetric = symmetric
803
804 - def __call__(self, charA, charB):
805 if self.symmetric and (charA, charB) not in self.score_dict: 806 # If the score dictionary is symmetric, then look up the 807 # score both ways. 808 charB, charA = charA, charB 809 return self.score_dict[(charA, charB)]
810 811
812 -class affine_penalty:
813 """affine_penalty(open, extend[, penalize_extend_when_opening]) -> gap_fn 814 815 Create a gap function for use in an alignment. 816 817 """
818 - def __init__(self, open, extend, penalize_extend_when_opening=0):
819 if open > 0 or extend > 0: 820 raise ValueError("Gap penalties should be non-positive.") 821 self.open, self.extend = open, extend 822 self.penalize_extend_when_opening = penalize_extend_when_opening
823
824 - def __call__(self, index, length):
825 return calc_affine_penalty( 826 length, self.open, self.extend, self.penalize_extend_when_opening)
827 828
829 -def calc_affine_penalty(length, open, extend, penalize_extend_when_opening):
830 if length <= 0: 831 return 0 832 penalty = open + extend * length 833 if not penalize_extend_when_opening: 834 penalty -= extend 835 return penalty
836 837 854 855
856 -def format_alignment(align1, align2, score, begin, end):
857 """format_alignment(align1, align2, score, begin, end) -> string 858 859 Format the alignment prettily into a string. 860 861 """ 862 s = [] 863 s.append("%s\n" % align1) 864 s.append("%s%s\n" % (" "*begin, "|"*(end-begin))) 865 s.append("%s\n" % align2) 866 s.append(" Score=%g\n" % score) 867 return ''.join(s)
868 869 870 # Try and load C implementations of functions. If I can't, 871 # then just ignore and use the pure python implementations. 872 try: 873 from cpairwise2 import rint, _make_score_matrix_fast 874 except ImportError: 875 pass 876 877
878 -def _test():
879 """Run the module's doctests (PRIVATE).""" 880 print "Running doctests..." 881 import doctest 882 doctest.testmod(optionflags=doctest.IGNORE_EXCEPTION_DETAIL) 883 print "Done"
884 885 if __name__ == "__main__": 886 _test() 887