Package Bio :: Package Align :: Module AlignInfo
[hide private]
[frames] | no frames]

Source Code for Module Bio.Align.AlignInfo

  1  # This code is part of the Biopython distribution and governed by its 
  2  # license.  Please see the LICENSE file that should have been included 
  3  # as part of this package. 
  4  # 
  5   
  6  """Extract information from alignment objects. 
  7   
  8  In order to try and avoid huge alignment objects with tons of functions, 
  9  functions which return summary type information about alignments should 
 10  be put into classes in this module. 
 11  """ 
 12   
 13  from __future__ import print_function 
 14   
 15  import math 
 16  import sys 
 17   
 18  from Bio import Alphabet 
 19  from Bio.Alphabet import IUPAC 
 20  from Bio.Seq import Seq 
 21  from Bio.SubsMat import FreqTable 
 22   
 23   
 24  # Expected random distributions for 20-letter protein, and 
 25  # for 4-letter nucleotide alphabets 
 26  Protein20Random = 0.05 
 27  Nucleotide4Random = 0.25 
 28   
 29   
30 -class SummaryInfo(object):
31 """Calculate summary info about the alignment. 32 33 This class should be used to caclculate information summarizing the 34 results of an alignment. This may either be straight consensus info 35 or more complicated things. 36 """
37 - def __init__(self, alignment):
38 """Initialize with the alignment to calculate information on. 39 40 ic_vector attribute. A dictionary. Keys: column numbers. Values: 41 """ 42 self.alignment = alignment 43 self.ic_vector = {}
44
45 - def dumb_consensus(self, threshold=.7, ambiguous="X", 46 consensus_alpha=None, require_multiple=0):
47 """Output a fast consensus sequence of the alignment. 48 49 This doesn't do anything fancy at all. It will just go through the 50 sequence residue by residue and count up the number of each type 51 of residue (ie. A or G or T or C for DNA) in all sequences in the 52 alignment. If the percentage of the most common residue type is 53 greater then the passed threshold, then we will add that residue type, 54 otherwise an ambiguous character will be added. 55 56 This could be made a lot fancier (ie. to take a substitution matrix 57 into account), but it just meant for a quick and dirty consensus. 58 59 Arguments: 60 - threshold - The threshold value that is required to add a particular 61 atom. 62 - ambiguous - The ambiguous character to be added when the threshold is 63 not reached. 64 - consensus_alpha - The alphabet to return for the consensus sequence. 65 If this is None, then we will try to guess the alphabet. 66 - require_multiple - If set as 1, this will require that more than 67 1 sequence be part of an alignment to put it in the consensus (ie. 68 not just 1 sequence and gaps). 69 """ 70 # Iddo Friedberg, 1-JUL-2004: changed ambiguous default to "X" 71 consensus = '' 72 73 # find the length of the consensus we are creating 74 con_len = self.alignment.get_alignment_length() 75 76 # go through each seq item 77 for n in range(con_len): 78 # keep track of the counts of the different atoms we get 79 atom_dict = {} 80 num_atoms = 0 81 82 for record in self.alignment: 83 # make sure we haven't run past the end of any sequences 84 # if they are of different lengths 85 if n < len(record.seq): 86 if record.seq[n] != '-' and record.seq[n] != '.': 87 if record.seq[n] not in atom_dict: 88 atom_dict[record.seq[n]] = 1 89 else: 90 atom_dict[record.seq[n]] += 1 91 92 num_atoms = num_atoms + 1 93 94 max_atoms = [] 95 max_size = 0 96 97 for atom in atom_dict: 98 if atom_dict[atom] > max_size: 99 max_atoms = [atom] 100 max_size = atom_dict[atom] 101 elif atom_dict[atom] == max_size: 102 max_atoms.append(atom) 103 104 if require_multiple and num_atoms == 1: 105 consensus += ambiguous 106 elif (len(max_atoms) == 1) and ((float(max_size) / 107 float(num_atoms)) >= threshold): 108 consensus += max_atoms[0] 109 else: 110 consensus += ambiguous 111 112 # we need to guess a consensus alphabet if one isn't specified 113 if consensus_alpha is None: 114 consensus_alpha = self._guess_consensus_alphabet(ambiguous) 115 116 return Seq(consensus, consensus_alpha)
117
118 - def gap_consensus(self, threshold=.7, ambiguous="X", 119 consensus_alpha=None, require_multiple=0):
120 """Same as dumb_consensus(), but allows gap on the output. 121 122 Things to do: 123 - Let the user define that with only one gap, the result 124 character in consensus is gap. 125 - Let the user select gap character, now 126 it takes the same as input. 127 """ 128 # Iddo Friedberg, 1-JUL-2004: changed ambiguous default to "X" 129 consensus = '' 130 131 # find the length of the consensus we are creating 132 con_len = self.alignment.get_alignment_length() 133 134 # go through each seq item 135 for n in range(con_len): 136 # keep track of the counts of the different atoms we get 137 atom_dict = {} 138 num_atoms = 0 139 140 for record in self.alignment: 141 # make sure we haven't run past the end of any sequences 142 # if they are of different lengths 143 if n < len(record.seq): 144 if record.seq[n] not in atom_dict: 145 atom_dict[record.seq[n]] = 1 146 else: 147 atom_dict[record.seq[n]] += 1 148 149 num_atoms += 1 150 151 max_atoms = [] 152 max_size = 0 153 154 for atom in atom_dict: 155 if atom_dict[atom] > max_size: 156 max_atoms = [atom] 157 max_size = atom_dict[atom] 158 elif atom_dict[atom] == max_size: 159 max_atoms.append(atom) 160 161 if require_multiple and num_atoms == 1: 162 consensus += ambiguous 163 elif (len(max_atoms) == 1) and ((float(max_size) / 164 float(num_atoms)) >= threshold): 165 consensus += max_atoms[0] 166 else: 167 consensus += ambiguous 168 169 # we need to guess a consensus alphabet if one isn't specified 170 if consensus_alpha is None: 171 # TODO - Should we make this into a Gapped alphabet? 172 consensus_alpha = self._guess_consensus_alphabet(ambiguous) 173 174 return Seq(consensus, consensus_alpha)
175
176 - def _guess_consensus_alphabet(self, ambiguous):
177 """Pick an (ungapped) alphabet for an alignment consesus sequence. 178 179 This just looks at the sequences we have, checks their type, and 180 returns as appropriate type which seems to make sense with the 181 sequences we've got. 182 """ 183 # Start with the (un-gapped version of) the alignment alphabet 184 a = Alphabet._get_base_alphabet(self.alignment._alphabet) 185 186 # Now check its compatible with all the rest of the sequences 187 for record in self.alignment: 188 # Get the (un-gapped version of) the sequence's alphabet 189 alt = Alphabet._get_base_alphabet(record.seq.alphabet) 190 if not isinstance(alt, a.__class__): 191 raise ValueError("Alignment contains a sequence with \ 192 an incompatible alphabet.") 193 194 # Check the ambiguous character we are going to use in the consensus 195 # is in the alphabet's list of valid letters (if defined). 196 if hasattr(a, "letters") and a.letters is not None \ 197 and ambiguous not in a.letters: 198 # We'll need to pick a more generic alphabet... 199 if isinstance(a, IUPAC.IUPACUnambiguousDNA): 200 if ambiguous in IUPAC.IUPACUnambiguousDNA().letters: 201 a = IUPAC.IUPACUnambiguousDNA() 202 else: 203 a = Alphabet.generic_dna 204 elif isinstance(a, IUPAC.IUPACUnambiguousRNA): 205 if ambiguous in IUPAC.IUPACUnambiguousRNA().letters: 206 a = IUPAC.IUPACUnambiguousRNA() 207 else: 208 a = Alphabet.generic_rna 209 elif isinstance(a, IUPAC.IUPACProtein): 210 if ambiguous in IUPAC.ExtendedIUPACProtein().letters: 211 a = IUPAC.ExtendedIUPACProtein() 212 else: 213 a = Alphabet.generic_protein 214 else: 215 a = Alphabet.single_letter_alphabet 216 return a
217
218 - def replacement_dictionary(self, skip_chars=None):
219 """Generate a replacement dictionary to plug into a substitution matrix 220 221 This should look at an alignment, and be able to generate the number 222 of substitutions of different residues for each other in the 223 aligned object. 224 225 Will then return a dictionary with this information:: 226 227 {('A', 'C') : 10, ('C', 'A') : 12, ('G', 'C') : 15 ....} 228 229 This also treats weighted sequences. The following example shows how 230 we calculate the replacement dictionary. Given the following 231 multiple sequence alignment:: 232 233 GTATC 0.5 234 AT--C 0.8 235 CTGTC 1.0 236 237 For the first column we have:: 238 239 ('A', 'G') : 0.5 * 0.8 = 0.4 240 ('C', 'G') : 0.5 * 1.0 = 0.5 241 ('A', 'C') : 0.8 * 1.0 = 0.8 242 243 We then continue this for all of the columns in the alignment, summing 244 the information for each substitution in each column, until we end 245 up with the replacement dictionary. 246 247 Arguments: 248 - skip_chars - A list of characters to skip when creating the dictionary. 249 This defaults to an empty list. 250 251 For instance, you might have Xs (screened stuff) or Ns, and not want 252 to include the ambiguity characters in the dictionary. 253 """ 254 # get a starting dictionary based on the alphabet of the alignment 255 rep_dict, skip_items = self._get_base_replacements(skip_chars) 256 257 # iterate through each record 258 for rec_num1 in range(len(self.alignment)): 259 # iterate through each record from one beyond the current record 260 # to the end of the list of records 261 for rec_num2 in range(rec_num1 + 1, len(self.alignment)): 262 # for each pair of records, compare the sequences and add 263 # the pertinent info to the dictionary 264 rep_dict = self._pair_replacement( 265 self.alignment[rec_num1].seq, 266 self.alignment[rec_num2].seq, 267 self.alignment[rec_num1].annotations.get('weight', 1.0), 268 self.alignment[rec_num2].annotations.get('weight', 1.0), 269 rep_dict, skip_items) 270 271 return rep_dict
272
273 - def _pair_replacement(self, seq1, seq2, weight1, weight2, 274 start_dict, ignore_chars):
275 """Compare two sequences and generate info on the replacements seen. 276 277 Arguments: 278 - seq1, seq2 - The two sequences to compare. 279 - weight1, weight2 - The relative weights of seq1 and seq2. 280 - start_dict - The dictionary containing the starting replacement 281 info that we will modify. 282 - ignore_chars - A list of characters to ignore when calculating 283 replacements (ie. '-'). 284 285 Returns: 286 - A replacment dictionary which is modified from initial_dict with 287 the information from the sequence comparison. 288 """ 289 # loop through each residue in the sequences 290 for residue_num in range(len(seq1)): 291 residue1 = seq1[residue_num] 292 try: 293 residue2 = seq2[residue_num] 294 # if seq2 is shorter, then we just stop looking at replacements 295 # and return the information 296 except IndexError: 297 return start_dict 298 299 # if the two residues are characters we want to count 300 if (residue1 not in ignore_chars) and (residue2 not in ignore_chars): 301 try: 302 # add info about the replacement to the dictionary, 303 # modified by the sequence weights 304 start_dict[(residue1, residue2)] += weight1 * weight2 305 306 # if we get a key error, then we've got a problem with alphabets 307 except KeyError: 308 raise ValueError("Residues %s, %s not found in alphabet %s" 309 % (residue1, residue2, 310 self.alignment._alphabet)) 311 312 return start_dict
313
314 - def _get_all_letters(self):
315 """Returns a string containing the expected letters in the alignment.""" 316 all_letters = self.alignment._alphabet.letters 317 if all_letters is None or \ 318 (isinstance(self.alignment._alphabet, Alphabet.Gapped) and 319 all_letters == self.alignment._alphabet.gap_char): 320 # We are dealing with a generic alphabet class where the 321 # letters are not defined! We must build a list of the 322 # letters used... 323 set_letters = set() 324 for record in self.alignment: 325 # Note the built in set does not have a union_update 326 # which was provided by the sets module's Set 327 set_letters = set_letters.union(record.seq) 328 list_letters = sorted(set_letters) 329 all_letters = "".join(list_letters) 330 return all_letters
331
332 - def _get_base_replacements(self, skip_items=None):
333 """Get a zeroed dictionary of all possible letter combinations. 334 335 This looks at the type of alphabet and gets the letters for it. 336 It then creates a dictionary with all possible combinations of these 337 letters as keys (ie. ('A', 'G')) and sets the values as zero. 338 339 Returns: 340 - The base dictionary created 341 - A list of alphabet items to skip when filling the dictionary. 342 (Right now the only thing I can imagine in this list is gap 343 characters, but maybe X's or something else might be useful later. 344 This will also include any characters that are specified to be 345 skipped.) Defaults to an empty list. 346 """ 347 if skip_items is None: 348 skip_items = [] 349 base_dictionary = {} 350 all_letters = self._get_all_letters() 351 352 # if we have a gapped alphabet we need to find the gap character 353 # and drop it out 354 if isinstance(self.alignment._alphabet, Alphabet.Gapped): 355 skip_items.append(self.alignment._alphabet.gap_char) 356 all_letters = all_letters.replace(self.alignment._alphabet.gap_char, '') 357 358 # now create the dictionary 359 for first_letter in all_letters: 360 for second_letter in all_letters: 361 if first_letter not in skip_items and \ 362 second_letter not in skip_items: 363 base_dictionary[(first_letter, second_letter)] = 0 364 365 return base_dictionary, skip_items
366
367 - def pos_specific_score_matrix(self, axis_seq=None, 368 chars_to_ignore=None):
369 """Create a position specific score matrix object for the alignment. 370 371 This creates a position specific score matrix (pssm) which is an 372 alternative method to look at a consensus sequence. 373 374 Arguments: 375 - chars_to_ignore - A list of all characters not to include in 376 the pssm. If the alignment alphabet declares a gap character, 377 then it will be excluded automatically. 378 - axis_seq - An optional argument specifying the sequence to 379 put on the axis of the PSSM. This should be a Seq object. If nothing 380 is specified, the consensus sequence, calculated with default 381 parameters, will be used. 382 383 Returns: 384 - A PSSM (position specific score matrix) object. 385 """ 386 # determine all of the letters we have to deal with 387 all_letters = self._get_all_letters() 388 assert all_letters 389 390 if chars_to_ignore is None: 391 chars_to_ignore = [] 392 if not isinstance(chars_to_ignore, list): 393 raise TypeError("chars_to_ignore should be a list.") 394 395 # if we have a gap char, add it to stuff to ignore 396 if isinstance(self.alignment._alphabet, Alphabet.Gapped): 397 chars_to_ignore.append(self.alignment._alphabet.gap_char) 398 399 for char in chars_to_ignore: 400 all_letters = all_letters.replace(char, '') 401 402 if axis_seq: 403 left_seq = axis_seq 404 assert len(axis_seq) == self.alignment.get_alignment_length() 405 else: 406 left_seq = self.dumb_consensus() 407 408 pssm_info = [] 409 # now start looping through all of the sequences and getting info 410 for residue_num in range(len(left_seq)): 411 score_dict = self._get_base_letters(all_letters) 412 for record in self.alignment: 413 try: 414 this_residue = record.seq[residue_num] 415 # if we hit an index error we've run out of sequence and 416 # should not add new residues 417 except IndexError: 418 this_residue = None 419 420 if this_residue and this_residue not in chars_to_ignore: 421 weight = record.annotations.get('weight', 1.0) 422 try: 423 score_dict[this_residue] += weight 424 # if we get a KeyError then we have an alphabet problem 425 except KeyError: 426 raise ValueError("Residue %s not found in alphabet %s" 427 % (this_residue, 428 self.alignment._alphabet)) 429 430 pssm_info.append((left_seq[residue_num], 431 score_dict)) 432 433 return PSSM(pssm_info)
434
435 - def _get_base_letters(self, letters):
436 """Create a zeroed dictionary with all of the specified letters. 437 """ 438 base_info = {} 439 for letter in letters: 440 base_info[letter] = 0 441 442 return base_info
443
444 - def information_content(self, start=0, 445 end=None, 446 e_freq_table=None, log_base=2, 447 chars_to_ignore=None):
448 """Calculate the information content for each residue along an alignment. 449 450 Arguments: 451 - start, end - The starting an ending points to calculate the 452 information content. These points should be relative to the first 453 sequence in the alignment, starting at zero (ie. even if the 'real' 454 first position in the seq is 203 in the initial sequence, for 455 the info content, we need to use zero). This defaults to the entire 456 length of the first sequence. 457 - e_freq_table - A FreqTable object specifying the expected frequencies 458 for each letter in the alphabet we are using (e.g. {'G' : 0.4, 459 'C' : 0.4, 'T' : 0.1, 'A' : 0.1}). Gap characters should not be 460 included, since these should not have expected frequencies. 461 - log_base - The base of the logathrim to use in calculating the 462 information content. This defaults to 2 so the info is in bits. 463 - chars_to_ignore - A listing of characters which should be ignored 464 in calculating the info content. Defaults to none. 465 466 Returns: 467 - A number representing the info content for the specified region. 468 469 Please see the Biopython manual for more information on how information 470 content is calculated. 471 """ 472 # if no end was specified, then we default to the end of the sequence 473 if end is None: 474 end = len(self.alignment[0].seq) 475 if chars_to_ignore is None: 476 chars_to_ignore = [] 477 478 if start < 0 or end > len(self.alignment[0].seq): 479 raise ValueError("Start (%s) and end (%s) are not in the \ 480 range %s to %s" 481 % (start, end, 0, len(self.alignment[0].seq))) 482 # determine random expected frequencies, if necessary 483 random_expected = None 484 if not e_freq_table: 485 # TODO - What about ambiguous alphabets? 486 base_alpha = Alphabet._get_base_alphabet(self.alignment._alphabet) 487 if isinstance(base_alpha, Alphabet.ProteinAlphabet): 488 random_expected = Protein20Random 489 elif isinstance(base_alpha, Alphabet.NucleotideAlphabet): 490 random_expected = Nucleotide4Random 491 else: 492 errstr = "Error in alphabet: not Nucleotide or Protein, " 493 errstr += "supply expected frequencies" 494 raise ValueError(errstr) 495 del base_alpha 496 elif not isinstance(e_freq_table, FreqTable.FreqTable): 497 raise ValueError("e_freq_table should be a FreqTable object") 498 499 # determine all of the letters we have to deal with 500 all_letters = self._get_all_letters() 501 for char in chars_to_ignore: 502 all_letters = all_letters.replace(char, '') 503 504 info_content = {} 505 for residue_num in range(start, end): 506 freq_dict = self._get_letter_freqs(residue_num, 507 self.alignment, 508 all_letters, chars_to_ignore) 509 # print freq_dict, 510 column_score = self._get_column_info_content(freq_dict, 511 e_freq_table, 512 log_base, 513 random_expected) 514 515 info_content[residue_num] = column_score 516 # sum up the score 517 total_info = sum(info_content.values()) 518 # fill in the ic_vector member: holds IC for each column 519 for i in info_content: 520 self.ic_vector[i] = info_content[i] 521 return total_info
522
523 - def _get_letter_freqs(self, residue_num, all_records, letters, to_ignore):
524 """Determine the frequency of specific letters in the alignment. 525 526 Arguments: 527 - residue_num - The number of the column we are getting frequencies 528 from. 529 - all_records - All of the SeqRecords in the alignment. 530 - letters - The letters we are interested in getting the frequency 531 for. 532 - to_ignore - Letters we are specifically supposed to ignore. 533 534 This will calculate the frequencies of each of the specified letters 535 in the alignment at the given frequency, and return this as a 536 dictionary where the keys are the letters and the values are the 537 frequencies. 538 """ 539 freq_info = self._get_base_letters(letters) 540 541 total_count = 0 542 # collect the count info into the dictionary for all the records 543 for record in all_records: 544 try: 545 if record.seq[residue_num] not in to_ignore: 546 weight = record.annotations.get('weight', 1.0) 547 freq_info[record.seq[residue_num]] += weight 548 total_count += weight 549 # getting a key error means we've got a problem with the alphabet 550 except KeyError: 551 raise ValueError("Residue %s not found in alphabet %s" 552 % (record.seq[residue_num], 553 self.alignment._alphabet)) 554 555 if total_count == 0: 556 # This column must be entirely ignored characters 557 for letter in freq_info: 558 assert freq_info[letter] == 0 559 # TODO - Map this to NA or NaN? 560 else: 561 # now convert the counts into frequencies 562 for letter in freq_info: 563 freq_info[letter] = freq_info[letter] / total_count 564 565 return freq_info
566
567 - def _get_column_info_content(self, obs_freq, e_freq_table, log_base, 568 random_expected):
569 """Calculate the information content for a column. 570 571 Arguments: 572 - obs_freq - The frequencies observed for each letter in the column. 573 - e_freq_table - An optional argument specifying the expected 574 frequencies for each letter. This is a SubsMat.FreqTable instance. 575 - log_base - The base of the logathrim to use in calculating the 576 info content. 577 """ 578 try: 579 gap_char = self.alignment._alphabet.gap_char 580 except AttributeError: 581 # The alphabet doesn't declare a gap - there could be none 582 # in the sequence... or just a vague alphabet. 583 gap_char = "-" # Safe? 584 585 if e_freq_table: 586 if not isinstance(e_freq_table, FreqTable.FreqTable): 587 raise ValueError("e_freq_table should be a FreqTable object") 588 # check the expected freq information to make sure it is good 589 for key in obs_freq: 590 if (key != gap_char and key not in e_freq_table): 591 raise ValueError("Expected frequency letters %s " 592 "do not match observed %s" 593 % (list(e_freq_table), 594 list(obs_freq) - [gap_char])) 595 596 total_info = 0.0 597 598 for letter in obs_freq: 599 inner_log = 0.0 600 # if we have expected frequencies, modify the log value by them 601 # gap characters do not have expected frequencies, so they 602 # should just be the observed frequency. 603 if letter != gap_char: 604 if e_freq_table: 605 inner_log = obs_freq[letter] / e_freq_table[letter] 606 else: 607 inner_log = obs_freq[letter] / random_expected 608 # if the observed frequency is zero, we don't add any info to the 609 # total information content 610 if inner_log > 0: 611 letter_info = (obs_freq[letter] * 612 math.log(inner_log) / math.log(log_base)) 613 total_info += letter_info 614 return total_info
615
616 - def get_column(self, col):
617 # TODO - Deprecate this and implement slicing? 618 return self.alignment[:, col]
619 620
621 -class PSSM(object):
622 """Represent a position specific score matrix. 623 624 This class is meant to make it easy to access the info within a PSSM 625 and also make it easy to print out the information in a nice table. 626 627 Let's say you had an alignment like this:: 628 629 GTATC 630 AT--C 631 CTGTC 632 633 The position specific score matrix (when printed) looks like:: 634 635 G A T C 636 G 1 1 0 1 637 T 0 0 3 0 638 A 1 1 0 0 639 T 0 0 2 0 640 C 0 0 0 3 641 642 You can access a single element of the PSSM using the following:: 643 644 your_pssm[sequence_number][residue_count_name] 645 646 For instance, to get the 'T' residue for the second element in the 647 above alignment you would need to do: 648 649 your_pssm[1]['T'] 650 """
651 - def __init__(self, pssm):
652 """Initialize with pssm data to represent. 653 654 The pssm passed should be a list with the following structure: 655 656 list[0] - The letter of the residue being represented (for instance, 657 from the example above, the first few list[0]s would be GTAT... 658 list[1] - A dictionary with the letter substitutions and counts. 659 """ 660 self.pssm = pssm
661
662 - def __getitem__(self, pos):
663 return self.pssm[pos][1]
664
665 - def __str__(self):
666 out = " " 667 all_residues = sorted(self.pssm[0][1]) 668 669 # first print out the top header 670 for res in all_residues: 671 out += " %s" % res 672 out += "\n" 673 674 # for each item, write out the substitutions 675 for item in self.pssm: 676 out += "%s " % item[0] 677 for res in all_residues: 678 out += " %.1f" % item[1][res] 679 680 out += "\n" 681 return out
682
683 - def get_residue(self, pos):
684 """Return the residue letter at the specified position. 685 """ 686 return self.pssm[pos][0]
687 688 699