Package Bio :: Package Align :: Module AlignInfo
[hide private]
[frames] | no frames]

Source Code for Module Bio.Align.AlignInfo

  1  # This code is part of the Biopython distribution and governed by its 
  2  # license.  Please see the LICENSE file that should have been included 
  3  # as part of this package. 
  4  # 
  5   
  6  """Extract information from alignment objects. 
  7   
  8  In order to try and avoid huge alignment objects with tons of functions, 
  9  functions which return summary type information about alignments should 
 10  be put into classes in this module. 
 11  """ 
 12   
 13  from __future__ import print_function 
 14   
 15  import math 
 16  import sys 
 17   
 18  from Bio import Alphabet 
 19  from Bio.Alphabet import IUPAC 
 20  from Bio.Seq import Seq 
 21  from Bio.SubsMat import FreqTable 
 22   
 23  __docformat__ = "restructuredtext en" 
 24   
 25  # Expected random distributions for 20-letter protein, and 
 26  # for 4-letter nucleotide alphabets 
 27  Protein20Random = 0.05 
 28  Nucleotide4Random = 0.25 
 29   
 30   
31 -class SummaryInfo(object):
32 """Calculate summary info about the alignment. 33 34 This class should be used to caclculate information summarizing the 35 results of an alignment. This may either be straight consensus info 36 or more complicated things. 37 """
38 - def __init__(self, alignment):
39 """Initialize with the alignment to calculate information on. 40 41 ic_vector attribute. A dictionary. Keys: column numbers. Values: 42 """ 43 self.alignment = alignment 44 self.ic_vector = {}
45
46 - def dumb_consensus(self, threshold=.7, ambiguous="X", 47 consensus_alpha=None, require_multiple=0):
48 """Output a fast consensus sequence of the alignment. 49 50 This doesn't do anything fancy at all. It will just go through the 51 sequence residue by residue and count up the number of each type 52 of residue (ie. A or G or T or C for DNA) in all sequences in the 53 alignment. If the percentage of the most common residue type is 54 greater then the passed threshold, then we will add that residue type, 55 otherwise an ambiguous character will be added. 56 57 This could be made a lot fancier (ie. to take a substitution matrix 58 into account), but it just meant for a quick and dirty consensus. 59 60 Arguments: 61 - threshold - The threshold value that is required to add a particular 62 atom. 63 - ambiguous - The ambiguous character to be added when the threshold is 64 not reached. 65 - consensus_alpha - The alphabet to return for the consensus sequence. 66 If this is None, then we will try to guess the alphabet. 67 - require_multiple - If set as 1, this will require that more than 68 1 sequence be part of an alignment to put it in the consensus (ie. 69 not just 1 sequence and gaps). 70 """ 71 # Iddo Friedberg, 1-JUL-2004: changed ambiguous default to "X" 72 consensus = '' 73 74 # find the length of the consensus we are creating 75 con_len = self.alignment.get_alignment_length() 76 77 # go through each seq item 78 for n in range(con_len): 79 # keep track of the counts of the different atoms we get 80 atom_dict = {} 81 num_atoms = 0 82 83 for record in self.alignment: 84 # make sure we haven't run past the end of any sequences 85 # if they are of different lengths 86 if n < len(record.seq): 87 if record.seq[n] != '-' and record.seq[n] != '.': 88 if record.seq[n] not in atom_dict: 89 atom_dict[record.seq[n]] = 1 90 else: 91 atom_dict[record.seq[n]] += 1 92 93 num_atoms = num_atoms + 1 94 95 max_atoms = [] 96 max_size = 0 97 98 for atom in atom_dict: 99 if atom_dict[atom] > max_size: 100 max_atoms = [atom] 101 max_size = atom_dict[atom] 102 elif atom_dict[atom] == max_size: 103 max_atoms.append(atom) 104 105 if require_multiple and num_atoms == 1: 106 consensus += ambiguous 107 elif (len(max_atoms) == 1) and ((float(max_size) / float(num_atoms)) 108 >= threshold): 109 consensus += max_atoms[0] 110 else: 111 consensus += ambiguous 112 113 # we need to guess a consensus alphabet if one isn't specified 114 if consensus_alpha is None: 115 consensus_alpha = self._guess_consensus_alphabet(ambiguous) 116 117 return Seq(consensus, consensus_alpha)
118
119 - def gap_consensus(self, threshold=.7, ambiguous="X", 120 consensus_alpha=None, require_multiple=0):
121 """Same as dumb_consensus(), but allows gap on the output. 122 123 Things to do: 124 - Let the user define that with only one gap, the result 125 character in consensus is gap. 126 - Let the user select gap character, now 127 it takes the same as input. 128 """ 129 # Iddo Friedberg, 1-JUL-2004: changed ambiguous default to "X" 130 consensus = '' 131 132 # find the length of the consensus we are creating 133 con_len = self.alignment.get_alignment_length() 134 135 # go through each seq item 136 for n in range(con_len): 137 # keep track of the counts of the different atoms we get 138 atom_dict = {} 139 num_atoms = 0 140 141 for record in self.alignment: 142 # make sure we haven't run past the end of any sequences 143 # if they are of different lengths 144 if n < len(record.seq): 145 if record.seq[n] not in atom_dict: 146 atom_dict[record.seq[n]] = 1 147 else: 148 atom_dict[record.seq[n]] += 1 149 150 num_atoms += 1 151 152 max_atoms = [] 153 max_size = 0 154 155 for atom in atom_dict: 156 if atom_dict[atom] > max_size: 157 max_atoms = [atom] 158 max_size = atom_dict[atom] 159 elif atom_dict[atom] == max_size: 160 max_atoms.append(atom) 161 162 if require_multiple and num_atoms == 1: 163 consensus += ambiguous 164 elif (len(max_atoms) == 1) and ((float(max_size) / float(num_atoms)) 165 >= threshold): 166 consensus += max_atoms[0] 167 else: 168 consensus += ambiguous 169 170 # we need to guess a consensus alphabet if one isn't specified 171 if consensus_alpha is None: 172 # TODO - Should we make this into a Gapped alphabet? 173 consensus_alpha = self._guess_consensus_alphabet(ambiguous) 174 175 return Seq(consensus, consensus_alpha)
176
177 - def _guess_consensus_alphabet(self, ambiguous):
178 """Pick an (ungapped) alphabet for an alignment consesus sequence. 179 180 This just looks at the sequences we have, checks their type, and 181 returns as appropriate type which seems to make sense with the 182 sequences we've got. 183 """ 184 # Start with the (un-gapped version of) the alignment alphabet 185 a = Alphabet._get_base_alphabet(self.alignment._alphabet) 186 187 # Now check its compatible with all the rest of the sequences 188 for record in self.alignment: 189 # Get the (un-gapped version of) the sequence's alphabet 190 alt = Alphabet._get_base_alphabet(record.seq.alphabet) 191 if not isinstance(alt, a.__class__): 192 raise ValueError("Alignment contains a sequence with \ 193 an incompatible alphabet.") 194 195 # Check the ambiguous character we are going to use in the consensus 196 # is in the alphabet's list of valid letters (if defined). 197 if hasattr(a, "letters") and a.letters is not None \ 198 and ambiguous not in a.letters: 199 # We'll need to pick a more generic alphabet... 200 if isinstance(a, IUPAC.IUPACUnambiguousDNA): 201 if ambiguous in IUPAC.IUPACUnambiguousDNA().letters: 202 a = IUPAC.IUPACUnambiguousDNA() 203 else: 204 a = Alphabet.generic_dna 205 elif isinstance(a, IUPAC.IUPACUnambiguousRNA): 206 if ambiguous in IUPAC.IUPACUnambiguousRNA().letters: 207 a = IUPAC.IUPACUnambiguousRNA() 208 else: 209 a = Alphabet.generic_rna 210 elif isinstance(a, IUPAC.IUPACProtein): 211 if ambiguous in IUPAC.ExtendedIUPACProtein().letters: 212 a = IUPAC.ExtendedIUPACProtein() 213 else: 214 a = Alphabet.generic_protein 215 else: 216 a = Alphabet.single_letter_alphabet 217 return a
218
219 - def replacement_dictionary(self, skip_chars=None):
220 """Generate a replacement dictionary to plug into a substitution matrix 221 222 This should look at an alignment, and be able to generate the number 223 of substitutions of different residues for each other in the 224 aligned object. 225 226 Will then return a dictionary with this information:: 227 228 {('A', 'C') : 10, ('C', 'A') : 12, ('G', 'C') : 15 ....} 229 230 This also treats weighted sequences. The following example shows how 231 we calculate the replacement dictionary. Given the following 232 multiple sequence alignment:: 233 234 GTATC 0.5 235 AT--C 0.8 236 CTGTC 1.0 237 238 For the first column we have:: 239 240 ('A', 'G') : 0.5 * 0.8 = 0.4 241 ('C', 'G') : 0.5 * 1.0 = 0.5 242 ('A', 'C') : 0.8 * 1.0 = 0.8 243 244 We then continue this for all of the columns in the alignment, summing 245 the information for each substitution in each column, until we end 246 up with the replacement dictionary. 247 248 Arguments: 249 - skip_chars - A list of characters to skip when creating the dictionary. 250 This defaults to an empty list. 251 252 For instance, you might have Xs (screened stuff) or Ns, and not want 253 to include the ambiguity characters in the dictionary. 254 """ 255 # get a starting dictionary based on the alphabet of the alignment 256 rep_dict, skip_items = self._get_base_replacements(skip_chars) 257 258 # iterate through each record 259 for rec_num1 in range(len(self.alignment)): 260 # iterate through each record from one beyond the current record 261 # to the end of the list of records 262 for rec_num2 in range(rec_num1 + 1, len(self.alignment)): 263 # for each pair of records, compare the sequences and add 264 # the pertinent info to the dictionary 265 rep_dict = self._pair_replacement( 266 self.alignment[rec_num1].seq, 267 self.alignment[rec_num2].seq, 268 self.alignment[rec_num1].annotations.get('weight', 1.0), 269 self.alignment[rec_num2].annotations.get('weight', 1.0), 270 rep_dict, skip_items) 271 272 return rep_dict
273
274 - def _pair_replacement(self, seq1, seq2, weight1, weight2, 275 start_dict, ignore_chars):
276 """Compare two sequences and generate info on the replacements seen. 277 278 Arguments: 279 - seq1, seq2 - The two sequences to compare. 280 - weight1, weight2 - The relative weights of seq1 and seq2. 281 - start_dict - The dictionary containing the starting replacement 282 info that we will modify. 283 - ignore_chars - A list of characters to ignore when calculating 284 replacements (ie. '-'). 285 286 Returns: 287 - A replacment dictionary which is modified from initial_dict with 288 the information from the sequence comparison. 289 """ 290 # loop through each residue in the sequences 291 for residue_num in range(len(seq1)): 292 residue1 = seq1[residue_num] 293 try: 294 residue2 = seq2[residue_num] 295 # if seq2 is shorter, then we just stop looking at replacements 296 # and return the information 297 except IndexError: 298 return start_dict 299 300 # if the two residues are characters we want to count 301 if (residue1 not in ignore_chars) and (residue2 not in ignore_chars): 302 try: 303 # add info about the replacement to the dictionary, 304 # modified by the sequence weights 305 start_dict[(residue1, residue2)] += weight1 * weight2 306 307 # if we get a key error, then we've got a problem with alphabets 308 except KeyError: 309 raise ValueError("Residues %s, %s not found in alphabet %s" 310 % (residue1, residue2, 311 self.alignment._alphabet)) 312 313 return start_dict
314
315 - def _get_all_letters(self):
316 """Returns a string containing the expected letters in the alignment.""" 317 all_letters = self.alignment._alphabet.letters 318 if all_letters is None \ 319 or (isinstance(self.alignment._alphabet, Alphabet.Gapped) 320 and all_letters == self.alignment._alphabet.gap_char): 321 # We are dealing with a generic alphabet class where the 322 # letters are not defined! We must build a list of the 323 # letters used... 324 set_letters = set() 325 for record in self.alignment: 326 # Note the built in set does not have a union_update 327 # which was provided by the sets module's Set 328 set_letters = set_letters.union(record.seq) 329 list_letters = sorted(set_letters) 330 all_letters = "".join(list_letters) 331 return all_letters
332
333 - def _get_base_replacements(self, skip_items=None):
334 """Get a zeroed dictionary of all possible letter combinations. 335 336 This looks at the type of alphabet and gets the letters for it. 337 It then creates a dictionary with all possible combinations of these 338 letters as keys (ie. ('A', 'G')) and sets the values as zero. 339 340 Returns: 341 - The base dictionary created 342 - A list of alphabet items to skip when filling the dictionary. 343 (Right now the only thing I can imagine in this list is gap 344 characters, but maybe X's or something else might be useful later. 345 This will also include any characters that are specified to be 346 skipped.) Defaults to an empty list. 347 """ 348 if skip_items is None: 349 skip_items = [] 350 base_dictionary = {} 351 all_letters = self._get_all_letters() 352 353 # if we have a gapped alphabet we need to find the gap character 354 # and drop it out 355 if isinstance(self.alignment._alphabet, Alphabet.Gapped): 356 skip_items.append(self.alignment._alphabet.gap_char) 357 all_letters = all_letters.replace(self.alignment._alphabet.gap_char, '') 358 359 # now create the dictionary 360 for first_letter in all_letters: 361 for second_letter in all_letters: 362 if first_letter not in skip_items and \ 363 second_letter not in skip_items: 364 base_dictionary[(first_letter, second_letter)] = 0 365 366 return base_dictionary, skip_items
367
368 - def pos_specific_score_matrix(self, axis_seq=None, 369 chars_to_ignore=None):
370 """Create a position specific score matrix object for the alignment. 371 372 This creates a position specific score matrix (pssm) which is an 373 alternative method to look at a consensus sequence. 374 375 Arguments: 376 - chars_to_ignore - A list of all characters not to include in 377 the pssm. If the alignment alphabet declares a gap character, 378 then it will be excluded automatically. 379 - axis_seq - An optional argument specifying the sequence to 380 put on the axis of the PSSM. This should be a Seq object. If nothing 381 is specified, the consensus sequence, calculated with default 382 parameters, will be used. 383 384 Returns: 385 - A PSSM (position specific score matrix) object. 386 """ 387 # determine all of the letters we have to deal with 388 all_letters = self._get_all_letters() 389 assert all_letters 390 391 if chars_to_ignore is None: 392 chars_to_ignore = [] 393 if not isinstance(chars_to_ignore, list): 394 raise TypeError("chars_to_ignore should be a list.") 395 396 # if we have a gap char, add it to stuff to ignore 397 if isinstance(self.alignment._alphabet, Alphabet.Gapped): 398 chars_to_ignore.append(self.alignment._alphabet.gap_char) 399 400 for char in chars_to_ignore: 401 all_letters = all_letters.replace(char, '') 402 403 if axis_seq: 404 left_seq = axis_seq 405 assert len(axis_seq) == self.alignment.get_alignment_length() 406 else: 407 left_seq = self.dumb_consensus() 408 409 pssm_info = [] 410 # now start looping through all of the sequences and getting info 411 for residue_num in range(len(left_seq)): 412 score_dict = self._get_base_letters(all_letters) 413 for record in self.alignment: 414 try: 415 this_residue = record.seq[residue_num] 416 # if we hit an index error we've run out of sequence and 417 # should not add new residues 418 except IndexError: 419 this_residue = None 420 421 if this_residue and this_residue not in chars_to_ignore: 422 weight = record.annotations.get('weight', 1.0) 423 try: 424 score_dict[this_residue] += weight 425 # if we get a KeyError then we have an alphabet problem 426 except KeyError: 427 raise ValueError("Residue %s not found in alphabet %s" 428 % (this_residue, 429 self.alignment._alphabet)) 430 431 pssm_info.append((left_seq[residue_num], 432 score_dict)) 433 434 return PSSM(pssm_info)
435
436 - def _get_base_letters(self, letters):
437 """Create a zeroed dictionary with all of the specified letters. 438 """ 439 base_info = {} 440 for letter in letters: 441 base_info[letter] = 0 442 443 return base_info
444
445 - def information_content(self, start=0, 446 end=None, 447 e_freq_table=None, log_base=2, 448 chars_to_ignore=None):
449 """Calculate the information content for each residue along an alignment. 450 451 Arguments: 452 - start, end - The starting an ending points to calculate the 453 information content. These points should be relative to the first 454 sequence in the alignment, starting at zero (ie. even if the 'real' 455 first position in the seq is 203 in the initial sequence, for 456 the info content, we need to use zero). This defaults to the entire 457 length of the first sequence. 458 - e_freq_table - A FreqTable object specifying the expected frequencies 459 for each letter in the alphabet we are using (e.g. {'G' : 0.4, 460 'C' : 0.4, 'T' : 0.1, 'A' : 0.1}). Gap characters should not be 461 included, since these should not have expected frequencies. 462 - log_base - The base of the logathrim to use in calculating the 463 information content. This defaults to 2 so the info is in bits. 464 - chars_to_ignore - A listing of characters which should be ignored 465 in calculating the info content. Defaults to none. 466 467 Returns: 468 - A number representing the info content for the specified region. 469 470 Please see the Biopython manual for more information on how information 471 content is calculated. 472 """ 473 # if no end was specified, then we default to the end of the sequence 474 if end is None: 475 end = len(self.alignment[0].seq) 476 if chars_to_ignore is None: 477 chars_to_ignore = [] 478 479 if start < 0 or end > len(self.alignment[0].seq): 480 raise ValueError("Start (%s) and end (%s) are not in the \ 481 range %s to %s" 482 % (start, end, 0, len(self.alignment[0].seq))) 483 # determine random expected frequencies, if necessary 484 random_expected = None 485 if not e_freq_table: 486 # TODO - What about ambiguous alphabets? 487 base_alpha = Alphabet._get_base_alphabet(self.alignment._alphabet) 488 if isinstance(base_alpha, Alphabet.ProteinAlphabet): 489 random_expected = Protein20Random 490 elif isinstance(base_alpha, Alphabet.NucleotideAlphabet): 491 random_expected = Nucleotide4Random 492 else: 493 errstr = "Error in alphabet: not Nucleotide or Protein, " 494 errstr += "supply expected frequencies" 495 raise ValueError(errstr) 496 del base_alpha 497 elif not isinstance(e_freq_table, FreqTable.FreqTable): 498 raise ValueError("e_freq_table should be a FreqTable object") 499 500 # determine all of the letters we have to deal with 501 all_letters = self._get_all_letters() 502 for char in chars_to_ignore: 503 all_letters = all_letters.replace(char, '') 504 505 info_content = {} 506 for residue_num in range(start, end): 507 freq_dict = self._get_letter_freqs(residue_num, 508 self.alignment, 509 all_letters, chars_to_ignore) 510 # print freq_dict, 511 column_score = self._get_column_info_content(freq_dict, 512 e_freq_table, 513 log_base, 514 random_expected) 515 516 info_content[residue_num] = column_score 517 # sum up the score 518 total_info = sum(info_content.values()) 519 # fill in the ic_vector member: holds IC for each column 520 for i in info_content: 521 self.ic_vector[i] = info_content[i] 522 return total_info
523
524 - def _get_letter_freqs(self, residue_num, all_records, letters, to_ignore):
525 """Determine the frequency of specific letters in the alignment. 526 527 Arguments: 528 - residue_num - The number of the column we are getting frequencies 529 from. 530 - all_records - All of the SeqRecords in the alignment. 531 - letters - The letters we are interested in getting the frequency 532 for. 533 - to_ignore - Letters we are specifically supposed to ignore. 534 535 This will calculate the frequencies of each of the specified letters 536 in the alignment at the given frequency, and return this as a 537 dictionary where the keys are the letters and the values are the 538 frequencies. 539 """ 540 freq_info = self._get_base_letters(letters) 541 542 total_count = 0 543 # collect the count info into the dictionary for all the records 544 for record in all_records: 545 try: 546 if record.seq[residue_num] not in to_ignore: 547 weight = record.annotations.get('weight', 1.0) 548 freq_info[record.seq[residue_num]] += weight 549 total_count += weight 550 # getting a key error means we've got a problem with the alphabet 551 except KeyError: 552 raise ValueError("Residue %s not found in alphabet %s" 553 % (record.seq[residue_num], 554 self.alignment._alphabet)) 555 556 if total_count == 0: 557 # This column must be entirely ignored characters 558 for letter in freq_info: 559 assert freq_info[letter] == 0 560 # TODO - Map this to NA or NaN? 561 else: 562 # now convert the counts into frequencies 563 for letter in freq_info: 564 freq_info[letter] = freq_info[letter] / total_count 565 566 return freq_info
567
568 - def _get_column_info_content(self, obs_freq, e_freq_table, log_base, 569 random_expected):
570 """Calculate the information content for a column. 571 572 Arguments: 573 - obs_freq - The frequencies observed for each letter in the column. 574 - e_freq_table - An optional argument specifying the expected 575 frequencies for each letter. This is a SubsMat.FreqTable instance. 576 - log_base - The base of the logathrim to use in calculating the 577 info content. 578 """ 579 try: 580 gap_char = self.alignment._alphabet.gap_char 581 except AttributeError: 582 # The alphabet doesn't declare a gap - there could be none 583 # in the sequence... or just a vague alphabet. 584 gap_char = "-" # Safe? 585 586 if e_freq_table: 587 if not isinstance(e_freq_table, FreqTable.FreqTable): 588 raise ValueError("e_freq_table should be a FreqTable object") 589 # check the expected freq information to make sure it is good 590 for key in obs_freq: 591 if (key != gap_char and key not in e_freq_table): 592 raise ValueError("Expected frequency letters %s " 593 "do not match observed %s" 594 % (list(e_freq_table), 595 list(obs_freq) - [gap_char])) 596 597 total_info = 0.0 598 599 for letter in obs_freq: 600 inner_log = 0.0 601 # if we have expected frequencies, modify the log value by them 602 # gap characters do not have expected frequencies, so they 603 # should just be the observed frequency. 604 if letter != gap_char: 605 if e_freq_table: 606 inner_log = obs_freq[letter] / e_freq_table[letter] 607 else: 608 inner_log = obs_freq[letter] / random_expected 609 # if the observed frequency is zero, we don't add any info to the 610 # total information content 611 if inner_log > 0: 612 letter_info = (obs_freq[letter] * 613 math.log(inner_log) / math.log(log_base)) 614 total_info += letter_info 615 return total_info
616
617 - def get_column(self, col):
618 # TODO - Deprecate this and implement slicing? 619 return self.alignment[:, col]
620 621
622 -class PSSM(object):
623 """Represent a position specific score matrix. 624 625 This class is meant to make it easy to access the info within a PSSM 626 and also make it easy to print out the information in a nice table. 627 628 Let's say you had an alignment like this:: 629 630 GTATC 631 AT--C 632 CTGTC 633 634 The position specific score matrix (when printed) looks like:: 635 636 G A T C 637 G 1 1 0 1 638 T 0 0 3 0 639 A 1 1 0 0 640 T 0 0 2 0 641 C 0 0 0 3 642 643 You can access a single element of the PSSM using the following:: 644 645 your_pssm[sequence_number][residue_count_name] 646 647 For instance, to get the 'T' residue for the second element in the 648 above alignment you would need to do: 649 650 your_pssm[1]['T'] 651 """
652 - def __init__(self, pssm):
653 """Initialize with pssm data to represent. 654 655 The pssm passed should be a list with the following structure: 656 657 list[0] - The letter of the residue being represented (for instance, 658 from the example above, the first few list[0]s would be GTAT... 659 list[1] - A dictionary with the letter substitutions and counts. 660 """ 661 self.pssm = pssm
662
663 - def __getitem__(self, pos):
664 return self.pssm[pos][1]
665
666 - def __str__(self):
667 out = " " 668 all_residues = sorted(self.pssm[0][1]) 669 670 # first print out the top header 671 for res in all_residues: 672 out += " %s" % res 673 out += "\n" 674 675 # for each item, write out the substitutions 676 for item in self.pssm: 677 out += "%s " % item[0] 678 for res in all_residues: 679 out += " %.1f" % item[1][res] 680 681 out += "\n" 682 return out
683
684 - def get_residue(self, pos):
685 """Return the residue letter at the specified position. 686 """ 687 return self.pssm[pos][0]
688 689 700 701 if __name__ == "__main__": 702 print("Quick test") 703 from Bio import AlignIO 704 from Bio.Align.Generic import Alignment 705 706 filename = "../../Tests/GFF/multi.fna" 707 format = "fasta" 708 expected = FreqTable.FreqTable({"A": 0.25, "G": 0.25, "T": 0.25, "C": 0.25}, 709 FreqTable.FREQ, 710 IUPAC.unambiguous_dna) 711 712 alignment = AlignIO.read(open(filename), format) 713 for record in alignment: 714 print(record.seq) 715 print("=" * alignment.get_alignment_length()) 716 717 summary = SummaryInfo(alignment) 718 consensus = summary.dumb_consensus(ambiguous="N") 719 print(consensus) 720 consensus = summary.gap_consensus(ambiguous="N") 721 print(consensus) 722 print("") 723 print(summary.pos_specific_score_matrix(chars_to_ignore=['-'], 724 axis_seq=consensus)) 725 print("") 726 # Have a generic alphabet, without a declared gap char, so must tell 727 # provide the frequencies and chars to ignore explicitly. 728 print(summary.information_content(e_freq_table=expected, 729 chars_to_ignore=['-'])) 730 print("") 731 print("Trying a protein sequence with gaps and stops") 732 733 alpha = Alphabet.HasStopCodon(Alphabet.Gapped(Alphabet.generic_protein, "-"), "*") 734 a = Alignment(alpha) 735 a.add_sequence("ID001", "MHQAIFIYQIGYP*LKSGYIQSIRSPEYDNW-") 736 a.add_sequence("ID002", "MH--IFIYQIGYAYLKSGYIQSIRSPEY-NW*") 737 a.add_sequence("ID003", "MHQAIFIYQIGYPYLKSGYIQSIRSPEYDNW*") 738 print(a) 739 print("=" * a.get_alignment_length()) 740 741 s = SummaryInfo(a) 742 c = s.dumb_consensus(ambiguous="X") 743 print(c) 744 c = s.gap_consensus(ambiguous="X") 745 print(c) 746 print("") 747 print(s.pos_specific_score_matrix(chars_to_ignore=['-', '*'], axis_seq=c)) 748 749 print(s.information_content(chars_to_ignore=['-', '*'])) 750 751 print("Done") 752