Package Bio :: Package Align :: Module AlignInfo
[hide private]
[frames] | no frames]

Source Code for Module Bio.Align.AlignInfo

  1  # This code is part of the Biopython distribution and governed by its 
  2  # license.  Please see the LICENSE file that should have been included 
  3  # as part of this package. 
  4  # 
  5   
  6  """Extract information from alignment objects. 
  7   
  8  In order to try and avoid huge alignment objects with tons of functions, 
  9  functions which return summary type information about alignments should 
 10  be put into classes in this module. 
 11   
 12  classes: 
 13  o SummaryInfo 
 14  o PSSM 
 15  """ 
 16   
 17  from __future__ import print_function 
 18   
 19  import math 
 20  import sys 
 21   
 22  from Bio import Alphabet 
 23  from Bio.Alphabet import IUPAC 
 24  from Bio.Seq import Seq 
 25  from Bio.SubsMat import FreqTable 
 26   
 27  # Expected random distributions for 20-letter protein, and 
 28  # for 4-letter nucleotide alphabets 
 29  Protein20Random = 0.05 
 30  Nucleotide4Random = 0.25 
 31   
 32   
33 -class SummaryInfo(object):
34 """Calculate summary info about the alignment. 35 36 This class should be used to caclculate information summarizing the 37 results of an alignment. This may either be straight consensus info 38 or more complicated things. 39 """
40 - def __init__(self, alignment):
41 """Initialize with the alignment to calculate information on. 42 ic_vector attribute. A dictionary. Keys: column numbers. Values: 43 """ 44 self.alignment = alignment 45 self.ic_vector = {}
46
47 - def dumb_consensus(self, threshold = .7, ambiguous = "X", 48 consensus_alpha = None, require_multiple = 0):
49 """Output a fast consensus sequence of the alignment. 50 51 This doesn't do anything fancy at all. It will just go through the 52 sequence residue by residue and count up the number of each type 53 of residue (ie. A or G or T or C for DNA) in all sequences in the 54 alignment. If the percentage of the most common residue type is 55 greater then the passed threshold, then we will add that residue type, 56 otherwise an ambiguous character will be added. 57 58 This could be made a lot fancier (ie. to take a substitution matrix 59 into account), but it just meant for a quick and dirty consensus. 60 61 Arguments: 62 o threshold - The threshold value that is required to add a particular 63 atom. 64 o ambiguous - The ambiguous character to be added when the threshold is 65 not reached. 66 o consensus_alpha - The alphabet to return for the consensus sequence. 67 If this is None, then we will try to guess the alphabet. 68 o require_multiple - If set as 1, this will require that more than 69 1 sequence be part of an alignment to put it in the consensus (ie. 70 not just 1 sequence and gaps). 71 """ 72 # Iddo Friedberg, 1-JUL-2004: changed ambiguous default to "X" 73 consensus = '' 74 75 # find the length of the consensus we are creating 76 con_len = self.alignment.get_alignment_length() 77 78 # go through each seq item 79 for n in range(con_len): 80 # keep track of the counts of the different atoms we get 81 atom_dict = {} 82 num_atoms = 0 83 84 for record in self.alignment._records: 85 # make sure we haven't run past the end of any sequences 86 # if they are of different lengths 87 if n < len(record.seq): 88 if record.seq[n] != '-' and record.seq[n] != '.': 89 if record.seq[n] not in atom_dict: 90 atom_dict[record.seq[n]] = 1 91 else: 92 atom_dict[record.seq[n]] += 1 93 94 num_atoms = num_atoms + 1 95 96 max_atoms = [] 97 max_size = 0 98 99 for atom in atom_dict: 100 if atom_dict[atom] > max_size: 101 max_atoms = [atom] 102 max_size = atom_dict[atom] 103 elif atom_dict[atom] == max_size: 104 max_atoms.append(atom) 105 106 if require_multiple and num_atoms == 1: 107 consensus += ambiguous 108 elif (len(max_atoms) == 1) and ((float(max_size)/float(num_atoms)) 109 >= threshold): 110 consensus += max_atoms[0] 111 else: 112 consensus += ambiguous 113 114 # we need to guess a consensus alphabet if one isn't specified 115 if consensus_alpha is None: 116 consensus_alpha = self._guess_consensus_alphabet(ambiguous) 117 118 return Seq(consensus, consensus_alpha)
119
120 - def gap_consensus(self, threshold = .7, ambiguous = "X", 121 consensus_alpha = None, require_multiple = 0):
122 """Same as dumb_consensus(), but allows gap on the output. 123 124 Things to do: Let the user define that with only one gap, the result 125 character in consensus is gap. Let the user select gap character, now 126 it takes the same is input. 127 """ 128 # Iddo Friedberg, 1-JUL-2004: changed ambiguous default to "X" 129 consensus = '' 130 131 # find the length of the consensus we are creating 132 con_len = self.alignment.get_alignment_length() 133 134 # go through each seq item 135 for n in range(con_len): 136 # keep track of the counts of the different atoms we get 137 atom_dict = {} 138 num_atoms = 0 139 140 for record in self.alignment._records: 141 # make sure we haven't run past the end of any sequences 142 # if they are of different lengths 143 if n < len(record.seq): 144 if record.seq[n] not in atom_dict: 145 atom_dict[record.seq[n]] = 1 146 else: 147 atom_dict[record.seq[n]] += 1 148 149 num_atoms += 1 150 151 max_atoms = [] 152 max_size = 0 153 154 for atom in atom_dict: 155 if atom_dict[atom] > max_size: 156 max_atoms = [atom] 157 max_size = atom_dict[atom] 158 elif atom_dict[atom] == max_size: 159 max_atoms.append(atom) 160 161 if require_multiple and num_atoms == 1: 162 consensus += ambiguous 163 elif (len(max_atoms) == 1) and ((float(max_size)/float(num_atoms)) 164 >= threshold): 165 consensus += max_atoms[0] 166 else: 167 consensus += ambiguous 168 169 # we need to guess a consensus alphabet if one isn't specified 170 if consensus_alpha is None: 171 #TODO - Should we make this into a Gapped alphabet? 172 consensus_alpha = self._guess_consensus_alphabet(ambiguous) 173 174 return Seq(consensus, consensus_alpha)
175
176 - def _guess_consensus_alphabet(self, ambiguous):
177 """Pick an (ungapped) alphabet for an alignment consesus sequence. 178 179 This just looks at the sequences we have, checks their type, and 180 returns as appropriate type which seems to make sense with the 181 sequences we've got. 182 """ 183 #Start with the (un-gapped version of) the alignment alphabet 184 a = Alphabet._get_base_alphabet(self.alignment._alphabet) 185 186 #Now check its compatible with all the rest of the sequences 187 for record in self.alignment: 188 #Get the (un-gapped version of) the sequence's alphabet 189 alt = Alphabet._get_base_alphabet(record.seq.alphabet) 190 if not isinstance(alt, a.__class__): 191 raise ValueError("Alignment contains a sequence with \ 192 an incompatible alphabet.") 193 194 #Check the ambiguous character we are going to use in the consensus 195 #is in the alphabet's list of valid letters (if defined). 196 if hasattr(a, "letters") and a.letters is not None \ 197 and ambiguous not in a.letters: 198 #We'll need to pick a more generic alphabet... 199 if isinstance(a, IUPAC.IUPACUnambiguousDNA): 200 if ambiguous in IUPAC.IUPACUnambiguousDNA().letters: 201 a = IUPAC.IUPACUnambiguousDNA() 202 else: 203 a = Alphabet.generic_dna 204 elif isinstance(a, IUPAC.IUPACUnambiguousRNA): 205 if ambiguous in IUPAC.IUPACUnambiguousRNA().letters: 206 a = IUPAC.IUPACUnambiguousRNA() 207 else: 208 a = Alphabet.generic_rna 209 elif isinstance(a, IUPAC.IUPACProtein): 210 if ambiguous in IUPAC.ExtendedIUPACProtein().letters: 211 a = IUPAC.ExtendedIUPACProtein() 212 else: 213 a = Alphabet.generic_protein 214 else: 215 a = Alphabet.single_letter_alphabet 216 return a
217
218 - def replacement_dictionary(self, skip_chars = []):
219 """Generate a replacement dictionary to plug into a substitution matrix 220 221 This should look at an alignment, and be able to generate the number 222 of substitutions of different residues for each other in the 223 aligned object. 224 225 Will then return a dictionary with this information: 226 {('A', 'C') : 10, ('C', 'A') : 12, ('G', 'C') : 15 ....} 227 228 This also treats weighted sequences. The following example shows how 229 we calculate the replacement dictionary. Given the following 230 multiple sequence alignments: 231 232 GTATC 0.5 233 AT--C 0.8 234 CTGTC 1.0 235 236 For the first column we have: 237 ('A', 'G') : 0.5 * 0.8 = 0.4 238 ('C', 'G') : 0.5 * 1.0 = 0.5 239 ('A', 'C') : 0.8 * 1.0 = 0.8 240 241 We then continue this for all of the columns in the alignment, summing 242 the information for each substitution in each column, until we end 243 up with the replacement dictionary. 244 245 Arguments: 246 o skip_chars - A list of characters to skip when creating the dictionary. 247 For instance, you might have Xs (screened stuff) or Ns, and not want 248 to include the ambiguity characters in the dictionary. 249 """ 250 # get a starting dictionary based on the alphabet of the alignment 251 rep_dict, skip_items = self._get_base_replacements(skip_chars) 252 253 # iterate through each record 254 for rec_num1 in range(len(self.alignment._records)): 255 # iterate through each record from one beyond the current record 256 # to the end of the list of records 257 for rec_num2 in range(rec_num1 + 1, len(self.alignment._records)): 258 # for each pair of records, compare the sequences and add 259 # the pertinent info to the dictionary 260 rep_dict = self._pair_replacement( 261 self.alignment._records[rec_num1].seq, 262 self.alignment._records[rec_num2].seq, 263 self.alignment._records[rec_num1].annotations.get('weight', 1.0), 264 self.alignment._records[rec_num2].annotations.get('weight', 1.0), 265 rep_dict, skip_items) 266 267 return rep_dict
268
269 - def _pair_replacement(self, seq1, seq2, weight1, weight2, 270 start_dict, ignore_chars):
271 """Compare two sequences and generate info on the replacements seen. 272 273 Arguments: 274 o seq1, seq2 - The two sequences to compare. 275 o weight1, weight2 - The relative weights of seq1 and seq2. 276 o start_dict - The dictionary containing the starting replacement 277 info that we will modify. 278 o ignore_chars - A list of characters to ignore when calculating 279 replacements (ie. '-'). 280 281 Returns: 282 o A replacment dictionary which is modified from initial_dict with 283 the information from the sequence comparison. 284 """ 285 # loop through each residue in the sequences 286 for residue_num in range(len(seq1)): 287 residue1 = seq1[residue_num] 288 try: 289 residue2 = seq2[residue_num] 290 # if seq2 is shorter, then we just stop looking at replacements 291 # and return the information 292 except IndexError: 293 return start_dict 294 295 # if the two residues are characters we want to count 296 if (residue1 not in ignore_chars) and (residue2 not in ignore_chars): 297 try: 298 # add info about the replacement to the dictionary, 299 # modified by the sequence weights 300 start_dict[(residue1, residue2)] += weight1 * weight2 301 302 # if we get a key error, then we've got a problem with alphabets 303 except KeyError: 304 raise ValueError("Residues %s, %s not found in alphabet %s" 305 % (residue1, residue2, 306 self.alignment._alphabet)) 307 308 return start_dict
309
310 - def _get_all_letters(self):
311 """Returns a string containing the expected letters in the alignment.""" 312 all_letters = self.alignment._alphabet.letters 313 if all_letters is None \ 314 or (isinstance(self.alignment._alphabet, Alphabet.Gapped) 315 and all_letters == self.alignment._alphabet.gap_char): 316 #We are dealing with a generic alphabet class where the 317 #letters are not defined! We must build a list of the 318 #letters used... 319 set_letters = set() 320 for record in self.alignment: 321 #Note the built in set does not have a union_update 322 #which was provided by the sets module's Set 323 set_letters = set_letters.union(record.seq) 324 list_letters = sorted(set_letters) 325 all_letters = "".join(list_letters) 326 return all_letters
327
328 - def _get_base_replacements(self, skip_items = []):
329 """Get a zeroed dictionary of all possible letter combinations. 330 331 This looks at the type of alphabet and gets the letters for it. 332 It then creates a dictionary with all possible combinations of these 333 letters as keys (ie. ('A', 'G')) and sets the values as zero. 334 335 Returns: 336 o The base dictionary created 337 o A list of alphabet items to skip when filling the dictionary.Right 338 now the only thing I can imagine in this list is gap characters, but 339 maybe X's or something else might be useful later. This will also 340 include any characters that are specified to be skipped. 341 """ 342 base_dictionary = {} 343 all_letters = self._get_all_letters() 344 345 # if we have a gapped alphabet we need to find the gap character 346 # and drop it out 347 if isinstance(self.alignment._alphabet, Alphabet.Gapped): 348 skip_items.append(self.alignment._alphabet.gap_char) 349 all_letters = all_letters.replace(self.alignment._alphabet.gap_char, '') 350 351 # now create the dictionary 352 for first_letter in all_letters: 353 for second_letter in all_letters: 354 if first_letter not in skip_items and \ 355 second_letter not in skip_items: 356 base_dictionary[(first_letter, second_letter)] = 0 357 358 return base_dictionary, skip_items
359
360 - def pos_specific_score_matrix(self, axis_seq = None, 361 chars_to_ignore = []):
362 """Create a position specific score matrix object for the alignment. 363 364 This creates a position specific score matrix (pssm) which is an 365 alternative method to look at a consensus sequence. 366 367 Arguments: 368 o chars_to_ignore - A listing of all characters not to include in 369 the pssm. If the alignment alphabet declares a gap character, 370 then it will be excluded automatically. 371 o axis_seq - An optional argument specifying the sequence to 372 put on the axis of the PSSM. This should be a Seq object. If nothing 373 is specified, the consensus sequence, calculated with default 374 parameters, will be used. 375 376 Returns: 377 o A PSSM (position specific score matrix) object. 378 """ 379 # determine all of the letters we have to deal with 380 all_letters = self._get_all_letters() 381 assert all_letters 382 383 if not isinstance(chars_to_ignore, list): 384 raise TypeError("chars_to_ignore should be a list.") 385 386 # if we have a gap char, add it to stuff to ignore 387 if isinstance(self.alignment._alphabet, Alphabet.Gapped): 388 chars_to_ignore.append(self.alignment._alphabet.gap_char) 389 390 for char in chars_to_ignore: 391 all_letters = all_letters.replace(char, '') 392 393 if axis_seq: 394 left_seq = axis_seq 395 assert len(axis_seq) == self.alignment.get_alignment_length() 396 else: 397 left_seq = self.dumb_consensus() 398 399 pssm_info = [] 400 # now start looping through all of the sequences and getting info 401 for residue_num in range(len(left_seq)): 402 score_dict = self._get_base_letters(all_letters) 403 for record in self.alignment._records: 404 try: 405 this_residue = record.seq[residue_num] 406 # if we hit an index error we've run out of sequence and 407 # should not add new residues 408 except IndexError: 409 this_residue = None 410 411 if this_residue and this_residue not in chars_to_ignore: 412 weight = record.annotations.get('weight', 1.0) 413 try: 414 score_dict[this_residue] += weight 415 # if we get a KeyError then we have an alphabet problem 416 except KeyError: 417 raise ValueError("Residue %s not found in alphabet %s" 418 % (this_residue, 419 self.alignment._alphabet)) 420 421 pssm_info.append((left_seq[residue_num], 422 score_dict)) 423 424 return PSSM(pssm_info)
425
426 - def _get_base_letters(self, letters):
427 """Create a zeroed dictionary with all of the specified letters. 428 """ 429 base_info = {} 430 for letter in letters: 431 base_info[letter] = 0 432 433 return base_info
434
435 - def information_content(self, start = 0, 436 end = None, 437 e_freq_table = None, log_base = 2, 438 chars_to_ignore = []):
439 """Calculate the information content for each residue along an alignment. 440 441 Arguments: 442 o start, end - The starting an ending points to calculate the 443 information content. These points should be relative to the first 444 sequence in the alignment, starting at zero (ie. even if the 'real' 445 first position in the seq is 203 in the initial sequence, for 446 the info content, we need to use zero). This defaults to the entire 447 length of the first sequence. 448 o e_freq_table - A FreqTable object specifying the expected frequencies 449 for each letter in the alphabet we are using (e.g. {'G' : 0.4, 450 'C' : 0.4, 'T' : 0.1, 'A' : 0.1}). Gap characters should not be 451 included, since these should not have expected frequencies. 452 o log_base - The base of the logathrim to use in calculating the 453 information content. This defaults to 2 so the info is in bits. 454 o chars_to_ignore - A listing of characterw which should be ignored 455 in calculating the info content. 456 457 Returns: 458 o A number representing the info content for the specified region. 459 460 Please see the Biopython manual for more information on how information 461 content is calculated. 462 """ 463 # if no end was specified, then we default to the end of the sequence 464 if end is None: 465 end = len(self.alignment._records[0].seq) 466 467 if start < 0 or end > len(self.alignment._records[0].seq): 468 raise ValueError("Start (%s) and end (%s) are not in the \ 469 range %s to %s" 470 % (start, end, 0, len(self.alignment._records[0].seq))) 471 # determine random expected frequencies, if necessary 472 random_expected = None 473 if not e_freq_table: 474 #TODO - What about ambiguous alphabets? 475 base_alpha = Alphabet._get_base_alphabet(self.alignment._alphabet) 476 if isinstance(base_alpha, Alphabet.ProteinAlphabet): 477 random_expected = Protein20Random 478 elif isinstance(base_alpha, Alphabet.NucleotideAlphabet): 479 random_expected = Nucleotide4Random 480 else: 481 errstr = "Error in alphabet: not Nucleotide or Protein, " 482 errstr += "supply expected frequencies" 483 raise ValueError(errstr) 484 del base_alpha 485 elif not isinstance(e_freq_table, FreqTable.FreqTable): 486 raise ValueError("e_freq_table should be a FreqTable object") 487 488 # determine all of the letters we have to deal with 489 all_letters = self._get_all_letters() 490 for char in chars_to_ignore: 491 all_letters = all_letters.replace(char, '') 492 493 info_content = {} 494 for residue_num in range(start, end): 495 freq_dict = self._get_letter_freqs(residue_num, 496 self.alignment._records, 497 all_letters, chars_to_ignore) 498 # print freq_dict, 499 column_score = self._get_column_info_content(freq_dict, 500 e_freq_table, 501 log_base, 502 random_expected) 503 504 info_content[residue_num] = column_score 505 # sum up the score 506 total_info = sum(info_content.values()) 507 # fill in the ic_vector member: holds IC for each column 508 for i in info_content: 509 self.ic_vector[i] = info_content[i] 510 return total_info
511
512 - def _get_letter_freqs(self, residue_num, all_records, letters, to_ignore):
513 """Determine the frequency of specific letters in the alignment. 514 515 Arguments: 516 o residue_num - The number of the column we are getting frequencies 517 from. 518 o all_records - All of the SeqRecords in the alignment. 519 o letters - The letters we are interested in getting the frequency 520 for. 521 o to_ignore - Letters we are specifically supposed to ignore. 522 523 This will calculate the frequencies of each of the specified letters 524 in the alignment at the given frequency, and return this as a 525 dictionary where the keys are the letters and the values are the 526 frequencies. 527 """ 528 freq_info = self._get_base_letters(letters) 529 530 total_count = 0 531 # collect the count info into the dictionary for all the records 532 for record in all_records: 533 try: 534 if record.seq[residue_num] not in to_ignore: 535 weight = record.annotations.get('weight', 1.0) 536 freq_info[record.seq[residue_num]] += weight 537 total_count += weight 538 # getting a key error means we've got a problem with the alphabet 539 except KeyError: 540 raise ValueError("Residue %s not found in alphabet %s" 541 % (record.seq[residue_num], 542 self.alignment._alphabet)) 543 544 if total_count == 0: 545 # This column must be entirely ignored characters 546 for letter in freq_info: 547 assert freq_info[letter] == 0 548 #TODO - Map this to NA or NaN? 549 else: 550 # now convert the counts into frequencies 551 for letter in freq_info: 552 freq_info[letter] = freq_info[letter] / total_count 553 554 return freq_info
555
556 - def _get_column_info_content(self, obs_freq, e_freq_table, log_base, 557 random_expected):
558 """Calculate the information content for a column. 559 560 Arguments: 561 o obs_freq - The frequencies observed for each letter in the column. 562 o e_freq_table - An optional argument specifying the expected 563 frequencies for each letter. This is a SubsMat.FreqTable instance. 564 o log_base - The base of the logathrim to use in calculating the 565 info content. 566 """ 567 try: 568 gap_char = self.alignment._alphabet.gap_char 569 except AttributeError: 570 #The alphabet doesn't declare a gap - there could be none 571 #in the sequence... or just a vague alphabet. 572 gap_char = "-" # Safe? 573 574 if e_freq_table: 575 if not isinstance(e_freq_table, FreqTable.FreqTable): 576 raise ValueError("e_freq_table should be a FreqTable object") 577 # check the expected freq information to make sure it is good 578 for key in obs_freq: 579 if (key != gap_char and key not in e_freq_table): 580 raise ValueError("Expected frequency letters %s " 581 "do not match observed %s" 582 % (list(e_freq_table.keys()), 583 list(obs_freq.keys()) - [gap_char])) 584 585 total_info = 0.0 586 587 for letter in obs_freq: 588 inner_log = 0.0 589 # if we have expected frequencies, modify the log value by them 590 # gap characters do not have expected frequencies, so they 591 # should just be the observed frequency. 592 if letter != gap_char: 593 if e_freq_table: 594 inner_log = obs_freq[letter] / e_freq_table[letter] 595 else: 596 inner_log = obs_freq[letter] / random_expected 597 # if the observed frequency is zero, we don't add any info to the 598 # total information content 599 if inner_log > 0: 600 letter_info = (obs_freq[letter] * 601 math.log(inner_log) / math.log(log_base)) 602 total_info += letter_info 603 return total_info
604
605 - def get_column(self, col):
606 return self.alignment.get_column(col)
607 608
609 -class PSSM(object):
610 """Represent a position specific score matrix. 611 612 This class is meant to make it easy to access the info within a PSSM 613 and also make it easy to print out the information in a nice table. 614 615 Let's say you had an alignment like this: 616 GTATC 617 AT--C 618 CTGTC 619 620 The position specific score matrix (when printed) looks like: 621 622 G A T C 623 G 1 1 0 1 624 T 0 0 3 0 625 A 1 1 0 0 626 T 0 0 2 0 627 C 0 0 0 3 628 629 You can access a single element of the PSSM using the following: 630 631 your_pssm[sequence_number][residue_count_name] 632 633 For instance, to get the 'T' residue for the second element in the 634 above alignment you would need to do: 635 636 your_pssm[1]['T'] 637 """
638 - def __init__(self, pssm):
639 """Initialize with pssm data to represent. 640 641 The pssm passed should be a list with the following structure: 642 643 list[0] - The letter of the residue being represented (for instance, 644 from the example above, the first few list[0]s would be GTAT... 645 list[1] - A dictionary with the letter substitutions and counts. 646 """ 647 self.pssm = pssm
648
649 - def __getitem__(self, pos):
650 return self.pssm[pos][1]
651
652 - def __str__(self):
653 out = " " 654 all_residues = sorted(self.pssm[0][1]) 655 656 # first print out the top header 657 for res in all_residues: 658 out += " %s" % res 659 out += "\n" 660 661 # for each item, write out the substitutions 662 for item in self.pssm: 663 out += "%s " % item[0] 664 for res in all_residues: 665 out += " %.1f" % item[1][res] 666 667 out += "\n" 668 return out
669
670 - def get_residue(self, pos):
671 """Return the residue letter at the specified position. 672 """ 673 return self.pssm[pos][0]
674 675 686 687 if __name__ == "__main__": 688 print("Quick test") 689 from Bio import AlignIO 690 from Bio.Align.Generic import Alignment 691 692 filename = "../../Tests/GFF/multi.fna" 693 format = "fasta" 694 expected = FreqTable.FreqTable({"A":0.25,"G":0.25,"T":0.25,"C":0.25}, 695 FreqTable.FREQ, 696 IUPAC.unambiguous_dna) 697 698 alignment = AlignIO.read(open(filename), format) 699 for record in alignment: 700 print(str(record.seq)) 701 print("="*alignment.get_alignment_length()) 702 703 summary = SummaryInfo(alignment) 704 consensus = summary.dumb_consensus(ambiguous="N") 705 print(consensus) 706 consensus = summary.gap_consensus(ambiguous="N") 707 print(consensus) 708 print("") 709 print(summary.pos_specific_score_matrix(chars_to_ignore=['-'], 710 axis_seq=consensus)) 711 print("") 712 #Have a generic alphabet, without a declared gap char, so must tell 713 #provide the frequencies and chars to ignore explicitly. 714 print(summary.information_content(e_freq_table=expected, 715 chars_to_ignore=['-'])) 716 print("") 717 print("Trying a protein sequence with gaps and stops") 718 719 alpha = Alphabet.HasStopCodon(Alphabet.Gapped(Alphabet.generic_protein, "-"), "*") 720 a = Alignment(alpha) 721 a.add_sequence("ID001", "MHQAIFIYQIGYP*LKSGYIQSIRSPEYDNW-") 722 a.add_sequence("ID002", "MH--IFIYQIGYAYLKSGYIQSIRSPEY-NW*") 723 a.add_sequence("ID003", "MHQAIFIYQIGYPYLKSGYIQSIRSPEYDNW*") 724 print(a) 725 print("="*a.get_alignment_length()) 726 727 s = SummaryInfo(a) 728 c = s.dumb_consensus(ambiguous="X") 729 print(c) 730 c = s.gap_consensus(ambiguous="X") 731 print(c) 732 print("") 733 print(s.pos_specific_score_matrix(chars_to_ignore=['-', '*'], axis_seq=c)) 734 735 print(s.information_content(chars_to_ignore=['-', '*'])) 736 737 print("Done") 738