Package Bio :: Package Align :: Module AlignInfo
[hide private]
[frames] | no frames]

Source Code for Module Bio.Align.AlignInfo

  1  # This code is part of the Biopython distribution and governed by its 
  2  # license.  Please see the LICENSE file that should have been included 
  3  # as part of this package. 
  4  # 
  5   
  6  """Extract information from alignment objects. 
  7   
  8  In order to try and avoid huge alignment objects with tons of functions, 
  9  functions which return summary type information about alignments should 
 10  be put into classes in this module. 
 11  """ 
 12   
 13  from __future__ import print_function 
 14   
 15  import math 
 16  import sys 
 17   
 18  from Bio import Alphabet 
 19  from Bio.Alphabet import IUPAC 
 20  from Bio.Seq import Seq 
 21  from Bio.SubsMat import FreqTable 
 22   
 23   
 24  # Expected random distributions for 20-letter protein, and 
 25  # for 4-letter nucleotide alphabets 
 26  Protein20Random = 0.05 
 27  Nucleotide4Random = 0.25 
 28   
 29   
30 -class SummaryInfo(object):
31 """Calculate summary info about the alignment. 32 33 This class should be used to caclculate information summarizing the 34 results of an alignment. This may either be straight consensus info 35 or more complicated things. 36 """ 37
38 - def __init__(self, alignment):
39 """Initialize with the alignment to calculate information on. 40 41 ic_vector attribute. A list of ic content for each column number. 42 """ 43 self.alignment = alignment 44 self.ic_vector = []
45
46 - def dumb_consensus(self, threshold=.7, ambiguous="X", 47 consensus_alpha=None, require_multiple=0):
48 """Output a fast consensus sequence of the alignment. 49 50 This doesn't do anything fancy at all. It will just go through the 51 sequence residue by residue and count up the number of each type 52 of residue (ie. A or G or T or C for DNA) in all sequences in the 53 alignment. If the percentage of the most common residue type is 54 greater then the passed threshold, then we will add that residue type, 55 otherwise an ambiguous character will be added. 56 57 This could be made a lot fancier (ie. to take a substitution matrix 58 into account), but it just meant for a quick and dirty consensus. 59 60 Arguments: 61 - threshold - The threshold value that is required to add a particular 62 atom. 63 - ambiguous - The ambiguous character to be added when the threshold is 64 not reached. 65 - consensus_alpha - The alphabet to return for the consensus sequence. 66 If this is None, then we will try to guess the alphabet. 67 - require_multiple - If set as 1, this will require that more than 68 1 sequence be part of an alignment to put it in the consensus (ie. 69 not just 1 sequence and gaps). 70 71 """ 72 # Iddo Friedberg, 1-JUL-2004: changed ambiguous default to "X" 73 consensus = '' 74 75 # find the length of the consensus we are creating 76 con_len = self.alignment.get_alignment_length() 77 78 # go through each seq item 79 for n in range(con_len): 80 # keep track of the counts of the different atoms we get 81 atom_dict = {} 82 num_atoms = 0 83 84 for record in self.alignment: 85 # make sure we haven't run past the end of any sequences 86 # if they are of different lengths 87 if n < len(record.seq): 88 if record.seq[n] != '-' and record.seq[n] != '.': 89 if record.seq[n] not in atom_dict: 90 atom_dict[record.seq[n]] = 1 91 else: 92 atom_dict[record.seq[n]] += 1 93 94 num_atoms = num_atoms + 1 95 96 max_atoms = [] 97 max_size = 0 98 99 for atom in atom_dict: 100 if atom_dict[atom] > max_size: 101 max_atoms = [atom] 102 max_size = atom_dict[atom] 103 elif atom_dict[atom] == max_size: 104 max_atoms.append(atom) 105 106 if require_multiple and num_atoms == 1: 107 consensus += ambiguous 108 elif (len(max_atoms) == 1) and ((float(max_size) / 109 float(num_atoms)) >= threshold): 110 consensus += max_atoms[0] 111 else: 112 consensus += ambiguous 113 114 # we need to guess a consensus alphabet if one isn't specified 115 if consensus_alpha is None: 116 consensus_alpha = self._guess_consensus_alphabet(ambiguous) 117 118 return Seq(consensus, consensus_alpha)
119
120 - def gap_consensus(self, threshold=.7, ambiguous="X", 121 consensus_alpha=None, require_multiple=0):
122 """Output a fast consensus sequence of the alignment, allowing gaps. 123 124 Same as dumb_consensus(), but allows gap on the output. 125 126 Things to do: 127 - Let the user define that with only one gap, the result 128 character in consensus is gap. 129 - Let the user select gap character, now 130 it takes the same as input. 131 132 """ 133 # Iddo Friedberg, 1-JUL-2004: changed ambiguous default to "X" 134 consensus = '' 135 136 # find the length of the consensus we are creating 137 con_len = self.alignment.get_alignment_length() 138 139 # go through each seq item 140 for n in range(con_len): 141 # keep track of the counts of the different atoms we get 142 atom_dict = {} 143 num_atoms = 0 144 145 for record in self.alignment: 146 # make sure we haven't run past the end of any sequences 147 # if they are of different lengths 148 if n < len(record.seq): 149 if record.seq[n] not in atom_dict: 150 atom_dict[record.seq[n]] = 1 151 else: 152 atom_dict[record.seq[n]] += 1 153 154 num_atoms += 1 155 156 max_atoms = [] 157 max_size = 0 158 159 for atom in atom_dict: 160 if atom_dict[atom] > max_size: 161 max_atoms = [atom] 162 max_size = atom_dict[atom] 163 elif atom_dict[atom] == max_size: 164 max_atoms.append(atom) 165 166 if require_multiple and num_atoms == 1: 167 consensus += ambiguous 168 elif (len(max_atoms) == 1) and ((float(max_size) / 169 float(num_atoms)) >= threshold): 170 consensus += max_atoms[0] 171 else: 172 consensus += ambiguous 173 174 # we need to guess a consensus alphabet if one isn't specified 175 if consensus_alpha is None: 176 # TODO - Should we make this into a Gapped alphabet? 177 consensus_alpha = self._guess_consensus_alphabet(ambiguous) 178 179 return Seq(consensus, consensus_alpha)
180
181 - def _guess_consensus_alphabet(self, ambiguous):
182 """Pick an (ungapped) alphabet for an alignment consesus sequence. 183 184 This just looks at the sequences we have, checks their type, and 185 returns as appropriate type which seems to make sense with the 186 sequences we've got. 187 """ 188 # Start with the (un-gapped version of) the alignment alphabet 189 a = Alphabet._get_base_alphabet(self.alignment._alphabet) 190 191 # Now check its compatible with all the rest of the sequences 192 for record in self.alignment: 193 # Get the (un-gapped version of) the sequence's alphabet 194 alt = Alphabet._get_base_alphabet(record.seq.alphabet) 195 if not isinstance(alt, a.__class__): 196 raise ValueError("Alignment contains a sequence with \ 197 an incompatible alphabet.") 198 199 # Check the ambiguous character we are going to use in the consensus 200 # is in the alphabet's list of valid letters (if defined). 201 if hasattr(a, "letters") and a.letters is not None \ 202 and ambiguous not in a.letters: 203 # We'll need to pick a more generic alphabet... 204 if isinstance(a, IUPAC.IUPACUnambiguousDNA): 205 if ambiguous in IUPAC.IUPACUnambiguousDNA().letters: 206 a = IUPAC.IUPACUnambiguousDNA() 207 else: 208 a = Alphabet.generic_dna 209 elif isinstance(a, IUPAC.IUPACUnambiguousRNA): 210 if ambiguous in IUPAC.IUPACUnambiguousRNA().letters: 211 a = IUPAC.IUPACUnambiguousRNA() 212 else: 213 a = Alphabet.generic_rna 214 elif isinstance(a, IUPAC.IUPACProtein): 215 if ambiguous in IUPAC.ExtendedIUPACProtein().letters: 216 a = IUPAC.ExtendedIUPACProtein() 217 else: 218 a = Alphabet.generic_protein 219 else: 220 a = Alphabet.single_letter_alphabet 221 return a
222
223 - def replacement_dictionary(self, skip_chars=None):
224 """Generate a replacement dictionary to plug into a substitution matrix. 225 226 This should look at an alignment, and be able to generate the number 227 of substitutions of different residues for each other in the 228 aligned object. 229 230 Will then return a dictionary with this information:: 231 232 {('A', 'C') : 10, ('C', 'A') : 12, ('G', 'C') : 15 ....} 233 234 This also treats weighted sequences. The following example shows how 235 we calculate the replacement dictionary. Given the following 236 multiple sequence alignment:: 237 238 GTATC 0.5 239 AT--C 0.8 240 CTGTC 1.0 241 242 For the first column we have:: 243 244 ('A', 'G') : 0.5 * 0.8 = 0.4 245 ('C', 'G') : 0.5 * 1.0 = 0.5 246 ('A', 'C') : 0.8 * 1.0 = 0.8 247 248 We then continue this for all of the columns in the alignment, summing 249 the information for each substitution in each column, until we end 250 up with the replacement dictionary. 251 252 Arguments: 253 - skip_chars - A list of characters to skip when creating the dictionary. 254 This defaults to an empty list. 255 256 For instance, you might have Xs (screened stuff) or Ns, and not want 257 to include the ambiguity characters in the dictionary. 258 """ 259 # get a starting dictionary based on the alphabet of the alignment 260 rep_dict, skip_items = self._get_base_replacements(skip_chars) 261 262 # iterate through each record 263 for rec_num1 in range(len(self.alignment)): 264 # iterate through each record from one beyond the current record 265 # to the end of the list of records 266 for rec_num2 in range(rec_num1 + 1, len(self.alignment)): 267 # for each pair of records, compare the sequences and add 268 # the pertinent info to the dictionary 269 rep_dict = self._pair_replacement( 270 self.alignment[rec_num1].seq, 271 self.alignment[rec_num2].seq, 272 self.alignment[rec_num1].annotations.get('weight', 1.0), 273 self.alignment[rec_num2].annotations.get('weight', 1.0), 274 rep_dict, skip_items) 275 276 return rep_dict
277
278 - def _pair_replacement(self, seq1, seq2, weight1, weight2, 279 start_dict, ignore_chars):
280 """Compare two sequences and generate info on the replacements seen. 281 282 Arguments: 283 - seq1, seq2 - The two sequences to compare. 284 - weight1, weight2 - The relative weights of seq1 and seq2. 285 - start_dict - The dictionary containing the starting replacement 286 info that we will modify. 287 - ignore_chars - A list of characters to ignore when calculating 288 replacements (ie. '-'). 289 290 Returns: 291 - A replacment dictionary which is modified from initial_dict with 292 the information from the sequence comparison. 293 294 """ 295 # loop through each residue in the sequences 296 for residue_num in range(len(seq1)): 297 residue1 = seq1[residue_num] 298 try: 299 residue2 = seq2[residue_num] 300 # if seq2 is shorter, then we just stop looking at replacements 301 # and return the information 302 except IndexError: 303 return start_dict 304 305 # if the two residues are characters we want to count 306 if (residue1 not in ignore_chars) and (residue2 not in ignore_chars): 307 try: 308 # add info about the replacement to the dictionary, 309 # modified by the sequence weights 310 start_dict[(residue1, residue2)] += weight1 * weight2 311 312 # if we get a key error, then we've got a problem with 313 # alphabets 314 except KeyError: 315 raise ValueError("Residues %s, %s not found in alphabet %s" 316 % (residue1, residue2, 317 self.alignment._alphabet)) 318 319 return start_dict
320
321 - def _get_all_letters(self):
322 """Return a string containing the expected letters in the alignment.""" 323 all_letters = self.alignment._alphabet.letters 324 if all_letters is None or \ 325 (isinstance(self.alignment._alphabet, Alphabet.Gapped) and 326 all_letters == self.alignment._alphabet.gap_char): 327 # We are dealing with a generic alphabet class where the 328 # letters are not defined! We must build a list of the 329 # letters used... 330 set_letters = set() 331 for record in self.alignment: 332 # Note the built in set does not have a union_update 333 # which was provided by the sets module's Set 334 set_letters = set_letters.union(record.seq) 335 list_letters = sorted(set_letters) 336 all_letters = "".join(list_letters) 337 return all_letters
338
339 - def _get_base_replacements(self, skip_items=None):
340 """Get a zeroed dictionary of all possible letter combinations. 341 342 This looks at the type of alphabet and gets the letters for it. 343 It then creates a dictionary with all possible combinations of these 344 letters as keys (ie. ('A', 'G')) and sets the values as zero. 345 346 Returns: 347 - The base dictionary created 348 - A list of alphabet items to skip when filling the dictionary. 349 (Right now the only thing I can imagine in this list is gap 350 characters, but maybe X's or something else might be useful later. 351 This will also include any characters that are specified to be 352 skipped.) Defaults to an empty list. 353 354 """ 355 if skip_items is None: 356 skip_items = [] 357 base_dictionary = {} 358 all_letters = self._get_all_letters() 359 360 # if we have a gapped alphabet we need to find the gap character 361 # and drop it out 362 if isinstance(self.alignment._alphabet, Alphabet.Gapped): 363 skip_items.append(self.alignment._alphabet.gap_char) 364 all_letters = all_letters.replace( 365 self.alignment._alphabet.gap_char, '') 366 367 # now create the dictionary 368 for first_letter in all_letters: 369 for second_letter in all_letters: 370 if first_letter not in skip_items and \ 371 second_letter not in skip_items: 372 base_dictionary[(first_letter, second_letter)] = 0 373 374 return base_dictionary, skip_items
375
376 - def pos_specific_score_matrix(self, axis_seq=None, 377 chars_to_ignore=None):
378 """Create a position specific score matrix object for the alignment. 379 380 This creates a position specific score matrix (pssm) which is an 381 alternative method to look at a consensus sequence. 382 383 Arguments: 384 - chars_to_ignore - A list of all characters not to include in 385 the pssm. If the alignment alphabet declares a gap character, 386 then it will be excluded automatically. 387 - axis_seq - An optional argument specifying the sequence to 388 put on the axis of the PSSM. This should be a Seq object. If nothing 389 is specified, the consensus sequence, calculated with default 390 parameters, will be used. 391 392 Returns: 393 - A PSSM (position specific score matrix) object. 394 395 """ 396 # determine all of the letters we have to deal with 397 all_letters = self._get_all_letters() 398 assert all_letters 399 400 if chars_to_ignore is None: 401 chars_to_ignore = [] 402 if not isinstance(chars_to_ignore, list): 403 raise TypeError("chars_to_ignore should be a list.") 404 405 # if we have a gap char, add it to stuff to ignore 406 if isinstance(self.alignment._alphabet, Alphabet.Gapped): 407 chars_to_ignore.append(self.alignment._alphabet.gap_char) 408 409 for char in chars_to_ignore: 410 all_letters = all_letters.replace(char, '') 411 412 if axis_seq: 413 left_seq = axis_seq 414 assert len(axis_seq) == self.alignment.get_alignment_length() 415 else: 416 left_seq = self.dumb_consensus() 417 418 pssm_info = [] 419 # now start looping through all of the sequences and getting info 420 for residue_num in range(len(left_seq)): 421 score_dict = self._get_base_letters(all_letters) 422 for record in self.alignment: 423 try: 424 this_residue = record.seq[residue_num] 425 # if we hit an index error we've run out of sequence and 426 # should not add new residues 427 except IndexError: 428 this_residue = None 429 430 if this_residue and this_residue not in chars_to_ignore: 431 weight = record.annotations.get('weight', 1.0) 432 try: 433 score_dict[this_residue] += weight 434 # if we get a KeyError then we have an alphabet problem 435 except KeyError: 436 raise ValueError("Residue %s not found in alphabet %s" 437 % (this_residue, 438 self.alignment._alphabet)) 439 440 pssm_info.append((left_seq[residue_num], 441 score_dict)) 442 443 return PSSM(pssm_info)
444
445 - def _get_base_letters(self, letters):
446 """Create a zeroed dictionary with all of the specified letters (PRIVATE).""" 447 base_info = {} 448 for letter in letters: 449 base_info[letter] = 0 450 451 return base_info
452
453 - def _get_gap_char(self):
454 """Return the gap character used in the alignment (PRIVATE).""" 455 try: 456 gap_char = self.alignment._alphabet.gap_char 457 except AttributeError: 458 # The alphabet doesn't declare a gap - there could be none 459 # in the sequence... or just a vague alphabet. 460 gap_char = "-" # Safe? 461 462 return gap_char
463
464 - def information_content(self, start=0, 465 end=None, 466 e_freq_table=None, log_base=2, 467 chars_to_ignore=None, pseudo_count=0):
468 """Calculate the information content for each residue along an alignment. 469 470 Arguments: 471 - start, end - The starting an ending points to calculate the 472 information content. These points should be relative to the first 473 sequence in the alignment, starting at zero (ie. even if the 'real' 474 first position in the seq is 203 in the initial sequence, for 475 the info content, we need to use zero). This defaults to the entire 476 length of the first sequence. 477 - e_freq_table - A FreqTable object specifying the expected frequencies 478 for each letter in the alphabet we are using (e.g. {'G' : 0.4, 479 'C' : 0.4, 'T' : 0.1, 'A' : 0.1}). Gap characters should not be 480 included, since these should not have expected frequencies. 481 - log_base - The base of the logathrim to use in calculating the 482 information content. This defaults to 2 so the info is in bits. 483 - chars_to_ignore - A listing of characters which should be ignored 484 in calculating the info content. Defaults to none. 485 486 Returns: 487 - A number representing the info content for the specified region. 488 489 Please see the Biopython manual for more information on how information 490 content is calculated. 491 492 """ 493 # if no end was specified, then we default to the end of the sequence 494 if end is None: 495 end = len(self.alignment[0].seq) 496 if chars_to_ignore is None: 497 chars_to_ignore = [] 498 499 if start < 0 or end > len(self.alignment[0].seq): 500 raise ValueError("Start (%s) and end (%s) are not in the \ 501 range %s to %s" 502 % (start, end, 0, len(self.alignment[0].seq))) 503 # determine random expected frequencies, if necessary 504 random_expected = None 505 if not e_freq_table: 506 # TODO - What about ambiguous alphabets? 507 base_alpha = Alphabet._get_base_alphabet(self.alignment._alphabet) 508 if isinstance(base_alpha, Alphabet.ProteinAlphabet): 509 random_expected = Protein20Random 510 elif isinstance(base_alpha, Alphabet.NucleotideAlphabet): 511 random_expected = Nucleotide4Random 512 else: 513 errstr = "Error in alphabet: not Nucleotide or Protein, " 514 errstr += "supply expected frequencies" 515 raise ValueError(errstr) 516 del base_alpha 517 elif not isinstance(e_freq_table, FreqTable.FreqTable): 518 raise ValueError("e_freq_table should be a FreqTable object") 519 520 # determine all of the letters we have to deal with 521 all_letters = self._get_all_letters() 522 for char in chars_to_ignore: 523 all_letters = all_letters.replace(char, '') 524 525 info_content = {} 526 for residue_num in range(start, end): 527 freq_dict = self._get_letter_freqs(residue_num, 528 self.alignment, 529 all_letters, 530 chars_to_ignore, 531 pseudo_count, 532 e_freq_table, 533 random_expected) 534 # print freq_dict, 535 column_score = self._get_column_info_content(freq_dict, 536 e_freq_table, 537 log_base, 538 random_expected) 539 info_content[residue_num] = column_score 540 # sum up the score 541 total_info = sum(info_content.values()) 542 # fill in the ic_vector member: holds IC for each column 543 # reset ic_vector to empty list at each call 544 self.ic_vector = [] 545 for (i, k) in enumerate(info_content): 546 self.ic_vector.append(info_content[i + start]) 547 return total_info
548
549 - def _get_letter_freqs(self, residue_num, all_records, letters, to_ignore, 550 pseudo_count=0, e_freq_table=None, random_expected=None):
551 """Determine the frequency of specific letters in the alignment. 552 553 Arguments: 554 - residue_num - The number of the column we are getting frequencies 555 from. 556 - all_records - All of the SeqRecords in the alignment. 557 - letters - The letters we are interested in getting the frequency 558 for. 559 - to_ignore - Letters we are specifically supposed to ignore. 560 - pseudo_count - Optional argument specifying the Pseudo count (k) 561 to add in order to prevent a frequency of 0 for a letter. 562 - e_freq_table - An optional argument specifying the expected 563 frequencies for each letter. This is a SubsMat.FreqTable instance. 564 - random_expected - Optional argument that specify the frequency to use 565 when e_freq_table is not defined. 566 567 This will calculate the frequencies of each of the specified letters 568 in the alignment at the given frequency, and return this as a 569 dictionary where the keys are the letters and the values are the 570 frequencies. Pseudo count can be added to prevent a null frequency 571 """ 572 freq_info = self._get_base_letters(letters) 573 574 total_count = 0 575 576 gap_char = self._get_gap_char() 577 578 if pseudo_count < 0: 579 raise ValueError("Positive value required for " 580 "pseudo_count, %s provided" % (pseudo_count)) 581 582 # collect the count info into the dictionary for all the records 583 for record in all_records: 584 try: 585 if record.seq[residue_num] not in to_ignore: 586 weight = record.annotations.get('weight', 1.0) 587 freq_info[record.seq[residue_num]] += weight 588 total_count += weight 589 # getting a key error means we've got a problem with the alphabet 590 except KeyError: 591 raise ValueError("Residue %s not found in alphabet %s" 592 % (record.seq[residue_num], 593 self.alignment._alphabet)) 594 595 if e_freq_table: 596 if not isinstance(e_freq_table, FreqTable.FreqTable): 597 raise ValueError("e_freq_table should be a FreqTable object") 598 599 # check if all the residus in freq_info are in e_freq_table 600 for key in freq_info: 601 if (key != gap_char and key not in e_freq_table): 602 raise ValueError("letters in current column %s " 603 "and not in expected frequency table %s" 604 % (list(freq_info) - [gap_char], 605 list(e_freq_table))) 606 607 if total_count == 0: 608 # This column must be entirely ignored characters 609 for letter in freq_info: 610 assert freq_info[letter] == 0 611 # TODO - Map this to NA or NaN? 612 else: 613 # now convert the counts into frequencies 614 for letter in freq_info: 615 if pseudo_count and (random_expected or e_freq_table): 616 # use either the expected random freq or the 617 if e_freq_table: 618 ajust_freq = e_freq_table[letter] 619 else: 620 ajust_freq = random_expected 621 622 ajusted_letter_count = freq_info[ 623 letter] + ajust_freq * pseudo_count 624 ajusted_total = total_count + pseudo_count 625 freq_info[letter] = ajusted_letter_count / ajusted_total 626 627 else: 628 freq_info[letter] = freq_info[letter] / total_count 629 630 return freq_info
631
632 - def _get_column_info_content(self, obs_freq, e_freq_table, log_base, 633 random_expected):
634 """Calculate the information content for a column. 635 636 Arguments: 637 - obs_freq - The frequencies observed for each letter in the column. 638 - e_freq_table - An optional argument specifying the expected 639 frequencies for each letter. This is a SubsMat.FreqTable instance. 640 - log_base - The base of the logathrim to use in calculating the 641 info content. 642 643 """ 644 gap_char = self._get_gap_char() 645 646 if e_freq_table: 647 if not isinstance(e_freq_table, FreqTable.FreqTable): 648 raise ValueError("e_freq_table should be a FreqTable object") 649 # check the expected freq information to make sure it is good 650 for key in obs_freq: 651 if (key != gap_char and key not in e_freq_table): 652 raise ValueError("Expected frequency letters %s " 653 "do not match observed %s" 654 % (list(e_freq_table), 655 list(obs_freq) - [gap_char])) 656 657 total_info = 0.0 658 659 for letter in obs_freq: 660 inner_log = 0.0 661 # if we have expected frequencies, modify the log value by them 662 # gap characters do not have expected frequencies, so they 663 # should just be the observed frequency. 664 if letter != gap_char: 665 if e_freq_table: 666 inner_log = obs_freq[letter] / e_freq_table[letter] 667 else: 668 inner_log = obs_freq[letter] / random_expected 669 # if the observed frequency is zero, we don't add any info to the 670 # total information content 671 if inner_log > 0: 672 letter_info = (obs_freq[letter] * 673 math.log(inner_log) / math.log(log_base)) 674 total_info += letter_info 675 return total_info
676
677 - def get_column(self, col):
678 # TODO - Deprecate this and implement slicing? 679 return self.alignment[:, col]
680 681
682 -class PSSM(object):
683 """Represent a position specific score matrix. 684 685 This class is meant to make it easy to access the info within a PSSM 686 and also make it easy to print out the information in a nice table. 687 688 Let's say you had an alignment like this:: 689 690 GTATC 691 AT--C 692 CTGTC 693 694 The position specific score matrix (when printed) looks like:: 695 696 G A T C 697 G 1 1 0 1 698 T 0 0 3 0 699 A 1 1 0 0 700 T 0 0 2 0 701 C 0 0 0 3 702 703 You can access a single element of the PSSM using the following:: 704 705 your_pssm[sequence_number][residue_count_name] 706 707 For instance, to get the 'T' residue for the second element in the 708 above alignment you would need to do: 709 710 your_pssm[1]['T'] 711 """ 712
713 - def __init__(self, pssm):
714 """Initialize with pssm data to represent. 715 716 The pssm passed should be a list with the following structure: 717 718 list[0] - The letter of the residue being represented (for instance, 719 from the example above, the first few list[0]s would be GTAT... 720 list[1] - A dictionary with the letter substitutions and counts. 721 """ 722 self.pssm = pssm
723
724 - def __getitem__(self, pos):
725 return self.pssm[pos][1]
726
727 - def __str__(self):
728 out = " " 729 all_residues = sorted(self.pssm[0][1]) 730 731 # first print out the top header 732 for res in all_residues: 733 out += " %s" % res 734 out += "\n" 735 736 # for each item, write out the substitutions 737 for item in self.pssm: 738 out += "%s " % item[0] 739 for res in all_residues: 740 out += " %.1f" % item[1][res] 741 742 out += "\n" 743 return out
744
745 - def get_residue(self, pos):
746 """Return the residue letter at the specified position.""" 747 return self.pssm[pos][0]
748 749 758