Package Bio :: Package Align :: Module AlignInfo
[hide private]
[frames] | no frames]

Source Code for Module Bio.Align.AlignInfo

  1  # This code is part of the Biopython distribution and governed by its 
  2  # license.  Please see the LICENSE file that should have been included 
  3  # as part of this package. 
  4  # 
  5   
  6  """Extract information from alignment objects. 
  7   
  8  In order to try and avoid huge alignment objects with tons of functions, 
  9  functions which return summary type information about alignments should 
 10  be put into classes in this module. 
 11  """ 
 12   
 13  from __future__ import print_function 
 14   
 15  import math 
 16  import sys 
 17   
 18  from Bio import Alphabet 
 19  from Bio.Alphabet import IUPAC 
 20  from Bio.Seq import Seq 
 21  from Bio.SubsMat import FreqTable 
 22   
 23   
 24  # Expected random distributions for 20-letter protein, and 
 25  # for 4-letter nucleotide alphabets 
 26  Protein20Random = 0.05 
 27  Nucleotide4Random = 0.25 
 28   
 29   
30 -class SummaryInfo(object):
31 """Calculate summary info about the alignment. 32 33 This class should be used to caclculate information summarizing the 34 results of an alignment. This may either be straight consensus info 35 or more complicated things. 36 """ 37
38 - def __init__(self, alignment):
39 """Initialize with the alignment to calculate information on. 40 41 ic_vector attribute. A list of ic content for each column number. 42 """ 43 self.alignment = alignment 44 self.ic_vector = []
45
46 - def dumb_consensus(self, threshold=.7, ambiguous="X", 47 consensus_alpha=None, require_multiple=0):
48 """Output a fast consensus sequence of the alignment. 49 50 This doesn't do anything fancy at all. It will just go through the 51 sequence residue by residue and count up the number of each type 52 of residue (ie. A or G or T or C for DNA) in all sequences in the 53 alignment. If the percentage of the most common residue type is 54 greater then the passed threshold, then we will add that residue type, 55 otherwise an ambiguous character will be added. 56 57 This could be made a lot fancier (ie. to take a substitution matrix 58 into account), but it just meant for a quick and dirty consensus. 59 60 Arguments: 61 - threshold - The threshold value that is required to add a particular 62 atom. 63 - ambiguous - The ambiguous character to be added when the threshold is 64 not reached. 65 - consensus_alpha - The alphabet to return for the consensus sequence. 66 If this is None, then we will try to guess the alphabet. 67 - require_multiple - If set as 1, this will require that more than 68 1 sequence be part of an alignment to put it in the consensus (ie. 69 not just 1 sequence and gaps). 70 """ 71 # Iddo Friedberg, 1-JUL-2004: changed ambiguous default to "X" 72 consensus = '' 73 74 # find the length of the consensus we are creating 75 con_len = self.alignment.get_alignment_length() 76 77 # go through each seq item 78 for n in range(con_len): 79 # keep track of the counts of the different atoms we get 80 atom_dict = {} 81 num_atoms = 0 82 83 for record in self.alignment: 84 # make sure we haven't run past the end of any sequences 85 # if they are of different lengths 86 if n < len(record.seq): 87 if record.seq[n] != '-' and record.seq[n] != '.': 88 if record.seq[n] not in atom_dict: 89 atom_dict[record.seq[n]] = 1 90 else: 91 atom_dict[record.seq[n]] += 1 92 93 num_atoms = num_atoms + 1 94 95 max_atoms = [] 96 max_size = 0 97 98 for atom in atom_dict: 99 if atom_dict[atom] > max_size: 100 max_atoms = [atom] 101 max_size = atom_dict[atom] 102 elif atom_dict[atom] == max_size: 103 max_atoms.append(atom) 104 105 if require_multiple and num_atoms == 1: 106 consensus += ambiguous 107 elif (len(max_atoms) == 1) and ((float(max_size) / 108 float(num_atoms)) >= threshold): 109 consensus += max_atoms[0] 110 else: 111 consensus += ambiguous 112 113 # we need to guess a consensus alphabet if one isn't specified 114 if consensus_alpha is None: 115 consensus_alpha = self._guess_consensus_alphabet(ambiguous) 116 117 return Seq(consensus, consensus_alpha)
118
119 - def gap_consensus(self, threshold=.7, ambiguous="X", 120 consensus_alpha=None, require_multiple=0):
121 """Same as dumb_consensus(), but allows gap on the output. 122 123 Things to do: 124 - Let the user define that with only one gap, the result 125 character in consensus is gap. 126 - Let the user select gap character, now 127 it takes the same as input. 128 """ 129 # Iddo Friedberg, 1-JUL-2004: changed ambiguous default to "X" 130 consensus = '' 131 132 # find the length of the consensus we are creating 133 con_len = self.alignment.get_alignment_length() 134 135 # go through each seq item 136 for n in range(con_len): 137 # keep track of the counts of the different atoms we get 138 atom_dict = {} 139 num_atoms = 0 140 141 for record in self.alignment: 142 # make sure we haven't run past the end of any sequences 143 # if they are of different lengths 144 if n < len(record.seq): 145 if record.seq[n] not in atom_dict: 146 atom_dict[record.seq[n]] = 1 147 else: 148 atom_dict[record.seq[n]] += 1 149 150 num_atoms += 1 151 152 max_atoms = [] 153 max_size = 0 154 155 for atom in atom_dict: 156 if atom_dict[atom] > max_size: 157 max_atoms = [atom] 158 max_size = atom_dict[atom] 159 elif atom_dict[atom] == max_size: 160 max_atoms.append(atom) 161 162 if require_multiple and num_atoms == 1: 163 consensus += ambiguous 164 elif (len(max_atoms) == 1) and ((float(max_size) / 165 float(num_atoms)) >= threshold): 166 consensus += max_atoms[0] 167 else: 168 consensus += ambiguous 169 170 # we need to guess a consensus alphabet if one isn't specified 171 if consensus_alpha is None: 172 # TODO - Should we make this into a Gapped alphabet? 173 consensus_alpha = self._guess_consensus_alphabet(ambiguous) 174 175 return Seq(consensus, consensus_alpha)
176
177 - def _guess_consensus_alphabet(self, ambiguous):
178 """Pick an (ungapped) alphabet for an alignment consesus sequence. 179 180 This just looks at the sequences we have, checks their type, and 181 returns as appropriate type which seems to make sense with the 182 sequences we've got. 183 """ 184 # Start with the (un-gapped version of) the alignment alphabet 185 a = Alphabet._get_base_alphabet(self.alignment._alphabet) 186 187 # Now check its compatible with all the rest of the sequences 188 for record in self.alignment: 189 # Get the (un-gapped version of) the sequence's alphabet 190 alt = Alphabet._get_base_alphabet(record.seq.alphabet) 191 if not isinstance(alt, a.__class__): 192 raise ValueError("Alignment contains a sequence with \ 193 an incompatible alphabet.") 194 195 # Check the ambiguous character we are going to use in the consensus 196 # is in the alphabet's list of valid letters (if defined). 197 if hasattr(a, "letters") and a.letters is not None \ 198 and ambiguous not in a.letters: 199 # We'll need to pick a more generic alphabet... 200 if isinstance(a, IUPAC.IUPACUnambiguousDNA): 201 if ambiguous in IUPAC.IUPACUnambiguousDNA().letters: 202 a = IUPAC.IUPACUnambiguousDNA() 203 else: 204 a = Alphabet.generic_dna 205 elif isinstance(a, IUPAC.IUPACUnambiguousRNA): 206 if ambiguous in IUPAC.IUPACUnambiguousRNA().letters: 207 a = IUPAC.IUPACUnambiguousRNA() 208 else: 209 a = Alphabet.generic_rna 210 elif isinstance(a, IUPAC.IUPACProtein): 211 if ambiguous in IUPAC.ExtendedIUPACProtein().letters: 212 a = IUPAC.ExtendedIUPACProtein() 213 else: 214 a = Alphabet.generic_protein 215 else: 216 a = Alphabet.single_letter_alphabet 217 return a
218
219 - def replacement_dictionary(self, skip_chars=None):
220 """Generate a replacement dictionary to plug into a substitution matrix 221 222 This should look at an alignment, and be able to generate the number 223 of substitutions of different residues for each other in the 224 aligned object. 225 226 Will then return a dictionary with this information:: 227 228 {('A', 'C') : 10, ('C', 'A') : 12, ('G', 'C') : 15 ....} 229 230 This also treats weighted sequences. The following example shows how 231 we calculate the replacement dictionary. Given the following 232 multiple sequence alignment:: 233 234 GTATC 0.5 235 AT--C 0.8 236 CTGTC 1.0 237 238 For the first column we have:: 239 240 ('A', 'G') : 0.5 * 0.8 = 0.4 241 ('C', 'G') : 0.5 * 1.0 = 0.5 242 ('A', 'C') : 0.8 * 1.0 = 0.8 243 244 We then continue this for all of the columns in the alignment, summing 245 the information for each substitution in each column, until we end 246 up with the replacement dictionary. 247 248 Arguments: 249 - skip_chars - A list of characters to skip when creating the dictionary. 250 This defaults to an empty list. 251 252 For instance, you might have Xs (screened stuff) or Ns, and not want 253 to include the ambiguity characters in the dictionary. 254 """ 255 # get a starting dictionary based on the alphabet of the alignment 256 rep_dict, skip_items = self._get_base_replacements(skip_chars) 257 258 # iterate through each record 259 for rec_num1 in range(len(self.alignment)): 260 # iterate through each record from one beyond the current record 261 # to the end of the list of records 262 for rec_num2 in range(rec_num1 + 1, len(self.alignment)): 263 # for each pair of records, compare the sequences and add 264 # the pertinent info to the dictionary 265 rep_dict = self._pair_replacement( 266 self.alignment[rec_num1].seq, 267 self.alignment[rec_num2].seq, 268 self.alignment[rec_num1].annotations.get('weight', 1.0), 269 self.alignment[rec_num2].annotations.get('weight', 1.0), 270 rep_dict, skip_items) 271 272 return rep_dict
273
274 - def _pair_replacement(self, seq1, seq2, weight1, weight2, 275 start_dict, ignore_chars):
276 """Compare two sequences and generate info on the replacements seen. 277 278 Arguments: 279 - seq1, seq2 - The two sequences to compare. 280 - weight1, weight2 - The relative weights of seq1 and seq2. 281 - start_dict - The dictionary containing the starting replacement 282 info that we will modify. 283 - ignore_chars - A list of characters to ignore when calculating 284 replacements (ie. '-'). 285 286 Returns: 287 - A replacment dictionary which is modified from initial_dict with 288 the information from the sequence comparison. 289 """ 290 # loop through each residue in the sequences 291 for residue_num in range(len(seq1)): 292 residue1 = seq1[residue_num] 293 try: 294 residue2 = seq2[residue_num] 295 # if seq2 is shorter, then we just stop looking at replacements 296 # and return the information 297 except IndexError: 298 return start_dict 299 300 # if the two residues are characters we want to count 301 if (residue1 not in ignore_chars) and (residue2 not in ignore_chars): 302 try: 303 # add info about the replacement to the dictionary, 304 # modified by the sequence weights 305 start_dict[(residue1, residue2)] += weight1 * weight2 306 307 # if we get a key error, then we've got a problem with 308 # alphabets 309 except KeyError: 310 raise ValueError("Residues %s, %s not found in alphabet %s" 311 % (residue1, residue2, 312 self.alignment._alphabet)) 313 314 return start_dict
315
316 - def _get_all_letters(self):
317 """Returns a string containing the expected letters in the alignment.""" 318 all_letters = self.alignment._alphabet.letters 319 if all_letters is None or \ 320 (isinstance(self.alignment._alphabet, Alphabet.Gapped) and 321 all_letters == self.alignment._alphabet.gap_char): 322 # We are dealing with a generic alphabet class where the 323 # letters are not defined! We must build a list of the 324 # letters used... 325 set_letters = set() 326 for record in self.alignment: 327 # Note the built in set does not have a union_update 328 # which was provided by the sets module's Set 329 set_letters = set_letters.union(record.seq) 330 list_letters = sorted(set_letters) 331 all_letters = "".join(list_letters) 332 return all_letters
333
334 - def _get_base_replacements(self, skip_items=None):
335 """Get a zeroed dictionary of all possible letter combinations. 336 337 This looks at the type of alphabet and gets the letters for it. 338 It then creates a dictionary with all possible combinations of these 339 letters as keys (ie. ('A', 'G')) and sets the values as zero. 340 341 Returns: 342 - The base dictionary created 343 - A list of alphabet items to skip when filling the dictionary. 344 (Right now the only thing I can imagine in this list is gap 345 characters, but maybe X's or something else might be useful later. 346 This will also include any characters that are specified to be 347 skipped.) Defaults to an empty list. 348 """ 349 if skip_items is None: 350 skip_items = [] 351 base_dictionary = {} 352 all_letters = self._get_all_letters() 353 354 # if we have a gapped alphabet we need to find the gap character 355 # and drop it out 356 if isinstance(self.alignment._alphabet, Alphabet.Gapped): 357 skip_items.append(self.alignment._alphabet.gap_char) 358 all_letters = all_letters.replace( 359 self.alignment._alphabet.gap_char, '') 360 361 # now create the dictionary 362 for first_letter in all_letters: 363 for second_letter in all_letters: 364 if first_letter not in skip_items and \ 365 second_letter not in skip_items: 366 base_dictionary[(first_letter, second_letter)] = 0 367 368 return base_dictionary, skip_items
369
370 - def pos_specific_score_matrix(self, axis_seq=None, 371 chars_to_ignore=None):
372 """Create a position specific score matrix object for the alignment. 373 374 This creates a position specific score matrix (pssm) which is an 375 alternative method to look at a consensus sequence. 376 377 Arguments: 378 - chars_to_ignore - A list of all characters not to include in 379 the pssm. If the alignment alphabet declares a gap character, 380 then it will be excluded automatically. 381 - axis_seq - An optional argument specifying the sequence to 382 put on the axis of the PSSM. This should be a Seq object. If nothing 383 is specified, the consensus sequence, calculated with default 384 parameters, will be used. 385 386 Returns: 387 - A PSSM (position specific score matrix) object. 388 """ 389 # determine all of the letters we have to deal with 390 all_letters = self._get_all_letters() 391 assert all_letters 392 393 if chars_to_ignore is None: 394 chars_to_ignore = [] 395 if not isinstance(chars_to_ignore, list): 396 raise TypeError("chars_to_ignore should be a list.") 397 398 # if we have a gap char, add it to stuff to ignore 399 if isinstance(self.alignment._alphabet, Alphabet.Gapped): 400 chars_to_ignore.append(self.alignment._alphabet.gap_char) 401 402 for char in chars_to_ignore: 403 all_letters = all_letters.replace(char, '') 404 405 if axis_seq: 406 left_seq = axis_seq 407 assert len(axis_seq) == self.alignment.get_alignment_length() 408 else: 409 left_seq = self.dumb_consensus() 410 411 pssm_info = [] 412 # now start looping through all of the sequences and getting info 413 for residue_num in range(len(left_seq)): 414 score_dict = self._get_base_letters(all_letters) 415 for record in self.alignment: 416 try: 417 this_residue = record.seq[residue_num] 418 # if we hit an index error we've run out of sequence and 419 # should not add new residues 420 except IndexError: 421 this_residue = None 422 423 if this_residue and this_residue not in chars_to_ignore: 424 weight = record.annotations.get('weight', 1.0) 425 try: 426 score_dict[this_residue] += weight 427 # if we get a KeyError then we have an alphabet problem 428 except KeyError: 429 raise ValueError("Residue %s not found in alphabet %s" 430 % (this_residue, 431 self.alignment._alphabet)) 432 433 pssm_info.append((left_seq[residue_num], 434 score_dict)) 435 436 return PSSM(pssm_info)
437
438 - def _get_base_letters(self, letters):
439 """Create a zeroed dictionary with all of the specified letters. 440 """ 441 base_info = {} 442 for letter in letters: 443 base_info[letter] = 0 444 445 return base_info
446
447 - def _get_gap_char(self):
448 """Return the gap character used in the alignment 449 """ 450 try: 451 gap_char = self.alignment._alphabet.gap_char 452 except AttributeError: 453 # The alphabet doesn't declare a gap - there could be none 454 # in the sequence... or just a vague alphabet. 455 gap_char = "-" # Safe? 456 457 return gap_char
458
459 - def information_content(self, start=0, 460 end=None, 461 e_freq_table=None, log_base=2, 462 chars_to_ignore=None, pseudo_count=0):
463 """Calculate the information content for each residue along an alignment. 464 465 Arguments: 466 - start, end - The starting an ending points to calculate the 467 information content. These points should be relative to the first 468 sequence in the alignment, starting at zero (ie. even if the 'real' 469 first position in the seq is 203 in the initial sequence, for 470 the info content, we need to use zero). This defaults to the entire 471 length of the first sequence. 472 - e_freq_table - A FreqTable object specifying the expected frequencies 473 for each letter in the alphabet we are using (e.g. {'G' : 0.4, 474 'C' : 0.4, 'T' : 0.1, 'A' : 0.1}). Gap characters should not be 475 included, since these should not have expected frequencies. 476 - log_base - The base of the logathrim to use in calculating the 477 information content. This defaults to 2 so the info is in bits. 478 - chars_to_ignore - A listing of characters which should be ignored 479 in calculating the info content. Defaults to none. 480 481 Returns: 482 - A number representing the info content for the specified region. 483 484 Please see the Biopython manual for more information on how information 485 content is calculated. 486 """ 487 # if no end was specified, then we default to the end of the sequence 488 if end is None: 489 end = len(self.alignment[0].seq) 490 if chars_to_ignore is None: 491 chars_to_ignore = [] 492 493 if start < 0 or end > len(self.alignment[0].seq): 494 raise ValueError("Start (%s) and end (%s) are not in the \ 495 range %s to %s" 496 % (start, end, 0, len(self.alignment[0].seq))) 497 # determine random expected frequencies, if necessary 498 random_expected = None 499 if not e_freq_table: 500 # TODO - What about ambiguous alphabets? 501 base_alpha = Alphabet._get_base_alphabet(self.alignment._alphabet) 502 if isinstance(base_alpha, Alphabet.ProteinAlphabet): 503 random_expected = Protein20Random 504 elif isinstance(base_alpha, Alphabet.NucleotideAlphabet): 505 random_expected = Nucleotide4Random 506 else: 507 errstr = "Error in alphabet: not Nucleotide or Protein, " 508 errstr += "supply expected frequencies" 509 raise ValueError(errstr) 510 del base_alpha 511 elif not isinstance(e_freq_table, FreqTable.FreqTable): 512 raise ValueError("e_freq_table should be a FreqTable object") 513 514 # determine all of the letters we have to deal with 515 all_letters = self._get_all_letters() 516 for char in chars_to_ignore: 517 all_letters = all_letters.replace(char, '') 518 519 info_content = {} 520 for residue_num in range(start, end): 521 freq_dict = self._get_letter_freqs(residue_num, 522 self.alignment, 523 all_letters, 524 chars_to_ignore, 525 pseudo_count, 526 e_freq_table, 527 random_expected) 528 # print freq_dict, 529 column_score = self._get_column_info_content(freq_dict, 530 e_freq_table, 531 log_base, 532 random_expected) 533 info_content[residue_num] = column_score 534 # sum up the score 535 total_info = sum(info_content.values()) 536 # fill in the ic_vector member: holds IC for each column 537 # reset ic_vector to empty list at each call 538 self.ic_vector = [] 539 for (i, k) in enumerate(info_content): 540 self.ic_vector.append(info_content[i + start]) 541 return total_info
542
543 - def _get_letter_freqs(self, residue_num, all_records, letters, to_ignore, 544 pseudo_count=0, e_freq_table=None, random_expected=None):
545 """Determine the frequency of specific letters in the alignment. 546 547 Arguments: 548 - residue_num - The number of the column we are getting frequencies 549 from. 550 - all_records - All of the SeqRecords in the alignment. 551 - letters - The letters we are interested in getting the frequency 552 for. 553 - to_ignore - Letters we are specifically supposed to ignore. 554 - pseudo_count - Optional argument specifying the Pseudo count (k) 555 to add in order to prevent a frequency of 0 for a letter. 556 - e_freq_table - An optional argument specifying the expected 557 frequencies for each letter. This is a SubsMat.FreqTable instance. 558 - random_expected - Optional argument that specify the frequency to use 559 when e_freq_table is not defined. 560 561 This will calculate the frequencies of each of the specified letters 562 in the alignment at the given frequency, and return this as a 563 dictionary where the keys are the letters and the values are the 564 frequencies. Pseudo count can be added to prevent a null frequency 565 """ 566 freq_info = self._get_base_letters(letters) 567 568 total_count = 0 569 570 gap_char = self._get_gap_char() 571 572 if pseudo_count < 0: 573 raise ValueError("Positive value required for " 574 "pseudo_count, %s provided" % (pseudo_count)) 575 576 # collect the count info into the dictionary for all the records 577 for record in all_records: 578 try: 579 if record.seq[residue_num] not in to_ignore: 580 weight = record.annotations.get('weight', 1.0) 581 freq_info[record.seq[residue_num]] += weight 582 total_count += weight 583 # getting a key error means we've got a problem with the alphabet 584 except KeyError: 585 raise ValueError("Residue %s not found in alphabet %s" 586 % (record.seq[residue_num], 587 self.alignment._alphabet)) 588 589 if e_freq_table: 590 if not isinstance(e_freq_table, FreqTable.FreqTable): 591 raise ValueError("e_freq_table should be a FreqTable object") 592 593 # check if all the residus in freq_info are in e_freq_table 594 for key in freq_info: 595 if (key != gap_char and key not in e_freq_table): 596 raise ValueError("letters in current column %s " 597 "and not in expected frequency table %s" 598 % (list(freq_info) - [gap_char], 599 list(e_freq_table))) 600 601 if total_count == 0: 602 # This column must be entirely ignored characters 603 for letter in freq_info: 604 assert freq_info[letter] == 0 605 # TODO - Map this to NA or NaN? 606 else: 607 # now convert the counts into frequencies 608 for letter in freq_info: 609 if pseudo_count and (random_expected or e_freq_table): 610 # use either the expected random freq or the 611 if e_freq_table: 612 ajust_freq = e_freq_table[letter] 613 else: 614 ajust_freq = random_expected 615 616 ajusted_letter_count = freq_info[ 617 letter] + ajust_freq * pseudo_count 618 ajusted_total = total_count + pseudo_count 619 freq_info[letter] = ajusted_letter_count / ajusted_total 620 621 else: 622 freq_info[letter] = freq_info[letter] / total_count 623 624 return freq_info
625
626 - def _get_column_info_content(self, obs_freq, e_freq_table, log_base, 627 random_expected):
628 """Calculate the information content for a column. 629 630 Arguments: 631 - obs_freq - The frequencies observed for each letter in the column. 632 - e_freq_table - An optional argument specifying the expected 633 frequencies for each letter. This is a SubsMat.FreqTable instance. 634 - log_base - The base of the logathrim to use in calculating the 635 info content. 636 """ 637 gap_char = self._get_gap_char() 638 639 if e_freq_table: 640 if not isinstance(e_freq_table, FreqTable.FreqTable): 641 raise ValueError("e_freq_table should be a FreqTable object") 642 # check the expected freq information to make sure it is good 643 for key in obs_freq: 644 if (key != gap_char and key not in e_freq_table): 645 raise ValueError("Expected frequency letters %s " 646 "do not match observed %s" 647 % (list(e_freq_table), 648 list(obs_freq) - [gap_char])) 649 650 total_info = 0.0 651 652 for letter in obs_freq: 653 inner_log = 0.0 654 # if we have expected frequencies, modify the log value by them 655 # gap characters do not have expected frequencies, so they 656 # should just be the observed frequency. 657 if letter != gap_char: 658 if e_freq_table: 659 inner_log = obs_freq[letter] / e_freq_table[letter] 660 else: 661 inner_log = obs_freq[letter] / random_expected 662 # if the observed frequency is zero, we don't add any info to the 663 # total information content 664 if inner_log > 0: 665 letter_info = (obs_freq[letter] * 666 math.log(inner_log) / math.log(log_base)) 667 total_info += letter_info 668 return total_info
669
670 - def get_column(self, col):
671 # TODO - Deprecate this and implement slicing? 672 return self.alignment[:, col]
673 674
675 -class PSSM(object):
676 """Represent a position specific score matrix. 677 678 This class is meant to make it easy to access the info within a PSSM 679 and also make it easy to print out the information in a nice table. 680 681 Let's say you had an alignment like this:: 682 683 GTATC 684 AT--C 685 CTGTC 686 687 The position specific score matrix (when printed) looks like:: 688 689 G A T C 690 G 1 1 0 1 691 T 0 0 3 0 692 A 1 1 0 0 693 T 0 0 2 0 694 C 0 0 0 3 695 696 You can access a single element of the PSSM using the following:: 697 698 your_pssm[sequence_number][residue_count_name] 699 700 For instance, to get the 'T' residue for the second element in the 701 above alignment you would need to do: 702 703 your_pssm[1]['T'] 704 """ 705
706 - def __init__(self, pssm):
707 """Initialize with pssm data to represent. 708 709 The pssm passed should be a list with the following structure: 710 711 list[0] - The letter of the residue being represented (for instance, 712 from the example above, the first few list[0]s would be GTAT... 713 list[1] - A dictionary with the letter substitutions and counts. 714 """ 715 self.pssm = pssm
716
717 - def __getitem__(self, pos):
718 return self.pssm[pos][1]
719
720 - def __str__(self):
721 out = " " 722 all_residues = sorted(self.pssm[0][1]) 723 724 # first print out the top header 725 for res in all_residues: 726 out += " %s" % res 727 out += "\n" 728 729 # for each item, write out the substitutions 730 for item in self.pssm: 731 out += "%s " % item[0] 732 for res in all_residues: 733 out += " %.1f" % item[1][res] 734 735 out += "\n" 736 return out
737
738 - def get_residue(self, pos):
739 """Return the residue letter at the specified position. 740 """ 741 return self.pssm[pos][0]
742 743 753