1 """Extract information from alignment objects.
2
3 In order to try and avoid huge alignment objects with tons of functions,
4 functions which return summary type information about alignments should
5 be put into classes in this module.
6
7 classes:
8 o SummaryInfo
9 o PSSM
10 """
11
12
13 import math
14 import sys
15
16
17 from Bio import Alphabet
18 from Bio.Alphabet import IUPAC
19 from Bio.Seq import Seq
20 from Bio.SubsMat import FreqTable
21
22
23
24 Protein20Random = 0.05
25 Nucleotide4Random = 0.25
26
27
29 """Calculate summary info about the alignment.
30
31 This class should be used to caclculate information summarizing the
32 results of an alignment. This may either be straight consensus info
33 or more complicated things.
34 """
36 """Initialize with the alignment to calculate information on.
37 ic_vector attribute. A dictionary. Keys: column numbers. Values:
38 """
39 self.alignment = alignment
40 self.ic_vector = {}
41
42 - def dumb_consensus(self, threshold = .7, ambiguous = "X",
43 consensus_alpha = None, require_multiple = 0):
44 """Output a fast consensus sequence of the alignment.
45
46 This doesn't do anything fancy at all. It will just go through the
47 sequence residue by residue and count up the number of each type
48 of residue (ie. A or G or T or C for DNA) in all sequences in the
49 alignment. If the percentage of the most common residue type is
50 greater then the passed threshold, then we will add that residue type,
51 otherwise an ambiguous character will be added.
52
53 This could be made a lot fancier (ie. to take a substitution matrix
54 into account), but it just meant for a quick and dirty consensus.
55
56 Arguments:
57 o threshold - The threshold value that is required to add a particular
58 atom.
59 o ambiguous - The ambiguous character to be added when the threshold is
60 not reached.
61 o consensus_alpha - The alphabet to return for the consensus sequence.
62 If this is None, then we will try to guess the alphabet.
63 o require_multiple - If set as 1, this will require that more than
64 1 sequence be part of an alignment to put it in the consensus (ie.
65 not just 1 sequence and gaps).
66 """
67
68 consensus = ''
69
70
71 con_len = self.alignment.get_alignment_length()
72
73
74 for n in range(con_len):
75
76 atom_dict = {}
77 num_atoms = 0
78
79 for record in self.alignment._records:
80
81
82 if n < len(record.seq):
83 if record.seq[n] != '-' and record.seq[n] != '.':
84 if record.seq[n] not in atom_dict:
85 atom_dict[record.seq[n]] = 1
86 else:
87 atom_dict[record.seq[n]] += 1
88
89 num_atoms = num_atoms + 1
90
91 max_atoms = []
92 max_size = 0
93
94 for atom in atom_dict:
95 if atom_dict[atom] > max_size:
96 max_atoms = [atom]
97 max_size = atom_dict[atom]
98 elif atom_dict[atom] == max_size:
99 max_atoms.append(atom)
100
101 if require_multiple and num_atoms == 1:
102 consensus += ambiguous
103 elif (len(max_atoms) == 1) and ((float(max_size)/float(num_atoms))
104 >= threshold):
105 consensus += max_atoms[0]
106 else:
107 consensus += ambiguous
108
109
110 if consensus_alpha is None:
111 consensus_alpha = self._guess_consensus_alphabet(ambiguous)
112
113 return Seq(consensus, consensus_alpha)
114
115 - def gap_consensus(self, threshold = .7, ambiguous = "X",
116 consensus_alpha = None, require_multiple = 0):
117 """Same as dumb_consensus(), but allows gap on the output.
118
119 Things to do: Let the user define that with only one gap, the result
120 character in consensus is gap. Let the user select gap character, now
121 it takes the same is input.
122 """
123
124 consensus = ''
125
126
127 con_len = self.alignment.get_alignment_length()
128
129
130 for n in range(con_len):
131
132 atom_dict = {}
133 num_atoms = 0
134
135 for record in self.alignment._records:
136
137
138 if n < len(record.seq):
139 if record.seq[n] not in atom_dict:
140 atom_dict[record.seq[n]] = 1
141 else:
142 atom_dict[record.seq[n]] += 1
143
144 num_atoms += 1
145
146 max_atoms = []
147 max_size = 0
148
149 for atom in atom_dict:
150 if atom_dict[atom] > max_size:
151 max_atoms = [atom]
152 max_size = atom_dict[atom]
153 elif atom_dict[atom] == max_size:
154 max_atoms.append(atom)
155
156 if require_multiple and num_atoms == 1:
157 consensus += ambiguous
158 elif (len(max_atoms) == 1) and ((float(max_size)/float(num_atoms))
159 >= threshold):
160 consensus += max_atoms[0]
161 else:
162 consensus += ambiguous
163
164
165 if consensus_alpha is None:
166
167 consensus_alpha = self._guess_consensus_alphabet(ambiguous)
168
169 return Seq(consensus, consensus_alpha)
170
212
214 """Generate a replacement dictionary to plug into a substitution matrix
215
216 This should look at an alignment, and be able to generate the number
217 of substitutions of different residues for each other in the
218 aligned object.
219
220 Will then return a dictionary with this information:
221 {('A', 'C') : 10, ('C', 'A') : 12, ('G', 'C') : 15 ....}
222
223 This also treats weighted sequences. The following example shows how
224 we calculate the replacement dictionary. Given the following
225 multiple sequence alignments:
226
227 GTATC 0.5
228 AT--C 0.8
229 CTGTC 1.0
230
231 For the first column we have:
232 ('A', 'G') : 0.5 * 0.8 = 0.4
233 ('C', 'G') : 0.5 * 1.0 = 0.5
234 ('A', 'C') : 0.8 * 1.0 = 0.8
235
236 We then continue this for all of the columns in the alignment, summing
237 the information for each substitution in each column, until we end
238 up with the replacement dictionary.
239
240 Arguments:
241 o skip_chars - A list of characters to skip when creating the dictionary.
242 For instance, you might have Xs (screened stuff) or Ns, and not want
243 to include the ambiguity characters in the dictionary.
244 """
245
246 rep_dict, skip_items = self._get_base_replacements(skip_chars)
247
248
249 for rec_num1 in range(len(self.alignment._records)):
250
251
252 for rec_num2 in range(rec_num1 + 1, len(self.alignment._records)):
253
254
255 rep_dict = self._pair_replacement(
256 self.alignment._records[rec_num1].seq,
257 self.alignment._records[rec_num2].seq,
258 self.alignment._records[rec_num1].annotations.get('weight',1.0),
259 self.alignment._records[rec_num2].annotations.get('weight',1.0),
260 rep_dict, skip_items)
261
262 return rep_dict
263
264 - def _pair_replacement(self, seq1, seq2, weight1, weight2,
265 start_dict, ignore_chars):
266 """Compare two sequences and generate info on the replacements seen.
267
268 Arguments:
269 o seq1, seq2 - The two sequences to compare.
270 o weight1, weight2 - The relative weights of seq1 and seq2.
271 o start_dict - The dictionary containing the starting replacement
272 info that we will modify.
273 o ignore_chars - A list of characters to ignore when calculating
274 replacements (ie. '-').
275
276 Returns:
277 o A replacment dictionary which is modified from initial_dict with
278 the information from the sequence comparison.
279 """
280
281 for residue_num in range(len(seq1)):
282 residue1 = seq1[residue_num]
283 try:
284 residue2 = seq2[residue_num]
285
286
287 except IndexError:
288 return start_dict
289
290
291 if (residue1 not in ignore_chars) and (residue2 not in ignore_chars):
292 try:
293
294
295 start_dict[(residue1, residue2)] += weight1 * weight2
296
297
298 except KeyError:
299 raise ValueError("Residues %s, %s not found in alphabet %s"
300 % (residue1, residue2,
301 self.alignment._alphabet))
302
303 return start_dict
304
306 """Returns a string containing the expected letters in the alignment."""
307 all_letters = self.alignment._alphabet.letters
308 if all_letters is None \
309 or (isinstance(self.alignment._alphabet, Alphabet.Gapped)
310 and all_letters == self.alignment._alphabet.gap_char):
311
312
313
314 set_letters = set()
315 for record in self.alignment:
316
317
318 set_letters = set_letters.union(record.seq)
319 list_letters = list(set_letters)
320 list_letters.sort()
321 all_letters = "".join(list_letters)
322 return all_letters
323
325 """Get a zeroed dictionary of all possible letter combinations.
326
327 This looks at the type of alphabet and gets the letters for it.
328 It then creates a dictionary with all possible combinations of these
329 letters as keys (ie. ('A', 'G')) and sets the values as zero.
330
331 Returns:
332 o The base dictionary created
333 o A list of alphabet items to skip when filling the dictionary.Right
334 now the only thing I can imagine in this list is gap characters, but
335 maybe X's or something else might be useful later. This will also
336 include any characters that are specified to be skipped.
337 """
338 base_dictionary = {}
339 all_letters = self._get_all_letters()
340
341
342
343 if isinstance(self.alignment._alphabet, Alphabet.Gapped):
344 skip_items.append(self.alignment._alphabet.gap_char)
345 all_letters = all_letters.replace(self.alignment._alphabet.gap_char,'')
346
347
348 for first_letter in all_letters:
349 for second_letter in all_letters:
350 if first_letter not in skip_items and \
351 second_letter not in skip_items:
352 base_dictionary[(first_letter, second_letter)] = 0
353
354 return base_dictionary, skip_items
355
358 """Create a position specific score matrix object for the alignment.
359
360 This creates a position specific score matrix (pssm) which is an
361 alternative method to look at a consensus sequence.
362
363 Arguments:
364 o chars_to_ignore - A listing of all characters not to include in
365 the pssm. If the alignment alphabet declares a gap character,
366 then it will be excluded automatically.
367 o axis_seq - An optional argument specifying the sequence to
368 put on the axis of the PSSM. This should be a Seq object. If nothing
369 is specified, the consensus sequence, calculated with default
370 parameters, will be used.
371
372 Returns:
373 o A PSSM (position specific score matrix) object.
374 """
375
376 all_letters = self._get_all_letters()
377 assert all_letters
378
379 if not isinstance(chars_to_ignore, list):
380 raise TypeError("chars_to_ignore should be a list.")
381
382
383 if isinstance(self.alignment._alphabet, Alphabet.Gapped):
384 chars_to_ignore.append(self.alignment._alphabet.gap_char)
385
386 for char in chars_to_ignore:
387 all_letters = all_letters.replace(char, '')
388
389 if axis_seq:
390 left_seq = axis_seq
391 assert len(axis_seq) == self.alignment.get_alignment_length()
392 else:
393 left_seq = self.dumb_consensus()
394
395 pssm_info = []
396
397 for residue_num in range(len(left_seq)):
398 score_dict = self._get_base_letters(all_letters)
399 for record in self.alignment._records:
400 try:
401 this_residue = record.seq[residue_num]
402
403
404 except IndexError:
405 this_residue = None
406
407 if this_residue and this_residue not in chars_to_ignore:
408 weight = record.annotations.get('weight', 1.0)
409 try:
410 score_dict[this_residue] += weight
411
412 except KeyError:
413 raise ValueError("Residue %s not found in alphabet %s"
414 % (this_residue,
415 self.alignment._alphabet))
416
417 pssm_info.append((left_seq[residue_num],
418 score_dict))
419
420 return PSSM(pssm_info)
421
423 """Create a zeroed dictionary with all of the specified letters.
424 """
425 base_info = {}
426 for letter in letters:
427 base_info[letter] = 0
428
429 return base_info
430
431 - def information_content(self, start = 0,
432 end = None,
433 e_freq_table = None, log_base = 2,
434 chars_to_ignore = []):
435 """Calculate the information content for each residue along an alignment.
436
437 Arguments:
438 o start, end - The starting an ending points to calculate the
439 information content. These points should be relative to the first
440 sequence in the alignment, starting at zero (ie. even if the 'real'
441 first position in the seq is 203 in the initial sequence, for
442 the info content, we need to use zero). This defaults to the entire
443 length of the first sequence.
444 o e_freq_table - A FreqTable object specifying the expected frequencies
445 for each letter in the alphabet we are using (e.g. {'G' : 0.4,
446 'C' : 0.4, 'T' : 0.1, 'A' : 0.1}). Gap characters should not be
447 included, since these should not have expected frequencies.
448 o log_base - The base of the logathrim to use in calculating the
449 information content. This defaults to 2 so the info is in bits.
450 o chars_to_ignore - A listing of characterw which should be ignored
451 in calculating the info content.
452
453 Returns:
454 o A number representing the info content for the specified region.
455
456 Please see the Biopython manual for more information on how information
457 content is calculated.
458 """
459
460 if end is None:
461 end = len(self.alignment._records[0].seq)
462
463 if start < 0 or end > len(self.alignment._records[0].seq):
464 raise ValueError("Start (%s) and end (%s) are not in the \
465 range %s to %s"
466 % (start, end, 0, len(self.alignment._records[0].seq)))
467
468 random_expected = None
469 if not e_freq_table:
470
471 base_alpha = Alphabet._get_base_alphabet(self.alignment._alphabet)
472 if isinstance(base_alpha, Alphabet.ProteinAlphabet):
473 random_expected = Protein20Random
474 elif isinstance(base_alpha, Alphabet.NucleotideAlphabet):
475 random_expected = Nucleotide4Random
476 else:
477 errstr = "Error in alphabet: not Nucleotide or Protein, "
478 errstr += "supply expected frequencies"
479 raise ValueError(errstr)
480 del base_alpha
481 elif not isinstance(e_freq_table, FreqTable.FreqTable):
482 raise ValueError("e_freq_table should be a FreqTable object")
483
484
485 all_letters = self._get_all_letters()
486 for char in chars_to_ignore:
487 all_letters = all_letters.replace(char, '')
488
489 info_content = {}
490 for residue_num in range(start, end):
491 freq_dict = self._get_letter_freqs(residue_num,
492 self.alignment._records,
493 all_letters, chars_to_ignore)
494
495 column_score = self._get_column_info_content(freq_dict,
496 e_freq_table,
497 log_base,
498 random_expected)
499
500 info_content[residue_num] = column_score
501
502 total_info = sum(info_content.itervalues())
503
504 for i in info_content:
505 self.ic_vector[i] = info_content[i]
506 return total_info
507
509 """Determine the frequency of specific letters in the alignment.
510
511 Arguments:
512 o residue_num - The number of the column we are getting frequencies
513 from.
514 o all_records - All of the SeqRecords in the alignment.
515 o letters - The letters we are interested in getting the frequency
516 for.
517 o to_ignore - Letters we are specifically supposed to ignore.
518
519 This will calculate the frequencies of each of the specified letters
520 in the alignment at the given frequency, and return this as a
521 dictionary where the keys are the letters and the values are the
522 frequencies.
523 """
524 freq_info = self._get_base_letters(letters)
525
526 total_count = 0
527
528 for record in all_records:
529 try:
530 if record.seq[residue_num] not in to_ignore:
531 weight = record.annotations.get('weight',1.0)
532 freq_info[record.seq[residue_num]] += weight
533 total_count += weight
534
535 except KeyError:
536 raise ValueError("Residue %s not found in alphabet %s"
537 % (record.seq[residue_num],
538 self.alignment._alphabet))
539
540 if total_count == 0:
541
542 for letter in freq_info:
543 assert freq_info[letter] == 0
544
545 else:
546
547 for letter in freq_info:
548 freq_info[letter] = freq_info[letter] / total_count
549
550 return freq_info
551
552 - def _get_column_info_content(self, obs_freq, e_freq_table, log_base,
553 random_expected):
554 """Calculate the information content for a column.
555
556 Arguments:
557 o obs_freq - The frequencies observed for each letter in the column.
558 o e_freq_table - An optional argument specifying the expected
559 frequencies for each letter. This is a SubsMat.FreqTable instance.
560 o log_base - The base of the logathrim to use in calculating the
561 info content.
562 """
563 try:
564 gap_char = self.alignment._alphabet.gap_char
565 except AttributeError:
566
567
568 gap_char = "-"
569
570 if e_freq_table:
571 if not isinstance(e_freq_table, FreqTable.FreqTable):
572 raise ValueError("e_freq_table should be a FreqTable object")
573
574 for key in obs_freq:
575 if (key != gap_char and key not in e_freq_table):
576 raise ValueError("Expected frequency letters %s "
577 "do not match observed %s"
578 % (e_freq_table.keys(),
579 obs_freq.keys() - [gap_char]))
580
581 total_info = 0.0
582
583 for letter in obs_freq:
584 inner_log = 0.0
585
586
587
588 if letter != gap_char:
589 if e_freq_table:
590 inner_log = obs_freq[letter] / e_freq_table[letter]
591 else:
592 inner_log = obs_freq[letter] / random_expected
593
594
595 if inner_log > 0:
596 letter_info = (obs_freq[letter] *
597 math.log(inner_log) / math.log(log_base))
598 total_info += letter_info
599 return total_info
600
603
604
606 """Represent a position specific score matrix.
607
608 This class is meant to make it easy to access the info within a PSSM
609 and also make it easy to print out the information in a nice table.
610
611 Let's say you had an alignment like this:
612 GTATC
613 AT--C
614 CTGTC
615
616 The position specific score matrix (when printed) looks like:
617
618 G A T C
619 G 1 1 0 1
620 T 0 0 3 0
621 A 1 1 0 0
622 T 0 0 2 0
623 C 0 0 0 3
624
625 You can access a single element of the PSSM using the following:
626
627 your_pssm[sequence_number][residue_count_name]
628
629 For instance, to get the 'T' residue for the second element in the
630 above alignment you would need to do:
631
632 your_pssm[1]['T']
633 """
635 """Initialize with pssm data to represent.
636
637 The pssm passed should be a list with the following structure:
638
639 list[0] - The letter of the residue being represented (for instance,
640 from the example above, the first few list[0]s would be GTAT...
641 list[1] - A dictionary with the letter substitutions and counts.
642 """
643 self.pssm = pssm
644
646 return self.pssm[pos][1]
647
649 out = " "
650 all_residues = self.pssm[0][1].keys()
651 all_residues.sort()
652
653
654 for res in all_residues:
655 out += " %s" % res
656 out += "\n"
657
658
659 for item in self.pssm:
660 out += "%s " % item[0]
661 for res in all_residues:
662 out += " %.1f" % item[1][res]
663
664 out += "\n"
665 return out
666
668 """Return the residue letter at the specified position.
669 """
670 return self.pssm[pos][0]
671
672
673 -def print_info_content(summary_info,fout=None,rep_record=0):
674 """ Three column output: position, aa in representative sequence,
675 ic_vector value"""
676 fout = fout or sys.stdout
677 if not summary_info.ic_vector:
678 summary_info.information_content()
679 rep_sequence = summary_info.alignment._records[rep_record].seq
680 positions = summary_info.ic_vector.keys()
681 positions.sort()
682 for pos in positions:
683 fout.write("%d %s %.3f\n" % (pos, rep_sequence[pos],
684 summary_info.ic_vector[pos]))
685
686 if __name__ == "__main__":
687 print "Quick test"
688 from Bio import AlignIO
689 from Bio.Align.Generic import Alignment
690
691 filename = "../../Tests/GFF/multi.fna"
692 format = "fasta"
693 expected = FreqTable.FreqTable({"A":0.25,"G":0.25,"T":0.25,"C":0.25},
694 FreqTable.FREQ,
695 IUPAC.unambiguous_dna)
696
697 alignment = AlignIO.read(open(filename), format)
698 for record in alignment:
699 print str(record.seq)
700 print "="*alignment.get_alignment_length()
701
702 summary = SummaryInfo(alignment)
703 consensus = summary.dumb_consensus(ambiguous="N")
704 print consensus
705 consensus = summary.gap_consensus(ambiguous="N")
706 print consensus
707 print
708 print summary.pos_specific_score_matrix(chars_to_ignore=['-'],
709 axis_seq=consensus)
710 print
711
712
713 print summary.information_content(e_freq_table=expected,
714 chars_to_ignore=['-'])
715 print
716 print "Trying a protein sequence with gaps and stops"
717
718 alpha = Alphabet.HasStopCodon(Alphabet.Gapped(Alphabet.generic_protein, "-"), "*")
719 a = Alignment(alpha)
720 a.add_sequence("ID001", "MHQAIFIYQIGYP*LKSGYIQSIRSPEYDNW-")
721 a.add_sequence("ID002", "MH--IFIYQIGYAYLKSGYIQSIRSPEY-NW*")
722 a.add_sequence("ID003", "MHQAIFIYQIGYPYLKSGYIQSIRSPEYDNW*")
723 print a
724 print "="*a.get_alignment_length()
725
726 s = SummaryInfo(a)
727 c = s.dumb_consensus(ambiguous="X")
728 print c
729 c = s.gap_consensus(ambiguous="X")
730 print c
731 print
732 print s.pos_specific_score_matrix(chars_to_ignore=['-', '*'], axis_seq=c)
733
734 print s.information_content(chars_to_ignore=['-', '*'])
735
736 print "Done"
737