1 import math
2 from CodonUsageIndices import SharpEcoliIndex
3 from Bio import SeqIO
4
5
6 CodonsDict = {'TTT': 0, 'TTC': 0, 'TTA': 0, 'TTG': 0, 'CTT': 0,
7 'CTC': 0, 'CTA': 0, 'CTG': 0, 'ATT': 0, 'ATC': 0,
8 'ATA': 0, 'ATG': 0, 'GTT': 0, 'GTC': 0, 'GTA': 0,
9 'GTG': 0, 'TAT': 0, 'TAC': 0, 'TAA': 0, 'TAG': 0,
10 'CAT': 0, 'CAC': 0, 'CAA': 0, 'CAG': 0, 'AAT': 0,
11 'AAC': 0, 'AAA': 0, 'AAG': 0, 'GAT': 0, 'GAC': 0,
12 'GAA': 0, 'GAG': 0, 'TCT': 0, 'TCC': 0, 'TCA': 0,
13 'TCG': 0, 'CCT': 0, 'CCC': 0, 'CCA': 0, 'CCG': 0,
14 'ACT': 0, 'ACC': 0, 'ACA': 0, 'ACG': 0, 'GCT': 0,
15 'GCC': 0, 'GCA': 0, 'GCG': 0, 'TGT': 0, 'TGC': 0,
16 'TGA': 0, 'TGG': 0, 'CGT': 0, 'CGC': 0, 'CGA': 0,
17 'CGG': 0, 'AGT': 0, 'AGC': 0, 'AGA': 0, 'AGG': 0,
18 'GGT': 0, 'GGC': 0, 'GGA': 0, 'GGG': 0}
19
20
21
22 SynonymousCodons = {
23 'CYS': ['TGT', 'TGC'],
24 'ASP': ['GAT', 'GAC'],
25 'SER': ['TCT', 'TCG', 'TCA', 'TCC', 'AGC', 'AGT'],
26 'GLN': ['CAA', 'CAG'],
27 'MET': ['ATG'],
28 'ASN': ['AAC', 'AAT'],
29 'PRO': ['CCT', 'CCG', 'CCA', 'CCC'],
30 'LYS': ['AAG', 'AAA'],
31 'STOP': ['TAG', 'TGA', 'TAA'],
32 'THR': ['ACC', 'ACA', 'ACG', 'ACT'],
33 'PHE': ['TTT', 'TTC'],
34 'ALA': ['GCA', 'GCC', 'GCG', 'GCT'],
35 'GLY': ['GGT', 'GGG', 'GGA', 'GGC'],
36 'ILE': ['ATC', 'ATA', 'ATT'],
37 'LEU': ['TTA', 'TTG', 'CTC', 'CTT', 'CTG', 'CTA'],
38 'HIS': ['CAT', 'CAC'],
39 'ARG': ['CGA', 'CGC', 'CGG', 'CGT', 'AGG', 'AGA'],
40 'TRP': ['TGG'],
41 'VAL': ['GTA', 'GTC', 'GTG', 'GTT'],
42 'GLU': ['GAG', 'GAA'],
43 'TYR': ['TAT', 'TAC']
44 }
45
46
48 """A codon adaptation index (CAI) implementation.
49
50 Implements the codon adaptation index (CAI) described by Sharp and
51 Li (Nucleic Acids Res. 1987 Feb 11;15(3):1281-95).
52
53 NOTE - This implementation does not currently cope with alternative genetic
54 codes: only the synonymous codons in the standard table are considered.
55 """
56
58 self.index = {}
59 self.codon_count = {}
60
61
63 """Sets up an index to be used when calculating CAI for a gene.
64 Just pass a dictionary similar to the SharpEcoliIndex in the
65 CodonUsageIndices module.
66 """
67 self.index = index
68
70 """Generate a codon usage index from a FASTA file of CDS sequences.
71
72 Takes a location of a Fasta file containing CDS sequences
73 (which must all have a whole number of codons) and generates a codon
74 usage index.
75 """
76
77
78 if self.index != {} or self.codon_count != {}:
79 raise ValueError("an index has already been set or a codon count has been done. cannot overwrite either.")
80
81
82 self._count_codons(fasta_file)
83
84
85
86 for aa in SynonymousCodons:
87 total = 0.0
88 rcsu = []
89 codons = SynonymousCodons[aa]
90
91 for codon in codons:
92 total += self.codon_count[codon]
93
94
95 for codon in codons:
96 denominator = float(total) / len(codons)
97 rcsu.append(self.codon_count[codon] / denominator)
98
99
100 rcsu_max = max(rcsu)
101 for i in range(len(codons)):
102 self.index[codons[i]] = rcsu[i] / rcsu_max
103
105 """Calculate the CAI (float) for the provided DNA sequence (string).
106
107 This method uses the Index (either the one you set or the one you generated)
108 and returns the CAI for the DNA sequence.
109 """
110 cai_value, cai_length = 0, 0
111
112
113 if self.index == {}:
114 self.set_cai_index(SharpEcoliIndex)
115
116 if dna_sequence.islower():
117 dna_sequence = dna_sequence.upper()
118
119 for i in range(0, len(dna_sequence), 3):
120 codon = dna_sequence[i:i+3]
121 if codon in self.index:
122 if codon not in ['ATG', 'TGG']:
123 cai_value += math.log(self.index[codon])
124 cai_length += 1
125 elif codon not in ['TGA', 'TAA', 'TAG']:
126 raise TypeError("illegal codon in sequence: %s.\n%s" % (codon, self.index))
127
128 return math.exp(cai_value / (cai_length - 1.0))
129
131 handle = open(fasta_file, 'r')
132
133
134 self.codon_count = CodonsDict.copy()
135
136
137 for cur_record in SeqIO.parse(handle, "fasta"):
138
139 if str(cur_record.seq).islower():
140 dna_sequence = str(cur_record.seq).upper()
141 else:
142 dna_sequence = str(cur_record.seq)
143 for i in range(0, len(dna_sequence), 3):
144 codon = dna_sequence[i:i+3]
145 if codon in self.codon_count:
146 self.codon_count[codon] += 1
147 else:
148 raise TypeError("illegal codon %s in gene: %s" % (codon, cur_record.id))
149 handle.close()
150
151
153 """Prints out the index used."""
154 for i in sorted(self.index):
155 print "%s\t%.3f" % (i, self.index[i])
156