Package Bio :: Package SeqUtils :: Module CodonUsage
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqUtils.CodonUsage

  1  # This code is part of the Biopython distribution and governed by its 
  2  # license.  Please see the LICENSE file that should have been included 
  3  # as part of this package. 
  4  # 
  5   
  6  from __future__ import print_function 
  7   
  8  import math 
  9  from .CodonUsageIndices import SharpEcoliIndex 
 10  from Bio import SeqIO  # To parse a FASTA file 
 11   
 12   
 13  CodonsDict = {'TTT': 0, 'TTC': 0, 'TTA': 0, 'TTG': 0, 'CTT': 0, 
 14  'CTC': 0, 'CTA': 0, 'CTG': 0, 'ATT': 0, 'ATC': 0, 
 15  'ATA': 0, 'ATG': 0, 'GTT': 0, 'GTC': 0, 'GTA': 0, 
 16  'GTG': 0, 'TAT': 0, 'TAC': 0, 'TAA': 0, 'TAG': 0, 
 17  'CAT': 0, 'CAC': 0, 'CAA': 0, 'CAG': 0, 'AAT': 0, 
 18  'AAC': 0, 'AAA': 0, 'AAG': 0, 'GAT': 0, 'GAC': 0, 
 19  'GAA': 0, 'GAG': 0, 'TCT': 0, 'TCC': 0, 'TCA': 0, 
 20  'TCG': 0, 'CCT': 0, 'CCC': 0, 'CCA': 0, 'CCG': 0, 
 21  'ACT': 0, 'ACC': 0, 'ACA': 0, 'ACG': 0, 'GCT': 0, 
 22  'GCC': 0, 'GCA': 0, 'GCG': 0, 'TGT': 0, 'TGC': 0, 
 23  'TGA': 0, 'TGG': 0, 'CGT': 0, 'CGC': 0, 'CGA': 0, 
 24  'CGG': 0, 'AGT': 0, 'AGC': 0, 'AGA': 0, 'AGG': 0, 
 25  'GGT': 0, 'GGC': 0, 'GGA': 0, 'GGG': 0} 
 26   
 27   
 28  # this dictionary shows which codons encode the same AA 
 29  SynonymousCodons = { 
 30      'CYS': ['TGT', 'TGC'], 
 31      'ASP': ['GAT', 'GAC'], 
 32      'SER': ['TCT', 'TCG', 'TCA', 'TCC', 'AGC', 'AGT'], 
 33      'GLN': ['CAA', 'CAG'], 
 34      'MET': ['ATG'], 
 35      'ASN': ['AAC', 'AAT'], 
 36      'PRO': ['CCT', 'CCG', 'CCA', 'CCC'], 
 37      'LYS': ['AAG', 'AAA'], 
 38      'STOP': ['TAG', 'TGA', 'TAA'], 
 39      'THR': ['ACC', 'ACA', 'ACG', 'ACT'], 
 40      'PHE': ['TTT', 'TTC'], 
 41      'ALA': ['GCA', 'GCC', 'GCG', 'GCT'], 
 42      'GLY': ['GGT', 'GGG', 'GGA', 'GGC'], 
 43      'ILE': ['ATC', 'ATA', 'ATT'], 
 44      'LEU': ['TTA', 'TTG', 'CTC', 'CTT', 'CTG', 'CTA'], 
 45      'HIS': ['CAT', 'CAC'], 
 46      'ARG': ['CGA', 'CGC', 'CGG', 'CGT', 'AGG', 'AGA'], 
 47      'TRP': ['TGG'], 
 48      'VAL': ['GTA', 'GTC', 'GTG', 'GTT'], 
 49      'GLU': ['GAG', 'GAA'], 
 50      'TYR': ['TAT', 'TAC'] 
 51  } 
 52   
 53   
54 -class CodonAdaptationIndex(object):
55 """A codon adaptation index (CAI) implementation. 56 57 Implements the codon adaptation index (CAI) described by Sharp and 58 Li (Nucleic Acids Res. 1987 Feb 11;15(3):1281-95). 59 60 NOTE - This implementation does not currently cope with alternative genetic 61 codes: only the synonymous codons in the standard table are considered. 62 """ 63
64 - def __init__(self):
65 self.index = {} 66 self.codon_count = {}
67 68 # use this method with predefined CAI index
69 - def set_cai_index(self, index):
70 """Sets up an index to be used when calculating CAI for a gene. 71 72 Just pass a dictionary similar to the SharpEcoliIndex in the 73 CodonUsageIndices module. 74 """ 75 self.index = index
76
77 - def generate_index(self, fasta_file):
78 """Generate a codon usage index from a FASTA file of CDS sequences. 79 80 Takes a location of a Fasta file containing CDS sequences 81 (which must all have a whole number of codons) and generates a codon 82 usage index. 83 """ 84 85 # first make sure we're not overwriting an existing index: 86 if self.index != {} or self.codon_count != {}: 87 raise ValueError("an index has already been set or a codon count has been done. cannot overwrite either.") 88 89 # count codon occurrences in the file. 90 self._count_codons(fasta_file) 91 92 # now to calculate the index we first need to sum the number of times 93 # synonymous codons were used all together. 94 for aa in SynonymousCodons: 95 total = 0.0 96 rcsu = [] # RCSU values are CodonCount/((1/num of synonymous codons) * sum of all synonymous codons) 97 codons = SynonymousCodons[aa] 98 99 for codon in codons: 100 total += self.codon_count[codon] 101 102 # calculate the RSCU value for each of the codons 103 for codon in codons: 104 denominator = float(total) / len(codons) 105 rcsu.append(self.codon_count[codon] / denominator) 106 107 # now generate the index W=RCSUi/RCSUmax: 108 rcsu_max = max(rcsu) 109 for codon_index, codon in enumerate(codons): 110 self.index[codon] = rcsu[codon_index] / rcsu_max
111
112 - def cai_for_gene(self, dna_sequence):
113 """Calculate the CAI (float) for the provided DNA sequence (string). 114 115 This method uses the Index (either the one you set or the one you generated) 116 and returns the CAI for the DNA sequence. 117 """ 118 cai_value, cai_length = 0, 0 119 120 # if no index is set or generated, the default SharpEcoliIndex will be used. 121 if self.index == {}: 122 self.set_cai_index(SharpEcoliIndex) 123 124 if dna_sequence.islower(): 125 dna_sequence = dna_sequence.upper() 126 127 for i in range(0, len(dna_sequence), 3): 128 codon = dna_sequence[i:i + 3] 129 if codon in self.index: 130 if codon not in ['ATG', 'TGG']: # these two codons are always one, exclude them 131 cai_value += math.log(self.index[codon]) 132 cai_length += 1 133 elif codon not in ['TGA', 'TAA', 'TAG']: # some indices may not include stop codons 134 raise TypeError("illegal codon in sequence: %s.\n%s" % (codon, self.index)) 135 136 return math.exp(cai_value / (cai_length - 1.0))
137
138 - def _count_codons(self, fasta_file):
139 with open(fasta_file, 'r') as handle: 140 141 # make the codon dictionary local 142 self.codon_count = CodonsDict.copy() 143 144 # iterate over sequence and count all the codons in the FastaFile. 145 for cur_record in SeqIO.parse(handle, "fasta"): 146 # make sure the sequence is lower case 147 if str(cur_record.seq).islower(): 148 dna_sequence = str(cur_record.seq).upper() 149 else: 150 dna_sequence = str(cur_record.seq) 151 for i in range(0, len(dna_sequence), 3): 152 codon = dna_sequence[i:i + 3] 153 if codon in self.codon_count: 154 self.codon_count[codon] += 1 155 else: 156 raise TypeError("illegal codon %s in gene: %s" % (codon, cur_record.id))
157 158 # this just gives the index when the objects is printed.
159 - def print_index(self):
160 """Prints out the index used.""" 161 for i in sorted(self.index): 162 print("%s\t%.3f" % (i, self.index[i]))
163