Package Bio :: Package SeqUtils :: Module ProtParam
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqUtils.ProtParam

  1  # Copyright Yair Benita Y.Benita@pharm.uu.nl 
  2  # Biopython (http://biopython.org) license applies 
  3   
  4  """Simple protein analysis. 
  5   
  6  Example, 
  7   
  8  X = ProteinAnalysis("MAEGEITTFTALTEKFNLPPGNYKKPKLLYCSNGGHFLRILPDGTVDGTRDRSDQHIQLQLSAESVGEVYIKSTETGQYLAMDTSGLLYGSQTPSEECLFLERLEENHYNTYTSKKHAEKNWFVGLKKNGSCKRGPRTHYGQKAILFLPLPV") 
  9  print X.count_amino_acids() 
 10  print X.get_amino_acids_percent() 
 11  print X.molecular_weight() 
 12  print X.aromaticity() 
 13  print X.instability_index() 
 14  print X.flexibility() 
 15  print X.isoelectric_point() 
 16  print X.secondary_structure_fraction() 
 17  print X.protein_scale(ProtParamData.kd, 9, 0.4) 
 18  """ 
 19   
 20  import sys 
 21  import ProtParamData  # Local 
 22  import IsoelectricPoint  # Local 
 23  from Bio.Seq import Seq 
 24  from Bio.Alphabet import IUPAC 
 25  from Bio.Data import IUPACData 
 26   
 27   
28 -class ProteinAnalysis(object):
29 """Class containing methods for protein analysis. 30 31 The constructor takes two arguments. 32 The first is the protein sequence as a string, which is then converted to a 33 sequence object using the Bio.Seq module. This is done just to make sure 34 the sequence is a protein sequence and not anything else. 35 36 The second argument is optional. If set to True, the weight of the amino 37 acids will be calculated using their monoisotopic mass (the weight of the 38 most abundant isotopes for each element), instead of the average molecular 39 mass (the averaged weight of all stable isotopes for each element). 40 If set to false (the default value) or left out, the IUPAC average 41 molecular mass will be used for the calculation. 42 43 """
44 - def __init__(self, prot_sequence, monoisotopic=False):
45 if prot_sequence.islower(): 46 self.sequence = Seq(prot_sequence.upper(), IUPAC.protein) 47 else: 48 self.sequence = Seq(prot_sequence, IUPAC.protein) 49 self.amino_acids_content = None 50 self.amino_acids_percent = None 51 self.length = len(self.sequence) 52 self.monoisotopic = monoisotopic
53
54 - def count_amino_acids(self):
55 """Count standard amino acids, returns a dict. 56 57 Counts the number times each amino acid is in the protein 58 sequence. Returns a dictionary {AminoAcid:Number}. 59 60 The return value is cached in self.amino_acids_content. 61 It is not recalculated upon subsequent calls. 62 """ 63 if self.amino_acids_content is None: 64 prot_dic = dict([(k, 0) for k in IUPACData.protein_letters]) 65 for aa in prot_dic: 66 prot_dic[aa] = self.sequence.count(aa) 67 68 self.amino_acids_content = prot_dic 69 70 return self.amino_acids_content
71
72 - def get_amino_acids_percent(self):
73 """Calculate the amino acid content in percentages. 74 75 The same as count_amino_acids only returns the Number in percentage of 76 entire sequence. Returns a dictionary of {AminoAcid:percentage}. 77 78 The return value is cached in self.amino_acids_percent. 79 80 input is the dictionary self.amino_acids_content. 81 output is a dictionary with amino acids as keys. 82 """ 83 if self.amino_acids_percent is None: 84 aa_counts = self.count_amino_acids() 85 86 percentages = {} 87 for aa in aa_counts: 88 percentages[aa] = aa_counts[aa] / float(self.length) 89 90 self.amino_acids_percent = percentages 91 92 return self.amino_acids_percent
93
94 - def molecular_weight(self):
95 """Calculate MW from Protein sequence""" 96 # make local dictionary for speed 97 if self.monoisotopic: 98 water = 18.01 99 iupac_weights = IUPACData.monoisotopic_protein_weights 100 else: 101 iupac_weights = IUPACData.protein_weights 102 water = 18.02 103 104 aa_weights = {} 105 for i in iupac_weights: 106 # remove a molecule of water from the amino acid weight 107 aa_weights[i] = iupac_weights[i] - water 108 109 total_weight = water # add just one water molecule for the whole sequence 110 for aa in self.sequence: 111 total_weight += aa_weights[aa] 112 113 return total_weight
114
115 - def aromaticity(self):
116 """Calculate the aromaticity according to Lobry, 1994. 117 118 Calculates the aromaticity value of a protein according to Lobry, 1994. 119 It is simply the relative frequency of Phe+Trp+Tyr. 120 """ 121 aromatic_aas = 'YWF' 122 aa_percentages = self.get_amino_acids_percent() 123 124 aromaticity = sum([aa_percentages[aa] for aa in aromatic_aas]) 125 126 return aromaticity
127
128 - def instability_index(self):
129 """Calculate the instability index according to Guruprasad et al 1990. 130 131 Implementation of the method of Guruprasad et al. 1990 to test a 132 protein for stability. Any value above 40 means the protein is unstable 133 (has a short half life). 134 135 See: Guruprasad K., Reddy B.V.B., Pandit M.W. 136 Protein Engineering 4:155-161(1990). 137 """ 138 index = ProtParamData.DIWV 139 score = 0.0 140 141 for i in range(self.length - 1): 142 this, next = self.sequence[i:i+2] 143 dipeptide_value = index[this][next] 144 score += dipeptide_value 145 146 return (10.0 / self.length) * score
147
148 - def flexibility(self):
149 """Calculate the flexibility according to Vihinen, 1994. 150 151 No argument to change window size because parameters are specific for a 152 window=9. The parameters used are optimized for determining the flexibility. 153 """ 154 flexibilities = ProtParamData.Flex 155 window_size = 9 156 weights = [0.25, 0.4375, 0.625, 0.8125, 1] 157 scores = [] 158 159 for i in range(self.length - window_size): 160 subsequence = self.sequence[i:i+window_size] 161 score = 0.0 162 163 for j in range(window_size // 2): 164 front = subsequence[j] 165 back = subsequence[window_size - j - 1] 166 score += (flexibilities[front] + flexibilities[back]) * weights[j] 167 168 middle = subsequence[window_size // 2 + 1] 169 score += flexibilities[middle] 170 171 scores.append(score / 5.25) 172 173 return scores
174
175 - def gravy(self):
176 """Calculate the gravy according to Kyte and Doolittle.""" 177 total_gravy = sum(ProtParamData.kd[aa] for aa in self.sequence) 178 179 return total_gravy / self.length
180
181 - def _weight_list(self, window, edge):
182 """Makes a list of relative weight of the 183 window edges compared to the window center. The weights are linear. 184 it actually generates half a list. For a window of size 9 and edge 0.4 185 you get a list of [0.4, 0.55, 0.7, 0.85]. 186 """ 187 unit = 2 * (1.0 - edge) / (window - 1) 188 weights = [0.0] * (window // 2) 189 190 for i in range(window // 2): 191 weights[i] = edge + unit * i 192 193 return weights
194
195 - def protein_scale(self, param_dict, window, edge=1.0):
196 """Compute a profile by any amino acid scale. 197 198 An amino acid scale is defined by a numerical value assigned to each type of 199 amino acid. The most frequently used scales are the hydrophobicity or 200 hydrophilicity scales and the secondary structure conformational parameters 201 scales, but many other scales exist which are based on different chemical and 202 physical properties of the amino acids. You can set several parameters that 203 control the computation of a scale profile, such as the window size and the 204 window edge relative weight value. 205 206 WindowSize: The window size is the length 207 of the interval to use for the profile computation. For a window size n, we 208 use the i-(n-1)/2 neighboring residues on each side to compute 209 the score for residue i. The score for residue i is the sum of the scaled values 210 for these amino acids, optionally weighted according to their position in the 211 window. 212 213 Edge: The central amino acid of the window always has a weight of 1. 214 By default, the amino acids at the remaining window positions have the same 215 weight, but you can make the residue at the center of the window have a 216 larger weight than the others by setting the edge value for the residues at 217 the beginning and end of the interval to a value between 0 and 1. For 218 instance, for Edge=0.4 and a window size of 5 the weights will be: 0.4, 0.7, 219 1.0, 0.7, 0.4. 220 221 The method returns a list of values which can be plotted to 222 view the change along a protein sequence. Many scales exist. Just add your 223 favorites to the ProtParamData modules. 224 225 Similar to expasy's ProtScale: http://www.expasy.org/cgi-bin/protscale.pl 226 """ 227 # generate the weights 228 # _weight_list returns only one tail. If the list should be [0.4,0.7,1.0,0.7,0.4] 229 # what you actually get from _weights_list is [0.4,0.7]. The correct calculation is done 230 # in the loop. 231 weights = self._weight_list(window, edge) 232 scores = [] 233 234 # the score in each Window is divided by the sum of weights 235 # (* 2 + 1) since the weight list is one sided: 236 sum_of_weights = sum(weights) * 2 + 1 237 238 for i in range(self.length - window + 1): 239 subsequence = self.sequence[i:i+window] 240 score = 0.0 241 242 for j in range(window // 2): 243 # walk from the outside of the Window towards the middle. 244 # Iddo: try/except clauses added to avoid raising an exception on a non-standard amino acid 245 try: 246 front = param_dict[subsequence[j]] 247 back = param_dict[subsequence[window - j - 1]] 248 score += weights[j] * front + weights[j] * back 249 except KeyError: 250 sys.stderr.write('warning: %s or %s is not a standard amino acid.\n' % 251 (subsequence[j], subsequence[window - j - 1])) 252 253 # Now add the middle value, which always has a weight of 1. 254 middle = subsequence[window // 2] 255 if middle in param_dict: 256 score += param_dict[middle] 257 else: 258 sys.stderr.write('warning: %s is not a standard amino acid.\n' % (middle)) 259 260 scores.append(score / sum_of_weights) 261 262 return scores
263
264 - def isoelectric_point(self):
265 """Calculate the isoelectric point. 266 267 Uses the module IsoelectricPoint to calculate the pI of a protein. 268 """ 269 aa_content = self.count_amino_acids() 270 271 ie_point = IsoelectricPoint.IsoelectricPoint(self.sequence, aa_content) 272 return ie_point.pi()
273
275 """Calculate fraction of helix, turn and sheet. 276 277 Returns a list of the fraction of amino acids which tend 278 to be in Helix, Turn or Sheet. 279 280 Amino acids in helix: V, I, Y, F, W, L. 281 Amino acids in Turn: N, P, G, S. 282 Amino acids in sheet: E, M, A, L. 283 284 Returns a tuple of three integers (Helix, Turn, Sheet). 285 """ 286 aa_percentages = self.get_amino_acids_percent() 287 288 helix = sum([aa_percentages[r] for r in 'VIYFWL']) 289 turn = sum([aa_percentages[r] for r in 'NPGS']) 290 sheet = sum([aa_percentages[r] for r in 'EMAL']) 291 292 return helix, turn, sheet
293