Package Bio :: Package SeqUtils :: Module ProtParam
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqUtils.ProtParam

  1  # Copyright Yair Benita Y.Benita@pharm.uu.nl 
  2  # Biopython (http://biopython.org) license applies 
  3   
  4  """Simple protein analysis. 
  5   
  6  Example:: 
  7   
  8      X = ProteinAnalysis("MAEGEITTFTALTEKFNLPPGNYKKPKLLYCSNGGHFLRILPDGTVDGTRDRSDQHIQLQLSAESVGEVYIKSTETGQYLAMDTSGLLYGSQTPSEECLFLERLEENHYNTYTSKKHAEKNWFVGLKKNGSCKRGPRTHYGQKAILFLPLPV") 
  9      print(X.count_amino_acids()) 
 10      print(X.get_amino_acids_percent()) 
 11      print(X.molecular_weight()) 
 12      print(X.aromaticity()) 
 13      print(X.instability_index()) 
 14      print(X.flexibility()) 
 15      print(X.isoelectric_point()) 
 16      print(X.secondary_structure_fraction()) 
 17      print(X.protein_scale(ProtParamData.kd, 9, 0.4)) 
 18   
 19  """ 
 20   
 21  from __future__ import print_function 
 22   
 23  import sys 
 24  from Bio.SeqUtils import ProtParamData  # Local 
 25  from Bio.SeqUtils import IsoelectricPoint  # Local 
 26  from Bio.Seq import Seq 
 27  from Bio.Alphabet import IUPAC 
 28  from Bio.Data import IUPACData 
 29  from Bio.SeqUtils import molecular_weight 
 30   
 31   
32 -class ProteinAnalysis(object):
33 """Class containing methods for protein analysis. 34 35 The constructor takes two arguments. 36 The first is the protein sequence as a string, which is then converted to a 37 sequence object using the Bio.Seq module. This is done just to make sure 38 the sequence is a protein sequence and not anything else. 39 40 The second argument is optional. If set to True, the weight of the amino 41 acids will be calculated using their monoisotopic mass (the weight of the 42 most abundant isotopes for each element), instead of the average molecular 43 mass (the averaged weight of all stable isotopes for each element). 44 If set to false (the default value) or left out, the IUPAC average 45 molecular mass will be used for the calculation. 46 47 """ 48
49 - def __init__(self, prot_sequence, monoisotopic=False):
50 if prot_sequence.islower(): 51 self.sequence = Seq(prot_sequence.upper(), IUPAC.protein) 52 else: 53 self.sequence = Seq(prot_sequence, IUPAC.protein) 54 self.amino_acids_content = None 55 self.amino_acids_percent = None 56 self.length = len(self.sequence) 57 self.monoisotopic = monoisotopic
58
59 - def count_amino_acids(self):
60 """Count standard amino acids, returns a dict. 61 62 Counts the number times each amino acid is in the protein 63 sequence. Returns a dictionary {AminoAcid:Number}. 64 65 The return value is cached in self.amino_acids_content. 66 It is not recalculated upon subsequent calls. 67 """ 68 if self.amino_acids_content is None: 69 prot_dic = dict((k, 0) for k in IUPACData.protein_letters) 70 for aa in prot_dic: 71 prot_dic[aa] = self.sequence.count(aa) 72 73 self.amino_acids_content = prot_dic 74 75 return self.amino_acids_content
76
77 - def get_amino_acids_percent(self):
78 """Calculate the amino acid content in percentages. 79 80 The same as count_amino_acids only returns the Number in percentage of 81 entire sequence. Returns a dictionary of {AminoAcid:percentage}. 82 83 The return value is cached in self.amino_acids_percent. 84 85 input is the dictionary self.amino_acids_content. 86 output is a dictionary with amino acids as keys. 87 """ 88 if self.amino_acids_percent is None: 89 aa_counts = self.count_amino_acids() 90 91 percentages = {} 92 for aa in aa_counts: 93 percentages[aa] = aa_counts[aa] / float(self.length) 94 95 self.amino_acids_percent = percentages 96 97 return self.amino_acids_percent
98
99 - def molecular_weight(self):
100 """Calculate MW from Protein sequence""" 101 return molecular_weight(self.sequence, monoisotopic=self.monoisotopic)
102
103 - def aromaticity(self):
104 """Calculate the aromaticity according to Lobry, 1994. 105 106 Calculates the aromaticity value of a protein according to Lobry, 1994. 107 It is simply the relative frequency of Phe+Trp+Tyr. 108 """ 109 aromatic_aas = 'YWF' 110 aa_percentages = self.get_amino_acids_percent() 111 112 aromaticity = sum(aa_percentages[aa] for aa in aromatic_aas) 113 114 return aromaticity
115
116 - def instability_index(self):
117 """Calculate the instability index according to Guruprasad et al 1990. 118 119 Implementation of the method of Guruprasad et al. 1990 to test a 120 protein for stability. Any value above 40 means the protein is unstable 121 (has a short half life). 122 123 See: Guruprasad K., Reddy B.V.B., Pandit M.W. 124 Protein Engineering 4:155-161(1990). 125 """ 126 index = ProtParamData.DIWV 127 score = 0.0 128 129 for i in range(self.length - 1): 130 this, next = self.sequence[i:i + 2] 131 dipeptide_value = index[this][next] 132 score += dipeptide_value 133 134 return (10.0 / self.length) * score
135
136 - def flexibility(self):
137 """Calculate the flexibility according to Vihinen, 1994. 138 139 No argument to change window size because parameters are specific for a 140 window=9. The parameters used are optimized for determining the flexibility. 141 """ 142 flexibilities = ProtParamData.Flex 143 window_size = 9 144 weights = [0.25, 0.4375, 0.625, 0.8125, 1] 145 scores = [] 146 147 for i in range(self.length - window_size): 148 subsequence = self.sequence[i:i + window_size] 149 score = 0.0 150 151 for j in range(window_size // 2): 152 front = subsequence[j] 153 back = subsequence[window_size - j - 1] 154 score += (flexibilities[front] + flexibilities[back]) * weights[j] 155 156 middle = subsequence[window_size // 2 + 1] 157 score += flexibilities[middle] 158 159 scores.append(score / 5.25) 160 161 return scores
162
163 - def gravy(self):
164 """Calculate the gravy according to Kyte and Doolittle.""" 165 total_gravy = sum(ProtParamData.kd[aa] for aa in self.sequence) 166 167 return total_gravy / self.length
168
169 - def _weight_list(self, window, edge):
170 """Makes a list of relative weight of the 171 window edges compared to the window center. The weights are linear. 172 it actually generates half a list. For a window of size 9 and edge 0.4 173 you get a list of [0.4, 0.55, 0.7, 0.85]. 174 """ 175 unit = 2 * (1.0 - edge) / (window - 1) 176 weights = [0.0] * (window // 2) 177 178 for i in range(window // 2): 179 weights[i] = edge + unit * i 180 181 return weights
182
183 - def protein_scale(self, param_dict, window, edge=1.0):
184 """Compute a profile by any amino acid scale. 185 186 An amino acid scale is defined by a numerical value assigned to each type of 187 amino acid. The most frequently used scales are the hydrophobicity or 188 hydrophilicity scales and the secondary structure conformational parameters 189 scales, but many other scales exist which are based on different chemical and 190 physical properties of the amino acids. You can set several parameters that 191 control the computation of a scale profile, such as the window size and the 192 window edge relative weight value. 193 194 WindowSize: The window size is the length 195 of the interval to use for the profile computation. For a window size n, we 196 use the i-(n-1)/2 neighboring residues on each side to compute 197 the score for residue i. The score for residue i is the sum of the scaled values 198 for these amino acids, optionally weighted according to their position in the 199 window. 200 201 Edge: The central amino acid of the window always has a weight of 1. 202 By default, the amino acids at the remaining window positions have the same 203 weight, but you can make the residue at the center of the window have a 204 larger weight than the others by setting the edge value for the residues at 205 the beginning and end of the interval to a value between 0 and 1. For 206 instance, for Edge=0.4 and a window size of 5 the weights will be: 0.4, 0.7, 207 1.0, 0.7, 0.4. 208 209 The method returns a list of values which can be plotted to 210 view the change along a protein sequence. Many scales exist. Just add your 211 favorites to the ProtParamData modules. 212 213 Similar to expasy's ProtScale: http://www.expasy.org/cgi-bin/protscale.pl 214 """ 215 # generate the weights 216 # _weight_list returns only one tail. If the list should be [0.4,0.7,1.0,0.7,0.4] 217 # what you actually get from _weights_list is [0.4,0.7]. The correct calculation is done 218 # in the loop. 219 weights = self._weight_list(window, edge) 220 scores = [] 221 222 # the score in each Window is divided by the sum of weights 223 # (* 2 + 1) since the weight list is one sided: 224 sum_of_weights = sum(weights) * 2 + 1 225 226 for i in range(self.length - window + 1): 227 subsequence = self.sequence[i:i + window] 228 score = 0.0 229 230 for j in range(window // 2): 231 # walk from the outside of the Window towards the middle. 232 # Iddo: try/except clauses added to avoid raising an exception on a non-standard amino acid 233 try: 234 front = param_dict[subsequence[j]] 235 back = param_dict[subsequence[window - j - 1]] 236 score += weights[j] * front + weights[j] * back 237 except KeyError: 238 sys.stderr.write('warning: %s or %s is not a standard amino acid.\n' % 239 (subsequence[j], subsequence[window - j - 1])) 240 241 # Now add the middle value, which always has a weight of 1. 242 middle = subsequence[window // 2] 243 if middle in param_dict: 244 score += param_dict[middle] 245 else: 246 sys.stderr.write('warning: %s is not a standard amino acid.\n' % (middle)) 247 248 scores.append(score / sum_of_weights) 249 250 return scores
251
252 - def isoelectric_point(self):
253 """Calculate the isoelectric point. 254 255 Uses the module IsoelectricPoint to calculate the pI of a protein. 256 """ 257 aa_content = self.count_amino_acids() 258 259 ie_point = IsoelectricPoint.IsoelectricPoint(self.sequence, aa_content) 260 return ie_point.pi()
261
263 """Calculate fraction of helix, turn and sheet. 264 265 Returns a list of the fraction of amino acids which tend 266 to be in Helix, Turn or Sheet. 267 268 Amino acids in helix: V, I, Y, F, W, L. 269 Amino acids in Turn: N, P, G, S. 270 Amino acids in sheet: E, M, A, L. 271 272 Returns a tuple of three floats (Helix, Turn, Sheet). 273 """ 274 aa_percentages = self.get_amino_acids_percent() 275 276 helix = sum(aa_percentages[r] for r in 'VIYFWL') 277 turn = sum(aa_percentages[r] for r in 'NPGS') 278 sheet = sum(aa_percentages[r] for r in 'EMAL') 279 280 return helix, turn, sheet
281