Package Bio :: Package SeqUtils :: Module ProtParam
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqUtils.ProtParam

  1  # Copyright Yair Benita Y.Benita@pharm.uu.nl 
  2  # Biopython (http://biopython.org) license applies 
  3   
  4  """Simple protein analysis. 
  5   
  6  Example:: 
  7   
  8      X = ProteinAnalysis("MAEGEITTFTALTEKFNLPPGNYKKPKLLYCSNGGHFLRILPDGTVDGTRDRSDQHIQLQLSAESVGEVYIKSTETGQYLAMDTSGLLYGSQTPSEECLFLERLEENHYNTYTSKKHAEKNWFVGLKKNGSCKRGPRTHYGQKAILFLPLPV") 
  9      print(X.count_amino_acids()) 
 10      print(X.get_amino_acids_percent()) 
 11      print(X.molecular_weight()) 
 12      print(X.aromaticity()) 
 13      print(X.instability_index()) 
 14      print(X.flexibility()) 
 15      print(X.isoelectric_point()) 
 16      print(X.secondary_structure_fraction()) 
 17      print(X.protein_scale(ProtParamData.kd, 9, 0.4)) 
 18   
 19  """ 
 20   
 21  from __future__ import print_function 
 22   
 23  import sys 
 24  from Bio.SeqUtils import ProtParamData  # Local 
 25  from Bio.SeqUtils import IsoelectricPoint  # Local 
 26  from Bio.Seq import Seq 
 27  from Bio.Alphabet import IUPAC 
 28  from Bio.Data import IUPACData 
 29  from Bio.SeqUtils import molecular_weight 
 30   
 31  __docformat__ = "restructuredtext en" 
 32   
 33   
34 -class ProteinAnalysis(object):
35 """Class containing methods for protein analysis. 36 37 The constructor takes two arguments. 38 The first is the protein sequence as a string, which is then converted to a 39 sequence object using the Bio.Seq module. This is done just to make sure 40 the sequence is a protein sequence and not anything else. 41 42 The second argument is optional. If set to True, the weight of the amino 43 acids will be calculated using their monoisotopic mass (the weight of the 44 most abundant isotopes for each element), instead of the average molecular 45 mass (the averaged weight of all stable isotopes for each element). 46 If set to false (the default value) or left out, the IUPAC average 47 molecular mass will be used for the calculation. 48 49 """
50 - def __init__(self, prot_sequence, monoisotopic=False):
51 if prot_sequence.islower(): 52 self.sequence = Seq(prot_sequence.upper(), IUPAC.protein) 53 else: 54 self.sequence = Seq(prot_sequence, IUPAC.protein) 55 self.amino_acids_content = None 56 self.amino_acids_percent = None 57 self.length = len(self.sequence) 58 self.monoisotopic = monoisotopic
59
60 - def count_amino_acids(self):
61 """Count standard amino acids, returns a dict. 62 63 Counts the number times each amino acid is in the protein 64 sequence. Returns a dictionary {AminoAcid:Number}. 65 66 The return value is cached in self.amino_acids_content. 67 It is not recalculated upon subsequent calls. 68 """ 69 if self.amino_acids_content is None: 70 prot_dic = dict((k, 0) for k in IUPACData.protein_letters) 71 for aa in prot_dic: 72 prot_dic[aa] = self.sequence.count(aa) 73 74 self.amino_acids_content = prot_dic 75 76 return self.amino_acids_content
77
78 - def get_amino_acids_percent(self):
79 """Calculate the amino acid content in percentages. 80 81 The same as count_amino_acids only returns the Number in percentage of 82 entire sequence. Returns a dictionary of {AminoAcid:percentage}. 83 84 The return value is cached in self.amino_acids_percent. 85 86 input is the dictionary self.amino_acids_content. 87 output is a dictionary with amino acids as keys. 88 """ 89 if self.amino_acids_percent is None: 90 aa_counts = self.count_amino_acids() 91 92 percentages = {} 93 for aa in aa_counts: 94 percentages[aa] = aa_counts[aa] / float(self.length) 95 96 self.amino_acids_percent = percentages 97 98 return self.amino_acids_percent
99
100 - def molecular_weight(self):
101 """Calculate MW from Protein sequence""" 102 return molecular_weight(self.sequence, monoisotopic=self.monoisotopic)
103
104 - def aromaticity(self):
105 """Calculate the aromaticity according to Lobry, 1994. 106 107 Calculates the aromaticity value of a protein according to Lobry, 1994. 108 It is simply the relative frequency of Phe+Trp+Tyr. 109 """ 110 aromatic_aas = 'YWF' 111 aa_percentages = self.get_amino_acids_percent() 112 113 aromaticity = sum(aa_percentages[aa] for aa in aromatic_aas) 114 115 return aromaticity
116
117 - def instability_index(self):
118 """Calculate the instability index according to Guruprasad et al 1990. 119 120 Implementation of the method of Guruprasad et al. 1990 to test a 121 protein for stability. Any value above 40 means the protein is unstable 122 (has a short half life). 123 124 See: Guruprasad K., Reddy B.V.B., Pandit M.W. 125 Protein Engineering 4:155-161(1990). 126 """ 127 index = ProtParamData.DIWV 128 score = 0.0 129 130 for i in range(self.length - 1): 131 this, next = self.sequence[i:i + 2] 132 dipeptide_value = index[this][next] 133 score += dipeptide_value 134 135 return (10.0 / self.length) * score
136
137 - def flexibility(self):
138 """Calculate the flexibility according to Vihinen, 1994. 139 140 No argument to change window size because parameters are specific for a 141 window=9. The parameters used are optimized for determining the flexibility. 142 """ 143 flexibilities = ProtParamData.Flex 144 window_size = 9 145 weights = [0.25, 0.4375, 0.625, 0.8125, 1] 146 scores = [] 147 148 for i in range(self.length - window_size): 149 subsequence = self.sequence[i:i + window_size] 150 score = 0.0 151 152 for j in range(window_size // 2): 153 front = subsequence[j] 154 back = subsequence[window_size - j - 1] 155 score += (flexibilities[front] + flexibilities[back]) * weights[j] 156 157 middle = subsequence[window_size // 2 + 1] 158 score += flexibilities[middle] 159 160 scores.append(score / 5.25) 161 162 return scores
163
164 - def gravy(self):
165 """Calculate the gravy according to Kyte and Doolittle.""" 166 total_gravy = sum(ProtParamData.kd[aa] for aa in self.sequence) 167 168 return total_gravy / self.length
169
170 - def _weight_list(self, window, edge):
171 """Makes a list of relative weight of the 172 window edges compared to the window center. The weights are linear. 173 it actually generates half a list. For a window of size 9 and edge 0.4 174 you get a list of [0.4, 0.55, 0.7, 0.85]. 175 """ 176 unit = 2 * (1.0 - edge) / (window - 1) 177 weights = [0.0] * (window // 2) 178 179 for i in range(window // 2): 180 weights[i] = edge + unit * i 181 182 return weights
183
184 - def protein_scale(self, param_dict, window, edge=1.0):
185 """Compute a profile by any amino acid scale. 186 187 An amino acid scale is defined by a numerical value assigned to each type of 188 amino acid. The most frequently used scales are the hydrophobicity or 189 hydrophilicity scales and the secondary structure conformational parameters 190 scales, but many other scales exist which are based on different chemical and 191 physical properties of the amino acids. You can set several parameters that 192 control the computation of a scale profile, such as the window size and the 193 window edge relative weight value. 194 195 WindowSize: The window size is the length 196 of the interval to use for the profile computation. For a window size n, we 197 use the i-(n-1)/2 neighboring residues on each side to compute 198 the score for residue i. The score for residue i is the sum of the scaled values 199 for these amino acids, optionally weighted according to their position in the 200 window. 201 202 Edge: The central amino acid of the window always has a weight of 1. 203 By default, the amino acids at the remaining window positions have the same 204 weight, but you can make the residue at the center of the window have a 205 larger weight than the others by setting the edge value for the residues at 206 the beginning and end of the interval to a value between 0 and 1. For 207 instance, for Edge=0.4 and a window size of 5 the weights will be: 0.4, 0.7, 208 1.0, 0.7, 0.4. 209 210 The method returns a list of values which can be plotted to 211 view the change along a protein sequence. Many scales exist. Just add your 212 favorites to the ProtParamData modules. 213 214 Similar to expasy's ProtScale: http://www.expasy.org/cgi-bin/protscale.pl 215 """ 216 # generate the weights 217 # _weight_list returns only one tail. If the list should be [0.4,0.7,1.0,0.7,0.4] 218 # what you actually get from _weights_list is [0.4,0.7]. The correct calculation is done 219 # in the loop. 220 weights = self._weight_list(window, edge) 221 scores = [] 222 223 # the score in each Window is divided by the sum of weights 224 # (* 2 + 1) since the weight list is one sided: 225 sum_of_weights = sum(weights) * 2 + 1 226 227 for i in range(self.length - window + 1): 228 subsequence = self.sequence[i:i + window] 229 score = 0.0 230 231 for j in range(window // 2): 232 # walk from the outside of the Window towards the middle. 233 # Iddo: try/except clauses added to avoid raising an exception on a non-standard amino acid 234 try: 235 front = param_dict[subsequence[j]] 236 back = param_dict[subsequence[window - j - 1]] 237 score += weights[j] * front + weights[j] * back 238 except KeyError: 239 sys.stderr.write('warning: %s or %s is not a standard amino acid.\n' % 240 (subsequence[j], subsequence[window - j - 1])) 241 242 # Now add the middle value, which always has a weight of 1. 243 middle = subsequence[window // 2] 244 if middle in param_dict: 245 score += param_dict[middle] 246 else: 247 sys.stderr.write('warning: %s is not a standard amino acid.\n' % (middle)) 248 249 scores.append(score / sum_of_weights) 250 251 return scores
252
253 - def isoelectric_point(self):
254 """Calculate the isoelectric point. 255 256 Uses the module IsoelectricPoint to calculate the pI of a protein. 257 """ 258 aa_content = self.count_amino_acids() 259 260 ie_point = IsoelectricPoint.IsoelectricPoint(self.sequence, aa_content) 261 return ie_point.pi()
262
264 """Calculate fraction of helix, turn and sheet. 265 266 Returns a list of the fraction of amino acids which tend 267 to be in Helix, Turn or Sheet. 268 269 Amino acids in helix: V, I, Y, F, W, L. 270 Amino acids in Turn: N, P, G, S. 271 Amino acids in sheet: E, M, A, L. 272 273 Returns a tuple of three integers (Helix, Turn, Sheet). 274 """ 275 aa_percentages = self.get_amino_acids_percent() 276 277 helix = sum(aa_percentages[r] for r in 'VIYFWL') 278 turn = sum(aa_percentages[r] for r in 'NPGS') 279 sheet = sum(aa_percentages[r] for r in 'EMAL') 280 281 return helix, turn, sheet
282