Package Bio :: Package SeqUtils :: Module ProtParam
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqUtils.ProtParam

  1  # Copyright Yair Benita Y.Benita@pharm.uu.nl 
  2  # Biopython (http://biopython.org) license applies 
  3   
  4  """Simple protein analysis. 
  5   
  6  Example:: 
  7   
  8      X = ProteinAnalysis("MAEGEITTFTALTEKFNLPPGNYKKPKLLYCSNGGHFLRILPDGTVDGTRDRSDQHIQLQLSAESVGEVYIKSTETGQYLAMDTSGLLYGSQTPSEECLFLERLEENHYNTYTSKKHAEKNWFVGLKKNGSCKRGPRTHYGQKAILFLPLPV") 
  9      print(X.count_amino_acids()) 
 10      print(X.get_amino_acids_percent()) 
 11      print(X.molecular_weight()) 
 12      print(X.aromaticity()) 
 13      print(X.instability_index()) 
 14      print(X.flexibility()) 
 15      print(X.isoelectric_point()) 
 16      print(X.secondary_structure_fraction()) 
 17      print(X.protein_scale(ProtParamData.kd, 9, 0.4)) 
 18   
 19  """ 
 20   
 21  from __future__ import print_function 
 22   
 23  import sys 
 24  from Bio.SeqUtils import ProtParamData  # Local 
 25  from Bio.SeqUtils import IsoelectricPoint  # Local 
 26  from Bio.Seq import Seq 
 27  from Bio.Alphabet import IUPAC 
 28  from Bio.Data import IUPACData 
 29  from Bio.SeqUtils import molecular_weight 
 30   
 31   
32 -class ProteinAnalysis(object):
33 """Class containing methods for protein analysis. 34 35 The constructor takes two arguments. 36 The first is the protein sequence as a string, which is then converted to a 37 sequence object using the Bio.Seq module. This is done just to make sure 38 the sequence is a protein sequence and not anything else. 39 40 The second argument is optional. If set to True, the weight of the amino 41 acids will be calculated using their monoisotopic mass (the weight of the 42 most abundant isotopes for each element), instead of the average molecular 43 mass (the averaged weight of all stable isotopes for each element). 44 If set to false (the default value) or left out, the IUPAC average 45 molecular mass will be used for the calculation. 46 47 """
48 - def __init__(self, prot_sequence, monoisotopic=False):
49 if prot_sequence.islower(): 50 self.sequence = Seq(prot_sequence.upper(), IUPAC.protein) 51 else: 52 self.sequence = Seq(prot_sequence, IUPAC.protein) 53 self.amino_acids_content = None 54 self.amino_acids_percent = None 55 self.length = len(self.sequence) 56 self.monoisotopic = monoisotopic
57
58 - def count_amino_acids(self):
59 """Count standard amino acids, returns a dict. 60 61 Counts the number times each amino acid is in the protein 62 sequence. Returns a dictionary {AminoAcid:Number}. 63 64 The return value is cached in self.amino_acids_content. 65 It is not recalculated upon subsequent calls. 66 """ 67 if self.amino_acids_content is None: 68 prot_dic = dict((k, 0) for k in IUPACData.protein_letters) 69 for aa in prot_dic: 70 prot_dic[aa] = self.sequence.count(aa) 71 72 self.amino_acids_content = prot_dic 73 74 return self.amino_acids_content
75
76 - def get_amino_acids_percent(self):
77 """Calculate the amino acid content in percentages. 78 79 The same as count_amino_acids only returns the Number in percentage of 80 entire sequence. Returns a dictionary of {AminoAcid:percentage}. 81 82 The return value is cached in self.amino_acids_percent. 83 84 input is the dictionary self.amino_acids_content. 85 output is a dictionary with amino acids as keys. 86 """ 87 if self.amino_acids_percent is None: 88 aa_counts = self.count_amino_acids() 89 90 percentages = {} 91 for aa in aa_counts: 92 percentages[aa] = aa_counts[aa] / float(self.length) 93 94 self.amino_acids_percent = percentages 95 96 return self.amino_acids_percent
97
98 - def molecular_weight(self):
99 """Calculate MW from Protein sequence""" 100 return molecular_weight(self.sequence, monoisotopic=self.monoisotopic)
101
102 - def aromaticity(self):
103 """Calculate the aromaticity according to Lobry, 1994. 104 105 Calculates the aromaticity value of a protein according to Lobry, 1994. 106 It is simply the relative frequency of Phe+Trp+Tyr. 107 """ 108 aromatic_aas = 'YWF' 109 aa_percentages = self.get_amino_acids_percent() 110 111 aromaticity = sum(aa_percentages[aa] for aa in aromatic_aas) 112 113 return aromaticity
114
115 - def instability_index(self):
116 """Calculate the instability index according to Guruprasad et al 1990. 117 118 Implementation of the method of Guruprasad et al. 1990 to test a 119 protein for stability. Any value above 40 means the protein is unstable 120 (has a short half life). 121 122 See: Guruprasad K., Reddy B.V.B., Pandit M.W. 123 Protein Engineering 4:155-161(1990). 124 """ 125 index = ProtParamData.DIWV 126 score = 0.0 127 128 for i in range(self.length - 1): 129 this, next = self.sequence[i:i + 2] 130 dipeptide_value = index[this][next] 131 score += dipeptide_value 132 133 return (10.0 / self.length) * score
134
135 - def flexibility(self):
136 """Calculate the flexibility according to Vihinen, 1994. 137 138 No argument to change window size because parameters are specific for a 139 window=9. The parameters used are optimized for determining the flexibility. 140 """ 141 flexibilities = ProtParamData.Flex 142 window_size = 9 143 weights = [0.25, 0.4375, 0.625, 0.8125, 1] 144 scores = [] 145 146 for i in range(self.length - window_size): 147 subsequence = self.sequence[i:i + window_size] 148 score = 0.0 149 150 for j in range(window_size // 2): 151 front = subsequence[j] 152 back = subsequence[window_size - j - 1] 153 score += (flexibilities[front] + flexibilities[back]) * weights[j] 154 155 middle = subsequence[window_size // 2 + 1] 156 score += flexibilities[middle] 157 158 scores.append(score / 5.25) 159 160 return scores
161
162 - def gravy(self):
163 """Calculate the gravy according to Kyte and Doolittle.""" 164 total_gravy = sum(ProtParamData.kd[aa] for aa in self.sequence) 165 166 return total_gravy / self.length
167
168 - def _weight_list(self, window, edge):
169 """Makes a list of relative weight of the 170 window edges compared to the window center. The weights are linear. 171 it actually generates half a list. For a window of size 9 and edge 0.4 172 you get a list of [0.4, 0.55, 0.7, 0.85]. 173 """ 174 unit = 2 * (1.0 - edge) / (window - 1) 175 weights = [0.0] * (window // 2) 176 177 for i in range(window // 2): 178 weights[i] = edge + unit * i 179 180 return weights
181
182 - def protein_scale(self, param_dict, window, edge=1.0):
183 """Compute a profile by any amino acid scale. 184 185 An amino acid scale is defined by a numerical value assigned to each type of 186 amino acid. The most frequently used scales are the hydrophobicity or 187 hydrophilicity scales and the secondary structure conformational parameters 188 scales, but many other scales exist which are based on different chemical and 189 physical properties of the amino acids. You can set several parameters that 190 control the computation of a scale profile, such as the window size and the 191 window edge relative weight value. 192 193 WindowSize: The window size is the length 194 of the interval to use for the profile computation. For a window size n, we 195 use the i-(n-1)/2 neighboring residues on each side to compute 196 the score for residue i. The score for residue i is the sum of the scaled values 197 for these amino acids, optionally weighted according to their position in the 198 window. 199 200 Edge: The central amino acid of the window always has a weight of 1. 201 By default, the amino acids at the remaining window positions have the same 202 weight, but you can make the residue at the center of the window have a 203 larger weight than the others by setting the edge value for the residues at 204 the beginning and end of the interval to a value between 0 and 1. For 205 instance, for Edge=0.4 and a window size of 5 the weights will be: 0.4, 0.7, 206 1.0, 0.7, 0.4. 207 208 The method returns a list of values which can be plotted to 209 view the change along a protein sequence. Many scales exist. Just add your 210 favorites to the ProtParamData modules. 211 212 Similar to expasy's ProtScale: http://www.expasy.org/cgi-bin/protscale.pl 213 """ 214 # generate the weights 215 # _weight_list returns only one tail. If the list should be [0.4,0.7,1.0,0.7,0.4] 216 # what you actually get from _weights_list is [0.4,0.7]. The correct calculation is done 217 # in the loop. 218 weights = self._weight_list(window, edge) 219 scores = [] 220 221 # the score in each Window is divided by the sum of weights 222 # (* 2 + 1) since the weight list is one sided: 223 sum_of_weights = sum(weights) * 2 + 1 224 225 for i in range(self.length - window + 1): 226 subsequence = self.sequence[i:i + window] 227 score = 0.0 228 229 for j in range(window // 2): 230 # walk from the outside of the Window towards the middle. 231 # Iddo: try/except clauses added to avoid raising an exception on a non-standard amino acid 232 try: 233 front = param_dict[subsequence[j]] 234 back = param_dict[subsequence[window - j - 1]] 235 score += weights[j] * front + weights[j] * back 236 except KeyError: 237 sys.stderr.write('warning: %s or %s is not a standard amino acid.\n' % 238 (subsequence[j], subsequence[window - j - 1])) 239 240 # Now add the middle value, which always has a weight of 1. 241 middle = subsequence[window // 2] 242 if middle in param_dict: 243 score += param_dict[middle] 244 else: 245 sys.stderr.write('warning: %s is not a standard amino acid.\n' % (middle)) 246 247 scores.append(score / sum_of_weights) 248 249 return scores
250
251 - def isoelectric_point(self):
252 """Calculate the isoelectric point. 253 254 Uses the module IsoelectricPoint to calculate the pI of a protein. 255 """ 256 aa_content = self.count_amino_acids() 257 258 ie_point = IsoelectricPoint.IsoelectricPoint(self.sequence, aa_content) 259 return ie_point.pi()
260
262 """Calculate fraction of helix, turn and sheet. 263 264 Returns a list of the fraction of amino acids which tend 265 to be in Helix, Turn or Sheet. 266 267 Amino acids in helix: V, I, Y, F, W, L. 268 Amino acids in Turn: N, P, G, S. 269 Amino acids in sheet: E, M, A, L. 270 271 Returns a tuple of three integers (Helix, Turn, Sheet). 272 """ 273 aa_percentages = self.get_amino_acids_percent() 274 275 helix = sum(aa_percentages[r] for r in 'VIYFWL') 276 turn = sum(aa_percentages[r] for r in 'NPGS') 277 sheet = sum(aa_percentages[r] for r in 'EMAL') 278 279 return helix, turn, sheet
280