Package Bio :: Package ExPASy :: Module Prosite
[hide private]
[frames] | no frames]

Source Code for Module Bio.ExPASy.Prosite

  1  # Copyright 1999 by Jeffrey Chang.  All rights reserved. 
  2  # Copyright 2000 by Jeffrey Chang.  All rights reserved. 
  3  # Revisions Copyright 2007 by Peter Cock.  All rights reserved. 
  4  # Revisions Copyright 2009 by Michiel de Hoon.  All rights reserved. 
  5  # This code is part of the Biopython distribution and governed by its 
  6  # license.  Please see the LICENSE file that should have been included 
  7  # as part of this package. 
  8  """Parser for the prosite dat file from Prosite at ExPASy. 
  9   
 10  See https://www.expasy.org/prosite/ 
 11   
 12  Tested with: 
 13   - Release 20.43, 10-Feb-2009 
 14   - Release 2017_03 of 15-Mar-2017. 
 15   
 16  Functions: 
 17   - read                  Reads a Prosite file containing one Prosite record 
 18   - parse                 Iterates over records in a Prosite file. 
 19   
 20  Classes: 
 21   - Record                Holds Prosite data. 
 22   
 23  """ 
 24   
 25   
26 -def parse(handle):
27 """Parse Prosite records. 28 29 This function is for parsing Prosite files containing multiple 30 records. 31 32 Arguments: 33 - handle - handle to the file. 34 35 """ 36 while True: 37 record = __read(handle) 38 if not record: 39 break 40 yield record
41 42
43 -def read(handle):
44 """Read one Prosite record. 45 46 This function is for parsing Prosite files containing 47 exactly one record. 48 49 Arguments: 50 - handle - handle to the file. 51 52 """ 53 record = __read(handle) 54 # We should have reached the end of the record by now 55 remainder = handle.read() 56 if remainder: 57 raise ValueError("More than one Prosite record found") 58 return record
59 60
61 -class Record(object):
62 """Holds information from a Prosite record. 63 64 Main attributes: 65 - name ID of the record. e.g. ADH_ZINC 66 - type Type of entry. e.g. PATTERN, MATRIX, or RULE 67 - accession e.g. PS00387 68 - created Date the entry was created. (MMM-YYYY for releases 69 before January 2017, DD-MMM-YYYY since January 2017) 70 - data_update Date the 'primary' data was last updated. 71 - info_update Date data other than 'primary' data was last updated. 72 - pdoc ID of the PROSITE DOCumentation. 73 - description Free-format description. 74 - pattern The PROSITE pattern. See docs. 75 - matrix List of strings that describes a matrix entry. 76 - rules List of rule definitions (from RU lines). (strings) 77 - prorules List of prorules (from PR lines). (strings) 78 79 NUMERICAL RESULTS: 80 - nr_sp_release SwissProt release. 81 - nr_sp_seqs Number of seqs in that release of Swiss-Prot. (int) 82 - nr_total Number of hits in Swiss-Prot. tuple of (hits, seqs) 83 - nr_positive True positives. tuple of (hits, seqs) 84 - nr_unknown Could be positives. tuple of (hits, seqs) 85 - nr_false_pos False positives. tuple of (hits, seqs) 86 - nr_false_neg False negatives. (int) 87 - nr_partial False negatives, because they are fragments. (int) 88 89 COMMENTS: 90 - cc_taxo_range Taxonomic range. See docs for format 91 - cc_max_repeat Maximum number of repetitions in a protein 92 - cc_site Interesting site. list of tuples (pattern pos, desc.) 93 - cc_skip_flag Can this entry be ignored? 94 - cc_matrix_type 95 - cc_scaling_db 96 - cc_author 97 - cc_ft_key 98 - cc_ft_desc 99 - cc_version version number (introduced in release 19.0) 100 101 The following are all lists if tuples (swiss-prot accession, swiss-prot name). 102 103 DATA BANK REFERENCES: 104 - dr_positive 105 - dr_false_neg 106 - dr_false_pos 107 - dr_potential Potential hits, but fingerprint region not yet available. 108 - dr_unknown Could possibly belong 109 - pdb_structs List of PDB entries. 110 111 """ 112
113 - def __init__(self):
114 """Initialize the class.""" 115 self.name = '' 116 self.type = '' 117 self.accession = '' 118 self.created = '' 119 self.data_update = '' 120 self.info_update = '' 121 self.pdoc = '' 122 123 self.description = '' 124 self.pattern = '' 125 self.matrix = [] 126 self.rules = [] 127 self.prorules = [] 128 self.postprocessing = [] 129 130 self.nr_sp_release = '' 131 self.nr_sp_seqs = '' 132 self.nr_total = (None, None) 133 self.nr_positive = (None, None) 134 self.nr_unknown = (None, None) 135 self.nr_false_pos = (None, None) 136 self.nr_false_neg = None 137 self.nr_partial = None 138 139 self.cc_taxo_range = '' 140 self.cc_max_repeat = '' 141 self.cc_site = [] 142 self.cc_skip_flag = '' 143 144 self.dr_positive = [] 145 self.dr_false_neg = [] 146 self.dr_false_pos = [] 147 self.dr_potential = [] 148 self.dr_unknown = [] 149 150 self.pdb_structs = []
151 152 153 # Everything below are private functions 154
155 -def __read(handle):
156 import re 157 record = None 158 for line in handle: 159 keyword, value = line[:2], line[5:].rstrip() 160 if keyword == 'ID': 161 record = Record() 162 cols = value.split("; ") 163 if len(cols) != 2: 164 raise ValueError("I don't understand identification line\n%s" 165 % line) 166 record.name = cols[0] 167 record.type = cols[1].rstrip('.') # don't want '.' 168 elif keyword == 'AC': 169 record.accession = value.rstrip(';') 170 elif keyword == 'DT': 171 # e.g. from January 2017, 172 # DT 01-APR-1990 CREATED; 01-APR-1990 DATA UPDATE; 01-APR-1990 INFO UPDATE. 173 # Older files had brackets round the date descriptions and used MMM-YYYY 174 dates = value.rstrip('.').split("; ") 175 if dates[0].endswith((' (CREATED)', ' CREATED')): 176 # Remove last word 177 record.created = dates[0].rsplit(" ", 1)[0] 178 else: 179 raise ValueError("I don't understand date line\n%s" % line) 180 if dates[1].endswith((' (DATA UPDATE)', ' DATA UPDATE')): 181 # Remove last two words 182 record.data_update = dates[1].rsplit(" ", 2)[0] 183 else: 184 raise ValueError("I don't understand date line\n%s" % line) 185 if dates[2].endswith((' (INFO UPDATE)', ' INFO UPDATE')): 186 # Remove last two words 187 record.info_update = dates[2].rsplit(" ", 2)[0] 188 else: 189 raise ValueError("I don't understand date line\n%s" % line) 190 elif keyword == 'DE': 191 record.description = value 192 elif keyword == 'PA': 193 record.pattern += value 194 elif keyword == 'MA': 195 record.matrix.append(value) 196 elif keyword == 'PP': 197 record.postprocessing.extend(value.split(";")) 198 elif keyword == 'RU': 199 record.rules.append(value) 200 elif keyword == 'NR': 201 cols = value.split(";") 202 for col in cols: 203 if not col: 204 continue 205 qual, data = [word.lstrip() for word in col.split("=")] 206 if qual == '/RELEASE': 207 release, seqs = data.split(",") 208 record.nr_sp_release = release 209 record.nr_sp_seqs = int(seqs) 210 elif qual == '/FALSE_NEG': 211 record.nr_false_neg = int(data) 212 elif qual == '/PARTIAL': 213 record.nr_partial = int(data) 214 elif qual in ['/TOTAL', '/POSITIVE', '/UNKNOWN', '/FALSE_POS']: 215 m = re.match(r'(\d+)\((\d+)\)', data) 216 if not m: 217 raise Exception("Broken data %s in comment line\n%s" 218 % (repr(data), line)) 219 hits = tuple(map(int, m.groups())) 220 if(qual == "/TOTAL"): 221 record.nr_total = hits 222 elif(qual == "/POSITIVE"): 223 record.nr_positive = hits 224 elif(qual == "/UNKNOWN"): 225 record.nr_unknown = hits 226 elif(qual == "/FALSE_POS"): 227 record.nr_false_pos = hits 228 else: 229 raise ValueError("Unknown qual %s in comment line\n%s" 230 % (repr(qual), line)) 231 elif keyword == 'CC': 232 # Expect CC lines like this: 233 # CC /TAXO-RANGE=??EPV; /MAX-REPEAT=2; 234 # Can (normally) split on ";" and then on "=" 235 cols = value.split(";") 236 for col in cols: 237 if not col or col[:17] == 'Automatic scaling': 238 # DNAJ_2 in Release 15 has a non-standard comment line: 239 # CC Automatic scaling using reversed database 240 # Throw it away. (Should I keep it?) 241 continue 242 if col.count("=") == 0: 243 # Missing qualifier! Can we recover gracefully? 244 # For example, from Bug 2403, in PS50293 have: 245 # CC /AUTHOR=K_Hofmann; N_Hulo 246 continue 247 qual, data = [word.lstrip() for word in col.split("=")] 248 if qual == '/TAXO-RANGE': 249 record.cc_taxo_range = data 250 elif qual == '/MAX-REPEAT': 251 record.cc_max_repeat = data 252 elif qual == '/SITE': 253 pos, desc = data.split(",") 254 record.cc_site.append((int(pos), desc)) 255 elif qual == '/SKIP-FLAG': 256 record.cc_skip_flag = data 257 elif qual == '/MATRIX_TYPE': 258 record.cc_matrix_type = data 259 elif qual == '/SCALING_DB': 260 record.cc_scaling_db = data 261 elif qual == '/AUTHOR': 262 record.cc_author = data 263 elif qual == '/FT_KEY': 264 record.cc_ft_key = data 265 elif qual == '/FT_DESC': 266 record.cc_ft_desc = data 267 elif qual == '/VERSION': 268 record.cc_version = data 269 else: 270 raise ValueError("Unknown qual %s in comment line\n%s" 271 % (repr(qual), line)) 272 elif keyword == 'DR': 273 refs = value.split(";") 274 for ref in refs: 275 if not ref: 276 continue 277 acc, name, type = [word.strip() for word in ref.split(",")] 278 if type == 'T': 279 record.dr_positive.append((acc, name)) 280 elif type == 'F': 281 record.dr_false_pos.append((acc, name)) 282 elif type == 'N': 283 record.dr_false_neg.append((acc, name)) 284 elif type == 'P': 285 record.dr_potential.append((acc, name)) 286 elif type == '?': 287 record.dr_unknown.append((acc, name)) 288 else: 289 raise ValueError("I don't understand type flag %s" % type) 290 elif keyword == '3D': 291 cols = value.split() 292 for id in cols: 293 record.pdb_structs.append(id.rstrip(';')) 294 elif keyword == 'PR': 295 rules = value.split(";") 296 record.prorules.extend(rules) 297 elif keyword == 'DO': 298 record.pdoc = value.rstrip(';') 299 elif keyword == 'CC': 300 continue 301 elif keyword == '//': 302 if not record: 303 # Then this was the copyright statement 304 continue 305 break 306 else: 307 raise ValueError("Unknown keyword %s found" % keyword) 308 else: 309 return 310 if not record: 311 raise ValueError("Unexpected end of stream.") 312 return record
313