Package Bio :: Package ExPASy :: Module Prosite
[hide private]
[frames] | no frames]

Source Code for Module Bio.ExPASy.Prosite

  1  # Copyright 1999 by Jeffrey Chang.  All rights reserved. 
  2  # Copyright 2000 by Jeffrey Chang.  All rights reserved. 
  3  # Revisions Copyright 2007 by Peter Cock.  All rights reserved. 
  4  # Revisions Copyright 2009 by Michiel de Hoon.  All rights reserved. 
  5  # This code is part of the Biopython distribution and governed by its 
  6  # license.  Please see the LICENSE file that should have been included 
  7  # as part of this package. 
  8  """Parser for the prosite dat file from Prosite at ExPASy. 
  9   
 10  See http://www.expasy.ch/prosite/ 
 11   
 12  Tested with: 
 13   - Release 20.43, 10-Feb-2009 
 14   - Release 2017_03 of 15-Mar-2017. 
 15   
 16  Functions: 
 17   - read                  Reads a Prosite file containing one Prosite record 
 18   - parse                 Iterates over records in a Prosite file. 
 19   
 20  Classes: 
 21   - Record                Holds Prosite data. 
 22   
 23  """ 
 24   
 25   
26 -def parse(handle):
27 """Parse Prosite records. 28 29 This function is for parsing Prosite files containing multiple 30 records. 31 32 Arguments: 33 - handle - handle to the file. 34 35 """ 36 while True: 37 record = __read(handle) 38 if not record: 39 break 40 yield record
41 42
43 -def read(handle):
44 """Read one Prosite record. 45 46 This function is for parsing Prosite files containing 47 exactly one record. 48 49 Arguments: 50 - handle - handle to the file. 51 52 """ 53 record = __read(handle) 54 # We should have reached the end of the record by now 55 remainder = handle.read() 56 if remainder: 57 raise ValueError("More than one Prosite record found") 58 return record
59 60
61 -class Record(object):
62 """Holds information from a Prosite record. 63 64 Main attributes: 65 - name ID of the record. e.g. ADH_ZINC 66 - type Type of entry. e.g. PATTERN, MATRIX, or RULE 67 - accession e.g. PS00387 68 - created Date the entry was created. (MMM-YYYY for releases 69 before January 2017, DD-MMM-YYYY since January 2017) 70 - data_update Date the 'primary' data was last updated. 71 - info_update Date data other than 'primary' data was last updated. 72 - pdoc ID of the PROSITE DOCumentation. 73 - description Free-format description. 74 - pattern The PROSITE pattern. See docs. 75 - matrix List of strings that describes a matrix entry. 76 - rules List of rule definitions (from RU lines). (strings) 77 - prorules List of prorules (from PR lines). (strings) 78 79 NUMERICAL RESULTS: 80 - nr_sp_release SwissProt release. 81 - nr_sp_seqs Number of seqs in that release of Swiss-Prot. (int) 82 - nr_total Number of hits in Swiss-Prot. tuple of (hits, seqs) 83 - nr_positive True positives. tuple of (hits, seqs) 84 - nr_unknown Could be positives. tuple of (hits, seqs) 85 - nr_false_pos False positives. tuple of (hits, seqs) 86 - nr_false_neg False negatives. (int) 87 - nr_partial False negatives, because they are fragments. (int) 88 89 COMMENTS: 90 - cc_taxo_range Taxonomic range. See docs for format 91 - cc_max_repeat Maximum number of repetitions in a protein 92 - cc_site Interesting site. list of tuples (pattern pos, desc.) 93 - cc_skip_flag Can this entry be ignored? 94 - cc_matrix_type 95 - cc_scaling_db 96 - cc_author 97 - cc_ft_key 98 - cc_ft_desc 99 - cc_version version number (introduced in release 19.0) 100 101 The following are all lists if tuples (swiss-prot accession, swiss-prot name). 102 103 DATA BANK REFERENCES: 104 - dr_positive 105 - dr_false_neg 106 - dr_false_pos 107 - dr_potential Potential hits, but fingerprint region not yet available. 108 - dr_unknown Could possibly belong 109 - pdb_structs List of PDB entries. 110 111 """ 112
113 - def __init__(self):
114 self.name = '' 115 self.type = '' 116 self.accession = '' 117 self.created = '' 118 self.data_update = '' 119 self.info_update = '' 120 self.pdoc = '' 121 122 self.description = '' 123 self.pattern = '' 124 self.matrix = [] 125 self.rules = [] 126 self.prorules = [] 127 self.postprocessing = [] 128 129 self.nr_sp_release = '' 130 self.nr_sp_seqs = '' 131 self.nr_total = (None, None) 132 self.nr_positive = (None, None) 133 self.nr_unknown = (None, None) 134 self.nr_false_pos = (None, None) 135 self.nr_false_neg = None 136 self.nr_partial = None 137 138 self.cc_taxo_range = '' 139 self.cc_max_repeat = '' 140 self.cc_site = [] 141 self.cc_skip_flag = '' 142 143 self.dr_positive = [] 144 self.dr_false_neg = [] 145 self.dr_false_pos = [] 146 self.dr_potential = [] 147 self.dr_unknown = [] 148 149 self.pdb_structs = []
150 151 152 # Everything below are private functions 153
154 -def __read(handle):
155 import re 156 record = None 157 for line in handle: 158 keyword, value = line[:2], line[5:].rstrip() 159 if keyword == 'ID': 160 record = Record() 161 cols = value.split("; ") 162 if len(cols) != 2: 163 raise ValueError("I don't understand identification line\n%s" 164 % line) 165 record.name = cols[0] 166 record.type = cols[1].rstrip('.') # don't want '.' 167 elif keyword == 'AC': 168 record.accession = value.rstrip(';') 169 elif keyword == 'DT': 170 # e.g. from January 2017, 171 # DT 01-APR-1990 CREATED; 01-APR-1990 DATA UPDATE; 01-APR-1990 INFO UPDATE. 172 # Older files had brackets round the date descriptions and used MMM-YYYY 173 dates = value.rstrip('.').split("; ") 174 if dates[0].endswith((' (CREATED)', ' CREATED')): 175 # Remove last word 176 record.created = dates[0].rsplit(" ", 1)[0] 177 else: 178 raise ValueError("I don't understand date line\n%s" % line) 179 if dates[1].endswith((' (DATA UPDATE)', ' DATA UPDATE')): 180 # Remove last two words 181 record.data_update = dates[1].rsplit(" ", 2)[0] 182 else: 183 raise ValueError("I don't understand date line\n%s" % line) 184 if dates[2].endswith((' (INFO UPDATE)', ' INFO UPDATE')): 185 # Remove last two words 186 record.info_update = dates[2].rsplit(" ", 2)[0] 187 else: 188 raise ValueError("I don't understand date line\n%s" % line) 189 elif keyword == 'DE': 190 record.description = value 191 elif keyword == 'PA': 192 record.pattern += value 193 elif keyword == 'MA': 194 record.matrix.append(value) 195 elif keyword == 'PP': 196 record.postprocessing.extend(value.split(";")) 197 elif keyword == 'RU': 198 record.rules.append(value) 199 elif keyword == 'NR': 200 cols = value.split(";") 201 for col in cols: 202 if not col: 203 continue 204 qual, data = [word.lstrip() for word in col.split("=")] 205 if qual == '/RELEASE': 206 release, seqs = data.split(",") 207 record.nr_sp_release = release 208 record.nr_sp_seqs = int(seqs) 209 elif qual == '/FALSE_NEG': 210 record.nr_false_neg = int(data) 211 elif qual == '/PARTIAL': 212 record.nr_partial = int(data) 213 elif qual in ['/TOTAL', '/POSITIVE', '/UNKNOWN', '/FALSE_POS']: 214 m = re.match(r'(\d+)\((\d+)\)', data) 215 if not m: 216 raise Exception("Broken data %s in comment line\n%s" 217 % (repr(data), line)) 218 hits = tuple(map(int, m.groups())) 219 if(qual == "/TOTAL"): 220 record.nr_total = hits 221 elif(qual == "/POSITIVE"): 222 record.nr_positive = hits 223 elif(qual == "/UNKNOWN"): 224 record.nr_unknown = hits 225 elif(qual == "/FALSE_POS"): 226 record.nr_false_pos = hits 227 else: 228 raise ValueError("Unknown qual %s in comment line\n%s" 229 % (repr(qual), line)) 230 elif keyword == 'CC': 231 # Expect CC lines like this: 232 # CC /TAXO-RANGE=??EPV; /MAX-REPEAT=2; 233 # Can (normally) split on ";" and then on "=" 234 cols = value.split(";") 235 for col in cols: 236 if not col or col[:17] == 'Automatic scaling': 237 # DNAJ_2 in Release 15 has a non-standard comment line: 238 # CC Automatic scaling using reversed database 239 # Throw it away. (Should I keep it?) 240 continue 241 if col.count("=") == 0: 242 # Missing qualifier! Can we recover gracefully? 243 # For example, from Bug 2403, in PS50293 have: 244 # CC /AUTHOR=K_Hofmann; N_Hulo 245 continue 246 qual, data = [word.lstrip() for word in col.split("=")] 247 if qual == '/TAXO-RANGE': 248 record.cc_taxo_range = data 249 elif qual == '/MAX-REPEAT': 250 record.cc_max_repeat = data 251 elif qual == '/SITE': 252 pos, desc = data.split(",") 253 record.cc_site.append((int(pos), desc)) 254 elif qual == '/SKIP-FLAG': 255 record.cc_skip_flag = data 256 elif qual == '/MATRIX_TYPE': 257 record.cc_matrix_type = data 258 elif qual == '/SCALING_DB': 259 record.cc_scaling_db = data 260 elif qual == '/AUTHOR': 261 record.cc_author = data 262 elif qual == '/FT_KEY': 263 record.cc_ft_key = data 264 elif qual == '/FT_DESC': 265 record.cc_ft_desc = data 266 elif qual == '/VERSION': 267 record.cc_version = data 268 else: 269 raise ValueError("Unknown qual %s in comment line\n%s" 270 % (repr(qual), line)) 271 elif keyword == 'DR': 272 refs = value.split(";") 273 for ref in refs: 274 if not ref: 275 continue 276 acc, name, type = [word.strip() for word in ref.split(",")] 277 if type == 'T': 278 record.dr_positive.append((acc, name)) 279 elif type == 'F': 280 record.dr_false_pos.append((acc, name)) 281 elif type == 'N': 282 record.dr_false_neg.append((acc, name)) 283 elif type == 'P': 284 record.dr_potential.append((acc, name)) 285 elif type == '?': 286 record.dr_unknown.append((acc, name)) 287 else: 288 raise ValueError("I don't understand type flag %s" % type) 289 elif keyword == '3D': 290 cols = value.split() 291 for id in cols: 292 record.pdb_structs.append(id.rstrip(';')) 293 elif keyword == 'PR': 294 rules = value.split(";") 295 record.prorules.extend(rules) 296 elif keyword == 'DO': 297 record.pdoc = value.rstrip(';') 298 elif keyword == 'CC': 299 continue 300 elif keyword == '//': 301 if not record: 302 # Then this was the copyright statement 303 continue 304 break 305 else: 306 raise ValueError("Unknown keyword %s found" % keyword) 307 else: 308 return 309 if not record: 310 raise ValueError("Unexpected end of stream.") 311 return record
312