Package Bio :: Package ExPASy :: Module Prosite
[hide private]
[frames] | no frames]

Source Code for Module Bio.ExPASy.Prosite

  1  # Copyright 1999 by Jeffrey Chang.  All rights reserved. 
  2  # Copyright 2000 by Jeffrey Chang.  All rights reserved. 
  3  # Revisions Copyright 2007 by Peter Cock.  All rights reserved. 
  4  # Revisions Copyright 2009 by Michiel de Hoon.  All rights reserved. 
  5  # This code is part of the Biopython distribution and governed by its 
  6  # license.  Please see the LICENSE file that should have been included 
  7  # as part of this package. 
  8  """ 
  9  This module provides code to work with the prosite dat file from 
 10  Prosite. 
 11  http://www.expasy.ch/prosite/ 
 12   
 13  Tested with: 
 14  Release 20.43, 10-Feb-2009 
 15   
 16   
 17  Functions: 
 18   
 19      - read                  Reads a Prosite file containing one Prosite record 
 20      - parse                 Iterates over records in a Prosite file. 
 21   
 22  Classes: 
 23   
 24      - Record                Holds Prosite data. 
 25  """ 
 26   
 27   
28 -def parse(handle):
29 """Parse Prosite records. 30 31 This function is for parsing Prosite files containing multiple 32 records. 33 34 handle - handle to the file.""" 35 while True: 36 record = __read(handle) 37 if not record: 38 break 39 yield record
40 41
42 -def read(handle):
43 """Read one Prosite record. 44 45 This function is for parsing Prosite files containing 46 exactly one record. 47 48 handle - handle to the file.""" 49 50 record = __read(handle) 51 # We should have reached the end of the record by now 52 remainder = handle.read() 53 if remainder: 54 raise ValueError("More than one Prosite record found") 55 return record
56 57
58 -class Record(object):
59 """Holds information from a Prosite record. 60 61 Members: 62 63 - name ID of the record. e.g. ADH_ZINC 64 - type Type of entry. e.g. PATTERN, MATRIX, or RULE 65 - accession e.g. PS00387 66 - created Date the entry was created. (MMM-YYYY) 67 - data_update Date the 'primary' data was last updated. 68 - info_update Date data other than 'primary' data was last updated. 69 - pdoc ID of the PROSITE DOCumentation. 70 71 - description Free-format description. 72 - pattern The PROSITE pattern. See docs. 73 - matrix List of strings that describes a matrix entry. 74 - rules List of rule definitions (from RU lines). (strings) 75 - prorules List of prorules (from PR lines). (strings) 76 77 NUMERICAL RESULTS 78 79 - nr_sp_release SwissProt release. 80 - nr_sp_seqs Number of seqs in that release of Swiss-Prot. (int) 81 - nr_total Number of hits in Swiss-Prot. tuple of (hits, seqs) 82 - nr_positive True positives. tuple of (hits, seqs) 83 - nr_unknown Could be positives. tuple of (hits, seqs) 84 - nr_false_pos False positives. tuple of (hits, seqs) 85 - nr_false_neg False negatives. (int) 86 - nr_partial False negatives, because they are fragments. (int) 87 88 COMMENTS 89 90 - cc_taxo_range Taxonomic range. See docs for format 91 - cc_max_repeat Maximum number of repetitions in a protein 92 - cc_site Interesting site. list of tuples (pattern pos, desc.) 93 - cc_skip_flag Can this entry be ignored? 94 - cc_matrix_type 95 - cc_scaling_db 96 - cc_author 97 - cc_ft_key 98 - cc_ft_desc 99 - cc_version version number (introduced in release 19.0) 100 101 The following are all lists if tuples (swiss-prot accession, swiss-prot name). 102 103 DATA BANK REFERENCES 104 105 - dr_positive 106 - dr_false_neg 107 - dr_false_pos 108 - dr_potential Potential hits, but fingerprint region not yet available. 109 - dr_unknown Could possibly belong 110 - pdb_structs List of PDB entries. 111 112 """
113 - def __init__(self):
114 self.name = '' 115 self.type = '' 116 self.accession = '' 117 self.created = '' 118 self.data_update = '' 119 self.info_update = '' 120 self.pdoc = '' 121 122 self.description = '' 123 self.pattern = '' 124 self.matrix = [] 125 self.rules = [] 126 self.prorules = [] 127 self.postprocessing = [] 128 129 self.nr_sp_release = '' 130 self.nr_sp_seqs = '' 131 self.nr_total = (None, None) 132 self.nr_positive = (None, None) 133 self.nr_unknown = (None, None) 134 self.nr_false_pos = (None, None) 135 self.nr_false_neg = None 136 self.nr_partial = None 137 138 self.cc_taxo_range = '' 139 self.cc_max_repeat = '' 140 self.cc_site = [] 141 self.cc_skip_flag = '' 142 143 self.dr_positive = [] 144 self.dr_false_neg = [] 145 self.dr_false_pos = [] 146 self.dr_potential = [] 147 self.dr_unknown = [] 148 149 self.pdb_structs = []
150 151 152 # Everything below are private functions 153
154 -def __read(handle):
155 import re 156 record = None 157 for line in handle: 158 keyword, value = line[:2], line[5:].rstrip() 159 if keyword == 'ID': 160 record = Record() 161 cols = value.split("; ") 162 if len(cols) != 2: 163 raise ValueError("I don't understand identification line\n%s" 164 % line) 165 record.name = cols[0] 166 record.type = cols[1].rstrip('.') # don't want '.' 167 elif keyword == 'AC': 168 record.accession = value.rstrip(';') 169 elif keyword == 'DT': 170 dates = value.rstrip('.').split("; ") 171 if (not dates[0].endswith('(CREATED)')) or \ 172 (not dates[1].endswith('(DATA UPDATE)')) or \ 173 (not dates[2].endswith('(INFO UPDATE)')): 174 raise ValueError("I don't understand date line\n%s" % line) 175 record.created = dates[0].rstrip(' (CREATED)') 176 record.data_update = dates[1].rstrip(' (DATA UPDATE)') 177 record.info_update = dates[2].rstrip(' (INFO UPDATE)') 178 elif keyword == 'DE': 179 record.description = value 180 elif keyword == 'PA': 181 record.pattern += value 182 elif keyword == 'MA': 183 record.matrix.append(value) 184 elif keyword == 'PP': 185 record.postprocessing.extend(value.split(";")) 186 elif keyword == 'RU': 187 record.rules.append(value) 188 elif keyword == 'NR': 189 cols = value.split(";") 190 for col in cols: 191 if not col: 192 continue 193 qual, data = [word.lstrip() for word in col.split("=")] 194 if qual == '/RELEASE': 195 release, seqs = data.split(",") 196 record.nr_sp_release = release 197 record.nr_sp_seqs = int(seqs) 198 elif qual == '/FALSE_NEG': 199 record.nr_false_neg = int(data) 200 elif qual == '/PARTIAL': 201 record.nr_partial = int(data) 202 elif qual in ['/TOTAL', '/POSITIVE', '/UNKNOWN', '/FALSE_POS']: 203 m = re.match(r'(\d+)\((\d+)\)', data) 204 if not m: 205 raise Exception("Broken data %s in comment line\n%s" 206 % (repr(data), line)) 207 hits = tuple(map(int, m.groups())) 208 if(qual == "/TOTAL"): 209 record.nr_total = hits 210 elif(qual == "/POSITIVE"): 211 record.nr_positive = hits 212 elif(qual == "/UNKNOWN"): 213 record.nr_unknown = hits 214 elif(qual == "/FALSE_POS"): 215 record.nr_false_pos = hits 216 else: 217 raise ValueError("Unknown qual %s in comment line\n%s" 218 % (repr(qual), line)) 219 elif keyword == 'CC': 220 # Expect CC lines like this: 221 # CC /TAXO-RANGE=??EPV; /MAX-REPEAT=2; 222 # Can (normally) split on ";" and then on "=" 223 cols = value.split(";") 224 for col in cols: 225 if not col or col[:17] == 'Automatic scaling': 226 # DNAJ_2 in Release 15 has a non-standard comment line: 227 # CC Automatic scaling using reversed database 228 # Throw it away. (Should I keep it?) 229 continue 230 if col.count("=") == 0: 231 # Missing qualifier! Can we recover gracefully? 232 # For example, from Bug 2403, in PS50293 have: 233 # CC /AUTHOR=K_Hofmann; N_Hulo 234 continue 235 qual, data = [word.lstrip() for word in col.split("=")] 236 if qual == '/TAXO-RANGE': 237 record.cc_taxo_range = data 238 elif qual == '/MAX-REPEAT': 239 record.cc_max_repeat = data 240 elif qual == '/SITE': 241 pos, desc = data.split(",") 242 record.cc_site.append((int(pos), desc)) 243 elif qual == '/SKIP-FLAG': 244 record.cc_skip_flag = data 245 elif qual == '/MATRIX_TYPE': 246 record.cc_matrix_type = data 247 elif qual == '/SCALING_DB': 248 record.cc_scaling_db = data 249 elif qual == '/AUTHOR': 250 record.cc_author = data 251 elif qual == '/FT_KEY': 252 record.cc_ft_key = data 253 elif qual == '/FT_DESC': 254 record.cc_ft_desc = data 255 elif qual == '/VERSION': 256 record.cc_version = data 257 else: 258 raise ValueError("Unknown qual %s in comment line\n%s" 259 % (repr(qual), line)) 260 elif keyword == 'DR': 261 refs = value.split(";") 262 for ref in refs: 263 if not ref: 264 continue 265 acc, name, type = [word.strip() for word in ref.split(",")] 266 if type == 'T': 267 record.dr_positive.append((acc, name)) 268 elif type == 'F': 269 record.dr_false_pos.append((acc, name)) 270 elif type == 'N': 271 record.dr_false_neg.append((acc, name)) 272 elif type == 'P': 273 record.dr_potential.append((acc, name)) 274 elif type == '?': 275 record.dr_unknown.append((acc, name)) 276 else: 277 raise ValueError("I don't understand type flag %s" % type) 278 elif keyword == '3D': 279 cols = value.split() 280 for id in cols: 281 record.pdb_structs.append(id.rstrip(';')) 282 elif keyword == 'PR': 283 rules = value.split(";") 284 record.prorules.extend(rules) 285 elif keyword == 'DO': 286 record.pdoc = value.rstrip(';') 287 elif keyword == 'CC': 288 continue 289 elif keyword == '//': 290 if not record: 291 # Then this was the copyright statement 292 continue 293 break 294 else: 295 raise ValueError("Unknown keyword %s found" % keyword) 296 else: 297 return 298 if not record: 299 raise ValueError("Unexpected end of stream.") 300 return record
301