Package Bio :: Package SwissProt
[hide private]
[frames] | no frames]

Source Code for Package Bio.SwissProt

  1  # Copyright 2007 by Michiel de Hoon.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Code to work with the sprotXX.dat file from SwissProt. 
  6   
  7  http://www.expasy.ch/sprot/sprot-top.html 
  8   
  9  Tested with: 
 10  Release 56.9, 03-March-2009. 
 11   
 12   
 13  Classes: 
 14  Record             Holds SwissProt data. 
 15  Reference          Holds reference data from a SwissProt record. 
 16   
 17  Functions: 
 18  read               Read one SwissProt record 
 19  parse              Read multiple SwissProt records 
 20   
 21  """ 
 22   
 23  from __future__ import print_function 
 24   
 25  from Bio._py3k import _as_string 
 26   
 27   
28 -class Record(object):
29 """Holds information from a SwissProt record. 30 31 Members: 32 entry_name Name of this entry, e.g. RL1_ECOLI. 33 data_class Either 'STANDARD' or 'PRELIMINARY'. 34 molecule_type Type of molecule, 'PRT', 35 sequence_length Number of residues. 36 37 accessions List of the accession numbers, e.g. ['P00321'] 38 created A tuple of (date, release). 39 sequence_update A tuple of (date, release). 40 annotation_update A tuple of (date, release). 41 42 description Free-format description. 43 gene_name Gene name. See userman.txt for description. 44 organism The source of the sequence. 45 organelle The origin of the sequence. 46 organism_classification The taxonomy classification. List of strings. 47 (http://www.ncbi.nlm.nih.gov/Taxonomy/) 48 taxonomy_id A list of NCBI taxonomy id's. 49 host_organism A list of names of the hosts of a virus, if any. 50 host_taxonomy_id A list of NCBI taxonomy id's of the hosts, if any. 51 references List of Reference objects. 52 comments List of strings. 53 cross_references List of tuples (db, id1[, id2][, id3]). See the docs. 54 keywords List of the keywords. 55 features List of tuples (key name, from, to, description). 56 from and to can be either integers for the residue 57 numbers, '<', '>', or '?' 58 59 seqinfo tuple of (length, molecular weight, CRC32 value) 60 sequence The sequence. 61 62 """
63 - def __init__(self):
64 self.entry_name = None 65 self.data_class = None 66 self.molecule_type = None 67 self.sequence_length = None 68 69 self.accessions = [] 70 self.created = None 71 self.sequence_update = None 72 self.annotation_update = None 73 74 self.description = [] 75 self.gene_name = '' 76 self.organism = [] 77 self.organelle = '' 78 self.organism_classification = [] 79 self.taxonomy_id = [] 80 self.host_organism = [] 81 self.host_taxonomy_id = [] 82 self.references = [] 83 self.comments = [] 84 self.cross_references = [] 85 self.keywords = [] 86 self.features = [] 87 88 self.seqinfo = None 89 self.sequence = ''
90 91
92 -class Reference(object):
93 """Holds information from one reference in a SwissProt entry. 94 95 Members: 96 number Number of reference in an entry. 97 positions Describes extent of work. List of strings. 98 comments Comments. List of (token, text). 99 references References. List of (dbname, identifier). 100 authors The authors of the work. 101 title Title of the work. 102 location A citation for the work. 103 104 """
105 - def __init__(self):
106 self.number = None 107 self.positions = [] 108 self.comments = [] 109 self.references = [] 110 self.authors = [] 111 self.title = [] 112 self.location = []
113 114
115 -def parse(handle):
116 while True: 117 record = _read(handle) 118 if not record: 119 return 120 yield record
121 122
123 -def read(handle):
124 record = _read(handle) 125 if not record: 126 raise ValueError("No SwissProt record found") 127 # We should have reached the end of the record by now 128 remainder = handle.read() 129 if remainder: 130 raise ValueError("More than one SwissProt record found") 131 return record
132 133 134 # Everything below is considered private 135 136
137 -def _read(handle):
138 record = None 139 unread = "" 140 for line in handle: 141 #This is for Python 3 to cope with a binary handle (byte strings), 142 #or a text handle (unicode strings): 143 line = _as_string(line) 144 key, value = line[:2], line[5:].rstrip() 145 if unread: 146 value = unread + " " + value 147 unread = "" 148 if key == '**': 149 #See Bug 2353, some files from the EBI have extra lines 150 #starting "**" (two asterisks/stars). They appear 151 #to be unofficial automated annotations. e.g. 152 #** 153 #** ################# INTERNAL SECTION ################## 154 #**HA SAM; Annotated by PicoHamap 1.88; MF_01138.1; 09-NOV-2003. 155 pass 156 elif key == 'ID': 157 record = Record() 158 _read_id(record, line) 159 _sequence_lines = [] 160 elif key == 'AC': 161 accessions = [word for word in value.rstrip(";").split("; ")] 162 record.accessions.extend(accessions) 163 elif key == 'DT': 164 _read_dt(record, line) 165 elif key == 'DE': 166 record.description.append(value.strip()) 167 elif key == 'GN': 168 if record.gene_name: 169 record.gene_name += " " 170 record.gene_name += value 171 elif key == 'OS': 172 record.organism.append(value) 173 elif key == 'OG': 174 record.organelle += line[5:] 175 elif key == 'OC': 176 cols = [col for col in value.rstrip(";.").split("; ")] 177 record.organism_classification.extend(cols) 178 elif key == 'OX': 179 _read_ox(record, line) 180 elif key == 'OH': 181 _read_oh(record, line) 182 elif key == 'RN': 183 reference = Reference() 184 _read_rn(reference, value) 185 record.references.append(reference) 186 elif key == 'RP': 187 assert record.references, "RP: missing RN" 188 record.references[-1].positions.append(value) 189 elif key == 'RC': 190 assert record.references, "RC: missing RN" 191 reference = record.references[-1] 192 unread = _read_rc(reference, value) 193 elif key == 'RX': 194 assert record.references, "RX: missing RN" 195 reference = record.references[-1] 196 _read_rx(reference, value) 197 elif key == 'RL': 198 assert record.references, "RL: missing RN" 199 reference = record.references[-1] 200 reference.location.append(value) 201 # In UniProt release 1.12 of 6/21/04, there is a new RG 202 # (Reference Group) line, which references a group instead of 203 # an author. Each block must have at least 1 RA or RG line. 204 elif key == 'RA': 205 assert record.references, "RA: missing RN" 206 reference = record.references[-1] 207 reference.authors.append(value) 208 elif key == 'RG': 209 assert record.references, "RG: missing RN" 210 reference = record.references[-1] 211 reference.authors.append(value) 212 elif key == "RT": 213 assert record.references, "RT: missing RN" 214 reference = record.references[-1] 215 reference.title.append(value) 216 elif key == 'CC': 217 _read_cc(record, line) 218 elif key == 'DR': 219 _read_dr(record, value) 220 elif key == 'PE': 221 #TODO - Record this information? 222 pass 223 elif key == 'KW': 224 cols = value.rstrip(";.").split('; ') 225 record.keywords.extend(cols) 226 elif key == 'FT': 227 _read_ft(record, line) 228 elif key == 'SQ': 229 cols = value.split() 230 assert len(cols) == 7, "I don't understand SQ line %s" % line 231 # Do more checking here? 232 record.seqinfo = int(cols[1]), int(cols[3]), cols[5] 233 elif key == ' ': 234 _sequence_lines.append(value.replace(" ", "").rstrip()) 235 elif key == '//': 236 # Join multiline data into one string 237 record.description = " ".join(record.description) 238 record.organism = " ".join(record.organism) 239 record.organelle = record.organelle.rstrip() 240 for reference in record.references: 241 reference.authors = " ".join(reference.authors).rstrip(";") 242 reference.title = " ".join(reference.title).rstrip(";") 243 if reference.title.startswith('"') and reference.title.endswith('"'): 244 reference.title = reference.title[1:-1] # remove quotes 245 reference.location = " ".join(reference.location) 246 record.sequence = "".join(_sequence_lines) 247 return record 248 else: 249 raise ValueError("Unknown keyword '%s' found" % key) 250 if record: 251 raise ValueError("Unexpected end of stream.")
252 253
254 -def _read_id(record, line):
255 cols = line[5:].split() 256 #Prior to release 51, included with MoleculeType: 257 #ID EntryName DataClass; MoleculeType; SequenceLength AA. 258 # 259 #Newer files lack the MoleculeType: 260 #ID EntryName DataClass; SequenceLength AA. 261 if len(cols) == 5: 262 record.entry_name = cols[0] 263 record.data_class = cols[1].rstrip(";") 264 record.molecule_type = cols[2].rstrip(";") 265 record.sequence_length = int(cols[3]) 266 elif len(cols) == 4: 267 record.entry_name = cols[0] 268 record.data_class = cols[1].rstrip(";") 269 record.molecule_type = None 270 record.sequence_length = int(cols[2]) 271 else: 272 raise ValueError("ID line has unrecognised format:\n" + line) 273 # check if the data class is one of the allowed values 274 allowed = ('STANDARD', 'PRELIMINARY', 'IPI', 'Reviewed', 'Unreviewed') 275 if record.data_class not in allowed: 276 raise ValueError("Unrecognized data class %s in line\n%s" % 277 (record.data_class, line)) 278 # molecule_type should be 'PRT' for PRoTein 279 # Note that has been removed in recent releases (set to None) 280 if record.molecule_type not in (None, 'PRT'): 281 raise ValueError("Unrecognized molecule type %s in line\n%s" % 282 (record.molecule_type, line))
283 284
285 -def _read_dt(record, line):
286 value = line[5:] 287 uprline = value.upper() 288 cols = value.rstrip().split() 289 if 'CREATED' in uprline \ 290 or 'LAST SEQUENCE UPDATE' in uprline \ 291 or 'LAST ANNOTATION UPDATE' in uprline: 292 # Old style DT line 293 # ================= 294 # e.g. 295 # DT 01-FEB-1995 (Rel. 31, Created) 296 # DT 01-FEB-1995 (Rel. 31, Last sequence update) 297 # DT 01-OCT-2000 (Rel. 40, Last annotation update) 298 # 299 # or: 300 # DT 08-JAN-2002 (IPI Human rel. 2.3, Created) 301 # ... 302 303 # find where the version information will be located 304 # This is needed for when you have cases like IPI where 305 # the release verison is in a different spot: 306 # DT 08-JAN-2002 (IPI Human rel. 2.3, Created) 307 uprcols = uprline.split() 308 rel_index = -1 309 for index in range(len(uprcols)): 310 if 'REL.' in uprcols[index]: 311 rel_index = index 312 assert rel_index >= 0, \ 313 "Could not find Rel. in DT line: %s" % line 314 version_index = rel_index + 1 315 # get the version information 316 str_version = cols[version_index].rstrip(",") 317 # no version number 318 if str_version == '': 319 version = 0 320 # dot versioned 321 elif '.' in str_version: 322 version = str_version 323 # integer versioned 324 else: 325 version = int(str_version) 326 date = cols[0] 327 328 if 'CREATED' in uprline: 329 record.created = date, version 330 elif 'LAST SEQUENCE UPDATE' in uprline: 331 record.sequence_update = date, version 332 elif 'LAST ANNOTATION UPDATE' in uprline: 333 record.annotation_update = date, version 334 else: 335 assert False, "Shouldn't reach this line!" 336 elif 'INTEGRATED INTO' in uprline \ 337 or 'SEQUENCE VERSION' in uprline \ 338 or 'ENTRY VERSION' in uprline: 339 # New style DT line 340 # ================= 341 # As of UniProt Knowledgebase release 7.0 (including 342 # Swiss-Prot release 49.0 and TrEMBL release 32.0) the 343 # format of the DT lines and the version information 344 # in them was changed - the release number was dropped. 345 # 346 # For more information see bug 1948 and 347 # http://ca.expasy.org/sprot/relnotes/sp_news.html#rel7.0 348 # 349 # e.g. 350 # DT 01-JAN-1998, integrated into UniProtKB/Swiss-Prot. 351 # DT 15-OCT-2001, sequence version 3. 352 # DT 01-APR-2004, entry version 14. 353 # 354 #This is a new style DT line... 355 356 # The date should be in string cols[1] 357 # Get the version number if there is one. 358 # For the three DT lines above: 0, 3, 14 359 try: 360 version = int(cols[-1]) 361 except ValueError: 362 version = 0 363 date = cols[0].rstrip(",") 364 365 # Re-use the historical property names, even though 366 # the meaning has changed slighty: 367 if "INTEGRATED" in uprline: 368 record.created = date, version 369 elif 'SEQUENCE VERSION' in uprline: 370 record.sequence_update = date, version 371 elif 'ENTRY VERSION' in uprline: 372 record.annotation_update = date, version 373 else: 374 assert False, "Shouldn't reach this line!" 375 else: 376 raise ValueError("I don't understand the date line %s" % line)
377 378
379 -def _read_ox(record, line):
380 # The OX line is in the format: 381 # OX DESCRIPTION=ID[, ID]...; 382 # If there are too many id's to fit onto a line, then the ID's 383 # continue directly onto the next line, e.g. 384 # OX DESCRIPTION=ID[, ID]... 385 # OX ID[, ID]...; 386 # Currently, the description is always "NCBI_TaxID". 387 # To parse this, I need to check to see whether I'm at the 388 # first line. If I am, grab the description and make sure 389 # it's an NCBI ID. Then, grab all the id's. 390 if record.taxonomy_id: 391 ids = line[5:].rstrip().rstrip(";") 392 else: 393 descr, ids = line[5:].rstrip().rstrip(";").split("=") 394 assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr 395 record.taxonomy_id.extend(ids.split(', '))
396 397
398 -def _read_oh(record, line):
399 # Line type OH (Organism Host) for viral hosts 400 assert line[5:].startswith("NCBI_TaxID="), "Unexpected %s" % line 401 line = line[16:].rstrip() 402 assert line[-1] == "." and line.count(";") == 1, line 403 taxid, name = line[:-1].split(";") 404 record.host_taxonomy_id.append(taxid.strip()) 405 record.host_organism.append(name.strip())
406 407
408 -def _read_rn(reference, rn):
409 assert rn[0] == '[' and rn[-1] == ']', "Missing brackets %s" % rn 410 reference.number = int(rn[1:-1])
411 412
413 -def _read_rc(reference, value):
414 cols = value.split(';') 415 if value[-1] == ';': 416 unread = "" 417 else: 418 cols, unread = cols[:-1], cols[-1] 419 for col in cols: 420 if not col: # last column will be the empty string 421 return 422 # The token is everything before the first '=' character. 423 i = col.find("=") 424 if i >= 0: 425 token, text = col[:i], col[i+1:] 426 comment = token.lstrip(), text 427 reference.comments.append(comment) 428 else: 429 comment = reference.comments[-1] 430 comment = "%s %s" % (comment, col) 431 reference.comments[-1] = comment 432 return unread
433 434
435 -def _read_rx(reference, value):
436 # The basic (older?) RX line is of the form: 437 # RX MEDLINE; 85132727. 438 # but there are variants of this that need to be dealt with (see below) 439 440 # CLD1_HUMAN in Release 39 and DADR_DIDMA in Release 33 441 # have extraneous information in the RX line. Check for 442 # this and chop it out of the line. 443 # (noticed by katel@worldpath.net) 444 value = value.replace(' [NCBI, ExPASy, Israel, Japan]', '') 445 446 # RX lines can also be used of the form 447 # RX PubMed=9603189; 448 # reported by edvard@farmasi.uit.no 449 # and these can be more complicated like: 450 # RX MEDLINE=95385798; PubMed=7656980; 451 # RX PubMed=15060122; DOI=10.1136/jmg 2003.012781; 452 # We look for these cases first and deal with them 453 warn = False 454 if "=" in value: 455 cols = value.split("; ") 456 cols = [x.strip() for x in cols] 457 cols = [x for x in cols if x] 458 for col in cols: 459 x = col.split("=") 460 if len(x) != 2 or x == ("DOI", "DOI"): 461 warn = True 462 break 463 assert len(x) == 2, "I don't understand RX line %s" % value 464 reference.references.append((x[0], x[1].rstrip(";"))) 465 # otherwise we assume we have the type 'RX MEDLINE; 85132727.' 466 else: 467 cols = value.split("; ") 468 # normally we split into the three parts 469 if len(cols) != 2: 470 warn = True 471 else: 472 reference.references.append((cols[0].rstrip(";"), cols[1].rstrip("."))) 473 if warn: 474 import warnings 475 from Bio import BiopythonParserWarning 476 warnings.warn("Possibly corrupt RX line %r" % value, 477 BiopythonParserWarning)
478 479
480 -def _read_cc(record, line):
481 key, value = line[5:8], line[9:].rstrip() 482 if key == '-!-': # Make a new comment 483 record.comments.append(value) 484 elif key == ' ': # add to the previous comment 485 if not record.comments: 486 # TCMO_STRGA in Release 37 has comment with no topic 487 record.comments.append(value) 488 else: 489 record.comments[-1] += " " + value
490 491
492 -def _read_dr(record, value):
493 cols = value.rstrip(".").split('; ') 494 record.cross_references.append(tuple(cols))
495 496
497 -def _read_ft(record, line):
498 line = line[5:] # get rid of junk in front 499 name = line[0:8].rstrip() 500 try: 501 from_res = int(line[9:15]) 502 except ValueError: 503 from_res = line[9:15].lstrip() 504 try: 505 to_res = int(line[16:22]) 506 except ValueError: 507 to_res = line[16:22].lstrip() 508 #if there is a feature_id (FTId), store it away 509 if line[29:35] == r"/FTId=": 510 ft_id = line[35:70].rstrip()[:-1] 511 description = "" 512 else: 513 ft_id = "" 514 description = line[29:70].rstrip() 515 if not name: # is continuation of last one 516 assert not from_res and not to_res 517 name, from_res, to_res, old_description, old_ft_id = record.features[-1] 518 del record.features[-1] 519 description = ("%s %s" % (old_description, description)).strip() 520 521 # special case -- VARSPLIC, reported by edvard@farmasi.uit.no 522 if name == "VARSPLIC": 523 # Remove unwanted spaces in sequences. 524 # During line carryover, the sequences in VARSPLIC can get mangled 525 # with unwanted spaces like: 526 # 'DISSTKLQALPSHGLESIQT -> PCRATGWSPFRRSSPC LPTH' 527 # We want to check for this case and correct it as it happens. 528 descr_cols = description.split(" -> ") 529 if len(descr_cols) == 2: 530 first_seq, second_seq = descr_cols 531 extra_info = '' 532 # we might have more information at the end of the 533 # second sequence, which should be in parenthesis 534 extra_info_pos = second_seq.find(" (") 535 if extra_info_pos != -1: 536 extra_info = second_seq[extra_info_pos:] 537 second_seq = second_seq[:extra_info_pos] 538 # now clean spaces out of the first and second string 539 first_seq = first_seq.replace(" ", "") 540 second_seq = second_seq.replace(" ", "") 541 # reassemble the description 542 description = first_seq + " -> " + second_seq + extra_info 543 record.features.append((name, from_res, to_res, description, ft_id))
544 545 546 if __name__ == "__main__": 547 print("Quick self test...") 548 549 example_filename = "../../Tests/SwissProt/sp008" 550 551 import os 552 if not os.path.isfile(example_filename): 553 print("Missing test file %s" % example_filename) 554 else: 555 #Try parsing it! 556 557 with open(example_filename) as handle: 558 records = parse(handle) 559 for record in records: 560 print(record.entry_name) 561 print(",".join(record.accessions)) 562 print(record.keywords) 563 print(repr(record.organism)) 564 print(record.sequence[:20] + "...") 565