Package Bio :: Package SwissProt
[hide private]
[frames] | no frames]

Source Code for Package Bio.SwissProt

  1  # Copyright 2007 by Michiel de Hoon.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Code to work with the sprotXX.dat file from SwissProt. 
  6   
  7  http://www.expasy.ch/sprot/sprot-top.html 
  8   
  9  Tested with: 
 10  Release 56.9, 03-March-2009. 
 11   
 12   
 13  Classes: 
 14   
 15      - Record             Holds SwissProt data. 
 16      - Reference          Holds reference data from a SwissProt record. 
 17   
 18  Functions: 
 19   
 20      - read               Read one SwissProt record 
 21      - parse              Read multiple SwissProt records 
 22   
 23  """ 
 24   
 25  from __future__ import print_function 
 26   
 27  from Bio._py3k import _as_string 
 28   
 29  __docformat__ = "restructuredtext en" 
 30   
 31   
32 -class Record(object):
33 """Holds information from a SwissProt record. 34 35 Members: 36 37 - entry_name Name of this entry, e.g. RL1_ECOLI. 38 - data_class Either 'STANDARD' or 'PRELIMINARY'. 39 - molecule_type Type of molecule, 'PRT', 40 - sequence_length Number of residues. 41 42 - accessions List of the accession numbers, e.g. ['P00321'] 43 - created A tuple of (date, release). 44 - sequence_update A tuple of (date, release). 45 - annotation_update A tuple of (date, release). 46 47 - description Free-format description. 48 - gene_name Gene name. See userman.txt for description. 49 - organism The source of the sequence. 50 - organelle The origin of the sequence. 51 - organism_classification The taxonomy classification. List of strings. 52 (http://www.ncbi.nlm.nih.gov/Taxonomy/) 53 - taxonomy_id A list of NCBI taxonomy id's. 54 - host_organism A list of names of the hosts of a virus, if any. 55 - host_taxonomy_id A list of NCBI taxonomy id's of the hosts, if any. 56 - references List of Reference objects. 57 - comments List of strings. 58 - cross_references List of tuples (db, id1[, id2][, id3]). See the docs. 59 - keywords List of the keywords. 60 - features List of tuples (key name, from, to, description). 61 from and to can be either integers for the residue 62 numbers, '<', '>', or '?' 63 64 - seqinfo tuple of (length, molecular weight, CRC32 value) 65 - sequence The sequence. 66 67 """
68 - def __init__(self):
69 self.entry_name = None 70 self.data_class = None 71 self.molecule_type = None 72 self.sequence_length = None 73 74 self.accessions = [] 75 self.created = None 76 self.sequence_update = None 77 self.annotation_update = None 78 79 self.description = [] 80 self.gene_name = '' 81 self.organism = [] 82 self.organelle = '' 83 self.organism_classification = [] 84 self.taxonomy_id = [] 85 self.host_organism = [] 86 self.host_taxonomy_id = [] 87 self.references = [] 88 self.comments = [] 89 self.cross_references = [] 90 self.keywords = [] 91 self.features = [] 92 93 self.seqinfo = None 94 self.sequence = ''
95 96
97 -class Reference(object):
98 """Holds information from one reference in a SwissProt entry. 99 100 Members: 101 number Number of reference in an entry. 102 evidence Evidence code. List of strings. 103 positions Describes extent of work. List of strings. 104 comments Comments. List of (token, text). 105 references References. List of (dbname, identifier). 106 authors The authors of the work. 107 title Title of the work. 108 location A citation for the work. 109 110 """
111 - def __init__(self):
112 self.number = None 113 self.positions = [] 114 self.comments = [] 115 self.references = [] 116 self.authors = [] 117 self.title = [] 118 self.location = []
119 120
121 -def parse(handle):
122 while True: 123 record = _read(handle) 124 if not record: 125 return 126 yield record
127 128
129 -def read(handle):
130 record = _read(handle) 131 if not record: 132 raise ValueError("No SwissProt record found") 133 # We should have reached the end of the record by now 134 remainder = handle.read() 135 if remainder: 136 raise ValueError("More than one SwissProt record found") 137 return record
138 139 140 # Everything below is considered private 141 142
143 -def _read(handle):
144 record = None 145 unread = "" 146 for line in handle: 147 # This is for Python 3 to cope with a binary handle (byte strings), 148 # or a text handle (unicode strings): 149 line = _as_string(line) 150 key, value = line[:2], line[5:].rstrip() 151 if unread: 152 value = unread + " " + value 153 unread = "" 154 if key == '**': 155 # See Bug 2353, some files from the EBI have extra lines 156 # starting "**" (two asterisks/stars). They appear 157 # to be unofficial automated annotations. e.g. 158 # ** 159 # ** ################# INTERNAL SECTION ################## 160 # **HA SAM; Annotated by PicoHamap 1.88; MF_01138.1; 09-NOV-2003. 161 pass 162 elif key == 'ID': 163 record = Record() 164 _read_id(record, line) 165 _sequence_lines = [] 166 elif key == 'AC': 167 accessions = [word for word in value.rstrip(";").split("; ")] 168 record.accessions.extend(accessions) 169 elif key == 'DT': 170 _read_dt(record, line) 171 elif key == 'DE': 172 record.description.append(value.strip()) 173 elif key == 'GN': 174 if record.gene_name: 175 record.gene_name += " " 176 record.gene_name += value 177 elif key == 'OS': 178 record.organism.append(value) 179 elif key == 'OG': 180 record.organelle += line[5:] 181 elif key == 'OC': 182 cols = [col for col in value.rstrip(";.").split("; ")] 183 record.organism_classification.extend(cols) 184 elif key == 'OX': 185 _read_ox(record, line) 186 elif key == 'OH': 187 _read_oh(record, line) 188 elif key == 'RN': 189 reference = Reference() 190 _read_rn(reference, value) 191 record.references.append(reference) 192 elif key == 'RP': 193 assert record.references, "RP: missing RN" 194 record.references[-1].positions.append(value) 195 elif key == 'RC': 196 assert record.references, "RC: missing RN" 197 reference = record.references[-1] 198 unread = _read_rc(reference, value) 199 elif key == 'RX': 200 assert record.references, "RX: missing RN" 201 reference = record.references[-1] 202 _read_rx(reference, value) 203 elif key == 'RL': 204 assert record.references, "RL: missing RN" 205 reference = record.references[-1] 206 reference.location.append(value) 207 # In UniProt release 1.12 of 6/21/04, there is a new RG 208 # (Reference Group) line, which references a group instead of 209 # an author. Each block must have at least 1 RA or RG line. 210 elif key == 'RA': 211 assert record.references, "RA: missing RN" 212 reference = record.references[-1] 213 reference.authors.append(value) 214 elif key == 'RG': 215 assert record.references, "RG: missing RN" 216 reference = record.references[-1] 217 reference.authors.append(value) 218 elif key == "RT": 219 assert record.references, "RT: missing RN" 220 reference = record.references[-1] 221 reference.title.append(value) 222 elif key == 'CC': 223 _read_cc(record, line) 224 elif key == 'DR': 225 _read_dr(record, value) 226 elif key == 'PE': 227 # TODO - Record this information? 228 pass 229 elif key == 'KW': 230 _read_kw(record, value) 231 elif key == 'FT': 232 _read_ft(record, line) 233 elif key == 'SQ': 234 cols = value.split() 235 assert len(cols) == 7, "I don't understand SQ line %s" % line 236 # Do more checking here? 237 record.seqinfo = int(cols[1]), int(cols[3]), cols[5] 238 elif key == ' ': 239 _sequence_lines.append(value.replace(" ", "").rstrip()) 240 elif key == '//': 241 # Join multiline data into one string 242 record.description = " ".join(record.description) 243 record.organism = " ".join(record.organism) 244 record.organelle = record.organelle.rstrip() 245 for reference in record.references: 246 reference.authors = " ".join(reference.authors).rstrip(";") 247 reference.title = " ".join(reference.title).rstrip(";") 248 if reference.title.startswith('"') and reference.title.endswith('"'): 249 reference.title = reference.title[1:-1] # remove quotes 250 reference.location = " ".join(reference.location) 251 record.sequence = "".join(_sequence_lines) 252 return record 253 else: 254 raise ValueError("Unknown keyword '%s' found" % key) 255 if record: 256 raise ValueError("Unexpected end of stream.")
257 258
259 -def _read_id(record, line):
260 cols = line[5:].split() 261 # Prior to release 51, included with MoleculeType: 262 # ID EntryName DataClass; MoleculeType; SequenceLength AA. 263 # 264 # Newer files lack the MoleculeType: 265 # ID EntryName DataClass; SequenceLength AA. 266 if len(cols) == 5: 267 record.entry_name = cols[0] 268 record.data_class = cols[1].rstrip(";") 269 record.molecule_type = cols[2].rstrip(";") 270 record.sequence_length = int(cols[3]) 271 elif len(cols) == 4: 272 record.entry_name = cols[0] 273 record.data_class = cols[1].rstrip(";") 274 record.molecule_type = None 275 record.sequence_length = int(cols[2]) 276 else: 277 raise ValueError("ID line has unrecognised format:\n" + line) 278 # check if the data class is one of the allowed values 279 allowed = ('STANDARD', 'PRELIMINARY', 'IPI', 'Reviewed', 'Unreviewed') 280 if record.data_class not in allowed: 281 raise ValueError("Unrecognized data class %s in line\n%s" % 282 (record.data_class, line)) 283 # molecule_type should be 'PRT' for PRoTein 284 # Note that has been removed in recent releases (set to None) 285 if record.molecule_type not in (None, 'PRT'): 286 raise ValueError("Unrecognized molecule type %s in line\n%s" % 287 (record.molecule_type, line))
288 289
290 -def _read_dt(record, line):
291 value = line[5:] 292 uprline = value.upper() 293 cols = value.rstrip().split() 294 if 'CREATED' in uprline \ 295 or 'LAST SEQUENCE UPDATE' in uprline \ 296 or 'LAST ANNOTATION UPDATE' in uprline: 297 # Old style DT line 298 # ================= 299 # e.g. 300 # DT 01-FEB-1995 (Rel. 31, Created) 301 # DT 01-FEB-1995 (Rel. 31, Last sequence update) 302 # DT 01-OCT-2000 (Rel. 40, Last annotation update) 303 # 304 # or: 305 # DT 08-JAN-2002 (IPI Human rel. 2.3, Created) 306 # ... 307 308 # find where the version information will be located 309 # This is needed for when you have cases like IPI where 310 # the release verison is in a different spot: 311 # DT 08-JAN-2002 (IPI Human rel. 2.3, Created) 312 uprcols = uprline.split() 313 rel_index = -1 314 for index in range(len(uprcols)): 315 if 'REL.' in uprcols[index]: 316 rel_index = index 317 assert rel_index >= 0, \ 318 "Could not find Rel. in DT line: %s" % line 319 version_index = rel_index + 1 320 # get the version information 321 str_version = cols[version_index].rstrip(",") 322 # no version number 323 if str_version == '': 324 version = 0 325 # dot versioned 326 elif '.' in str_version: 327 version = str_version 328 # integer versioned 329 else: 330 version = int(str_version) 331 date = cols[0] 332 333 if 'CREATED' in uprline: 334 record.created = date, version 335 elif 'LAST SEQUENCE UPDATE' in uprline: 336 record.sequence_update = date, version 337 elif 'LAST ANNOTATION UPDATE' in uprline: 338 record.annotation_update = date, version 339 else: 340 assert False, "Shouldn't reach this line!" 341 elif 'INTEGRATED INTO' in uprline \ 342 or 'SEQUENCE VERSION' in uprline \ 343 or 'ENTRY VERSION' in uprline: 344 # New style DT line 345 # ================= 346 # As of UniProt Knowledgebase release 7.0 (including 347 # Swiss-Prot release 49.0 and TrEMBL release 32.0) the 348 # format of the DT lines and the version information 349 # in them was changed - the release number was dropped. 350 # 351 # For more information see bug 1948 and 352 # http://ca.expasy.org/sprot/relnotes/sp_news.html#rel7.0 353 # 354 # e.g. 355 # DT 01-JAN-1998, integrated into UniProtKB/Swiss-Prot. 356 # DT 15-OCT-2001, sequence version 3. 357 # DT 01-APR-2004, entry version 14. 358 # 359 # This is a new style DT line... 360 361 # The date should be in string cols[1] 362 # Get the version number if there is one. 363 # For the three DT lines above: 0, 3, 14 364 try: 365 version = int(cols[-1]) 366 except ValueError: 367 version = 0 368 date = cols[0].rstrip(",") 369 370 # Re-use the historical property names, even though 371 # the meaning has changed slighty: 372 if "INTEGRATED" in uprline: 373 record.created = date, version 374 elif 'SEQUENCE VERSION' in uprline: 375 record.sequence_update = date, version 376 elif 'ENTRY VERSION' in uprline: 377 record.annotation_update = date, version 378 else: 379 assert False, "Shouldn't reach this line!" 380 else: 381 raise ValueError("I don't understand the date line %s" % line)
382 383
384 -def _read_ox(record, line):
385 # The OX line used to be in the simple format: 386 # OX DESCRIPTION=ID[, ID]...; 387 # If there are too many id's to fit onto a line, then the ID's 388 # continue directly onto the next line, e.g. 389 # OX DESCRIPTION=ID[, ID]... 390 # OX ID[, ID]...; 391 # Currently, the description is always "NCBI_TaxID". 392 # To parse this, I need to check to see whether I'm at the 393 # first line. If I am, grab the description and make sure 394 # it's an NCBI ID. Then, grab all the id's. 395 # 396 # As of the 2014-10-01 release, there may be an evidence code, e.g. 397 # OX NCBI_TaxID=418404 {ECO:0000313|EMBL:AEX14553.1}; 398 # In the short term, we will ignore any evidence codes: 399 line = line.split('{')[0] 400 if record.taxonomy_id: 401 ids = line[5:].rstrip().rstrip(";") 402 else: 403 descr, ids = line[5:].rstrip().rstrip(";").split("=") 404 assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr 405 record.taxonomy_id.extend(ids.split(', '))
406 407
408 -def _read_oh(record, line):
409 # Line type OH (Organism Host) for viral hosts 410 assert line[5:].startswith("NCBI_TaxID="), "Unexpected %s" % line 411 line = line[16:].rstrip() 412 assert line[-1] == "." and line.count(";") == 1, line 413 taxid, name = line[:-1].split(";") 414 record.host_taxonomy_id.append(taxid.strip()) 415 record.host_organism.append(name.strip())
416 417
418 -def _read_rn(reference, rn):
419 # This used to be a very simple line with a reference number, e.g. 420 # RN [1] 421 # As of the 2014-10-01 release, there may be an evidence code, e.g. 422 # RN [1] {ECO:0000313|EMBL:AEX14553.1} 423 words = rn.split(None, 1) 424 number = words[0] 425 assert number.startswith('[') and number.endswith(']'), "Missing brackets %s" % number 426 reference.number = int(number[1:-1]) 427 if len(words) > 1: 428 evidence = words[1] 429 assert evidence.startswith('{') and evidence.endswith('}'), "Missing braces %s" % evidence 430 reference.evidence = evidence[1:-1].split('|')
431
432 -def _read_rc(reference, value):
433 cols = value.split(';') 434 if value[-1] == ';': 435 unread = "" 436 else: 437 cols, unread = cols[:-1], cols[-1] 438 for col in cols: 439 if not col: # last column will be the empty string 440 return 441 # The token is everything before the first '=' character. 442 i = col.find("=") 443 if i >= 0: 444 token, text = col[:i], col[i + 1:] 445 comment = token.lstrip(), text 446 reference.comments.append(comment) 447 else: 448 comment = reference.comments[-1] 449 comment = "%s %s" % (comment, col) 450 reference.comments[-1] = comment 451 return unread
452 453
454 -def _read_rx(reference, value):
455 # The basic (older?) RX line is of the form: 456 # RX MEDLINE; 85132727. 457 # but there are variants of this that need to be dealt with (see below) 458 459 # CLD1_HUMAN in Release 39 and DADR_DIDMA in Release 33 460 # have extraneous information in the RX line. Check for 461 # this and chop it out of the line. 462 # (noticed by katel@worldpath.net) 463 value = value.replace(' [NCBI, ExPASy, Israel, Japan]', '') 464 465 # RX lines can also be used of the form 466 # RX PubMed=9603189; 467 # reported by edvard@farmasi.uit.no 468 # and these can be more complicated like: 469 # RX MEDLINE=95385798; PubMed=7656980; 470 # RX PubMed=15060122; DOI=10.1136/jmg 2003.012781; 471 # We look for these cases first and deal with them 472 warn = False 473 if "=" in value: 474 cols = value.split("; ") 475 cols = [x.strip() for x in cols] 476 cols = [x for x in cols if x] 477 for col in cols: 478 x = col.split("=") 479 if len(x) != 2 or x == ("DOI", "DOI"): 480 warn = True 481 break 482 assert len(x) == 2, "I don't understand RX line %s" % value 483 reference.references.append((x[0], x[1].rstrip(";"))) 484 # otherwise we assume we have the type 'RX MEDLINE; 85132727.' 485 else: 486 cols = value.split("; ") 487 # normally we split into the three parts 488 if len(cols) != 2: 489 warn = True 490 else: 491 reference.references.append((cols[0].rstrip(";"), cols[1].rstrip("."))) 492 if warn: 493 import warnings 494 from Bio import BiopythonParserWarning 495 warnings.warn("Possibly corrupt RX line %r" % value, 496 BiopythonParserWarning)
497 498
499 -def _read_cc(record, line):
500 key, value = line[5:8], line[9:].rstrip() 501 if key == '-!-': # Make a new comment 502 record.comments.append(value) 503 elif key == ' ': # add to the previous comment 504 if not record.comments: 505 # TCMO_STRGA in Release 37 has comment with no topic 506 record.comments.append(value) 507 else: 508 record.comments[-1] += " " + value
509 510
511 -def _read_dr(record, value):
512 cols = value.rstrip(".").split('; ') 513 record.cross_references.append(tuple(cols))
514 515
516 -def _read_kw(record, value):
517 # Old style - semi-colon separated, multi-line. e.g. Q13639.txt 518 # KW Alternative splicing; Cell membrane; Complete proteome; 519 # KW Disulfide bond; Endosome; G-protein coupled receptor; Glycoprotein; 520 # KW Lipoprotein; Membrane; Palmitate; Polymorphism; Receptor; Transducer; 521 # KW Transmembrane. 522 # 523 # New style as of 2014-10-01 release with evidence codes, e.g. H2CNN8.txt 524 # KW Monooxygenase {ECO:0000313|EMBL:AEX14553.1}; 525 # KW Oxidoreductase {ECO:0000313|EMBL:AEX14553.1}. 526 # For now to match the XML parser, drop the evidence codes. 527 for value in value.rstrip(";.").split('; '): 528 if value.endswith("}"): 529 # Discard the evidence code 530 value = value.rsplit("{", 1)[0] 531 record.keywords.append(value.strip())
532 533
534 -def _read_ft(record, line):
535 line = line[5:] # get rid of junk in front 536 name = line[0:8].rstrip() 537 try: 538 from_res = int(line[9:15]) 539 except ValueError: 540 from_res = line[9:15].lstrip() 541 try: 542 to_res = int(line[16:22]) 543 except ValueError: 544 to_res = line[16:22].lstrip() 545 # if there is a feature_id (FTId), store it away 546 if line[29:35] == r"/FTId=": 547 ft_id = line[35:70].rstrip()[:-1] 548 description = "" 549 else: 550 ft_id = "" 551 description = line[29:70].rstrip() 552 if not name: # is continuation of last one 553 assert not from_res and not to_res 554 name, from_res, to_res, old_description, old_ft_id = record.features[-1] 555 del record.features[-1] 556 description = ("%s %s" % (old_description, description)).strip() 557 558 # special case -- VARSPLIC, reported by edvard@farmasi.uit.no 559 if name == "VARSPLIC": 560 # Remove unwanted spaces in sequences. 561 # During line carryover, the sequences in VARSPLIC can get mangled 562 # with unwanted spaces like: 563 # 'DISSTKLQALPSHGLESIQT -> PCRATGWSPFRRSSPC LPTH' 564 # We want to check for this case and correct it as it happens. 565 descr_cols = description.split(" -> ") 566 if len(descr_cols) == 2: 567 first_seq, second_seq = descr_cols 568 extra_info = '' 569 # we might have more information at the end of the 570 # second sequence, which should be in parenthesis 571 extra_info_pos = second_seq.find(" (") 572 if extra_info_pos != -1: 573 extra_info = second_seq[extra_info_pos:] 574 second_seq = second_seq[:extra_info_pos] 575 # now clean spaces out of the first and second string 576 first_seq = first_seq.replace(" ", "") 577 second_seq = second_seq.replace(" ", "") 578 # reassemble the description 579 description = first_seq + " -> " + second_seq + extra_info 580 record.features.append((name, from_res, to_res, description, ft_id))
581 582 583 if __name__ == "__main__": 584 print("Quick self test...") 585 586 example_filename = "../../Tests/SwissProt/sp008" 587 588 import os 589 if not os.path.isfile(example_filename): 590 print("Missing test file %s" % example_filename) 591 else: 592 # Try parsing it! 593 594 with open(example_filename) as handle: 595 records = parse(handle) 596 for record in records: 597 print(record.entry_name) 598 print(",".join(record.accessions)) 599 print(record.keywords) 600 print(repr(record.organism)) 601 print(record.sequence[:20] + "...") 602