Package Bio :: Package SwissProt
[hide private]
[frames] | no frames]

Source Code for Package Bio.SwissProt

  1  # Copyright 2007 by Michiel de Hoon.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Code to work with the sprotXX.dat file from SwissProt. 
  6   
  7  http://www.expasy.ch/sprot/sprot-top.html 
  8   
  9  Tested with: 
 10  Release 56.9, 03-March-2009. 
 11   
 12   
 13  Classes: 
 14   
 15      - Record             Holds SwissProt data. 
 16      - Reference          Holds reference data from a SwissProt record. 
 17   
 18  Functions: 
 19   
 20      - read               Read one SwissProt record 
 21      - parse              Read multiple SwissProt records 
 22   
 23  """ 
 24   
 25  from __future__ import print_function 
 26   
 27  from Bio._py3k import _as_string 
 28   
 29   
30 -class Record(object):
31 """Holds information from a SwissProt record. 32 33 Members: 34 35 - entry_name Name of this entry, e.g. RL1_ECOLI. 36 - data_class Either 'STANDARD' or 'PRELIMINARY'. 37 - molecule_type Type of molecule, 'PRT', 38 - sequence_length Number of residues. 39 40 - accessions List of the accession numbers, e.g. ['P00321'] 41 - created A tuple of (date, release). 42 - sequence_update A tuple of (date, release). 43 - annotation_update A tuple of (date, release). 44 45 - description Free-format description. 46 - gene_name Gene name. See userman.txt for description. 47 - organism The source of the sequence. 48 - organelle The origin of the sequence. 49 - organism_classification The taxonomy classification. List of strings. 50 (http://www.ncbi.nlm.nih.gov/Taxonomy/) 51 - taxonomy_id A list of NCBI taxonomy id's. 52 - host_organism A list of names of the hosts of a virus, if any. 53 - host_taxonomy_id A list of NCBI taxonomy id's of the hosts, if any. 54 - references List of Reference objects. 55 - comments List of strings. 56 - cross_references List of tuples (db, id1[, id2][, id3]). See the docs. 57 - keywords List of the keywords. 58 - features List of tuples (key name, from, to, description). 59 from and to can be either integers for the residue 60 numbers, '<', '>', or '?' 61 62 - seqinfo tuple of (length, molecular weight, CRC32 value) 63 - sequence The sequence. 64 65 """
66 - def __init__(self):
67 self.entry_name = None 68 self.data_class = None 69 self.molecule_type = None 70 self.sequence_length = None 71 72 self.accessions = [] 73 self.created = None 74 self.sequence_update = None 75 self.annotation_update = None 76 77 self.description = [] 78 self.gene_name = '' 79 self.organism = [] 80 self.organelle = '' 81 self.organism_classification = [] 82 self.taxonomy_id = [] 83 self.host_organism = [] 84 self.host_taxonomy_id = [] 85 self.references = [] 86 self.comments = [] 87 self.cross_references = [] 88 self.keywords = [] 89 self.features = [] 90 91 self.seqinfo = None 92 self.sequence = ''
93 94
95 -class Reference(object):
96 """Holds information from one reference in a SwissProt entry. 97 98 Members: 99 number Number of reference in an entry. 100 evidence Evidence code. List of strings. 101 positions Describes extent of work. List of strings. 102 comments Comments. List of (token, text). 103 references References. List of (dbname, identifier). 104 authors The authors of the work. 105 title Title of the work. 106 location A citation for the work. 107 108 """
109 - def __init__(self):
110 self.number = None 111 self.positions = [] 112 self.comments = [] 113 self.references = [] 114 self.authors = [] 115 self.title = [] 116 self.location = []
117 118
119 -def parse(handle):
120 while True: 121 record = _read(handle) 122 if not record: 123 return 124 yield record
125 126
127 -def read(handle):
128 record = _read(handle) 129 if not record: 130 raise ValueError("No SwissProt record found") 131 # We should have reached the end of the record by now 132 # Used to check with handle.read() but that breaks on Python 3.5 133 # due to http://bugs.python.org/issue26499 and could download 134 # lot of data needlessly if there were more records. 135 remainder = handle.readline() 136 if remainder: 137 raise ValueError("More than one SwissProt record found") 138 return record
139 140 141 # Everything below is considered private 142 143
144 -def _read(handle):
145 record = None 146 unread = "" 147 for line in handle: 148 # This is for Python 3 to cope with a binary handle (byte strings), 149 # or a text handle (unicode strings): 150 line = _as_string(line) 151 key, value = line[:2], line[5:].rstrip() 152 if unread: 153 value = unread + " " + value 154 unread = "" 155 if key == '**': 156 # See Bug 2353, some files from the EBI have extra lines 157 # starting "**" (two asterisks/stars). They appear 158 # to be unofficial automated annotations. e.g. 159 # ** 160 # ** ################# INTERNAL SECTION ################## 161 # **HA SAM; Annotated by PicoHamap 1.88; MF_01138.1; 09-NOV-2003. 162 pass 163 elif key == 'ID': 164 record = Record() 165 _read_id(record, line) 166 _sequence_lines = [] 167 elif key == 'AC': 168 accessions = [word for word in value.rstrip(";").split("; ")] 169 record.accessions.extend(accessions) 170 elif key == 'DT': 171 _read_dt(record, line) 172 elif key == 'DE': 173 record.description.append(value.strip()) 174 elif key == 'GN': 175 if record.gene_name: 176 record.gene_name += " " 177 record.gene_name += value 178 elif key == 'OS': 179 record.organism.append(value) 180 elif key == 'OG': 181 record.organelle += line[5:] 182 elif key == 'OC': 183 cols = [col for col in value.rstrip(";.").split("; ")] 184 record.organism_classification.extend(cols) 185 elif key == 'OX': 186 _read_ox(record, line) 187 elif key == 'OH': 188 _read_oh(record, line) 189 elif key == 'RN': 190 reference = Reference() 191 _read_rn(reference, value) 192 record.references.append(reference) 193 elif key == 'RP': 194 assert record.references, "RP: missing RN" 195 record.references[-1].positions.append(value) 196 elif key == 'RC': 197 assert record.references, "RC: missing RN" 198 reference = record.references[-1] 199 unread = _read_rc(reference, value) 200 elif key == 'RX': 201 assert record.references, "RX: missing RN" 202 reference = record.references[-1] 203 _read_rx(reference, value) 204 elif key == 'RL': 205 assert record.references, "RL: missing RN" 206 reference = record.references[-1] 207 reference.location.append(value) 208 # In UniProt release 1.12 of 6/21/04, there is a new RG 209 # (Reference Group) line, which references a group instead of 210 # an author. Each block must have at least 1 RA or RG line. 211 elif key == 'RA': 212 assert record.references, "RA: missing RN" 213 reference = record.references[-1] 214 reference.authors.append(value) 215 elif key == 'RG': 216 assert record.references, "RG: missing RN" 217 reference = record.references[-1] 218 reference.authors.append(value) 219 elif key == "RT": 220 assert record.references, "RT: missing RN" 221 reference = record.references[-1] 222 reference.title.append(value) 223 elif key == 'CC': 224 _read_cc(record, line) 225 elif key == 'DR': 226 _read_dr(record, value) 227 elif key == 'PE': 228 # TODO - Record this information? 229 pass 230 elif key == 'KW': 231 _read_kw(record, value) 232 elif key == 'FT': 233 _read_ft(record, line) 234 elif key == 'SQ': 235 cols = value.split() 236 assert len(cols) == 7, "I don't understand SQ line %s" % line 237 # Do more checking here? 238 record.seqinfo = int(cols[1]), int(cols[3]), cols[5] 239 elif key == ' ': 240 _sequence_lines.append(value.replace(" ", "").rstrip()) 241 elif key == '//': 242 # Join multiline data into one string 243 record.description = " ".join(record.description) 244 record.organism = " ".join(record.organism) 245 record.organelle = record.organelle.rstrip() 246 for reference in record.references: 247 reference.authors = " ".join(reference.authors).rstrip(";") 248 reference.title = " ".join(reference.title).rstrip(";") 249 if reference.title.startswith('"') and reference.title.endswith('"'): 250 reference.title = reference.title[1:-1] # remove quotes 251 reference.location = " ".join(reference.location) 252 record.sequence = "".join(_sequence_lines) 253 return record 254 else: 255 raise ValueError("Unknown keyword '%s' found" % key) 256 if record: 257 raise ValueError("Unexpected end of stream.")
258 259
260 -def _read_id(record, line):
261 cols = line[5:].split() 262 # Prior to release 51, included with MoleculeType: 263 # ID EntryName DataClass; MoleculeType; SequenceLength AA. 264 # 265 # Newer files lack the MoleculeType: 266 # ID EntryName DataClass; SequenceLength AA. 267 if len(cols) == 5: 268 record.entry_name = cols[0] 269 record.data_class = cols[1].rstrip(";") 270 record.molecule_type = cols[2].rstrip(";") 271 record.sequence_length = int(cols[3]) 272 elif len(cols) == 4: 273 record.entry_name = cols[0] 274 record.data_class = cols[1].rstrip(";") 275 record.molecule_type = None 276 record.sequence_length = int(cols[2]) 277 else: 278 raise ValueError("ID line has unrecognised format:\n" + line) 279 # check if the data class is one of the allowed values 280 allowed = ('STANDARD', 'PRELIMINARY', 'IPI', 'Reviewed', 'Unreviewed') 281 if record.data_class not in allowed: 282 raise ValueError("Unrecognized data class %s in line\n%s" % 283 (record.data_class, line)) 284 # molecule_type should be 'PRT' for PRoTein 285 # Note that has been removed in recent releases (set to None) 286 if record.molecule_type not in (None, 'PRT'): 287 raise ValueError("Unrecognized molecule type %s in line\n%s" % 288 (record.molecule_type, line))
289 290
291 -def _read_dt(record, line):
292 value = line[5:] 293 uprline = value.upper() 294 cols = value.rstrip().split() 295 if 'CREATED' in uprline \ 296 or 'LAST SEQUENCE UPDATE' in uprline \ 297 or 'LAST ANNOTATION UPDATE' in uprline: 298 # Old style DT line 299 # ================= 300 # e.g. 301 # DT 01-FEB-1995 (Rel. 31, Created) 302 # DT 01-FEB-1995 (Rel. 31, Last sequence update) 303 # DT 01-OCT-2000 (Rel. 40, Last annotation update) 304 # 305 # or: 306 # DT 08-JAN-2002 (IPI Human rel. 2.3, Created) 307 # ... 308 309 # find where the version information will be located 310 # This is needed for when you have cases like IPI where 311 # the release version is in a different spot: 312 # DT 08-JAN-2002 (IPI Human rel. 2.3, Created) 313 uprcols = uprline.split() 314 rel_index = -1 315 for index in range(len(uprcols)): 316 if 'REL.' in uprcols[index]: 317 rel_index = index 318 assert rel_index >= 0, \ 319 "Could not find Rel. in DT line: %s" % line 320 version_index = rel_index + 1 321 # get the version information 322 str_version = cols[version_index].rstrip(",") 323 # no version number 324 if str_version == '': 325 version = 0 326 # dot versioned 327 elif '.' in str_version: 328 version = str_version 329 # integer versioned 330 else: 331 version = int(str_version) 332 date = cols[0] 333 334 if 'CREATED' in uprline: 335 record.created = date, version 336 elif 'LAST SEQUENCE UPDATE' in uprline: 337 record.sequence_update = date, version 338 elif 'LAST ANNOTATION UPDATE' in uprline: 339 record.annotation_update = date, version 340 else: 341 assert False, "Shouldn't reach this line!" 342 elif 'INTEGRATED INTO' in uprline \ 343 or 'SEQUENCE VERSION' in uprline \ 344 or 'ENTRY VERSION' in uprline: 345 # New style DT line 346 # ================= 347 # As of UniProt Knowledgebase release 7.0 (including 348 # Swiss-Prot release 49.0 and TrEMBL release 32.0) the 349 # format of the DT lines and the version information 350 # in them was changed - the release number was dropped. 351 # 352 # For more information see bug 1948 and 353 # http://ca.expasy.org/sprot/relnotes/sp_news.html#rel7.0 354 # 355 # e.g. 356 # DT 01-JAN-1998, integrated into UniProtKB/Swiss-Prot. 357 # DT 15-OCT-2001, sequence version 3. 358 # DT 01-APR-2004, entry version 14. 359 # 360 # This is a new style DT line... 361 362 # The date should be in string cols[1] 363 # Get the version number if there is one. 364 # For the three DT lines above: 0, 3, 14 365 try: 366 version = int(cols[-1]) 367 except ValueError: 368 version = 0 369 date = cols[0].rstrip(",") 370 371 # Re-use the historical property names, even though 372 # the meaning has changed slighty: 373 if "INTEGRATED" in uprline: 374 record.created = date, version 375 elif 'SEQUENCE VERSION' in uprline: 376 record.sequence_update = date, version 377 elif 'ENTRY VERSION' in uprline: 378 record.annotation_update = date, version 379 else: 380 assert False, "Shouldn't reach this line!" 381 else: 382 raise ValueError("I don't understand the date line %s" % line)
383 384
385 -def _read_ox(record, line):
386 # The OX line used to be in the simple format: 387 # OX DESCRIPTION=ID[, ID]...; 388 # If there are too many id's to fit onto a line, then the ID's 389 # continue directly onto the next line, e.g. 390 # OX DESCRIPTION=ID[, ID]... 391 # OX ID[, ID]...; 392 # Currently, the description is always "NCBI_TaxID". 393 # To parse this, I need to check to see whether I'm at the 394 # first line. If I am, grab the description and make sure 395 # it's an NCBI ID. Then, grab all the id's. 396 # 397 # As of the 2014-10-01 release, there may be an evidence code, e.g. 398 # OX NCBI_TaxID=418404 {ECO:0000313|EMBL:AEX14553.1}; 399 # In the short term, we will ignore any evidence codes: 400 line = line.split('{')[0] 401 if record.taxonomy_id: 402 ids = line[5:].rstrip().rstrip(";") 403 else: 404 descr, ids = line[5:].rstrip().rstrip(";").split("=") 405 assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr 406 record.taxonomy_id.extend(ids.split(', '))
407 408
409 -def _read_oh(record, line):
410 # Line type OH (Organism Host) for viral hosts 411 assert line[5:].startswith("NCBI_TaxID="), "Unexpected %s" % line 412 line = line[16:].rstrip() 413 assert line[-1] == "." and line.count(";") == 1, line 414 taxid, name = line[:-1].split(";") 415 record.host_taxonomy_id.append(taxid.strip()) 416 record.host_organism.append(name.strip())
417 418
419 -def _read_rn(reference, rn):
420 # This used to be a very simple line with a reference number, e.g. 421 # RN [1] 422 # As of the 2014-10-01 release, there may be an evidence code, e.g. 423 # RN [1] {ECO:0000313|EMBL:AEX14553.1} 424 words = rn.split(None, 1) 425 number = words[0] 426 assert number.startswith('[') and number.endswith(']'), "Missing brackets %s" % number 427 reference.number = int(number[1:-1]) 428 if len(words) > 1: 429 evidence = words[1] 430 assert evidence.startswith('{') and evidence.endswith('}'), "Missing braces %s" % evidence 431 reference.evidence = evidence[1:-1].split('|')
432 433
434 -def _read_rc(reference, value):
435 cols = value.split(';') 436 if value[-1] == ';': 437 unread = "" 438 else: 439 cols, unread = cols[:-1], cols[-1] 440 for col in cols: 441 if not col: # last column will be the empty string 442 return 443 # The token is everything before the first '=' character. 444 i = col.find("=") 445 if i >= 0: 446 token, text = col[:i], col[i + 1:] 447 comment = token.lstrip(), text 448 reference.comments.append(comment) 449 else: 450 comment = reference.comments[-1] 451 comment = "%s %s" % (comment, col) 452 reference.comments[-1] = comment 453 return unread
454 455
456 -def _read_rx(reference, value):
457 # The basic (older?) RX line is of the form: 458 # RX MEDLINE; 85132727. 459 # but there are variants of this that need to be dealt with (see below) 460 461 # CLD1_HUMAN in Release 39 and DADR_DIDMA in Release 33 462 # have extraneous information in the RX line. Check for 463 # this and chop it out of the line. 464 # (noticed by katel@worldpath.net) 465 value = value.replace(' [NCBI, ExPASy, Israel, Japan]', '') 466 467 # RX lines can also be used of the form 468 # RX PubMed=9603189; 469 # reported by edvard@farmasi.uit.no 470 # and these can be more complicated like: 471 # RX MEDLINE=95385798; PubMed=7656980; 472 # RX PubMed=15060122; DOI=10.1136/jmg 2003.012781; 473 # We look for these cases first and deal with them 474 warn = False 475 if "=" in value: 476 cols = value.split("; ") 477 cols = [x.strip() for x in cols] 478 cols = [x for x in cols if x] 479 for col in cols: 480 x = col.split("=") 481 if len(x) != 2 or x == ("DOI", "DOI"): 482 warn = True 483 break 484 assert len(x) == 2, "I don't understand RX line %s" % value 485 reference.references.append((x[0], x[1].rstrip(";"))) 486 # otherwise we assume we have the type 'RX MEDLINE; 85132727.' 487 else: 488 cols = value.split("; ") 489 # normally we split into the three parts 490 if len(cols) != 2: 491 warn = True 492 else: 493 reference.references.append((cols[0].rstrip(";"), cols[1].rstrip("."))) 494 if warn: 495 import warnings 496 from Bio import BiopythonParserWarning 497 warnings.warn("Possibly corrupt RX line %r" % value, 498 BiopythonParserWarning)
499 500
501 -def _read_cc(record, line):
502 key, value = line[5:8], line[9:].rstrip() 503 if key == '-!-': # Make a new comment 504 record.comments.append(value) 505 elif key == ' ': # add to the previous comment 506 if not record.comments: 507 # TCMO_STRGA in Release 37 has comment with no topic 508 record.comments.append(value) 509 else: 510 record.comments[-1] += " " + value
511 512
513 -def _read_dr(record, value):
514 cols = value.rstrip(".").split('; ') 515 record.cross_references.append(tuple(cols))
516 517
518 -def _read_kw(record, value):
519 # Old style - semi-colon separated, multi-line. e.g. Q13639.txt 520 # KW Alternative splicing; Cell membrane; Complete proteome; 521 # KW Disulfide bond; Endosome; G-protein coupled receptor; Glycoprotein; 522 # KW Lipoprotein; Membrane; Palmitate; Polymorphism; Receptor; Transducer; 523 # KW Transmembrane. 524 # 525 # New style as of 2014-10-01 release with evidence codes, e.g. H2CNN8.txt 526 # KW Monooxygenase {ECO:0000313|EMBL:AEX14553.1}; 527 # KW Oxidoreductase {ECO:0000313|EMBL:AEX14553.1}. 528 # For now to match the XML parser, drop the evidence codes. 529 for value in value.rstrip(";.").split('; '): 530 if value.endswith("}"): 531 # Discard the evidence code 532 value = value.rsplit("{", 1)[0] 533 record.keywords.append(value.strip())
534 535
536 -def _read_ft(record, line):
537 line = line[5:] # get rid of junk in front 538 name = line[0:8].rstrip() 539 try: 540 from_res = int(line[9:15]) 541 except ValueError: 542 from_res = line[9:15].lstrip() 543 try: 544 to_res = int(line[16:22]) 545 except ValueError: 546 to_res = line[16:22].lstrip() 547 # if there is a feature_id (FTId), store it away 548 if line[29:35] == r"/FTId=": 549 ft_id = line[35:70].rstrip()[:-1] 550 description = "" 551 else: 552 ft_id = "" 553 description = line[29:70].rstrip() 554 if not name: # is continuation of last one 555 assert not from_res and not to_res 556 name, from_res, to_res, old_description, old_ft_id = record.features[-1] 557 del record.features[-1] 558 description = ("%s %s" % (old_description, description)).strip() 559 560 # special case -- VARSPLIC, reported by edvard@farmasi.uit.no 561 if name == "VARSPLIC": 562 # Remove unwanted spaces in sequences. 563 # During line carryover, the sequences in VARSPLIC can get mangled 564 # with unwanted spaces like: 565 # 'DISSTKLQALPSHGLESIQT -> PCRATGWSPFRRSSPC LPTH' 566 # We want to check for this case and correct it as it happens. 567 descr_cols = description.split(" -> ") 568 if len(descr_cols) == 2: 569 first_seq, second_seq = descr_cols 570 extra_info = '' 571 # we might have more information at the end of the 572 # second sequence, which should be in parenthesis 573 extra_info_pos = second_seq.find(" (") 574 if extra_info_pos != -1: 575 extra_info = second_seq[extra_info_pos:] 576 second_seq = second_seq[:extra_info_pos] 577 # now clean spaces out of the first and second string 578 first_seq = first_seq.replace(" ", "") 579 second_seq = second_seq.replace(" ", "") 580 # reassemble the description 581 description = first_seq + " -> " + second_seq + extra_info 582 record.features.append((name, from_res, to_res, description, ft_id))
583 584 585 if __name__ == "__main__": 586 print("Quick self test...") 587 588 example_filename = "../../Tests/SwissProt/sp008" 589 590 import os 591 if not os.path.isfile(example_filename): 592 print("Missing test file %s" % example_filename) 593 else: 594 # Try parsing it! 595 596 with open(example_filename) as handle: 597 records = parse(handle) 598 for record in records: 599 print(record.entry_name) 600 print(",".join(record.accessions)) 601 print(record.keywords) 602 print(repr(record.organism)) 603 print(record.sequence[:20] + "...") 604