Package Bio :: Package SwissProt
[hide private]
[frames] | no frames]

Source Code for Package Bio.SwissProt

  1  # Copyright 2007 by Michiel de Hoon.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Code to work with the sprotXX.dat file from SwissProt. 
  6   
  7  http://www.expasy.ch/sprot/sprot-top.html 
  8   
  9  Tested with: 
 10  Release 56.9, 03-March-2009. 
 11   
 12   
 13  Classes: 
 14   
 15      - Record             Holds SwissProt data. 
 16      - Reference          Holds reference data from a SwissProt record. 
 17   
 18  Functions: 
 19   
 20      - read               Read one SwissProt record 
 21      - parse              Read multiple SwissProt records 
 22   
 23  """ 
 24   
 25  from __future__ import print_function 
 26   
 27  from Bio._py3k import _as_string 
 28   
 29   
30 -class Record(object):
31 """Holds information from a SwissProt record. 32 33 Members: 34 35 - entry_name Name of this entry, e.g. RL1_ECOLI. 36 - data_class Either 'STANDARD' or 'PRELIMINARY'. 37 - molecule_type Type of molecule, 'PRT', 38 - sequence_length Number of residues. 39 40 - accessions List of the accession numbers, e.g. ['P00321'] 41 - created A tuple of (date, release). 42 - sequence_update A tuple of (date, release). 43 - annotation_update A tuple of (date, release). 44 45 - description Free-format description. 46 - gene_name Gene name. See userman.txt for description. 47 - organism The source of the sequence. 48 - organelle The origin of the sequence. 49 - organism_classification The taxonomy classification. List of strings. 50 (http://www.ncbi.nlm.nih.gov/Taxonomy/) 51 - taxonomy_id A list of NCBI taxonomy id's. 52 - host_organism A list of names of the hosts of a virus, if any. 53 - host_taxonomy_id A list of NCBI taxonomy id's of the hosts, if any. 54 - references List of Reference objects. 55 - comments List of strings. 56 - cross_references List of tuples (db, id1[, id2][, id3]). See the docs. 57 - keywords List of the keywords. 58 - features List of tuples (key name, from, to, description). 59 from and to can be either integers for the residue 60 numbers, '<', '>', or '?' 61 62 - seqinfo tuple of (length, molecular weight, CRC32 value) 63 - sequence The sequence. 64 65 Example: 66 67 >>> import Bio.SwissProt as sp 68 >>> example_filename = "SwissProt/sp008" 69 >>> with open(example_filename) as handle: 70 ... records = sp.parse(handle) 71 ... for record in records: 72 ... print(record.entry_name) 73 ... print(",".join(record.accessions)) 74 ... print(record.keywords) 75 ... print(repr(record.organism)) 76 ... print(record.sequence[:20] + "...") 77 ... 78 1A02_HUMAN 79 P01892,P06338,P30514,P30444,P30445,P30446,Q29680,Q29899,Q95352,Q29837,Q95380 80 ['MHC I', 'Transmembrane', 'Glycoprotein', 'Signal', 'Polymorphism', '3D-structure'] 81 'Homo sapiens (Human).' 82 MAVMAPRTLVLLLSGALALT... 83 84 """
85 - def __init__(self):
86 self.entry_name = None 87 self.data_class = None 88 self.molecule_type = None 89 self.sequence_length = None 90 91 self.accessions = [] 92 self.created = None 93 self.sequence_update = None 94 self.annotation_update = None 95 96 self.description = [] 97 self.gene_name = '' 98 self.organism = [] 99 self.organelle = '' 100 self.organism_classification = [] 101 self.taxonomy_id = [] 102 self.host_organism = [] 103 self.host_taxonomy_id = [] 104 self.references = [] 105 self.comments = [] 106 self.cross_references = [] 107 self.keywords = [] 108 self.features = [] 109 110 self.seqinfo = None 111 self.sequence = ''
112 113
114 -class Reference(object):
115 """Holds information from one reference in a SwissProt entry. 116 117 Members: 118 number Number of reference in an entry. 119 evidence Evidence code. List of strings. 120 positions Describes extent of work. List of strings. 121 comments Comments. List of (token, text). 122 references References. List of (dbname, identifier). 123 authors The authors of the work. 124 title Title of the work. 125 location A citation for the work. 126 127 """
128 - def __init__(self):
129 self.number = None 130 self.positions = [] 131 self.comments = [] 132 self.references = [] 133 self.authors = [] 134 self.title = [] 135 self.location = []
136 137
138 -def parse(handle):
139 while True: 140 record = _read(handle) 141 if not record: 142 return 143 yield record
144 145
146 -def read(handle):
147 record = _read(handle) 148 if not record: 149 raise ValueError("No SwissProt record found") 150 # We should have reached the end of the record by now 151 # Used to check with handle.read() but that breaks on Python 3.5 152 # due to http://bugs.python.org/issue26499 and could download 153 # lot of data needlessly if there were more records. 154 remainder = handle.readline() 155 if remainder: 156 raise ValueError("More than one SwissProt record found") 157 return record
158 159 160 # Everything below is considered private 161 162
163 -def _read(handle):
164 record = None 165 unread = "" 166 for line in handle: 167 # This is for Python 3 to cope with a binary handle (byte strings), 168 # or a text handle (unicode strings): 169 line = _as_string(line) 170 key, value = line[:2], line[5:].rstrip() 171 if unread: 172 value = unread + " " + value 173 unread = "" 174 if key == '**': 175 # See Bug 2353, some files from the EBI have extra lines 176 # starting "**" (two asterisks/stars). They appear 177 # to be unofficial automated annotations. e.g. 178 # ** 179 # ** ################# INTERNAL SECTION ################## 180 # **HA SAM; Annotated by PicoHamap 1.88; MF_01138.1; 09-NOV-2003. 181 pass 182 elif key == 'ID': 183 record = Record() 184 _read_id(record, line) 185 _sequence_lines = [] 186 elif key == 'AC': 187 accessions = [word for word in value.rstrip(";").split("; ")] 188 record.accessions.extend(accessions) 189 elif key == 'DT': 190 _read_dt(record, line) 191 elif key == 'DE': 192 record.description.append(value.strip()) 193 elif key == 'GN': 194 if record.gene_name: 195 record.gene_name += " " 196 record.gene_name += value 197 elif key == 'OS': 198 record.organism.append(value) 199 elif key == 'OG': 200 record.organelle += line[5:] 201 elif key == 'OC': 202 cols = [col for col in value.rstrip(";.").split("; ")] 203 record.organism_classification.extend(cols) 204 elif key == 'OX': 205 _read_ox(record, line) 206 elif key == 'OH': 207 _read_oh(record, line) 208 elif key == 'RN': 209 reference = Reference() 210 _read_rn(reference, value) 211 record.references.append(reference) 212 elif key == 'RP': 213 assert record.references, "RP: missing RN" 214 record.references[-1].positions.append(value) 215 elif key == 'RC': 216 assert record.references, "RC: missing RN" 217 reference = record.references[-1] 218 unread = _read_rc(reference, value) 219 elif key == 'RX': 220 assert record.references, "RX: missing RN" 221 reference = record.references[-1] 222 _read_rx(reference, value) 223 elif key == 'RL': 224 assert record.references, "RL: missing RN" 225 reference = record.references[-1] 226 reference.location.append(value) 227 # In UniProt release 1.12 of 6/21/04, there is a new RG 228 # (Reference Group) line, which references a group instead of 229 # an author. Each block must have at least 1 RA or RG line. 230 elif key == 'RA': 231 assert record.references, "RA: missing RN" 232 reference = record.references[-1] 233 reference.authors.append(value) 234 elif key == 'RG': 235 assert record.references, "RG: missing RN" 236 reference = record.references[-1] 237 reference.authors.append(value) 238 elif key == "RT": 239 assert record.references, "RT: missing RN" 240 reference = record.references[-1] 241 reference.title.append(value) 242 elif key == 'CC': 243 _read_cc(record, line) 244 elif key == 'DR': 245 _read_dr(record, value) 246 elif key == 'PE': 247 # TODO - Record this information? 248 pass 249 elif key == 'KW': 250 _read_kw(record, value) 251 elif key == 'FT': 252 _read_ft(record, line) 253 elif key == 'SQ': 254 cols = value.split() 255 assert len(cols) == 7, "I don't understand SQ line %s" % line 256 # Do more checking here? 257 record.seqinfo = int(cols[1]), int(cols[3]), cols[5] 258 elif key == ' ': 259 _sequence_lines.append(value.replace(" ", "").rstrip()) 260 elif key == '//': 261 # Join multiline data into one string 262 record.description = " ".join(record.description) 263 record.organism = " ".join(record.organism) 264 record.organelle = record.organelle.rstrip() 265 for reference in record.references: 266 reference.authors = " ".join(reference.authors).rstrip(";") 267 reference.title = " ".join(reference.title).rstrip(";") 268 if reference.title.startswith('"') and reference.title.endswith('"'): 269 reference.title = reference.title[1:-1] # remove quotes 270 reference.location = " ".join(reference.location) 271 record.sequence = "".join(_sequence_lines) 272 return record 273 else: 274 raise ValueError("Unknown keyword '%s' found" % key) 275 if record: 276 raise ValueError("Unexpected end of stream.")
277 278
279 -def _read_id(record, line):
280 cols = line[5:].split() 281 # Prior to release 51, included with MoleculeType: 282 # ID EntryName DataClass; MoleculeType; SequenceLength AA. 283 # 284 # Newer files lack the MoleculeType: 285 # ID EntryName DataClass; SequenceLength AA. 286 if len(cols) == 5: 287 record.entry_name = cols[0] 288 record.data_class = cols[1].rstrip(";") 289 record.molecule_type = cols[2].rstrip(";") 290 record.sequence_length = int(cols[3]) 291 elif len(cols) == 4: 292 record.entry_name = cols[0] 293 record.data_class = cols[1].rstrip(";") 294 record.molecule_type = None 295 record.sequence_length = int(cols[2]) 296 else: 297 raise ValueError("ID line has unrecognised format:\n" + line) 298 # check if the data class is one of the allowed values 299 allowed = ('STANDARD', 'PRELIMINARY', 'IPI', 'Reviewed', 'Unreviewed') 300 if record.data_class not in allowed: 301 raise ValueError("Unrecognized data class %s in line\n%s" % 302 (record.data_class, line)) 303 # molecule_type should be 'PRT' for PRoTein 304 # Note that has been removed in recent releases (set to None) 305 if record.molecule_type not in (None, 'PRT'): 306 raise ValueError("Unrecognized molecule type %s in line\n%s" % 307 (record.molecule_type, line))
308 309
310 -def _read_dt(record, line):
311 value = line[5:] 312 uprline = value.upper() 313 cols = value.rstrip().split() 314 if 'CREATED' in uprline \ 315 or 'LAST SEQUENCE UPDATE' in uprline \ 316 or 'LAST ANNOTATION UPDATE' in uprline: 317 # Old style DT line 318 # ================= 319 # e.g. 320 # DT 01-FEB-1995 (Rel. 31, Created) 321 # DT 01-FEB-1995 (Rel. 31, Last sequence update) 322 # DT 01-OCT-2000 (Rel. 40, Last annotation update) 323 # 324 # or: 325 # DT 08-JAN-2002 (IPI Human rel. 2.3, Created) 326 # ... 327 328 # find where the version information will be located 329 # This is needed for when you have cases like IPI where 330 # the release version is in a different spot: 331 # DT 08-JAN-2002 (IPI Human rel. 2.3, Created) 332 uprcols = uprline.split() 333 rel_index = -1 334 for index in range(len(uprcols)): 335 if 'REL.' in uprcols[index]: 336 rel_index = index 337 assert rel_index >= 0, \ 338 "Could not find Rel. in DT line: %s" % line 339 version_index = rel_index + 1 340 # get the version information 341 str_version = cols[version_index].rstrip(",") 342 # no version number 343 if str_version == '': 344 version = 0 345 # dot versioned 346 elif '.' in str_version: 347 version = str_version 348 # integer versioned 349 else: 350 version = int(str_version) 351 date = cols[0] 352 353 if 'CREATED' in uprline: 354 record.created = date, version 355 elif 'LAST SEQUENCE UPDATE' in uprline: 356 record.sequence_update = date, version 357 elif 'LAST ANNOTATION UPDATE' in uprline: 358 record.annotation_update = date, version 359 else: 360 assert False, "Shouldn't reach this line!" 361 elif 'INTEGRATED INTO' in uprline \ 362 or 'SEQUENCE VERSION' in uprline \ 363 or 'ENTRY VERSION' in uprline: 364 # New style DT line 365 # ================= 366 # As of UniProt Knowledgebase release 7.0 (including 367 # Swiss-Prot release 49.0 and TrEMBL release 32.0) the 368 # format of the DT lines and the version information 369 # in them was changed - the release number was dropped. 370 # 371 # For more information see bug 1948 and 372 # http://ca.expasy.org/sprot/relnotes/sp_news.html#rel7.0 373 # 374 # e.g. 375 # DT 01-JAN-1998, integrated into UniProtKB/Swiss-Prot. 376 # DT 15-OCT-2001, sequence version 3. 377 # DT 01-APR-2004, entry version 14. 378 # 379 # This is a new style DT line... 380 381 # The date should be in string cols[1] 382 # Get the version number if there is one. 383 # For the three DT lines above: 0, 3, 14 384 try: 385 version = int(cols[-1]) 386 except ValueError: 387 version = 0 388 date = cols[0].rstrip(",") 389 390 # Re-use the historical property names, even though 391 # the meaning has changed slighty: 392 if "INTEGRATED" in uprline: 393 record.created = date, version 394 elif 'SEQUENCE VERSION' in uprline: 395 record.sequence_update = date, version 396 elif 'ENTRY VERSION' in uprline: 397 record.annotation_update = date, version 398 else: 399 assert False, "Shouldn't reach this line!" 400 else: 401 raise ValueError("I don't understand the date line %s" % line)
402 403
404 -def _read_ox(record, line):
405 # The OX line used to be in the simple format: 406 # OX DESCRIPTION=ID[, ID]...; 407 # If there are too many id's to fit onto a line, then the ID's 408 # continue directly onto the next line, e.g. 409 # OX DESCRIPTION=ID[, ID]... 410 # OX ID[, ID]...; 411 # Currently, the description is always "NCBI_TaxID". 412 # To parse this, I need to check to see whether I'm at the 413 # first line. If I am, grab the description and make sure 414 # it's an NCBI ID. Then, grab all the id's. 415 # 416 # As of the 2014-10-01 release, there may be an evidence code, e.g. 417 # OX NCBI_TaxID=418404 {ECO:0000313|EMBL:AEX14553.1}; 418 # In the short term, we will ignore any evidence codes: 419 line = line.split('{')[0] 420 if record.taxonomy_id: 421 ids = line[5:].rstrip().rstrip(";") 422 else: 423 descr, ids = line[5:].rstrip().rstrip(";").split("=") 424 assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr 425 record.taxonomy_id.extend(ids.split(', '))
426 427
428 -def _read_oh(record, line):
429 # Line type OH (Organism Host) for viral hosts 430 assert line[5:].startswith("NCBI_TaxID="), "Unexpected %s" % line 431 line = line[16:].rstrip() 432 assert line[-1] == "." and line.count(";") == 1, line 433 taxid, name = line[:-1].split(";") 434 record.host_taxonomy_id.append(taxid.strip()) 435 record.host_organism.append(name.strip())
436 437
438 -def _read_rn(reference, rn):
439 # This used to be a very simple line with a reference number, e.g. 440 # RN [1] 441 # As of the 2014-10-01 release, there may be an evidence code, e.g. 442 # RN [1] {ECO:0000313|EMBL:AEX14553.1} 443 words = rn.split(None, 1) 444 number = words[0] 445 assert number.startswith('[') and number.endswith(']'), "Missing brackets %s" % number 446 reference.number = int(number[1:-1]) 447 if len(words) > 1: 448 evidence = words[1] 449 assert evidence.startswith('{') and evidence.endswith('}'), "Missing braces %s" % evidence 450 reference.evidence = evidence[1:-1].split('|')
451 452
453 -def _read_rc(reference, value):
454 cols = value.split(';') 455 if value[-1] == ';': 456 unread = "" 457 else: 458 cols, unread = cols[:-1], cols[-1] 459 for col in cols: 460 if not col: # last column will be the empty string 461 return 462 # The token is everything before the first '=' character. 463 i = col.find("=") 464 if i >= 0: 465 token, text = col[:i], col[i + 1:] 466 comment = token.lstrip(), text 467 reference.comments.append(comment) 468 else: 469 comment = reference.comments[-1] 470 comment = "%s %s" % (comment, col) 471 reference.comments[-1] = comment 472 return unread
473 474
475 -def _read_rx(reference, value):
476 # The basic (older?) RX line is of the form: 477 # RX MEDLINE; 85132727. 478 # but there are variants of this that need to be dealt with (see below) 479 480 # CLD1_HUMAN in Release 39 and DADR_DIDMA in Release 33 481 # have extraneous information in the RX line. Check for 482 # this and chop it out of the line. 483 # (noticed by katel@worldpath.net) 484 value = value.replace(' [NCBI, ExPASy, Israel, Japan]', '') 485 486 # RX lines can also be used of the form 487 # RX PubMed=9603189; 488 # reported by edvard@farmasi.uit.no 489 # and these can be more complicated like: 490 # RX MEDLINE=95385798; PubMed=7656980; 491 # RX PubMed=15060122; DOI=10.1136/jmg 2003.012781; 492 # We look for these cases first and deal with them 493 warn = False 494 if "=" in value: 495 cols = value.split("; ") 496 cols = [x.strip() for x in cols] 497 cols = [x for x in cols if x] 498 for col in cols: 499 x = col.split("=") 500 if len(x) != 2 or x == ("DOI", "DOI"): 501 warn = True 502 break 503 assert len(x) == 2, "I don't understand RX line %s" % value 504 reference.references.append((x[0], x[1].rstrip(";"))) 505 # otherwise we assume we have the type 'RX MEDLINE; 85132727.' 506 else: 507 cols = value.split("; ") 508 # normally we split into the three parts 509 if len(cols) != 2: 510 warn = True 511 else: 512 reference.references.append((cols[0].rstrip(";"), cols[1].rstrip("."))) 513 if warn: 514 import warnings 515 from Bio import BiopythonParserWarning 516 warnings.warn("Possibly corrupt RX line %r" % value, 517 BiopythonParserWarning)
518 519
520 -def _read_cc(record, line):
521 key, value = line[5:8], line[9:].rstrip() 522 if key == '-!-': # Make a new comment 523 record.comments.append(value) 524 elif key == ' ': # add to the previous comment 525 if not record.comments: 526 # TCMO_STRGA in Release 37 has comment with no topic 527 record.comments.append(value) 528 else: 529 record.comments[-1] += " " + value
530 531
532 -def _read_dr(record, value):
533 cols = value.rstrip(".").split('; ') 534 record.cross_references.append(tuple(cols))
535 536
537 -def _read_kw(record, value):
538 # Old style - semi-colon separated, multi-line. e.g. Q13639.txt 539 # KW Alternative splicing; Cell membrane; Complete proteome; 540 # KW Disulfide bond; Endosome; G-protein coupled receptor; Glycoprotein; 541 # KW Lipoprotein; Membrane; Palmitate; Polymorphism; Receptor; Transducer; 542 # KW Transmembrane. 543 # 544 # New style as of 2014-10-01 release with evidence codes, e.g. H2CNN8.txt 545 # KW Monooxygenase {ECO:0000313|EMBL:AEX14553.1}; 546 # KW Oxidoreductase {ECO:0000313|EMBL:AEX14553.1}. 547 # For now to match the XML parser, drop the evidence codes. 548 for value in value.rstrip(";.").split('; '): 549 if value.endswith("}"): 550 # Discard the evidence code 551 value = value.rsplit("{", 1)[0] 552 record.keywords.append(value.strip())
553 554
555 -def _read_ft(record, line):
556 line = line[5:] # get rid of junk in front 557 name = line[0:8].rstrip() 558 try: 559 from_res = int(line[9:15]) 560 except ValueError: 561 from_res = line[9:15].lstrip() 562 try: 563 to_res = int(line[16:22]) 564 except ValueError: 565 to_res = line[16:22].lstrip() 566 # if there is a feature_id (FTId), store it away 567 if line[29:35] == r"/FTId=": 568 ft_id = line[35:70].rstrip()[:-1] 569 description = "" 570 else: 571 ft_id = "" 572 description = line[29:70].rstrip() 573 if not name: # is continuation of last one 574 assert not from_res and not to_res 575 name, from_res, to_res, old_description, old_ft_id = record.features[-1] 576 del record.features[-1] 577 description = ("%s %s" % (old_description, description)).strip() 578 579 # special case -- VARSPLIC, reported by edvard@farmasi.uit.no 580 if name == "VARSPLIC": 581 # Remove unwanted spaces in sequences. 582 # During line carryover, the sequences in VARSPLIC can get mangled 583 # with unwanted spaces like: 584 # 'DISSTKLQALPSHGLESIQT -> PCRATGWSPFRRSSPC LPTH' 585 # We want to check for this case and correct it as it happens. 586 descr_cols = description.split(" -> ") 587 if len(descr_cols) == 2: 588 first_seq, second_seq = descr_cols 589 extra_info = '' 590 # we might have more information at the end of the 591 # second sequence, which should be in parenthesis 592 extra_info_pos = second_seq.find(" (") 593 if extra_info_pos != -1: 594 extra_info = second_seq[extra_info_pos:] 595 second_seq = second_seq[:extra_info_pos] 596 # now clean spaces out of the first and second string 597 first_seq = first_seq.replace(" ", "") 598 second_seq = second_seq.replace(" ", "") 599 # reassemble the description 600 description = first_seq + " -> " + second_seq + extra_info 601 record.features.append((name, from_res, to_res, description, ft_id))
602 603 if __name__ == "__main__": 604 from Bio._utils import run_doctest 605 run_doctest(verbose=0) 606