Package Bio :: Package SwissProt
[hide private]
[frames] | no frames]

Source Code for Package Bio.SwissProt

  1  # Copyright 2007 by Michiel de Hoon.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Code to work with the sprotXX.dat file from SwissProt. 
  6   
  7  http://www.expasy.ch/sprot/sprot-top.html 
  8   
  9  Tested with: 
 10  Release 56.9, 03-March-2009. 
 11   
 12   
 13  Classes: 
 14   - Record             Holds SwissProt data. 
 15   - Reference          Holds reference data from a SwissProt record. 
 16   
 17  Functions: 
 18   - read               Read one SwissProt record 
 19   - parse              Read multiple SwissProt records 
 20   
 21  """ 
 22   
 23  from __future__ import print_function 
 24   
 25  from Bio._py3k import _as_string 
 26   
 27   
28 -class Record(object):
29 """Holds information from a SwissProt record. 30 31 Attributes: 32 - entry_name Name of this entry, e.g. RL1_ECOLI. 33 - data_class Either 'STANDARD' or 'PRELIMINARY'. 34 - molecule_type Type of molecule, 'PRT', 35 - sequence_length Number of residues. 36 - accessions List of the accession numbers, e.g. ['P00321'] 37 - created A tuple of (date, release). 38 - sequence_update A tuple of (date, release). 39 - annotation_update A tuple of (date, release). 40 - description Free-format description. 41 - gene_name Gene name. See userman.txt for description. 42 - organism The source of the sequence. 43 - organelle The origin of the sequence. 44 - organism_classification The taxonomy classification. List of strings. 45 (http://www.ncbi.nlm.nih.gov/Taxonomy/) 46 - taxonomy_id A list of NCBI taxonomy id's. 47 - host_organism A list of names of the hosts of a virus, if any. 48 - host_taxonomy_id A list of NCBI taxonomy id's of the hosts, if any. 49 - references List of Reference objects. 50 - comments List of strings. 51 - cross_references List of tuples (db, id1[, id2][, id3]). See the docs. 52 - keywords List of the keywords. 53 - features List of tuples (key name, from, to, description). 54 from and to can be either integers for the residue 55 numbers, '<', '>', or '?' 56 - protein_existence Numerical value describing the evidence for the existence of the protein. 57 - seqinfo tuple of (length, molecular weight, CRC32 value) 58 - sequence The sequence. 59 60 Example: 61 62 >>> import Bio.SwissProt as sp 63 >>> example_filename = "SwissProt/sp008" 64 >>> with open(example_filename) as handle: 65 ... records = sp.parse(handle) 66 ... for record in records: 67 ... print(record.entry_name) 68 ... print(",".join(record.accessions)) 69 ... print(record.keywords) 70 ... print(repr(record.organism)) 71 ... print(record.sequence[:20] + "...") 72 ... 73 1A02_HUMAN 74 P01892,P06338,P30514,P30444,P30445,P30446,Q29680,Q29899,Q95352,Q29837,Q95380 75 ['MHC I', 'Transmembrane', 'Glycoprotein', 'Signal', 'Polymorphism', '3D-structure'] 76 'Homo sapiens (Human).' 77 MAVMAPRTLVLLLSGALALT... 78 79 """ 80
81 - def __init__(self):
82 self.entry_name = None 83 self.data_class = None 84 self.molecule_type = None 85 self.sequence_length = None 86 87 self.accessions = [] 88 self.created = None 89 self.sequence_update = None 90 self.annotation_update = None 91 92 self.description = [] 93 self.gene_name = '' 94 self.organism = [] 95 self.organelle = '' 96 self.organism_classification = [] 97 self.taxonomy_id = [] 98 self.host_organism = [] 99 self.host_taxonomy_id = [] 100 self.references = [] 101 self.comments = [] 102 self.cross_references = [] 103 self.keywords = [] 104 self.features = [] 105 self.protein_existence = '' 106 107 self.seqinfo = None 108 self.sequence = ''
109 110
111 -class Reference(object):
112 """Holds information from one reference in a SwissProt entry. 113 114 Attribues: 115 - number Number of reference in an entry. 116 - evidence Evidence code. List of strings. 117 - positions Describes extent of work. List of strings. 118 - comments Comments. List of (token, text). 119 - references References. List of (dbname, identifier). 120 - authors The authors of the work. 121 - title Title of the work. 122 - location A citation for the work. 123 124 """ 125
126 - def __init__(self):
127 self.number = None 128 self.positions = [] 129 self.comments = [] 130 self.references = [] 131 self.authors = [] 132 self.title = [] 133 self.location = []
134 135
136 -def parse(handle):
137 while True: 138 record = _read(handle) 139 if not record: 140 return 141 yield record
142 143
144 -def read(handle):
145 record = _read(handle) 146 if not record: 147 raise ValueError("No SwissProt record found") 148 # We should have reached the end of the record by now 149 # Used to check with handle.read() but that breaks on Python 3.5 150 # due to http://bugs.python.org/issue26499 and could download 151 # lot of data needlessly if there were more records. 152 remainder = handle.readline() 153 if remainder: 154 raise ValueError("More than one SwissProt record found") 155 return record
156 157 158 # Everything below is considered private 159 160
161 -def _read(handle):
162 record = None 163 unread = "" 164 for line in handle: 165 # This is for Python 3 to cope with a binary handle (byte strings), 166 # or a text handle (unicode strings): 167 line = _as_string(line) 168 key, value = line[:2], line[5:].rstrip() 169 if unread: 170 value = unread + " " + value 171 unread = "" 172 if key == '**': 173 # See Bug 2353, some files from the EBI have extra lines 174 # starting "**" (two asterisks/stars). They appear 175 # to be unofficial automated annotations. e.g. 176 # ** 177 # ** ################# INTERNAL SECTION ################## 178 # **HA SAM; Annotated by PicoHamap 1.88; MF_01138.1; 09-NOV-2003. 179 pass 180 elif key == 'ID': 181 record = Record() 182 _read_id(record, line) 183 _sequence_lines = [] 184 elif key == 'AC': 185 accessions = [word for word in value.rstrip(";").split("; ")] 186 record.accessions.extend(accessions) 187 elif key == 'DT': 188 _read_dt(record, line) 189 elif key == 'DE': 190 record.description.append(value.strip()) 191 elif key == 'GN': 192 if record.gene_name: 193 record.gene_name += " " 194 record.gene_name += value 195 elif key == 'OS': 196 record.organism.append(value) 197 elif key == 'OG': 198 record.organelle += line[5:] 199 elif key == 'OC': 200 cols = [col for col in value.rstrip(";.").split("; ")] 201 record.organism_classification.extend(cols) 202 elif key == 'OX': 203 _read_ox(record, line) 204 elif key == 'OH': 205 _read_oh(record, line) 206 elif key == 'RN': 207 reference = Reference() 208 _read_rn(reference, value) 209 record.references.append(reference) 210 elif key == 'RP': 211 assert record.references, "RP: missing RN" 212 record.references[-1].positions.append(value) 213 elif key == 'RC': 214 assert record.references, "RC: missing RN" 215 reference = record.references[-1] 216 unread = _read_rc(reference, value) 217 elif key == 'RX': 218 assert record.references, "RX: missing RN" 219 reference = record.references[-1] 220 _read_rx(reference, value) 221 elif key == 'RL': 222 assert record.references, "RL: missing RN" 223 reference = record.references[-1] 224 reference.location.append(value) 225 # In UniProt release 1.12 of 6/21/04, there is a new RG 226 # (Reference Group) line, which references a group instead of 227 # an author. Each block must have at least 1 RA or RG line. 228 elif key == 'RA': 229 assert record.references, "RA: missing RN" 230 reference = record.references[-1] 231 reference.authors.append(value) 232 elif key == 'RG': 233 assert record.references, "RG: missing RN" 234 reference = record.references[-1] 235 reference.authors.append(value) 236 elif key == "RT": 237 assert record.references, "RT: missing RN" 238 reference = record.references[-1] 239 reference.title.append(value) 240 elif key == 'CC': 241 _read_cc(record, line) 242 elif key == 'DR': 243 _read_dr(record, value) 244 elif key == 'PE': 245 _read_pe(record, value) 246 elif key == 'KW': 247 _read_kw(record, value) 248 elif key == 'FT': 249 _read_ft(record, line) 250 elif key == 'SQ': 251 cols = value.split() 252 assert len(cols) == 7, "I don't understand SQ line %s" % line 253 # Do more checking here? 254 record.seqinfo = int(cols[1]), int(cols[3]), cols[5] 255 elif key == ' ': 256 _sequence_lines.append(value.replace(" ", "").rstrip()) 257 elif key == '//': 258 # Join multiline data into one string 259 record.description = " ".join(record.description) 260 record.organism = " ".join(record.organism) 261 record.organelle = record.organelle.rstrip() 262 for reference in record.references: 263 reference.authors = " ".join(reference.authors).rstrip(";") 264 reference.title = " ".join(reference.title).rstrip(";") 265 if reference.title.startswith('"') and reference.title.endswith('"'): 266 reference.title = reference.title[1:-1] # remove quotes 267 reference.location = " ".join(reference.location) 268 record.sequence = "".join(_sequence_lines) 269 return record 270 else: 271 raise ValueError("Unknown keyword '%s' found" % key) 272 if record: 273 raise ValueError("Unexpected end of stream.")
274 275
276 -def _read_id(record, line):
277 cols = line[5:].split() 278 # Prior to release 51, included with MoleculeType: 279 # ID EntryName DataClass; MoleculeType; SequenceLength AA. 280 # 281 # Newer files lack the MoleculeType: 282 # ID EntryName DataClass; SequenceLength AA. 283 if len(cols) == 5: 284 record.entry_name = cols[0] 285 record.data_class = cols[1].rstrip(";") 286 record.molecule_type = cols[2].rstrip(";") 287 record.sequence_length = int(cols[3]) 288 elif len(cols) == 4: 289 record.entry_name = cols[0] 290 record.data_class = cols[1].rstrip(";") 291 record.molecule_type = None 292 record.sequence_length = int(cols[2]) 293 else: 294 raise ValueError("ID line has unrecognised format:\n" + line) 295 # check if the data class is one of the allowed values 296 allowed = ('STANDARD', 'PRELIMINARY', 'IPI', 'Reviewed', 'Unreviewed') 297 if record.data_class not in allowed: 298 raise ValueError("Unrecognized data class %s in line\n%s" % 299 (record.data_class, line)) 300 # molecule_type should be 'PRT' for PRoTein 301 # Note that has been removed in recent releases (set to None) 302 if record.molecule_type not in (None, 'PRT'): 303 raise ValueError("Unrecognized molecule type %s in line\n%s" % 304 (record.molecule_type, line))
305 306
307 -def _read_dt(record, line):
308 value = line[5:] 309 uprline = value.upper() 310 cols = value.rstrip().split() 311 if 'CREATED' in uprline \ 312 or 'LAST SEQUENCE UPDATE' in uprline \ 313 or 'LAST ANNOTATION UPDATE' in uprline: 314 # Old style DT line 315 # ================= 316 # e.g. 317 # DT 01-FEB-1995 (Rel. 31, Created) 318 # DT 01-FEB-1995 (Rel. 31, Last sequence update) 319 # DT 01-OCT-2000 (Rel. 40, Last annotation update) 320 # 321 # or: 322 # DT 08-JAN-2002 (IPI Human rel. 2.3, Created) 323 # ... 324 325 # find where the version information will be located 326 # This is needed for when you have cases like IPI where 327 # the release version is in a different spot: 328 # DT 08-JAN-2002 (IPI Human rel. 2.3, Created) 329 uprcols = uprline.split() 330 rel_index = -1 331 for index in range(len(uprcols)): 332 if 'REL.' in uprcols[index]: 333 rel_index = index 334 assert rel_index >= 0, \ 335 "Could not find Rel. in DT line: %s" % line 336 version_index = rel_index + 1 337 # get the version information 338 str_version = cols[version_index].rstrip(",") 339 # no version number 340 if str_version == '': 341 version = 0 342 # dot versioned 343 elif '.' in str_version: 344 version = str_version 345 # integer versioned 346 else: 347 version = int(str_version) 348 date = cols[0] 349 350 if 'CREATED' in uprline: 351 record.created = date, version 352 elif 'LAST SEQUENCE UPDATE' in uprline: 353 record.sequence_update = date, version 354 elif 'LAST ANNOTATION UPDATE' in uprline: 355 record.annotation_update = date, version 356 else: 357 assert False, "Shouldn't reach this line!" 358 elif 'INTEGRATED INTO' in uprline \ 359 or 'SEQUENCE VERSION' in uprline \ 360 or 'ENTRY VERSION' in uprline: 361 # New style DT line 362 # ================= 363 # As of UniProt Knowledgebase release 7.0 (including 364 # Swiss-Prot release 49.0 and TrEMBL release 32.0) the 365 # format of the DT lines and the version information 366 # in them was changed - the release number was dropped. 367 # 368 # For more information see bug 1948 and 369 # http://ca.expasy.org/sprot/relnotes/sp_news.html#rel7.0 370 # 371 # e.g. 372 # DT 01-JAN-1998, integrated into UniProtKB/Swiss-Prot. 373 # DT 15-OCT-2001, sequence version 3. 374 # DT 01-APR-2004, entry version 14. 375 # 376 # This is a new style DT line... 377 378 # The date should be in string cols[1] 379 # Get the version number if there is one. 380 # For the three DT lines above: 0, 3, 14 381 try: 382 version = 0 383 for s in cols[-1].split('.'): 384 if s.isdigit(): 385 version = int(s) 386 except ValueError: 387 version = 0 388 date = cols[0].rstrip(",") 389 390 # Re-use the historical property names, even though 391 # the meaning has changed slighty: 392 if "INTEGRATED" in uprline: 393 record.created = date, version 394 elif 'SEQUENCE VERSION' in uprline: 395 record.sequence_update = date, version 396 elif 'ENTRY VERSION' in uprline: 397 record.annotation_update = date, version 398 else: 399 assert False, "Shouldn't reach this line!" 400 else: 401 raise ValueError("I don't understand the date line %s" % line)
402 403
404 -def _read_ox(record, line):
405 # The OX line used to be in the simple format: 406 # OX DESCRIPTION=ID[, ID]...; 407 # If there are too many id's to fit onto a line, then the ID's 408 # continue directly onto the next line, e.g. 409 # OX DESCRIPTION=ID[, ID]... 410 # OX ID[, ID]...; 411 # Currently, the description is always "NCBI_TaxID". 412 # To parse this, I need to check to see whether I'm at the 413 # first line. If I am, grab the description and make sure 414 # it's an NCBI ID. Then, grab all the id's. 415 # 416 # As of the 2014-10-01 release, there may be an evidence code, e.g. 417 # OX NCBI_TaxID=418404 {ECO:0000313|EMBL:AEX14553.1}; 418 # In the short term, we will ignore any evidence codes: 419 line = line.split('{')[0] 420 if record.taxonomy_id: 421 ids = line[5:].rstrip().rstrip(";") 422 else: 423 descr, ids = line[5:].rstrip().rstrip(";").split("=") 424 assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr 425 record.taxonomy_id.extend(ids.split(', '))
426 427
428 -def _read_oh(record, line):
429 # Line type OH (Organism Host) for viral hosts 430 assert line[5:].startswith("NCBI_TaxID="), "Unexpected %s" % line 431 line = line[16:].rstrip() 432 assert line[-1] == "." and line.count(";") == 1, line 433 taxid, name = line[:-1].split(";") 434 record.host_taxonomy_id.append(taxid.strip()) 435 record.host_organism.append(name.strip())
436 437
438 -def _read_rn(reference, rn):
439 # This used to be a very simple line with a reference number, e.g. 440 # RN [1] 441 # As of the 2014-10-01 release, there may be an evidence code, e.g. 442 # RN [1] {ECO:0000313|EMBL:AEX14553.1} 443 words = rn.split(None, 1) 444 number = words[0] 445 assert number.startswith('[') and number.endswith(']'), "Missing brackets %s" % number 446 reference.number = int(number[1:-1]) 447 if len(words) > 1: 448 evidence = words[1] 449 assert evidence.startswith('{') and evidence.endswith('}'), "Missing braces %s" % evidence 450 reference.evidence = evidence[1:-1].split('|')
451 452
453 -def _read_rc(reference, value):
454 cols = value.split(';') 455 if value[-1] == ';': 456 unread = "" 457 else: 458 cols, unread = cols[:-1], cols[-1] 459 for col in cols: 460 if not col: # last column will be the empty string 461 return 462 # The token is everything before the first '=' character. 463 i = col.find("=") 464 if i >= 0: 465 token, text = col[:i], col[i + 1:] 466 comment = token.lstrip(), text 467 reference.comments.append(comment) 468 else: 469 comment = reference.comments[-1] 470 comment = "%s %s" % (comment, col) 471 reference.comments[-1] = comment 472 return unread
473 474
475 -def _read_rx(reference, value):
476 # The basic (older?) RX line is of the form: 477 # RX MEDLINE; 85132727. 478 # but there are variants of this that need to be dealt with (see below) 479 480 # CLD1_HUMAN in Release 39 and DADR_DIDMA in Release 33 481 # have extraneous information in the RX line. Check for 482 # this and chop it out of the line. 483 # (noticed by katel@worldpath.net) 484 value = value.replace(' [NCBI, ExPASy, Israel, Japan]', '') 485 486 # RX lines can also be used of the form 487 # RX PubMed=9603189; 488 # reported by edvard@farmasi.uit.no 489 # and these can be more complicated like: 490 # RX MEDLINE=95385798; PubMed=7656980; 491 # RX PubMed=15060122; DOI=10.1136/jmg 2003.012781; 492 # We look for these cases first and deal with them 493 warn = False 494 if "=" in value: 495 cols = value.split("; ") 496 cols = [x.strip() for x in cols] 497 cols = [x for x in cols if x] 498 for col in cols: 499 x = col.split("=") 500 if len(x) != 2 or x == ("DOI", "DOI"): 501 warn = True 502 break 503 assert len(x) == 2, "I don't understand RX line %s" % value 504 reference.references.append((x[0], x[1].rstrip(";"))) 505 # otherwise we assume we have the type 'RX MEDLINE; 85132727.' 506 else: 507 cols = value.split("; ") 508 # normally we split into the three parts 509 if len(cols) != 2: 510 warn = True 511 else: 512 reference.references.append((cols[0].rstrip(";"), cols[1].rstrip("."))) 513 if warn: 514 import warnings 515 from Bio import BiopythonParserWarning 516 warnings.warn("Possibly corrupt RX line %r" % value, 517 BiopythonParserWarning)
518 519
520 -def _read_cc(record, line):
521 key, value = line[5:8], line[9:].rstrip() 522 if key == '-!-': # Make a new comment 523 record.comments.append(value) 524 elif key == ' ': # add to the previous comment 525 if not record.comments: 526 # TCMO_STRGA in Release 37 has comment with no topic 527 record.comments.append(value) 528 else: 529 record.comments[-1] += " " + value
530 531
532 -def _read_dr(record, value):
533 cols = value.rstrip(".").split('; ') 534 record.cross_references.append(tuple(cols))
535 536
537 -def _read_pe(record, value):
538 pe = value.split(":") 539 record.protein_existence = int(pe[0])
540 541
542 -def _read_kw(record, value):
543 # Old style - semi-colon separated, multi-line. e.g. Q13639.txt 544 # KW Alternative splicing; Cell membrane; Complete proteome; 545 # KW Disulfide bond; Endosome; G-protein coupled receptor; Glycoprotein; 546 # KW Lipoprotein; Membrane; Palmitate; Polymorphism; Receptor; Transducer; 547 # KW Transmembrane. 548 # 549 # New style as of 2014-10-01 release with evidence codes, e.g. H2CNN8.txt 550 # KW Monooxygenase {ECO:0000313|EMBL:AEX14553.1}; 551 # KW Oxidoreductase {ECO:0000313|EMBL:AEX14553.1}. 552 # For now to match the XML parser, drop the evidence codes. 553 for value in value.rstrip(";.").split('; '): 554 if value.endswith("}"): 555 # Discard the evidence code 556 value = value.rsplit("{", 1)[0] 557 record.keywords.append(value.strip())
558 559
560 -def _read_ft(record, line):
561 line = line[5:] # get rid of junk in front 562 name = line[0:8].rstrip() 563 try: 564 from_res = int(line[9:15]) 565 except ValueError: 566 from_res = line[9:15].lstrip() 567 try: 568 to_res = int(line[16:22]) 569 except ValueError: 570 to_res = line[16:22].lstrip() 571 # if there is a feature_id (FTId), store it away 572 if line[29:35] == r"/FTId=": 573 ft_id = line[35:70].rstrip()[:-1] 574 description = "" 575 else: 576 ft_id = "" 577 description = line[29:70].rstrip() 578 if not name: # is continuation of last one 579 assert not from_res and not to_res 580 name, from_res, to_res, old_description, old_ft_id = record.features[-1] 581 del record.features[-1] 582 description = ("%s %s" % (old_description, description)).strip() 583 584 # special case -- VARSPLIC, reported by edvard@farmasi.uit.no 585 if name == "VARSPLIC": 586 # Remove unwanted spaces in sequences. 587 # During line carryover, the sequences in VARSPLIC can get mangled 588 # with unwanted spaces like: 589 # 'DISSTKLQALPSHGLESIQT -> PCRATGWSPFRRSSPC LPTH' 590 # We want to check for this case and correct it as it happens. 591 descr_cols = description.split(" -> ") 592 if len(descr_cols) == 2: 593 first_seq, second_seq = descr_cols 594 extra_info = '' 595 # we might have more information at the end of the 596 # second sequence, which should be in parenthesis 597 extra_info_pos = second_seq.find(" (") 598 if extra_info_pos != -1: 599 extra_info = second_seq[extra_info_pos:] 600 second_seq = second_seq[:extra_info_pos] 601 # now clean spaces out of the first and second string 602 first_seq = first_seq.replace(" ", "") 603 second_seq = second_seq.replace(" ", "") 604 # reassemble the description 605 description = first_seq + " -> " + second_seq + extra_info 606 record.features.append((name, from_res, to_res, description, ft_id))
607 608 609 if __name__ == "__main__": 610 from Bio._utils import run_doctest 611 run_doctest(verbose=0) 612