Package Bio :: Package SwissProt
[hide private]
[frames] | no frames]

Source Code for Package Bio.SwissProt

  1  # Copyright 2007 by Michiel de Hoon.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """ 
  6  This module provides code to work with the sprotXX.dat file from 
  7  SwissProt. 
  8  http://www.expasy.ch/sprot/sprot-top.html 
  9   
 10  Tested with: 
 11  Release 56.9, 03-March-2009. 
 12   
 13   
 14  Classes: 
 15  Record             Holds SwissProt data. 
 16  Reference          Holds reference data from a SwissProt record. 
 17   
 18  Functions: 
 19  read               Read one SwissProt record 
 20  parse              Read multiple SwissProt records 
 21   
 22  """ 
 23   
 24  from Bio._py3k import _as_string 
 25   
 26   
27 -class Record(object):
28 """Holds information from a SwissProt record. 29 30 Members: 31 entry_name Name of this entry, e.g. RL1_ECOLI. 32 data_class Either 'STANDARD' or 'PRELIMINARY'. 33 molecule_type Type of molecule, 'PRT', 34 sequence_length Number of residues. 35 36 accessions List of the accession numbers, e.g. ['P00321'] 37 created A tuple of (date, release). 38 sequence_update A tuple of (date, release). 39 annotation_update A tuple of (date, release). 40 41 description Free-format description. 42 gene_name Gene name. See userman.txt for description. 43 organism The source of the sequence. 44 organelle The origin of the sequence. 45 organism_classification The taxonomy classification. List of strings. 46 (http://www.ncbi.nlm.nih.gov/Taxonomy/) 47 taxonomy_id A list of NCBI taxonomy id's. 48 host_organism A list of names of the hosts of a virus, if any. 49 host_taxonomy_id A list of NCBI taxonomy id's of the hosts, if any. 50 references List of Reference objects. 51 comments List of strings. 52 cross_references List of tuples (db, id1[, id2][, id3]). See the docs. 53 keywords List of the keywords. 54 features List of tuples (key name, from, to, description). 55 from and to can be either integers for the residue 56 numbers, '<', '>', or '?' 57 58 seqinfo tuple of (length, molecular weight, CRC32 value) 59 sequence The sequence. 60 61 """
62 - def __init__(self):
63 self.entry_name = None 64 self.data_class = None 65 self.molecule_type = None 66 self.sequence_length = None 67 68 self.accessions = [] 69 self.created = None 70 self.sequence_update = None 71 self.annotation_update = None 72 73 self.description = [] 74 self.gene_name = '' 75 self.organism = [] 76 self.organelle = '' 77 self.organism_classification = [] 78 self.taxonomy_id = [] 79 self.host_organism = [] 80 self.host_taxonomy_id = [] 81 self.references = [] 82 self.comments = [] 83 self.cross_references = [] 84 self.keywords = [] 85 self.features = [] 86 87 self.seqinfo = None 88 self.sequence = ''
89 90
91 -class Reference(object):
92 """Holds information from one reference in a SwissProt entry. 93 94 Members: 95 number Number of reference in an entry. 96 positions Describes extent of work. List of strings. 97 comments Comments. List of (token, text). 98 references References. List of (dbname, identifier). 99 authors The authors of the work. 100 title Title of the work. 101 location A citation for the work. 102 103 """
104 - def __init__(self):
105 self.number = None 106 self.positions = [] 107 self.comments = [] 108 self.references = [] 109 self.authors = [] 110 self.title = [] 111 self.location = []
112 113
114 -def parse(handle):
115 while True: 116 record = _read(handle) 117 if not record: 118 return 119 yield record
120 121
122 -def read(handle):
123 record = _read(handle) 124 if not record: 125 raise ValueError("No SwissProt record found") 126 # We should have reached the end of the record by now 127 remainder = handle.read() 128 if remainder: 129 raise ValueError("More than one SwissProt record found") 130 return record
131 132 133 # Everything below is considered private 134 135
136 -def _read(handle):
137 record = None 138 unread = "" 139 for line in handle: 140 #This is for Python 3 to cope with a binary handle (byte strings), 141 #or a text handle (unicode strings): 142 line = _as_string(line) 143 key, value = line[:2], line[5:].rstrip() 144 if unread: 145 value = unread + " " + value 146 unread = "" 147 if key == '**': 148 #See Bug 2353, some files from the EBI have extra lines 149 #starting "**" (two asterisks/stars). They appear 150 #to be unofficial automated annotations. e.g. 151 #** 152 #** ################# INTERNAL SECTION ################## 153 #**HA SAM; Annotated by PicoHamap 1.88; MF_01138.1; 09-NOV-2003. 154 pass 155 elif key == 'ID': 156 record = Record() 157 _read_id(record, line) 158 _sequence_lines = [] 159 elif key == 'AC': 160 accessions = [word for word in value.rstrip(";").split("; ")] 161 record.accessions.extend(accessions) 162 elif key == 'DT': 163 _read_dt(record, line) 164 elif key == 'DE': 165 record.description.append(value.strip()) 166 elif key == 'GN': 167 if record.gene_name: 168 record.gene_name += " " 169 record.gene_name += value 170 elif key == 'OS': 171 record.organism.append(value) 172 elif key == 'OG': 173 record.organelle += line[5:] 174 elif key == 'OC': 175 cols = [col for col in value.rstrip(";.").split("; ")] 176 record.organism_classification.extend(cols) 177 elif key == 'OX': 178 _read_ox(record, line) 179 elif key == 'OH': 180 _read_oh(record, line) 181 elif key == 'RN': 182 reference = Reference() 183 _read_rn(reference, value) 184 record.references.append(reference) 185 elif key == 'RP': 186 assert record.references, "RP: missing RN" 187 record.references[-1].positions.append(value) 188 elif key == 'RC': 189 assert record.references, "RC: missing RN" 190 reference = record.references[-1] 191 unread = _read_rc(reference, value) 192 elif key == 'RX': 193 assert record.references, "RX: missing RN" 194 reference = record.references[-1] 195 _read_rx(reference, value) 196 elif key == 'RL': 197 assert record.references, "RL: missing RN" 198 reference = record.references[-1] 199 reference.location.append(value) 200 # In UniProt release 1.12 of 6/21/04, there is a new RG 201 # (Reference Group) line, which references a group instead of 202 # an author. Each block must have at least 1 RA or RG line. 203 elif key == 'RA': 204 assert record.references, "RA: missing RN" 205 reference = record.references[-1] 206 reference.authors.append(value) 207 elif key == 'RG': 208 assert record.references, "RG: missing RN" 209 reference = record.references[-1] 210 reference.authors.append(value) 211 elif key == "RT": 212 assert record.references, "RT: missing RN" 213 reference = record.references[-1] 214 reference.title.append(value) 215 elif key == 'CC': 216 _read_cc(record, line) 217 elif key == 'DR': 218 _read_dr(record, value) 219 elif key == 'PE': 220 #TODO - Record this information? 221 pass 222 elif key == 'KW': 223 cols = value.rstrip(";.").split('; ') 224 record.keywords.extend(cols) 225 elif key == 'FT': 226 _read_ft(record, line) 227 elif key == 'SQ': 228 cols = value.split() 229 assert len(cols) == 7, "I don't understand SQ line %s" % line 230 # Do more checking here? 231 record.seqinfo = int(cols[1]), int(cols[3]), cols[5] 232 elif key == ' ': 233 _sequence_lines.append(value.replace(" ", "").rstrip()) 234 elif key == '//': 235 # Join multiline data into one string 236 record.description = " ".join(record.description) 237 record.organism = " ".join(record.organism) 238 record.organelle = record.organelle.rstrip() 239 for reference in record.references: 240 reference.authors = " ".join(reference.authors).rstrip(";") 241 reference.title = " ".join(reference.title).rstrip(";") 242 if reference.title.startswith('"') and reference.title.endswith('"'): 243 reference.title = reference.title[1:-1] # remove quotes 244 reference.location = " ".join(reference.location) 245 record.sequence = "".join(_sequence_lines) 246 return record 247 else: 248 raise ValueError("Unknown keyword '%s' found" % key) 249 if record: 250 raise ValueError("Unexpected end of stream.")
251 252
253 -def _read_id(record, line):
254 cols = line[5:].split() 255 #Prior to release 51, included with MoleculeType: 256 #ID EntryName DataClass; MoleculeType; SequenceLength AA. 257 # 258 #Newer files lack the MoleculeType: 259 #ID EntryName DataClass; SequenceLength AA. 260 if len(cols) == 5: 261 record.entry_name = cols[0] 262 record.data_class = cols[1].rstrip(";") 263 record.molecule_type = cols[2].rstrip(";") 264 record.sequence_length = int(cols[3]) 265 elif len(cols) == 4: 266 record.entry_name = cols[0] 267 record.data_class = cols[1].rstrip(";") 268 record.molecule_type = None 269 record.sequence_length = int(cols[2]) 270 else: 271 raise ValueError("ID line has unrecognised format:\n" + line) 272 # check if the data class is one of the allowed values 273 allowed = ('STANDARD', 'PRELIMINARY', 'IPI', 'Reviewed', 'Unreviewed') 274 if record.data_class not in allowed: 275 raise ValueError("Unrecognized data class %s in line\n%s" % 276 (record.data_class, line)) 277 # molecule_type should be 'PRT' for PRoTein 278 # Note that has been removed in recent releases (set to None) 279 if record.molecule_type not in (None, 'PRT'): 280 raise ValueError("Unrecognized molecule type %s in line\n%s" % 281 (record.molecule_type, line))
282 283
284 -def _read_dt(record, line):
285 value = line[5:] 286 uprline = value.upper() 287 cols = value.rstrip().split() 288 if 'CREATED' in uprline \ 289 or 'LAST SEQUENCE UPDATE' in uprline \ 290 or 'LAST ANNOTATION UPDATE' in uprline: 291 # Old style DT line 292 # ================= 293 # e.g. 294 # DT 01-FEB-1995 (Rel. 31, Created) 295 # DT 01-FEB-1995 (Rel. 31, Last sequence update) 296 # DT 01-OCT-2000 (Rel. 40, Last annotation update) 297 # 298 # or: 299 # DT 08-JAN-2002 (IPI Human rel. 2.3, Created) 300 # ... 301 302 # find where the version information will be located 303 # This is needed for when you have cases like IPI where 304 # the release verison is in a different spot: 305 # DT 08-JAN-2002 (IPI Human rel. 2.3, Created) 306 uprcols = uprline.split() 307 rel_index = -1 308 for index in range(len(uprcols)): 309 if 'REL.' in uprcols[index]: 310 rel_index = index 311 assert rel_index >= 0, \ 312 "Could not find Rel. in DT line: %s" % line 313 version_index = rel_index + 1 314 # get the version information 315 str_version = cols[version_index].rstrip(",") 316 # no version number 317 if str_version == '': 318 version = 0 319 # dot versioned 320 elif '.' in str_version: 321 version = str_version 322 # integer versioned 323 else: 324 version = int(str_version) 325 date = cols[0] 326 327 if 'CREATED' in uprline: 328 record.created = date, version 329 elif 'LAST SEQUENCE UPDATE' in uprline: 330 record.sequence_update = date, version 331 elif 'LAST ANNOTATION UPDATE' in uprline: 332 record.annotation_update = date, version 333 else: 334 assert False, "Shouldn't reach this line!" 335 elif 'INTEGRATED INTO' in uprline \ 336 or 'SEQUENCE VERSION' in uprline \ 337 or 'ENTRY VERSION' in uprline: 338 # New style DT line 339 # ================= 340 # As of UniProt Knowledgebase release 7.0 (including 341 # Swiss-Prot release 49.0 and TrEMBL release 32.0) the 342 # format of the DT lines and the version information 343 # in them was changed - the release number was dropped. 344 # 345 # For more information see bug 1948 and 346 # http://ca.expasy.org/sprot/relnotes/sp_news.html#rel7.0 347 # 348 # e.g. 349 # DT 01-JAN-1998, integrated into UniProtKB/Swiss-Prot. 350 # DT 15-OCT-2001, sequence version 3. 351 # DT 01-APR-2004, entry version 14. 352 # 353 #This is a new style DT line... 354 355 # The date should be in string cols[1] 356 # Get the version number if there is one. 357 # For the three DT lines above: 0, 3, 14 358 try: 359 version = int(cols[-1]) 360 except ValueError: 361 version = 0 362 date = cols[0].rstrip(",") 363 364 # Re-use the historical property names, even though 365 # the meaning has changed slighty: 366 if "INTEGRATED" in uprline: 367 record.created = date, version 368 elif 'SEQUENCE VERSION' in uprline: 369 record.sequence_update = date, version 370 elif 'ENTRY VERSION' in uprline: 371 record.annotation_update = date, version 372 else: 373 assert False, "Shouldn't reach this line!" 374 else: 375 raise ValueError("I don't understand the date line %s" % line)
376 377
378 -def _read_ox(record, line):
379 # The OX line is in the format: 380 # OX DESCRIPTION=ID[, ID]...; 381 # If there are too many id's to fit onto a line, then the ID's 382 # continue directly onto the next line, e.g. 383 # OX DESCRIPTION=ID[, ID]... 384 # OX ID[, ID]...; 385 # Currently, the description is always "NCBI_TaxID". 386 # To parse this, I need to check to see whether I'm at the 387 # first line. If I am, grab the description and make sure 388 # it's an NCBI ID. Then, grab all the id's. 389 if record.taxonomy_id: 390 ids = line[5:].rstrip().rstrip(";") 391 else: 392 descr, ids = line[5:].rstrip().rstrip(";").split("=") 393 assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr 394 record.taxonomy_id.extend(ids.split(', '))
395 396
397 -def _read_oh(record, line):
398 # Line type OH (Organism Host) for viral hosts 399 assert line[5:].startswith("NCBI_TaxID="), "Unexpected %s" % line 400 line = line[16:].rstrip() 401 assert line[-1] == "." and line.count(";") == 1, line 402 taxid, name = line[:-1].split(";") 403 record.host_taxonomy_id.append(taxid.strip()) 404 record.host_organism.append(name.strip())
405 406
407 -def _read_rn(reference, rn):
408 assert rn[0] == '[' and rn[-1] == ']', "Missing brackets %s" % rn 409 reference.number = int(rn[1:-1])
410 411
412 -def _read_rc(reference, value):
413 cols = value.split(';') 414 if value[-1] == ';': 415 unread = "" 416 else: 417 cols, unread = cols[:-1], cols[-1] 418 for col in cols: 419 if not col: # last column will be the empty string 420 return 421 # The token is everything before the first '=' character. 422 i = col.find("=") 423 if i >= 0: 424 token, text = col[:i], col[i+1:] 425 comment = token.lstrip(), text 426 reference.comments.append(comment) 427 else: 428 comment = reference.comments[-1] 429 comment = "%s %s" % (comment, col) 430 reference.comments[-1] = comment 431 return unread
432 433
434 -def _read_rx(reference, value):
435 # The basic (older?) RX line is of the form: 436 # RX MEDLINE; 85132727. 437 # but there are variants of this that need to be dealt with (see below) 438 439 # CLD1_HUMAN in Release 39 and DADR_DIDMA in Release 33 440 # have extraneous information in the RX line. Check for 441 # this and chop it out of the line. 442 # (noticed by katel@worldpath.net) 443 value = value.replace(' [NCBI, ExPASy, Israel, Japan]', '') 444 445 # RX lines can also be used of the form 446 # RX PubMed=9603189; 447 # reported by edvard@farmasi.uit.no 448 # and these can be more complicated like: 449 # RX MEDLINE=95385798; PubMed=7656980; 450 # RX PubMed=15060122; DOI=10.1136/jmg 2003.012781; 451 # We look for these cases first and deal with them 452 warn = False 453 if "=" in value: 454 cols = value.split("; ") 455 cols = [x.strip() for x in cols] 456 cols = [x for x in cols if x] 457 for col in cols: 458 x = col.split("=") 459 if len(x) != 2 or x == ("DOI", "DOI"): 460 warn = True 461 break 462 assert len(x) == 2, "I don't understand RX line %s" % value 463 reference.references.append((x[0], x[1].rstrip(";"))) 464 # otherwise we assume we have the type 'RX MEDLINE; 85132727.' 465 else: 466 cols = value.split("; ") 467 # normally we split into the three parts 468 if len(cols) != 2: 469 warn = True 470 else: 471 reference.references.append((cols[0].rstrip(";"), cols[1].rstrip("."))) 472 if warn: 473 import warnings 474 from Bio import BiopythonParserWarning 475 warnings.warn("Possibly corrupt RX line %r" % value, 476 BiopythonParserWarning)
477 478
479 -def _read_cc(record, line):
480 key, value = line[5:8], line[9:].rstrip() 481 if key == '-!-': # Make a new comment 482 record.comments.append(value) 483 elif key == ' ': # add to the previous comment 484 if not record.comments: 485 # TCMO_STRGA in Release 37 has comment with no topic 486 record.comments.append(value) 487 else: 488 record.comments[-1] += " " + value
489 490
491 -def _read_dr(record, value):
492 # Remove the comments at the end of the line 493 i = value.find(' [') 494 if i >= 0: 495 value = value[:i] 496 cols = value.rstrip(".").split('; ') 497 record.cross_references.append(tuple(cols))
498 499
500 -def _read_ft(record, line):
501 line = line[5:] # get rid of junk in front 502 name = line[0:8].rstrip() 503 try: 504 from_res = int(line[9:15]) 505 except ValueError: 506 from_res = line[9:15].lstrip() 507 try: 508 to_res = int(line[16:22]) 509 except ValueError: 510 to_res = line[16:22].lstrip() 511 #if there is a feature_id (FTId), store it away 512 if line[29:35] == r"/FTId=": 513 ft_id = line[35:70].rstrip()[:-1] 514 description = "" 515 else: 516 ft_id = "" 517 description = line[29:70].rstrip() 518 if not name: # is continuation of last one 519 assert not from_res and not to_res 520 name, from_res, to_res, old_description, old_ft_id = record.features[-1] 521 del record.features[-1] 522 description = ("%s %s" % (old_description, description)).strip() 523 524 # special case -- VARSPLIC, reported by edvard@farmasi.uit.no 525 if name == "VARSPLIC": 526 # Remove unwanted spaces in sequences. 527 # During line carryover, the sequences in VARSPLIC can get mangled 528 # with unwanted spaces like: 529 # 'DISSTKLQALPSHGLESIQT -> PCRATGWSPFRRSSPC LPTH' 530 # We want to check for this case and correct it as it happens. 531 descr_cols = description.split(" -> ") 532 if len(descr_cols) == 2: 533 first_seq, second_seq = descr_cols 534 extra_info = '' 535 # we might have more information at the end of the 536 # second sequence, which should be in parenthesis 537 extra_info_pos = second_seq.find(" (") 538 if extra_info_pos != -1: 539 extra_info = second_seq[extra_info_pos:] 540 second_seq = second_seq[:extra_info_pos] 541 # now clean spaces out of the first and second string 542 first_seq = first_seq.replace(" ", "") 543 second_seq = second_seq.replace(" ", "") 544 # reassemble the description 545 description = first_seq + " -> " + second_seq + extra_info 546 record.features.append((name, from_res, to_res, description, ft_id))
547 548 549 if __name__ == "__main__": 550 print "Quick self test..." 551 552 example_filename = "../../Tests/SwissProt/sp008" 553 554 import os 555 if not os.path.isfile(example_filename): 556 print "Missing test file %s" % example_filename 557 else: 558 #Try parsing it! 559 560 handle = open(example_filename) 561 records = parse(handle) 562 for record in records: 563 print record.entry_name 564 print ",".join(record.accessions) 565 print record.keywords 566 print repr(record.organism) 567 print record.sequence[:20] + "..." 568 handle.close() 569