Package Bio :: Package SeqIO :: Module SwissIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.SwissIO

  1  # Copyright 2006-2013 by Peter Cock. 
  2  # Revisions copyright 2008-2009 by Michiel de Hoon. 
  3  # All rights reserved. 
  4  # 
  5  # This code is part of the Biopython distribution and governed by its 
  6  # license.  Please see the LICENSE file that should have been included 
  7  # as part of this package. 
  8   
  9  """Bio.SeqIO support for the "swiss" (aka SwissProt/UniProt) file format. 
 10   
 11  You are expected to use this module via the Bio.SeqIO functions. 
 12  See also the Bio.SwissProt module which offers more than just accessing 
 13  the sequences as SeqRecord objects. 
 14   
 15  See also Bio.SeqIO.UniprotIO.py which supports the "uniprot-xml" format. 
 16  """ 
 17   
 18  from __future__ import print_function 
 19   
 20  from Bio import Seq 
 21  from Bio import SeqRecord 
 22  from Bio import Alphabet 
 23  from Bio import SeqFeature 
 24  from Bio import SwissProt 
 25   
 26  __docformat__ = "restructuredtext en" 
 27   
 28   
29 -def _make_position(location_string, offset=0):
30 """Turn a Swiss location position into a SeqFeature position object (PRIVATE). 31 32 An offset of -1 is used with a start location to make it pythonic. 33 """ 34 if location_string == "?": 35 return SeqFeature.UnknownPosition() 36 # Hack so that feature from 0 to 0 becomes 0 to 0, not -1 to 0. 37 try: 38 return SeqFeature.ExactPosition(max(0, offset + int(location_string))) 39 except ValueError: 40 pass 41 if location_string.startswith("<"): 42 try: 43 return SeqFeature.BeforePosition(max(0, offset + int(location_string[1:]))) 44 except ValueError: 45 pass 46 elif location_string.startswith(">"): # e.g. ">13" 47 try: 48 return SeqFeature.AfterPosition(max(0, offset + int(location_string[1:]))) 49 except ValueError: 50 pass 51 elif location_string.startswith("?"): # e.g. "?22" 52 try: 53 return SeqFeature.UncertainPosition(max(0, offset + int(location_string[1:]))) 54 except ValueError: 55 pass 56 raise NotImplementedError("Cannot parse location '%s'" % location_string)
57 58
59 -def _make_seqfeature(name, from_res, to_res, description, ft_id):
60 """Construct SeqFeature from feature data from parser (PRIVATE).""" 61 loc = SeqFeature.FeatureLocation(_make_position(from_res, -1), 62 _make_position(to_res, 0)) 63 if not ft_id: 64 ft_id = "<unknown id>" # The default in SeqFeature object 65 return SeqFeature.SeqFeature(loc, type=name, id=ft_id, 66 qualifiers={"description": description})
67 68
69 -def SwissIterator(handle):
70 """Breaks up a Swiss-Prot/UniProt file into SeqRecord objects. 71 72 Every section from the ID line to the terminating // becomes 73 a single SeqRecord with associated annotation and features. 74 75 This parser is for the flat file "swiss" format as used by: 76 - Swiss-Prot aka SwissProt 77 - TrEMBL 78 - UniProtKB aka UniProt Knowledgebase 79 80 For consistency with BioPerl and EMBOSS we call this the "swiss" 81 format. See also the SeqIO support for "uniprot-xml" format. 82 """ 83 swiss_records = SwissProt.parse(handle) 84 for swiss_record in swiss_records: 85 # Convert the SwissProt record to a SeqRecord 86 seq = Seq.Seq(swiss_record.sequence, Alphabet.generic_protein) 87 record = SeqRecord.SeqRecord(seq, 88 id=swiss_record.accessions[0], 89 name=swiss_record.entry_name, 90 description=swiss_record.description, 91 features=[_make_seqfeature(*f) for f 92 in swiss_record.features], 93 ) 94 record.description = swiss_record.description 95 for cross_reference in swiss_record.cross_references: 96 if len(cross_reference) < 2: 97 continue 98 database, accession = cross_reference[:2] 99 dbxref = "%s:%s" % (database, accession) 100 if dbxref not in record.dbxrefs: 101 record.dbxrefs.append(dbxref) 102 annotations = record.annotations 103 annotations['accessions'] = swiss_record.accessions 104 if swiss_record.created: 105 annotations['date'] = swiss_record.created[0] 106 if swiss_record.sequence_update: 107 annotations[ 108 'date_last_sequence_update'] = swiss_record.sequence_update[0] 109 if swiss_record.annotation_update: 110 annotations['date_last_annotation_update'] = swiss_record.annotation_update[0] 111 if swiss_record.gene_name: 112 annotations['gene_name'] = swiss_record.gene_name 113 annotations['organism'] = swiss_record.organism.rstrip(".") 114 annotations['taxonomy'] = swiss_record.organism_classification 115 annotations['ncbi_taxid'] = swiss_record.taxonomy_id 116 if swiss_record.host_organism: 117 annotations['organism_host'] = swiss_record.host_organism 118 if swiss_record.host_taxonomy_id: 119 annotations['host_ncbi_taxid'] = swiss_record.host_taxonomy_id 120 if swiss_record.comments: 121 annotations['comment'] = "\n".join(swiss_record.comments) 122 if swiss_record.references: 123 annotations['references'] = [] 124 for reference in swiss_record.references: 125 feature = SeqFeature.Reference() 126 feature.comment = " ".join("%s=%s;" % k_v for k_v in reference.comments) 127 for key, value in reference.references: 128 if key == 'PubMed': 129 feature.pubmed_id = value 130 elif key == 'MEDLINE': 131 feature.medline_id = value 132 elif key == 'DOI': 133 pass 134 elif key == 'AGRICOLA': 135 pass 136 else: 137 raise ValueError( 138 "Unknown key %s found in references" % key) 139 feature.authors = reference.authors 140 feature.title = reference.title 141 feature.journal = reference.location 142 annotations['references'].append(feature) 143 if swiss_record.keywords: 144 record.annotations['keywords'] = swiss_record.keywords 145 yield record
146 147 if __name__ == "__main__": 148 print("Quick self test...") 149 150 example_filename = "../../Tests/SwissProt/sp008" 151 152 import os 153 if not os.path.isfile(example_filename): 154 print("Missing test file %s" % example_filename) 155 else: 156 # Try parsing it! 157 with open(example_filename) as handle: 158 records = SwissIterator(handle) 159 for record in records: 160 print(record.name) 161 print(record.id) 162 print(record.annotations['keywords']) 163 print(repr(record.annotations['organism'])) 164 print(str(record.seq)[:20] + "...") 165 for f in record.features: 166 print(f) 167