Package Bio :: Package SeqIO :: Module SwissIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.SwissIO

  1  # Copyright 2006-2013 by Peter Cock. 
  2  # Revisions copyright 2008-2009 by Michiel de Hoon. 
  3  # All rights reserved. 
  4  # 
  5  # This code is part of the Biopython distribution and governed by its 
  6  # license.  Please see the LICENSE file that should have been included 
  7  # as part of this package. 
  8   
  9  """Bio.SeqIO support for the "swiss" (aka SwissProt/UniProt) file format. 
 10   
 11  You are expected to use this module via the Bio.SeqIO functions. 
 12  See also the Bio.SwissProt module which offers more than just accessing 
 13  the sequences as SeqRecord objects. 
 14   
 15  See also Bio.SeqIO.UniprotIO.py which supports the "uniprot-xml" format. 
 16  """ 
 17   
 18  from __future__ import print_function 
 19   
 20  from Bio import Seq 
 21  from Bio import SeqRecord 
 22  from Bio import Alphabet 
 23  from Bio import SeqFeature 
 24  from Bio import SwissProt 
 25   
 26   
27 -def _make_position(location_string, offset=0):
28 """Turn a Swiss location position into a SeqFeature position object (PRIVATE). 29 30 An offset of -1 is used with a start location to make it pythonic. 31 """ 32 if location_string == "?": 33 return SeqFeature.UnknownPosition() 34 # Hack so that feature from 0 to 0 becomes 0 to 0, not -1 to 0. 35 try: 36 return SeqFeature.ExactPosition(max(0, offset + int(location_string))) 37 except ValueError: 38 pass 39 if location_string.startswith("<"): 40 try: 41 return SeqFeature.BeforePosition(max(0, offset + int(location_string[1:]))) 42 except ValueError: 43 pass 44 elif location_string.startswith(">"): # e.g. ">13" 45 try: 46 return SeqFeature.AfterPosition(max(0, offset + int(location_string[1:]))) 47 except ValueError: 48 pass 49 elif location_string.startswith("?"): # e.g. "?22" 50 try: 51 return SeqFeature.UncertainPosition(max(0, offset + int(location_string[1:]))) 52 except ValueError: 53 pass 54 raise NotImplementedError("Cannot parse location '%s'" % location_string)
55 56
57 -def _make_seqfeature(name, from_res, to_res, description, ft_id):
58 """Construct SeqFeature from feature data from parser (PRIVATE).""" 59 loc = SeqFeature.FeatureLocation(_make_position(from_res, -1), 60 _make_position(to_res, 0)) 61 if not ft_id: 62 ft_id = "<unknown id>" # The default in SeqFeature object 63 return SeqFeature.SeqFeature(loc, type=name, id=ft_id, 64 qualifiers={"description": description})
65 66
67 -def SwissIterator(handle):
68 """Breaks up a Swiss-Prot/UniProt file into SeqRecord objects. 69 70 Every section from the ID line to the terminating // becomes 71 a single SeqRecord with associated annotation and features. 72 73 This parser is for the flat file "swiss" format as used by: 74 - Swiss-Prot aka SwissProt 75 - TrEMBL 76 - UniProtKB aka UniProt Knowledgebase 77 78 For consistency with BioPerl and EMBOSS we call this the "swiss" 79 format. See also the SeqIO support for "uniprot-xml" format. 80 81 Rather than calling it directly, you are expected to use this 82 parser via Bio.SeqIO.parse(..., format="swiss") instead. 83 """ 84 swiss_records = SwissProt.parse(handle) 85 for swiss_record in swiss_records: 86 # Convert the SwissProt record to a SeqRecord 87 seq = Seq.Seq(swiss_record.sequence, Alphabet.generic_protein) 88 record = SeqRecord.SeqRecord(seq, 89 id=swiss_record.accessions[0], 90 name=swiss_record.entry_name, 91 description=swiss_record.description, 92 features=[_make_seqfeature(*f) for f 93 in swiss_record.features], 94 ) 95 record.description = swiss_record.description 96 for cross_reference in swiss_record.cross_references: 97 if len(cross_reference) < 2: 98 continue 99 database, accession = cross_reference[:2] 100 dbxref = "%s:%s" % (database, accession) 101 if dbxref not in record.dbxrefs: 102 record.dbxrefs.append(dbxref) 103 annotations = record.annotations 104 annotations['accessions'] = swiss_record.accessions 105 if swiss_record.protein_existence: 106 annotations['protein_existence'] = swiss_record.protein_existence 107 if swiss_record.created: 108 annotations['date'] = swiss_record.created[0] 109 annotations['sequence_version'] = swiss_record.created[1] 110 if swiss_record.sequence_update: 111 annotations[ 112 'date_last_sequence_update'] = swiss_record.sequence_update[0] 113 annotations['sequence_version'] = swiss_record.sequence_update[1] 114 if swiss_record.annotation_update: 115 annotations['date_last_annotation_update'] = swiss_record.annotation_update[0] 116 annotations['entry_version'] = swiss_record.annotation_update[1] 117 if swiss_record.gene_name: 118 annotations['gene_name'] = swiss_record.gene_name 119 annotations['organism'] = swiss_record.organism.rstrip(".") 120 annotations['taxonomy'] = swiss_record.organism_classification 121 annotations['ncbi_taxid'] = swiss_record.taxonomy_id 122 if swiss_record.host_organism: 123 annotations['organism_host'] = swiss_record.host_organism 124 if swiss_record.host_taxonomy_id: 125 annotations['host_ncbi_taxid'] = swiss_record.host_taxonomy_id 126 if swiss_record.comments: 127 annotations['comment'] = "\n".join(swiss_record.comments) 128 if swiss_record.references: 129 annotations['references'] = [] 130 for reference in swiss_record.references: 131 feature = SeqFeature.Reference() 132 feature.comment = " ".join("%s=%s;" % k_v for k_v in reference.comments) 133 for key, value in reference.references: 134 if key == 'PubMed': 135 feature.pubmed_id = value 136 elif key == 'MEDLINE': 137 feature.medline_id = value 138 elif key == 'DOI': 139 pass 140 elif key == 'AGRICOLA': 141 pass 142 else: 143 raise ValueError( 144 "Unknown key %s found in references" % key) 145 feature.authors = reference.authors 146 feature.title = reference.title 147 feature.journal = reference.location 148 annotations['references'].append(feature) 149 if swiss_record.keywords: 150 record.annotations['keywords'] = swiss_record.keywords 151 yield record
152