Package Bio :: Package SeqIO :: Module SeqXmlIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.SeqXmlIO

  1  # Copyright 2010 by Thomas Schmitt. 
  2  # 
  3  # This file is part of the Biopython distribution and governed by your 
  4  # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". 
  5  # Please see the LICENSE file that should have been included as part of this 
  6  # package. 
  7  """Bio.SeqIO support for the "seqxml" file format, SeqXML. 
  8   
  9  This module is for reading and writing SeqXML format files as 
 10  SeqRecord objects, and is expected to be used via the Bio.SeqIO API. 
 11   
 12  SeqXML is a lightweight XML format which is supposed be an alternative for 
 13  FASTA files. For more Information see http://www.seqXML.org and Schmitt et al 
 14  (2011), https://doi.org/10.1093/bib/bbr025 
 15  """ 
 16   
 17  from __future__ import print_function 
 18   
 19  from xml.sax.saxutils import XMLGenerator 
 20  from xml.sax.xmlreader import AttributesImpl 
 21  from xml.dom import pulldom 
 22  from xml.sax import SAXParseException 
 23   
 24  from Bio._py3k import range 
 25  from Bio._py3k import basestring 
 26   
 27  from Bio import Alphabet 
 28  from Bio.Seq import Seq 
 29  from Bio.Seq import UnknownSeq 
 30  from Bio.SeqRecord import SeqRecord 
 31  from .Interfaces import SequentialSequenceWriter 
 32   
 33   
34 -class XMLRecordIterator(object):
35 """Base class for building iterators for record style XML formats. 36 37 It is assumed that all information for one record can be found within a 38 record element or above. Two types of methods are called when the start 39 tag of an element is reached. To receive only the attributes of an 40 element before its end tag is reached implement _attr_TAGNAME. 41 To get an element and its children as a DOM tree implement _elem_TAGNAME. 42 Everything that is part of the DOM tree will not trigger any further 43 method calls. 44 """ 45
46 - def __init__(self, handle, recordTag, namespace=None):
47 """Create the object and initializing the XML parser.""" 48 self._recordTag = recordTag 49 self._namespace = namespace 50 self._events = pulldom.parse(handle)
51 52 # TODO: Implement __next__ in order for Python to treat this class as 53 # an interator and not just as an iterable. The SequenceIterator API 54 # expects base implementation of __iter__ to call __next__ internally. 55
56 - def __iter__(self):
57 """Iterate over the records in the XML file.""" 58 record = None 59 try: 60 for event, node in self._events: 61 62 if event == "START_ELEMENT" and node.namespaceURI == self._namespace: 63 64 if node.localName == self._recordTag: 65 # create an empty SeqRecord 66 record = SeqRecord('', id='') 67 68 # call matching methods with attributes only 69 if hasattr(self, "_attr_" + node.localName): 70 getattr(self, "_attr_" + node.localName)( 71 self._attributes(node), record) 72 73 # call matching methods with DOM tree 74 if hasattr(self, "_elem_" + node.localName): 75 # read the element and all nested elements into a DOM tree 76 self._events.expandNode(node) 77 node.normalize() 78 79 getattr(self, "_elem_" + node.localName)(node, record) 80 81 elif event == "END_ELEMENT" and node.namespaceURI == self._namespace and node.localName == self._recordTag: 82 yield record 83 84 except SAXParseException as e: 85 86 if e.getLineNumber() == 1 and e.getColumnNumber() == 0: 87 # empty file 88 pass 89 else: 90 import os 91 if e.getLineNumber() == 1 and e.getColumnNumber() == 1 \ 92 and os.name == "java": 93 # empty file, see http://bugs.jython.org/issue1774 94 pass 95 else: 96 raise
97
98 - def _attributes(self, node):
99 """Return the attributes of a DOM node as dictionary (PRIVATE).""" 100 return dict((node.attributes.item(i).name, node.attributes.item(i).value) 101 for i in range(node.attributes.length))
102 103
104 -class SeqXmlIterator(XMLRecordIterator):
105 """Breaks seqXML file into SeqRecords. 106 107 Assumes valid seqXML please validate beforehand. 108 """ 109
110 - def __init__(self, handle):
111 """Create the object.""" 112 XMLRecordIterator.__init__(self, handle, "entry") 113 114 self._source = None 115 self._source_version = None 116 self._version = None 117 self._speciesName = None 118 self._ncbiTaxId = None
119
120 - def _attr_seqXML(self, attr_dict, record):
121 """Parse the document metadata (PRIVATE).""" 122 if "source" in attr_dict: 123 self._source = attr_dict["source"] 124 if "sourceVersion" in attr_dict: 125 self._source_version = attr_dict["sourceVersion"] 126 if "version" in attr_dict: 127 self._version = attr_dict["seqXMLversion"] 128 if "ncbiTaxID" in attr_dict: 129 self._ncbiTaxId = attr_dict["ncbiTaxID"] 130 if "speciesName" in attr_dict: 131 self._speciesName = attr_dict["speciesName"]
132
133 - def _attr_property(self, attr_dict, record):
134 """Parse key value pair properties and store them as annotations (PRIVATE).""" 135 if "name" not in attr_dict: 136 raise ValueError("Malformed property element.") 137 138 value = attr_dict.get("value") 139 140 if attr_dict["name"] not in record.annotations: 141 record.annotations[attr_dict["name"]] = value 142 elif isinstance(record.annotations[attr_dict["name"]], list): 143 record.annotations[attr_dict["name"]].append(value) 144 else: 145 record.annotations[attr_dict["name"]] = [ 146 record.annotations[attr_dict["name"]], value]
147
148 - def _attr_species(self, attr_dict, record):
149 """Parse the species information (PRIVATE).""" 150 if "name" not in attr_dict or "ncbiTaxID" not in attr_dict: 151 raise ValueError("Malformed species element!") 152 153 # the keywords for the species annotation are taken from SwissIO 154 record.annotations["organism"] = attr_dict["name"] 155 # TODO - Should have been a list to match SwissProt parser: 156 record.annotations["ncbi_taxid"] = attr_dict["ncbiTaxID"]
157
158 - def _attr_entry(self, attr_dict, record):
159 """Set new entry with id and the optional entry source (PRIVATE).""" 160 if "id" not in attr_dict: 161 raise ValueError("Malformed entry! Identifier is missing.") 162 163 record.id = attr_dict["id"] 164 if "source" in attr_dict: 165 record.annotations["source"] = attr_dict["source"] 166 elif self._source is not None: 167 record.annotations["source"] = self._source 168 169 # initialize entry with global species definition 170 # the keywords for the species annotation are taken from SwissIO 171 if self._ncbiTaxId is not None: 172 record.annotations["ncbi_taxid"] = self._ncbiTaxId 173 if self._speciesName is not None: 174 record.annotations["organism"] = self._speciesName
175
176 - def _elem_DNAseq(self, node, record):
177 """Parse DNA sequence (PRIVATE).""" 178 if not (node.hasChildNodes() and len(node.firstChild.data) > 0): 179 raise ValueError("Sequence length should be greater than 0.") 180 181 record.seq = Seq(node.firstChild.data, Alphabet.generic_dna)
182
183 - def _elem_RNAseq(self, node, record):
184 """Parse RNA sequence (PRIVATE).""" 185 if not (node.hasChildNodes() and len(node.firstChild.data) > 0): 186 raise ValueError("Sequence length should be greater than 0.") 187 188 record.seq = Seq(node.firstChild.data, Alphabet.generic_rna)
189
190 - def _elem_AAseq(self, node, record):
191 """Parse protein sequence (PRIVATE).""" 192 if not (node.hasChildNodes() and len(node.firstChild.data) > 0): 193 raise ValueError("Sequence length should be greater than 0.") 194 195 record.seq = Seq(node.firstChild.data, Alphabet.generic_protein)
196
197 - def _elem_description(self, node, record):
198 """Parse the description (PRIVATE).""" 199 if node.hasChildNodes() and len(node.firstChild.data) > 0: 200 record.description = node.firstChild.data
201
202 - def _attr_DBRef(self, attr_dict, record):
203 """Parse a database cross reference (PRIVATE).""" 204 if "source" not in attr_dict or "id" not in attr_dict: 205 raise ValueError("Invalid DB cross reference.") 206 207 if "%s:%s" % (attr_dict["source"], attr_dict["id"]) not in record.dbxrefs: 208 record.dbxrefs.append( 209 "%s:%s" % (attr_dict["source"], attr_dict["id"]))
210 211
212 -class SeqXmlWriter(SequentialSequenceWriter):
213 """Writes SeqRecords into seqXML file. 214 215 SeqXML requires the sequence alphabet be explicitly RNA, DNA or protein, 216 i.e. an instance or subclass of Bio.Alphapet.RNAAlphabet, 217 Bio.Alphapet.DNAAlphabet or Bio.Alphapet.ProteinAlphabet. 218 """ 219
220 - def __init__(self, handle, source=None, source_version=None, 221 species=None, ncbiTaxId=None):
222 """Create Object and start the xml generator.""" 223 SequentialSequenceWriter.__init__(self, handle) 224 225 self.xml_generator = XMLGenerator(handle, "utf-8") 226 self.xml_generator.startDocument() 227 self.source = source 228 self.source_version = source_version 229 self.species = species 230 self.ncbiTaxId = ncbiTaxId
231
232 - def write_header(self):
233 """Write root node with document metadata.""" 234 SequentialSequenceWriter.write_header(self) 235 236 attrs = {"xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance", 237 "xsi:noNamespaceSchemaLocation": "http://www.seqxml.org/0.4/seqxml.xsd", 238 "seqXMLversion": "0.4"} 239 240 if self.source is not None: 241 attrs["source"] = self.source 242 if self.source_version is not None: 243 attrs["sourceVersion"] = self.source_version 244 if self.species is not None: 245 if not isinstance(self.species, basestring): 246 raise TypeError("species should be of type string") 247 attrs["speciesName"] = self.species 248 if self.ncbiTaxId is not None: 249 if not isinstance(self.ncbiTaxId, (basestring, int)): 250 raise TypeError("ncbiTaxID should be of type string or int") 251 attrs["ncbiTaxID"] = self.ncbiTaxId 252 253 self.xml_generator.startElement("seqXML", AttributesImpl(attrs))
254
255 - def write_record(self, record):
256 """Write one record.""" 257 if not record.id or record.id == "<unknown id>": 258 raise ValueError("SeqXML requires identifier") 259 260 if not isinstance(record.id, basestring): 261 raise TypeError("Identifier should be of type string") 262 263 attrb = {"id": record.id} 264 265 if "source" in record.annotations and self.source != record.annotations["source"]: 266 if not isinstance(record.annotations["source"], basestring): 267 raise TypeError("source should be of type string") 268 attrb["source"] = record.annotations["source"] 269 270 self.xml_generator.startElement("entry", AttributesImpl(attrb)) 271 self._write_species(record) 272 self._write_description(record) 273 self._write_seq(record) 274 self._write_dbxrefs(record) 275 self._write_properties(record) 276 self.xml_generator.endElement("entry")
277 284
285 - def _write_species(self, record):
286 """Write the species if given (PRIVATE).""" 287 local_ncbi_taxid = None 288 if "ncbi_taxid" in record.annotations: 289 local_ncbi_taxid = record.annotations["ncbi_taxid"] 290 if isinstance(local_ncbi_taxid, list): 291 # SwissProt parser uses a list (which could cope with chimeras) 292 if len(local_ncbi_taxid) == 1: 293 local_ncbi_taxid = local_ncbi_taxid[0] 294 elif len(local_ncbi_taxid) == 0: 295 local_ncbi_taxid = None 296 else: 297 ValueError('Multiple entries for record.annotations["ncbi_taxid"], %r' 298 % local_ncbi_taxid) 299 if "organism" in record.annotations and local_ncbi_taxid: 300 local_org = record.annotations["organism"] 301 302 if not isinstance(local_org, basestring): 303 raise TypeError("organism should be of type string") 304 305 if not isinstance(local_ncbi_taxid, (basestring, int)): 306 raise TypeError("ncbiTaxID should be of type string or int") 307 308 # The local species definition is only written if it differs from the global species definition 309 if local_org != self.species or local_ncbi_taxid != self.ncbiTaxId: 310 311 attr = {"name": local_org, 312 "ncbiTaxID": local_ncbi_taxid} 313 self.xml_generator.startElement( 314 "species", AttributesImpl(attr)) 315 self.xml_generator.endElement("species")
316
317 - def _write_description(self, record):
318 """Write the description if given (PRIVATE).""" 319 if record.description: 320 321 if not isinstance(record.description, basestring): 322 raise TypeError("Description should be of type string") 323 324 description = record.description 325 if description == "<unknown description>": 326 description = "" 327 328 if len(record.description) > 0: 329 self.xml_generator.startElement( 330 "description", AttributesImpl({})) 331 self.xml_generator.characters(description) 332 self.xml_generator.endElement("description")
333
334 - def _write_seq(self, record):
335 """Write the sequence (PRIVATE). 336 337 Note that SeqXML requires a DNA, RNA or protein alphabet. 338 """ 339 if isinstance(record.seq, UnknownSeq): 340 raise TypeError( 341 "Sequence type is UnknownSeq but SeqXML requires sequence") 342 343 seq = str(record.seq) 344 345 if not len(seq) > 0: 346 raise ValueError("The sequence length should be greater than 0") 347 348 # Get the base alphabet (underneath any Gapped or StopCodon encoding) 349 alpha = Alphabet._get_base_alphabet(record.seq.alphabet) 350 if isinstance(alpha, Alphabet.RNAAlphabet): 351 seqElem = "RNAseq" 352 elif isinstance(alpha, Alphabet.DNAAlphabet): 353 seqElem = "DNAseq" 354 elif isinstance(alpha, Alphabet.ProteinAlphabet): 355 seqElem = "AAseq" 356 else: 357 raise ValueError("Need a DNA, RNA or Protein alphabet") 358 359 self.xml_generator.startElement(seqElem, AttributesImpl({})) 360 self.xml_generator.characters(seq) 361 self.xml_generator.endElement(seqElem)
362
363 - def _write_dbxrefs(self, record):
364 """Write all database cross references (PRIVATE).""" 365 if record.dbxrefs is not None: 366 367 for dbxref in record.dbxrefs: 368 369 if not isinstance(dbxref, basestring): 370 raise TypeError("dbxrefs should be of type list of string") 371 if dbxref.find(':') < 1: 372 raise ValueError("dbxrefs should be in the form ['source:id', 'source:id' ]") 373 374 dbsource, dbid = dbxref.split(':', 1) 375 376 attr = {"source": dbsource, "id": dbid} 377 self.xml_generator.startElement("DBRef", AttributesImpl(attr)) 378 self.xml_generator.endElement("DBRef")
379
380 - def _write_properties(self, record):
381 """Write all annotations that are key value pairs with values of a primitive type or list of primitive types (PRIVATE).""" 382 for key, value in record.annotations.items(): 383 384 if key not in ("organism", "ncbi_taxid", "source"): 385 386 if value is None: 387 388 attr = {"name": key} 389 self.xml_generator.startElement( 390 "property", AttributesImpl(attr)) 391 self.xml_generator.endElement("property") 392 393 elif isinstance(value, list): 394 395 for v in value: 396 if isinstance(value, (int, float, basestring)): 397 attr = {"name": key, "value": v} 398 self.xml_generator.startElement( 399 "property", AttributesImpl(attr)) 400 self.xml_generator.endElement("property") 401 402 elif isinstance(value, (int, float, basestring)): 403 404 attr = {"name": key, "value": str(value)} 405 self.xml_generator.startElement( 406 "property", AttributesImpl(attr)) 407 self.xml_generator.endElement("property")
408