Package Bio :: Package SeqIO :: Module InsdcIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.InsdcIO

   1  # Copyright 2007-2016 by Peter Cock.  All rights reserved. 
   2  # 
   3  # This code is part of the Biopython distribution and governed by its 
   4  # license.  Please see the LICENSE file that should have been included 
   5  # as part of this package.. 
   6   
   7  """Bio.SeqIO support for the "genbank" and "embl" file formats. 
   8   
   9  You are expected to use this module via the Bio.SeqIO functions. 
  10  Note that internally this module calls Bio.GenBank to do the actual 
  11  parsing of GenBank, EMBL and IMGT files. 
  12   
  13  See also: 
  14   
  15  International Nucleotide Sequence Database Collaboration 
  16  http://www.insdc.org/ 
  17   
  18  GenBank 
  19  http://www.ncbi.nlm.nih.gov/Genbank/ 
  20   
  21  EMBL Nucleotide Sequence Database 
  22  http://www.ebi.ac.uk/embl/ 
  23   
  24  DDBJ (DNA Data Bank of Japan) 
  25  http://www.ddbj.nig.ac.jp/ 
  26   
  27  IMGT (use a variant of EMBL format with longer feature indents) 
  28  http://imgt.cines.fr/download/LIGM-DB/userman_doc.html 
  29  http://imgt.cines.fr/download/LIGM-DB/ftable_doc.html 
  30  http://www.ebi.ac.uk/imgt/hla/docs/manual.html 
  31   
  32  """ 
  33   
  34  from __future__ import print_function 
  35   
  36  import warnings 
  37  from Bio import BiopythonWarning 
  38   
  39  from Bio.Seq import UnknownSeq 
  40  from Bio.GenBank.Scanner import GenBankScanner, EmblScanner, _ImgtScanner 
  41  from Bio import Alphabet 
  42  from Bio.SeqIO.Interfaces import SequentialSequenceWriter 
  43  from Bio import SeqFeature 
  44   
  45  from Bio._py3k import _is_int_or_long 
  46  from Bio._py3k import basestring 
  47   
  48   
  49  # NOTE 
  50  # ==== 
  51  # The "brains" for parsing GenBank, EMBL and IMGT files (and any 
  52  # other flat file variants from the INSDC in future) is in 
  53  # Bio.GenBank.Scanner (plus the _FeatureConsumer in Bio.GenBank) 
  54  # However, all the writing code is in this file. 
  55   
  56   
57 -def GenBankIterator(handle):
58 """Breaks up a Genbank file into SeqRecord objects. 59 60 Every section from the LOCUS line to the terminating // becomes 61 a single SeqRecord with associated annotation and features. 62 63 Note that for genomes or chromosomes, there is typically only 64 one record. 65 66 This gets called internally by Bio.SeqIO for the GenBank file format: 67 68 >>> from Bio import SeqIO 69 >>> for record in SeqIO.parse("GenBank/cor6_6.gb", "gb"): 70 ... print(record.id) 71 ... 72 X55053.1 73 X62281.1 74 M81224.1 75 AJ237582.1 76 L31939.1 77 AF297471.1 78 79 Equivalently, 80 81 >>> with open("GenBank/cor6_6.gb") as handle: 82 ... for record in GenBankIterator(handle): 83 ... print(record.id) 84 ... 85 X55053.1 86 X62281.1 87 M81224.1 88 AJ237582.1 89 L31939.1 90 AF297471.1 91 92 """ 93 # This calls a generator function: 94 return GenBankScanner(debug=0).parse_records(handle)
95 96
97 -def EmblIterator(handle):
98 """Breaks up an EMBL file into SeqRecord objects. 99 100 Every section from the LOCUS line to the terminating // becomes 101 a single SeqRecord with associated annotation and features. 102 103 Note that for genomes or chromosomes, there is typically only 104 one record. 105 106 This gets called internally by Bio.SeqIO for the EMBL file format: 107 108 >>> from Bio import SeqIO 109 >>> for record in SeqIO.parse("EMBL/epo_prt_selection.embl", "embl"): 110 ... print(record.id) 111 ... 112 A00022.1 113 A00028.1 114 A00031.1 115 A00034.1 116 A00060.1 117 A00071.1 118 A00072.1 119 A00078.1 120 CQ797900.1 121 122 Equivalently, 123 124 >>> with open("EMBL/epo_prt_selection.embl") as handle: 125 ... for record in EmblIterator(handle): 126 ... print(record.id) 127 ... 128 A00022.1 129 A00028.1 130 A00031.1 131 A00034.1 132 A00060.1 133 A00071.1 134 A00072.1 135 A00078.1 136 CQ797900.1 137 138 """ 139 # This calls a generator function: 140 return EmblScanner(debug=0).parse_records(handle)
141 142
143 -def ImgtIterator(handle):
144 """Breaks up an IMGT file into SeqRecord objects. 145 146 Every section from the LOCUS line to the terminating // becomes 147 a single SeqRecord with associated annotation and features. 148 149 Note that for genomes or chromosomes, there is typically only 150 one record.""" 151 # This calls a generator function: 152 return _ImgtScanner(debug=0).parse_records(handle)
153 154
155 -def GenBankCdsFeatureIterator(handle, alphabet=Alphabet.generic_protein):
156 """Breaks up a Genbank file into SeqRecord objects for each CDS feature. 157 158 Every section from the LOCUS line to the terminating // can contain 159 many CDS features. These are returned as with the stated amino acid 160 translation sequence (if given). 161 """ 162 # This calls a generator function: 163 return GenBankScanner(debug=0).parse_cds_features(handle, alphabet)
164 165
166 -def EmblCdsFeatureIterator(handle, alphabet=Alphabet.generic_protein):
167 """Breaks up a EMBL file into SeqRecord objects for each CDS feature. 168 169 Every section from the LOCUS line to the terminating // can contain 170 many CDS features. These are returned as with the stated amino acid 171 translation sequence (if given). 172 """ 173 # This calls a generator function: 174 return EmblScanner(debug=0).parse_cds_features(handle, alphabet)
175 176
177 -def _insdc_feature_position_string(pos, offset=0):
178 """Build a GenBank/EMBL position string (PRIVATE). 179 180 Use offset=1 to add one to convert a start position from python counting. 181 """ 182 if isinstance(pos, SeqFeature.ExactPosition): 183 return "%i" % (pos.position + offset) 184 elif isinstance(pos, SeqFeature.WithinPosition): 185 return "(%i.%i)" % (pos.position + offset, 186 pos.position + pos.extension + offset) 187 elif isinstance(pos, SeqFeature.BetweenPosition): 188 return "(%i^%i)" % (pos.position + offset, 189 pos.position + pos.extension + offset) 190 elif isinstance(pos, SeqFeature.BeforePosition): 191 return "<%i" % (pos.position + offset) 192 elif isinstance(pos, SeqFeature.AfterPosition): 193 return ">%i" % (pos.position + offset) 194 elif isinstance(pos, SeqFeature.OneOfPosition): 195 return "one-of(%s)" \ 196 % ",".join(_insdc_feature_position_string(p, offset) 197 for p in pos.position_choices) 198 elif isinstance(pos, SeqFeature.AbstractPosition): 199 raise NotImplementedError("Please report this as a bug in Biopython.") 200 else: 201 raise ValueError("Expected a SeqFeature position object.")
202 203
204 -def _insdc_location_string_ignoring_strand_and_subfeatures(location, rec_length):
205 if location.ref: 206 ref = "%s:" % location.ref 207 else: 208 ref = "" 209 assert not location.ref_db 210 if isinstance(location.start, SeqFeature.ExactPosition) \ 211 and isinstance(location.end, SeqFeature.ExactPosition) \ 212 and location.start.position == location.end.position: 213 # Special case, for 12:12 return 12^13 214 # (a zero length slice, meaning the point between two letters) 215 if location.end.position == rec_length: 216 # Very special case, for a between position at the end of a 217 # sequence (used on some circular genomes, Bug 3098) we have 218 # N:N so return N^1 219 return "%s%i^1" % (ref, rec_length) 220 else: 221 return "%s%i^%i" % (ref, location.end.position, 222 location.end.position + 1) 223 if isinstance(location.start, SeqFeature.ExactPosition) \ 224 and isinstance(location.end, SeqFeature.ExactPosition) \ 225 and location.start.position + 1 == location.end.position: 226 # Special case, for 11:12 return 12 rather than 12..12 227 # (a length one slice, meaning a single letter) 228 return "%s%i" % (ref, location.end.position) 229 elif isinstance(location.start, SeqFeature.UnknownPosition) \ 230 or isinstance(location.end, SeqFeature.UnknownPosition): 231 # Special case for features from SwissProt/UniProt files 232 if isinstance(location.start, SeqFeature.UnknownPosition) \ 233 and isinstance(location.end, SeqFeature.UnknownPosition): 234 # warnings.warn("Feature with unknown location", BiopythonWarning) 235 # return "?" 236 raise ValueError("Feature with unknown location") 237 elif isinstance(location.start, SeqFeature.UnknownPosition): 238 # Treat the unknown start position as a BeforePosition 239 return "%s<%i..%s" \ 240 % (ref, 241 location.nofuzzy_end, 242 _insdc_feature_position_string(location.end)) 243 else: 244 # Treat the unknown end position as an AfterPosition 245 return "%s%s..>%i" \ 246 % (ref, 247 _insdc_feature_position_string(location.start, +1), 248 location.nofuzzy_start + 1) 249 else: 250 # Typical case, e.g. 12..15 gets mapped to 11:15 251 return ref \ 252 + _insdc_feature_position_string(location.start, +1) \ 253 + ".." + \ 254 _insdc_feature_position_string(location.end)
255 256
257 -def _insdc_location_string(location, rec_length):
258 """Build a GenBank/EMBL location from a (Compound) FeatureLocation (PRIVATE). 259 260 There is a choice of how to show joins on the reverse complement strand, 261 GenBank used "complement(join(1,10),(20,100))" while EMBL used to use 262 "join(complement(20,100),complement(1,10))" instead (but appears to have 263 now adopted the GenBank convention). Notice that the order of the entries 264 is reversed! This function therefore uses the first form. In this situation 265 we expect the CompoundFeatureLocation and its parts to all be marked as 266 strand == -1, and to be in the order 19:100 then 0:10. 267 """ 268 try: 269 parts = location.parts 270 # CompoundFeatureLocation 271 if location.strand == -1: 272 # Special case, put complement outside the join/order/... and reverse order 273 return "complement(%s(%s))" % (location.operator, 274 ",".join(_insdc_location_string_ignoring_strand_and_subfeatures(p, rec_length) 275 for p in parts[::-1])) 276 else: 277 return "%s(%s)" % (location.operator, 278 ",".join(_insdc_location_string(p, rec_length) for p in parts)) 279 except AttributeError: 280 # Simple FeatureLocation 281 loc = _insdc_location_string_ignoring_strand_and_subfeatures(location, rec_length) 282 if location.strand == -1: 283 return "complement(%s)" % loc 284 else: 285 return loc
286 287
288 -class _InsdcWriter(SequentialSequenceWriter):
289 """Base class for GenBank and EMBL writers (PRIVATE).""" 290 MAX_WIDTH = 80 291 QUALIFIER_INDENT = 21 292 QUALIFIER_INDENT_STR = " " * QUALIFIER_INDENT 293 QUALIFIER_INDENT_TMP = " %s " # 21 if %s is empty 294 FTQUAL_NO_QUOTE = ("anticodon", "citation", "codon_start", "compare", 295 "direction", "estimated_length", "mod_base", "number", 296 "rpt_type", "rpt_unit_range", "tag_peptide", 297 "transl_except", "transl_table") 298
299 - def _write_feature_qualifier(self, key, value=None, quote=None):
300 if value is None: 301 # Value-less entry like /pseudo 302 self.handle.write("%s/%s\n" % (self.QUALIFIER_INDENT_STR, key)) 303 return 304 # Quick hack with no line wrapping, may be useful for testing: 305 # self.handle.write('%s/%s="%s"\n' % (self.QUALIFIER_INDENT_STR, key, value)) 306 if quote is None: 307 # Try to mimic unwritten rules about when quotes can be left out: 308 if _is_int_or_long(value) or key in self.FTQUAL_NO_QUOTE: 309 quote = False 310 else: 311 quote = True 312 if quote: 313 line = '%s/%s="%s"' % (self.QUALIFIER_INDENT_STR, key, value) 314 else: 315 line = '%s/%s=%s' % (self.QUALIFIER_INDENT_STR, key, value) 316 if len(line) <= self.MAX_WIDTH: 317 self.handle.write(line + "\n") 318 return 319 while line.lstrip(): 320 if len(line) <= self.MAX_WIDTH: 321 self.handle.write(line + "\n") 322 return 323 # Insert line break... 324 for index in range(min(len(line) - 1, self.MAX_WIDTH), 325 self.QUALIFIER_INDENT + 1, -1): 326 if line[index] == " ": 327 break 328 if line[index] != " ": 329 # No nice place to break... 330 index = self.MAX_WIDTH 331 assert index <= self.MAX_WIDTH 332 self.handle.write(line[:index] + "\n") 333 line = self.QUALIFIER_INDENT_STR + line[index:].lstrip()
334
335 - def _wrap_location(self, location):
336 """Split a feature location into lines (break at commas).""" 337 # TODO - Rewrite this not to recurse! 338 length = self.MAX_WIDTH - self.QUALIFIER_INDENT 339 if len(location) <= length: 340 return location 341 index = location[:length].rfind(",") 342 if index == -1: 343 # No good place to split (!) 344 warnings.warn("Couldn't split location:\n%s" % location, 345 BiopythonWarning) 346 return location 347 return location[:index + 1] + "\n" + \ 348 self.QUALIFIER_INDENT_STR + \ 349 self._wrap_location(location[index + 1:])
350
351 - def _write_feature(self, feature, record_length):
352 """Write a single SeqFeature object to features table.""" 353 assert feature.type, feature 354 location = _insdc_location_string(feature.location, record_length) 355 f_type = feature.type.replace(" ", "_") 356 line = (self.QUALIFIER_INDENT_TMP % f_type)[:self.QUALIFIER_INDENT] \ 357 + self._wrap_location(location) + "\n" 358 self.handle.write(line) 359 # Now the qualifiers... 360 for key in sorted(feature.qualifiers.keys()): 361 values = feature.qualifiers[key] 362 if isinstance(values, (list, tuple)): 363 for value in values: 364 self._write_feature_qualifier(key, value) 365 else: 366 # String, int, etc - or None for a /pseudo tpy entry 367 self._write_feature_qualifier(key, values)
368
369 - def _get_annotation_str(self, record, key, default=".", just_first=False):
370 """Get an annotation dictionary entry (as a string). 371 372 Some entries are lists, in which case if just_first=True the first entry 373 is returned. If just_first=False (default) this verifies there is only 374 one entry before returning it.""" 375 try: 376 answer = record.annotations[key] 377 except KeyError: 378 return default 379 if isinstance(answer, list): 380 if not just_first: 381 assert len(answer) == 1 382 return str(answer[0]) 383 else: 384 return str(answer)
385
386 - def _split_multi_line(self, text, max_len):
387 """Returns a list of strings. 388 389 Any single words which are too long get returned as a whole line 390 (e.g. URLs) without an exception or warning. 391 """ 392 # TODO - Do the line spliting while preserving white space? 393 text = text.strip() 394 if len(text) <= max_len: 395 return [text] 396 397 words = text.split() 398 text = "" 399 while words and len(text) + 1 + len(words[0]) <= max_len: 400 text += " " + words.pop(0) 401 text = text.strip() 402 # assert len(text) <= max_len 403 answer = [text] 404 while words: 405 text = words.pop(0) 406 while words and len(text) + 1 + len(words[0]) <= max_len: 407 text += " " + words.pop(0) 408 text = text.strip() 409 # assert len(text) <= max_len 410 answer.append(text) 411 assert not words 412 return answer
413
414 - def _split_contig(self, record, max_len):
415 """Returns a list of strings, splits on commas.""" 416 # TODO - Merge this with _write_multi_line method? 417 # It would need the addition of the comma splitting logic... 418 # are there any other cases where that would be sensible? 419 contig = record.annotations.get("contig", "") 420 if isinstance(contig, (list, tuple)): 421 contig = "".join(contig) 422 contig = self.clean(contig) 423 answer = [] 424 while contig: 425 if len(contig) > max_len: 426 # Split lines at the commas 427 pos = contig[:max_len - 1].rfind(",") 428 if pos == -1: 429 raise ValueError("Could not break up CONTIG") 430 text, contig = contig[:pos + 1], contig[pos + 1:] 431 else: 432 text, contig = contig, "" 433 answer.append(text) 434 return answer
435 436
437 -class GenBankWriter(_InsdcWriter):
438 """GenBank writer.""" 439 440 HEADER_WIDTH = 12 441 QUALIFIER_INDENT = 21 442
443 - def _write_single_line(self, tag, text):
444 """Used in the 'header' of each GenBank record.""" 445 assert len(tag) < self.HEADER_WIDTH 446 if len(text) > self.MAX_WIDTH - self.HEADER_WIDTH: 447 if tag: 448 warnings.warn("Annotation %r too long for %r line" % (text, tag), 449 BiopythonWarning) 450 else: 451 # Can't give such a precise warning 452 warnings.warn("Annotation %r too long" % text, BiopythonWarning) 453 self.handle.write("%s%s\n" % (tag.ljust(self.HEADER_WIDTH), 454 text.replace("\n", " ")))
455
456 - def _write_multi_line(self, tag, text):
457 """Used in the 'header' of each GenBank record.""" 458 # TODO - Do the line spliting while preserving white space? 459 max_len = self.MAX_WIDTH - self.HEADER_WIDTH 460 lines = self._split_multi_line(text, max_len) 461 self._write_single_line(tag, lines[0]) 462 for line in lines[1:]: 463 self._write_single_line("", line)
464
465 - def _write_multi_entries(self, tag, text_list):
466 # used for DBLINK and any similar later line types. 467 # If the list of strings is empty, nothing is written. 468 for i, text in enumerate(text_list): 469 if i == 0: 470 self._write_single_line(tag, text) 471 else: 472 self._write_single_line("", text)
473
474 - def _get_date(self, record):
475 default = "01-JAN-1980" 476 try: 477 date = record.annotations["date"] 478 except KeyError: 479 return default 480 # Cope with a list of one string: 481 if isinstance(date, list) and len(date) == 1: 482 date = date[0] 483 # TODO - allow a Python date object 484 if not isinstance(date, basestring) or len(date) != 11 \ 485 or date[2] != "-" or date[6] != "-" \ 486 or not date[:2].isdigit() or not date[7:].isdigit() \ 487 or int(date[:2]) > 31 \ 488 or date[3:6] not in ["JAN", "FEB", "MAR", "APR", "MAY", "JUN", 489 "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"]: 490 # TODO - Check is a valid date (e.g. not 31 Feb) 491 return default 492 return date
493
494 - def _get_data_division(self, record):
495 try: 496 division = record.annotations["data_file_division"] 497 except KeyError: 498 division = "UNK" 499 if division in ["PRI", "ROD", "MAM", "VRT", "INV", "PLN", "BCT", 500 "VRL", "PHG", "SYN", "UNA", "EST", "PAT", "STS", 501 "GSS", "HTG", "HTC", "ENV", "CON"]: 502 # Good, already GenBank style 503 # PRI - primate sequences 504 # ROD - rodent sequences 505 # MAM - other mammalian sequences 506 # VRT - other vertebrate sequences 507 # INV - invertebrate sequences 508 # PLN - plant, fungal, and algal sequences 509 # BCT - bacterial sequences [plus archea] 510 # VRL - viral sequences 511 # PHG - bacteriophage sequences 512 # SYN - synthetic sequences 513 # UNA - unannotated sequences 514 # EST - EST sequences (expressed sequence tags) 515 # PAT - patent sequences 516 # STS - STS sequences (sequence tagged sites) 517 # GSS - GSS sequences (genome survey sequences) 518 # HTG - HTGS sequences (high throughput genomic sequences) 519 # HTC - HTC sequences (high throughput cDNA sequences) 520 # ENV - Environmental sampling sequences 521 # CON - Constructed sequences 522 # 523 # (plus UNK for unknown) 524 pass 525 else: 526 # See if this is in EMBL style: 527 # Division Code 528 # ----------------- ---- 529 # Bacteriophage PHG - common 530 # Environmental Sample ENV - common 531 # Fungal FUN - map to PLN (plants + fungal) 532 # Human HUM - map to PRI (primates) 533 # Invertebrate INV - common 534 # Other Mammal MAM - common 535 # Other Vertebrate VRT - common 536 # Mus musculus MUS - map to ROD (rodent) 537 # Plant PLN - common 538 # Prokaryote PRO - map to BCT (poor name) 539 # Other Rodent ROD - common 540 # Synthetic SYN - common 541 # Transgenic TGN - ??? map to SYN ??? 542 # Unclassified UNC - map to UNK 543 # Viral VRL - common 544 # 545 # (plus XXX for submiting which we can map to UNK) 546 embl_to_gbk = {"FUN": "PLN", 547 "HUM": "PRI", 548 "MUS": "ROD", 549 "PRO": "BCT", 550 "UNC": "UNK", 551 "XXX": "UNK", 552 } 553 try: 554 division = embl_to_gbk[division] 555 except KeyError: 556 division = "UNK" 557 assert len(division) == 3 558 return division
559
560 - def _get_topology(self, record):
561 """Set the topology to 'circular', 'linear' if defined""" 562 max_topology_len = len("circular") 563 564 topology = self._get_annotation_str(record, "topology", default="") 565 if topology and len(topology) <= max_topology_len: 566 return topology.ljust(max_topology_len) 567 else: 568 return " " * max_topology_len
569
570 - def _write_the_first_line(self, record):
571 """Write the LOCUS line.""" 572 573 locus = record.name 574 if not locus or locus == "<unknown name>": 575 locus = record.id 576 if not locus or locus == "<unknown id>": 577 locus = self._get_annotation_str( 578 record, "accession", just_first=True) 579 if len(locus) > 16: 580 if len(locus) + 1 + len(str(len(record))) > 28: 581 # Locus name and record length to long to squeeze in. 582 raise ValueError("Locus identifier %r is too long" % locus) 583 else: 584 warnings.warn("Stealing space from length field to allow long name in LOCUS line", BiopythonWarning) 585 if len(locus.split()) > 1: 586 # locus could be unicode, and u'with space' versus 'with space' 587 # causes trouble with doctest or print-and-compare tests, so 588 tmp = repr(locus) 589 if tmp.startswith("u'") and tmp.endswith("'"): 590 tmp = tmp[1:] 591 raise ValueError("Invalid whitespace in %s for LOCUS line" % tmp) 592 if len(record) > 99999999999: 593 # Currently GenBank only officially support up to 350000, but 594 # the length field can take eleven digits 595 raise ValueError("Sequence too long!") 596 597 # Get the base alphabet (underneath any Gapped or StopCodon encoding) 598 a = Alphabet._get_base_alphabet(record.seq.alphabet) 599 if not isinstance(a, Alphabet.Alphabet): 600 raise TypeError("Invalid alphabet") 601 elif isinstance(a, Alphabet.ProteinAlphabet): 602 units = "aa" 603 elif isinstance(a, Alphabet.NucleotideAlphabet): 604 units = "bp" 605 else: 606 # Must be something like NucleotideAlphabet or 607 # just the generic Alphabet (default for fasta files) 608 raise ValueError("Need a Nucleotide or Protein alphabet") 609 610 # Get the molecule type 611 # TODO - record this explicitly in the parser? 612 if isinstance(a, Alphabet.ProteinAlphabet): 613 mol_type = "" 614 elif isinstance(a, Alphabet.DNAAlphabet): 615 mol_type = "DNA" 616 elif isinstance(a, Alphabet.RNAAlphabet): 617 mol_type = "RNA" 618 else: 619 # Must be something like NucleotideAlphabet or 620 # just the generic Alphabet (default for fasta files) 621 raise ValueError("Need a DNA, RNA or Protein alphabet") 622 623 topology = self._get_topology(record) 624 625 division = self._get_data_division(record) 626 627 name_length = str(len(record)).rjust(28) 628 name_length = locus + name_length[len(locus):] 629 assert len(name_length) == 28, name_length 630 assert " " in name_length, name_length 631 632 assert len(units) == 2 633 assert len(division) == 3 634 line = "LOCUS %s %s %s %s %s %s\n" \ 635 % (name_length, 636 units, 637 mol_type.ljust(7), 638 topology, 639 division, 640 self._get_date(record)) 641 assert len(line) == 79 + 1, repr(line) # plus one for new line 642 643 # We're bending the rules to allow an identifier over 16 characters 644 # if we can steal spaces from the length field: 645 # assert line[12:28].rstrip() == locus, \ 646 # 'LOCUS line does not contain the locus at the expected position:\n' + line 647 # assert line[28:29] == " " 648 # assert line[29:40].lstrip() == str(len(record)), \ 649 # 'LOCUS line does not contain the length at the expected position:\n' + line 650 assert line[12:40].split() == [locus, str(len(record))], line 651 652 # Tests copied from Bio.GenBank.Scanner 653 assert line[40:44] in [' bp ', ' aa '], \ 654 'LOCUS line does not contain size units at expected position:\n' + \ 655 line 656 assert line[44:47] in [' ', 'ss-', 'ds-', 'ms-'], \ 657 'LOCUS line does not have valid strand type (Single stranded, ...):\n' + line 658 assert line[47:54].strip() == "" \ 659 or 'DNA' in line[47:54].strip() \ 660 or 'RNA' in line[47:54].strip(), \ 661 'LOCUS line does not contain valid sequence type (DNA, RNA, ...):\n' + line 662 assert line[54:55] == ' ', \ 663 'LOCUS line does not contain space at position 55:\n' + line 664 assert line[55:63].strip() in ['', 'linear', 'circular'], \ 665 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line 666 assert line[63:64] == ' ', \ 667 'LOCUS line does not contain space at position 64:\n' + line 668 assert line[67:68] == ' ', \ 669 'LOCUS line does not contain space at position 68:\n' + line 670 assert line[70:71] == '-', \ 671 'LOCUS line does not contain - at position 71 in date:\n' + line 672 assert line[74:75] == '-', \ 673 'LOCUS line does not contain - at position 75 in date:\n' + line 674 675 self.handle.write(line)
676
677 - def _write_references(self, record):
678 number = 0 679 for ref in record.annotations["references"]: 680 if not isinstance(ref, SeqFeature.Reference): 681 continue 682 number += 1 683 data = str(number) 684 # TODO - support more complex record reference locations? 685 if ref.location and len(ref.location) == 1: 686 a = Alphabet._get_base_alphabet(record.seq.alphabet) 687 if isinstance(a, Alphabet.ProteinAlphabet): 688 units = "residues" 689 else: 690 units = "bases" 691 data += " (%s %i to %i)" % (units, 692 ref.location[0].nofuzzy_start + 1, 693 ref.location[0].nofuzzy_end) 694 self._write_single_line("REFERENCE", data) 695 if ref.authors: 696 # We store the AUTHORS data as a single string 697 self._write_multi_line(" AUTHORS", ref.authors) 698 if ref.consrtm: 699 # We store the consortium as a single string 700 self._write_multi_line(" CONSRTM", ref.consrtm) 701 if ref.title: 702 # We store the title as a single string 703 self._write_multi_line(" TITLE", ref.title) 704 if ref.journal: 705 # We store this as a single string - holds the journal name, 706 # volume, year, and page numbers of the citation 707 self._write_multi_line(" JOURNAL", ref.journal) 708 if ref.medline_id: 709 # This line type is obsolete and was removed from the GenBank 710 # flatfile format in April 2005. Should we write it? 711 # Note this has a two space indent: 712 self._write_multi_line(" MEDLINE", ref.medline_id) 713 if ref.pubmed_id: 714 # Note this has a THREE space indent: 715 self._write_multi_line(" PUBMED", ref.pubmed_id) 716 if ref.comment: 717 self._write_multi_line(" REMARK", ref.comment)
718
719 - def _write_comment(self, record):
720 # This is a bit complicated due to the range of possible 721 # ways people might have done their annotation... 722 # Currently the parser uses a single string with newlines. 723 # A list of lines is also reasonable. 724 # A single (long) string is perhaps the most natural of all. 725 # This means we may need to deal with line wrapping. 726 comment = record.annotations["comment"] 727 if isinstance(comment, basestring): 728 lines = comment.split("\n") 729 elif isinstance(comment, (list, tuple)): 730 lines = comment 731 else: 732 raise ValueError("Could not understand comment annotation") 733 self._write_multi_line("COMMENT", lines[0]) 734 for line in lines[1:]: 735 self._write_multi_line("", line)
736
737 - def _write_contig(self, record):
738 max_len = self.MAX_WIDTH - self.HEADER_WIDTH 739 lines = self._split_contig(record, max_len) 740 self._write_single_line("CONTIG", lines[0]) 741 for text in lines[1:]: 742 self._write_single_line("", text)
743
744 - def _write_sequence(self, record):
745 # Loosely based on code from Howard Salis 746 # TODO - Force lower case? 747 LETTERS_PER_LINE = 60 748 SEQUENCE_INDENT = 9 749 750 if isinstance(record.seq, UnknownSeq): 751 # We have already recorded the length, and there is no need 752 # to record a long sequence of NNNNNNN...NNN or whatever. 753 if "contig" in record.annotations: 754 self._write_contig(record) 755 else: 756 self.handle.write("ORIGIN\n") 757 return 758 759 # Catches sequence being None: 760 data = self._get_seq_string(record).lower() 761 seq_len = len(data) 762 self.handle.write("ORIGIN\n") 763 for line_number in range(0, seq_len, LETTERS_PER_LINE): 764 self.handle.write(str(line_number + 1).rjust(SEQUENCE_INDENT)) 765 for words in range(line_number, 766 min(line_number + LETTERS_PER_LINE, seq_len), 10): 767 self.handle.write(" %s" % data[words:words + 10]) 768 self.handle.write("\n")
769
770 - def write_record(self, record):
771 """Write a single record to the output file.""" 772 handle = self.handle 773 self._write_the_first_line(record) 774 775 default = record.id 776 if default.count(".") == 1 and default[default.index(".") + 1:].isdigit(): 777 # Good, looks like accesion.version and not something 778 # else like identifier.start-end 779 default = record.id.split(".", 1)[0] 780 accession = self._get_annotation_str(record, "accession", 781 default, 782 just_first=True) 783 acc_with_version = accession 784 if record.id.startswith(accession + "."): 785 try: 786 acc_with_version = "%s.%i" \ 787 % (accession, 788 int(record.id.split(".", 1)[1])) 789 except ValueError: 790 pass 791 gi = self._get_annotation_str(record, "gi", just_first=True) 792 793 descr = record.description 794 if descr == "<unknown description>": 795 descr = "." 796 self._write_multi_line("DEFINITION", descr) 797 798 self._write_single_line("ACCESSION", accession) 799 if gi != ".": 800 self._write_single_line("VERSION", "%s GI:%s" 801 % (acc_with_version, gi)) 802 else: 803 self._write_single_line("VERSION", "%s" % (acc_with_version)) 804 805 # The NCBI only expect two types of link so far, 806 # e.g. "Project:28471" and "Trace Assembly Archive:123456" 807 # TODO - Filter the dbxrefs list to just these? 808 self._write_multi_entries("DBLINK", record.dbxrefs) 809 810 try: 811 # List of strings 812 # Keywords should be given separated with semi colons, 813 keywords = "; ".join(record.annotations["keywords"]) 814 # with a trailing period: 815 if not keywords.endswith("."): 816 keywords += "." 817 except KeyError: 818 # If no keywords, there should be just a period: 819 keywords = "." 820 self._write_multi_line("KEYWORDS", keywords) 821 822 if "segment" in record.annotations: 823 # Deal with SEGMENT line found only in segmented records, 824 # e.g. AH000819 825 segment = record.annotations["segment"] 826 if isinstance(segment, list): 827 assert len(segment) == 1, segment 828 segment = segment[0] 829 self._write_single_line("SEGMENT", segment) 830 831 self._write_multi_line("SOURCE", 832 self._get_annotation_str(record, "source")) 833 # The ORGANISM line MUST be a single line, as any continuation is the taxonomy 834 org = self._get_annotation_str(record, "organism") 835 if len(org) > self.MAX_WIDTH - self.HEADER_WIDTH: 836 org = org[:self.MAX_WIDTH - self.HEADER_WIDTH - 4] + "..." 837 self._write_single_line(" ORGANISM", org) 838 try: 839 # List of strings 840 # Taxonomy should be given separated with semi colons, 841 taxonomy = "; ".join(record.annotations["taxonomy"]) 842 # with a trailing period: 843 if not taxonomy.endswith("."): 844 taxonomy += "." 845 except KeyError: 846 taxonomy = "." 847 self._write_multi_line("", taxonomy) 848 849 if "references" in record.annotations: 850 self._write_references(record) 851 852 if "comment" in record.annotations: 853 self._write_comment(record) 854 855 handle.write("FEATURES Location/Qualifiers\n") 856 rec_length = len(record) 857 for feature in record.features: 858 self._write_feature(feature, rec_length) 859 self._write_sequence(record) 860 handle.write("//\n")
861 862
863 -class EmblWriter(_InsdcWriter):
864 """EMBL writer.""" 865 866 HEADER_WIDTH = 5 867 QUALIFIER_INDENT = 21 868 QUALIFIER_INDENT_STR = "FT" + " " * (QUALIFIER_INDENT - 2) 869 QUALIFIER_INDENT_TMP = "FT %s " # 21 if %s is empty 870 FEATURE_HEADER = "FH Key Location/Qualifiers\n" 871
872 - def _write_contig(self, record):
873 max_len = self.MAX_WIDTH - self.HEADER_WIDTH 874 lines = self._split_contig(record, max_len) 875 for text in lines: 876 self._write_single_line("CO", text)
877
878 - def _write_sequence(self, record):
879 LETTERS_PER_BLOCK = 10 880 BLOCKS_PER_LINE = 6 881 LETTERS_PER_LINE = LETTERS_PER_BLOCK * BLOCKS_PER_LINE 882 POSITION_PADDING = 10 883 handle = self.handle # save looking up this multiple times 884 885 if isinstance(record.seq, UnknownSeq): 886 # We have already recorded the length, and there is no need 887 # to record a long sequence of NNNNNNN...NNN or whatever. 888 if "contig" in record.annotations: 889 self._write_contig(record) 890 else: 891 # TODO - Can the sequence just be left out as in GenBank files? 892 handle.write("SQ \n") 893 return 894 895 # Catches sequence being None 896 data = self._get_seq_string(record).lower() 897 seq_len = len(data) 898 899 # Get the base alphabet (underneath any Gapped or StopCodon encoding) 900 a = Alphabet._get_base_alphabet(record.seq.alphabet) 901 if isinstance(a, Alphabet.DNAAlphabet): 902 # TODO - What if we have RNA? 903 a_count = data.count('A') + data.count('a') 904 c_count = data.count('C') + data.count('c') 905 g_count = data.count('G') + data.count('g') 906 t_count = data.count('T') + data.count('t') 907 other = seq_len - (a_count + c_count + g_count + t_count) 908 handle.write("SQ Sequence %i BP; %i A; %i C; %i G; %i T; %i other;\n" 909 % (seq_len, a_count, c_count, g_count, t_count, other)) 910 else: 911 handle.write("SQ \n") 912 913 for line_number in range(0, seq_len // LETTERS_PER_LINE): 914 handle.write(" ") # Just four, not five 915 for block in range(BLOCKS_PER_LINE): 916 index = LETTERS_PER_LINE * line_number + \ 917 LETTERS_PER_BLOCK * block 918 handle.write((" %s" % data[index:index + LETTERS_PER_BLOCK])) 919 handle.write(str((line_number + 1) * 920 LETTERS_PER_LINE).rjust(POSITION_PADDING)) 921 handle.write("\n") 922 if seq_len % LETTERS_PER_LINE: 923 # Final (partial) line 924 line_number = (seq_len // LETTERS_PER_LINE) 925 handle.write(" ") # Just four, not five 926 for block in range(BLOCKS_PER_LINE): 927 index = LETTERS_PER_LINE * line_number + \ 928 LETTERS_PER_BLOCK * block 929 handle.write( 930 (" %s" % data[index:index + LETTERS_PER_BLOCK]).ljust(11)) 931 handle.write(str(seq_len).rjust(POSITION_PADDING)) 932 handle.write("\n")
933
934 - def _write_single_line(self, tag, text):
935 assert len(tag) == 2 936 line = tag + " " + text 937 if len(text) > self.MAX_WIDTH: 938 warnings.warn("Line %r too long" % line, BiopythonWarning) 939 self.handle.write(line + "\n")
940
941 - def _write_multi_line(self, tag, text):
942 max_len = self.MAX_WIDTH - self.HEADER_WIDTH 943 lines = self._split_multi_line(text, max_len) 944 for line in lines: 945 self._write_single_line(tag, line)
946
947 - def _write_the_first_lines(self, record):
948 """Write the ID and AC lines.""" 949 if "." in record.id and record.id.rsplit(".", 1)[1].isdigit(): 950 version = "SV " + record.id.rsplit(".", 1)[1] 951 accession = self._get_annotation_str(record, "accession", 952 record.id.rsplit(".", 1)[0], 953 just_first=True) 954 else: 955 version = "" 956 accession = self._get_annotation_str(record, "accession", 957 record.id, 958 just_first=True) 959 960 if ";" in accession: 961 raise ValueError("Cannot have semi-colon in EMBL accession, %s" 962 % repr(str(accession))) 963 if " " in accession: 964 # This is out of practicallity... might it be allowed? 965 raise ValueError("Cannot have spaces in EMBL accession, %s" 966 % repr(str(accession))) 967 968 topology = self._get_annotation_str(record, "topology", default="") 969 970 # Get the molecule type 971 # TODO - record this explicitly in the parser? 972 # Get the base alphabet (underneath any Gapped or StopCodon encoding) 973 a = Alphabet._get_base_alphabet(record.seq.alphabet) 974 if not isinstance(a, Alphabet.Alphabet): 975 raise TypeError("Invalid alphabet") 976 elif isinstance(a, Alphabet.DNAAlphabet): 977 mol_type = "DNA" 978 units = "BP" 979 elif isinstance(a, Alphabet.RNAAlphabet): 980 mol_type = "RNA" 981 units = "BP" 982 elif isinstance(a, Alphabet.ProteinAlphabet): 983 mol_type = "PROTEIN" 984 units = "AA" 985 else: 986 # Must be something like NucleotideAlphabet 987 raise ValueError("Need a DNA, RNA or Protein alphabet") 988 989 # Get the taxonomy division 990 division = self._get_data_division(record) 991 992 # TODO - Full ID line 993 handle = self.handle 994 # ID <1>; SV <2>; <3>; <4>; <5>; <6>; <7> BP. 995 # 1. Primary accession number 996 # 2. Sequence version number 997 # 3. Topology: 'circular' or 'linear' 998 # 4. Molecule type 999 # 5. Data class 1000 # 6. Taxonomic division 1001 # 7. Sequence length 1002 self._write_single_line("ID", "%s; %s; %s; %s; ; %s; %i %s." 1003 % (accession, version, topology, mol_type, 1004 division, len(record), units)) 1005 handle.write("XX\n") 1006 self._write_single_line("AC", accession + ";") 1007 handle.write("XX\n")
1008
1009 - def _get_data_division(self, record):
1010 try: 1011 division = record.annotations["data_file_division"] 1012 except KeyError: 1013 division = "UNC" 1014 if division in ["PHG", "ENV", "FUN", "HUM", "INV", "MAM", "VRT", 1015 "MUS", "PLN", "PRO", "ROD", "SYN", "TGN", "UNC", 1016 "VRL", "XXX"]: 1017 # Good, already EMBL style 1018 # Division Code 1019 # ----------------- ---- 1020 # Bacteriophage PHG 1021 # Environmental Sample ENV 1022 # Fungal FUN 1023 # Human HUM 1024 # Invertebrate INV 1025 # Other Mammal MAM 1026 # Other Vertebrate VRT 1027 # Mus musculus MUS 1028 # Plant PLN 1029 # Prokaryote PRO 1030 # Other Rodent ROD 1031 # Synthetic SYN 1032 # Transgenic TGN 1033 # Unclassified UNC (i.e. unknown) 1034 # Viral VRL 1035 # 1036 # (plus XXX used for submiting data to EMBL) 1037 pass 1038 else: 1039 # See if this is in GenBank style & can be converted. 1040 # Generally a problem as the GenBank groups are wider 1041 # than those of EMBL. Note that GenBank use "BCT" for 1042 # both bacteria and acherea thus this maps to EMBL's 1043 # "PRO" nicely. 1044 gbk_to_embl = {"BCT": "PRO", 1045 "UNK": "UNC", 1046 } 1047 try: 1048 division = gbk_to_embl[division] 1049 except KeyError: 1050 division = "UNC" 1051 assert len(division) == 3 1052 return division
1053
1054 - def _write_keywords(self, record):
1055 # Put the keywords right after DE line. 1056 # Each 'keyword' can have multiple words and spaces, but we 1057 # must not split any 'keyword' between lines. 1058 # TODO - Combine short keywords onto one line 1059 for keyword in record.annotations["keywords"]: 1060 self._write_single_line("KW", keyword) 1061 self.handle.write("XX\n")
1062
1063 - def _write_references(self, record):
1064 # The order should be RN, RC, RP, RX, RG, RA, RT, RL 1065 number = 0 1066 for ref in record.annotations["references"]: 1067 if not isinstance(ref, SeqFeature.Reference): 1068 continue 1069 number += 1 1070 self._write_single_line("RN", "[%i]" % number) 1071 # TODO - support for RC line (needed in parser too) 1072 # TODO - support more complex record reference locations? 1073 if ref.location and len(ref.location) == 1: 1074 self._write_single_line( 1075 "RP", "%i-%i" % (ref.location[0].nofuzzy_start + 1, 1076 ref.location[0].nofuzzy_end)) 1077 # TODO - record any DOI or AGRICOLA identifier in the reference object? 1078 if ref.pubmed_id: 1079 self._write_single_line("RX", "PUBMED; %s." % ref.pubmed_id) 1080 if ref.consrtm: 1081 self._write_single_line("RG", "%s" % ref.consrtm) 1082 if ref.authors: 1083 # We store the AUTHORS data as a single string 1084 self._write_multi_line("RA", ref.authors + ";") 1085 if ref.title: 1086 # We store the title as a single string 1087 self._write_multi_line("RT", '"%s";' % ref.title) 1088 if ref.journal: 1089 # We store this as a single string - holds the journal name, 1090 # volume, year, and page numbers of the citation 1091 self._write_multi_line("RL", ref.journal) 1092 self.handle.write("XX\n")
1093
1094 - def _write_comment(self, record):
1095 # This is a bit complicated due to the range of possible 1096 # ways people might have done their annotation... 1097 # Currently the parser uses a single string with newlines. 1098 # A list of lines is also reasonable. 1099 # A single (long) string is perhaps the most natural of all. 1100 # This means we may need to deal with line wrapping. 1101 comment = record.annotations["comment"] 1102 if isinstance(comment, basestring): 1103 lines = comment.split("\n") 1104 elif isinstance(comment, (list, tuple)): 1105 lines = comment 1106 else: 1107 raise ValueError("Could not understand comment annotation") 1108 # TODO - Merge this with the GenBank comment code? 1109 if not lines: 1110 return 1111 for line in lines: 1112 self._write_multi_line("CC", line) 1113 self.handle.write("XX\n")
1114
1115 - def write_record(self, record):
1116 """Write a single record to the output file.""" 1117 1118 handle = self.handle 1119 self._write_the_first_lines(record) 1120 1121 # PR line (0 or 1 lines only), project identifier 1122 # 1123 # Assuming can't use 2 lines, we should prefer newer GenBank 1124 # DBLINK BioProject:... entries over the older GenBank DBLINK 1125 # Project:... lines. 1126 # 1127 # In either case, seems EMBL usess just "PR Project:..." 1128 # regardless of the type of ID (old numeric only, or new 1129 # with alpha prefix), e.g. for CP002497 NCBI now uses: 1130 # 1131 # DBLINK BioProject: PRJNA60715 1132 # BioSample: SAMN03081426 1133 # 1134 # While EMBL uses: 1135 # 1136 # XX 1137 # PR Project:PRJNA60715; 1138 # XX 1139 # 1140 # Sorting ensures (new) BioProject:... is before old Project:... 1141 for xref in sorted(record.dbxrefs): 1142 if xref.startswith("BioProject:"): 1143 self._write_single_line("PR", xref[3:] + ";") 1144 handle.write("XX\n") 1145 break 1146 if xref.startswith("Project:"): 1147 self._write_single_line("PR", xref + ";") 1148 handle.write("XX\n") 1149 break 1150 1151 # TODO - DT lines (date) 1152 1153 descr = record.description 1154 if descr == "<unknown description>": 1155 descr = "." 1156 self._write_multi_line("DE", descr) 1157 handle.write("XX\n") 1158 1159 if "keywords" in record.annotations: 1160 self._write_keywords(record) 1161 1162 # Should this be "source" or "organism"? 1163 self._write_multi_line( 1164 "OS", self._get_annotation_str(record, "organism")) 1165 try: 1166 # List of strings 1167 taxonomy = "; ".join(record.annotations["taxonomy"]) + "." 1168 except KeyError: 1169 taxonomy = "." 1170 self._write_multi_line("OC", taxonomy) 1171 handle.write("XX\n") 1172 1173 if "references" in record.annotations: 1174 self._write_references(record) 1175 1176 if "comment" in record.annotations: 1177 self._write_comment(record) 1178 1179 handle.write(self.FEATURE_HEADER) 1180 rec_length = len(record) 1181 for feature in record.features: 1182 self._write_feature(feature, rec_length) 1183 handle.write("XX\n") 1184 1185 self._write_sequence(record) 1186 handle.write("//\n")
1187 1188
1189 -class ImgtWriter(EmblWriter):
1190 """IMGT writer (EMBL format variant).""" 1191 1192 HEADER_WIDTH = 5 1193 QUALIFIER_INDENT = 25 # Not 21 as in EMBL 1194 QUALIFIER_INDENT_STR = "FT" + " " * (QUALIFIER_INDENT - 2) 1195 QUALIFIER_INDENT_TMP = "FT %s " # 25 if %s is empty 1196 FEATURE_HEADER = "FH Key Location/Qualifiers\n"
1197 1198 if __name__ == "__main__": 1199 from Bio._utils import run_doctest 1200 run_doctest(verbose=0) 1201