Package Bio :: Package SeqIO :: Module InsdcIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.InsdcIO

   1  # Copyright 2007-2016 by Peter Cock.  All rights reserved. 
   2  # 
   3  # This file is part of the Biopython distribution and governed by your 
   4  # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". 
   5  # Please see the LICENSE file that should have been included as part of this 
   6  # package. 
   7  """Bio.SeqIO support for the "genbank" and "embl" file formats. 
   8   
   9  You are expected to use this module via the Bio.SeqIO functions. 
  10  Note that internally this module calls Bio.GenBank to do the actual 
  11  parsing of GenBank, EMBL and IMGT files. 
  12   
  13  See Also: 
  14  International Nucleotide Sequence Database Collaboration 
  15  http://www.insdc.org/ 
  16   
  17  GenBank 
  18  http://www.ncbi.nlm.nih.gov/Genbank/ 
  19   
  20  EMBL Nucleotide Sequence Database 
  21  http://www.ebi.ac.uk/embl/ 
  22   
  23  DDBJ (DNA Data Bank of Japan) 
  24  http://www.ddbj.nig.ac.jp/ 
  25   
  26  IMGT (use a variant of EMBL format with longer feature indents) 
  27  http://imgt.cines.fr/download/LIGM-DB/userman_doc.html 
  28  http://imgt.cines.fr/download/LIGM-DB/ftable_doc.html 
  29  http://www.ebi.ac.uk/imgt/hla/docs/manual.html 
  30   
  31  """ 
  32   
  33  from __future__ import print_function 
  34   
  35  import warnings 
  36  from Bio import BiopythonWarning 
  37   
  38  from Bio.Seq import UnknownSeq 
  39  from Bio.GenBank.Scanner import GenBankScanner, EmblScanner, _ImgtScanner 
  40  from Bio import Alphabet 
  41  from Bio.SeqIO.Interfaces import SequentialSequenceWriter 
  42  from Bio import SeqFeature 
  43   
  44  from Bio._py3k import _is_int_or_long 
  45  from Bio._py3k import basestring 
46 47 48 # NOTE 49 # ==== 50 # The "brains" for parsing GenBank, EMBL and IMGT files (and any 51 # other flat file variants from the INSDC in future) is in 52 # Bio.GenBank.Scanner (plus the _FeatureConsumer in Bio.GenBank) 53 # However, all the writing code is in this file. 54 55 56 -def GenBankIterator(handle):
57 """Break up a Genbank file into SeqRecord objects. 58 59 Every section from the LOCUS line to the terminating // becomes 60 a single SeqRecord with associated annotation and features. 61 62 Note that for genomes or chromosomes, there is typically only 63 one record. 64 65 This gets called internally by Bio.SeqIO for the GenBank file format: 66 67 >>> from Bio import SeqIO 68 >>> for record in SeqIO.parse("GenBank/cor6_6.gb", "gb"): 69 ... print(record.id) 70 ... 71 X55053.1 72 X62281.1 73 M81224.1 74 AJ237582.1 75 L31939.1 76 AF297471.1 77 78 Equivalently, 79 80 >>> with open("GenBank/cor6_6.gb") as handle: 81 ... for record in GenBankIterator(handle): 82 ... print(record.id) 83 ... 84 X55053.1 85 X62281.1 86 M81224.1 87 AJ237582.1 88 L31939.1 89 AF297471.1 90 91 """ 92 # This calls a generator function: 93 return GenBankScanner(debug=0).parse_records(handle)
94
95 96 -def EmblIterator(handle):
97 """Break up an EMBL file into SeqRecord objects. 98 99 Every section from the LOCUS line to the terminating // becomes 100 a single SeqRecord with associated annotation and features. 101 102 Note that for genomes or chromosomes, there is typically only 103 one record. 104 105 This gets called internally by Bio.SeqIO for the EMBL file format: 106 107 >>> from Bio import SeqIO 108 >>> for record in SeqIO.parse("EMBL/epo_prt_selection.embl", "embl"): 109 ... print(record.id) 110 ... 111 A00022.1 112 A00028.1 113 A00031.1 114 A00034.1 115 A00060.1 116 A00071.1 117 A00072.1 118 A00078.1 119 CQ797900.1 120 121 Equivalently, 122 123 >>> with open("EMBL/epo_prt_selection.embl") as handle: 124 ... for record in EmblIterator(handle): 125 ... print(record.id) 126 ... 127 A00022.1 128 A00028.1 129 A00031.1 130 A00034.1 131 A00060.1 132 A00071.1 133 A00072.1 134 A00078.1 135 CQ797900.1 136 137 """ 138 # This calls a generator function: 139 return EmblScanner(debug=0).parse_records(handle)
140
141 142 -def ImgtIterator(handle):
143 """Break up an IMGT file into SeqRecord objects. 144 145 Every section from the LOCUS line to the terminating // becomes 146 a single SeqRecord with associated annotation and features. 147 148 Note that for genomes or chromosomes, there is typically only 149 one record. 150 """ 151 # This calls a generator function: 152 return _ImgtScanner(debug=0).parse_records(handle)
153
154 155 -def GenBankCdsFeatureIterator(handle, alphabet=Alphabet.generic_protein):
156 """Break up a Genbank file into SeqRecord objects for each CDS feature. 157 158 Every section from the LOCUS line to the terminating // can contain 159 many CDS features. These are returned as with the stated amino acid 160 translation sequence (if given). 161 """ 162 # This calls a generator function: 163 return GenBankScanner(debug=0).parse_cds_features(handle, alphabet)
164
165 166 -def EmblCdsFeatureIterator(handle, alphabet=Alphabet.generic_protein):
167 """Break up a EMBL file into SeqRecord objects for each CDS feature. 168 169 Every section from the LOCUS line to the terminating // can contain 170 many CDS features. These are returned as with the stated amino acid 171 translation sequence (if given). 172 """ 173 # This calls a generator function: 174 return EmblScanner(debug=0).parse_cds_features(handle, alphabet)
175
176 177 -def _insdc_feature_position_string(pos, offset=0):
178 """Build a GenBank/EMBL position string (PRIVATE). 179 180 Use offset=1 to add one to convert a start position from python counting. 181 """ 182 if isinstance(pos, SeqFeature.ExactPosition): 183 return "%i" % (pos.position + offset) 184 elif isinstance(pos, SeqFeature.WithinPosition): 185 return "(%i.%i)" % (pos.position + offset, 186 pos.position + pos.extension + offset) 187 elif isinstance(pos, SeqFeature.BetweenPosition): 188 return "(%i^%i)" % (pos.position + offset, 189 pos.position + pos.extension + offset) 190 elif isinstance(pos, SeqFeature.BeforePosition): 191 return "<%i" % (pos.position + offset) 192 elif isinstance(pos, SeqFeature.AfterPosition): 193 return ">%i" % (pos.position + offset) 194 elif isinstance(pos, SeqFeature.OneOfPosition): 195 return "one-of(%s)" \ 196 % ",".join(_insdc_feature_position_string(p, offset) 197 for p in pos.position_choices) 198 elif isinstance(pos, SeqFeature.AbstractPosition): 199 raise NotImplementedError("Please report this as a bug in Biopython.") 200 else: 201 raise ValueError("Expected a SeqFeature position object.")
202
203 204 -def _insdc_location_string_ignoring_strand_and_subfeatures(location, rec_length):
205 if location.ref: 206 ref = "%s:" % location.ref 207 else: 208 ref = "" 209 assert not location.ref_db 210 if isinstance(location.start, SeqFeature.ExactPosition) \ 211 and isinstance(location.end, SeqFeature.ExactPosition) \ 212 and location.start.position == location.end.position: 213 # Special case, for 12:12 return 12^13 214 # (a zero length slice, meaning the point between two letters) 215 if location.end.position == rec_length: 216 # Very special case, for a between position at the end of a 217 # sequence (used on some circular genomes, Bug 3098) we have 218 # N:N so return N^1 219 return "%s%i^1" % (ref, rec_length) 220 else: 221 return "%s%i^%i" % (ref, location.end.position, 222 location.end.position + 1) 223 if isinstance(location.start, SeqFeature.ExactPosition) \ 224 and isinstance(location.end, SeqFeature.ExactPosition) \ 225 and location.start.position + 1 == location.end.position: 226 # Special case, for 11:12 return 12 rather than 12..12 227 # (a length one slice, meaning a single letter) 228 return "%s%i" % (ref, location.end.position) 229 elif isinstance(location.start, SeqFeature.UnknownPosition) \ 230 or isinstance(location.end, SeqFeature.UnknownPosition): 231 # Special case for features from SwissProt/UniProt files 232 if isinstance(location.start, SeqFeature.UnknownPosition) \ 233 and isinstance(location.end, SeqFeature.UnknownPosition): 234 # warnings.warn("Feature with unknown location", BiopythonWarning) 235 # return "?" 236 raise ValueError("Feature with unknown location") 237 elif isinstance(location.start, SeqFeature.UnknownPosition): 238 # Treat the unknown start position as a BeforePosition 239 return "%s<%i..%s" \ 240 % (ref, 241 location.nofuzzy_end, 242 _insdc_feature_position_string(location.end)) 243 else: 244 # Treat the unknown end position as an AfterPosition 245 return "%s%s..>%i" \ 246 % (ref, 247 _insdc_feature_position_string(location.start, +1), 248 location.nofuzzy_start + 1) 249 else: 250 # Typical case, e.g. 12..15 gets mapped to 11:15 251 return ref \ 252 + _insdc_feature_position_string(location.start, +1) \ 253 + ".." + \ 254 _insdc_feature_position_string(location.end)
255
256 257 -def _insdc_location_string(location, rec_length):
258 """Build a GenBank/EMBL location from a (Compound) FeatureLocation (PRIVATE). 259 260 There is a choice of how to show joins on the reverse complement strand, 261 GenBank used "complement(join(1,10),(20,100))" while EMBL used to use 262 "join(complement(20,100),complement(1,10))" instead (but appears to have 263 now adopted the GenBank convention). Notice that the order of the entries 264 is reversed! This function therefore uses the first form. In this situation 265 we expect the CompoundFeatureLocation and its parts to all be marked as 266 strand == -1, and to be in the order 19:100 then 0:10. 267 """ 268 try: 269 parts = location.parts 270 # CompoundFeatureLocation 271 if location.strand == -1: 272 # Special case, put complement outside the join/order/... and reverse order 273 return "complement(%s(%s))" % (location.operator, 274 ",".join(_insdc_location_string_ignoring_strand_and_subfeatures(p, rec_length) 275 for p in parts[::-1])) 276 else: 277 return "%s(%s)" % (location.operator, 278 ",".join(_insdc_location_string(p, rec_length) for p in parts)) 279 except AttributeError: 280 # Simple FeatureLocation 281 loc = _insdc_location_string_ignoring_strand_and_subfeatures(location, rec_length) 282 if location.strand == -1: 283 return "complement(%s)" % loc 284 else: 285 return loc
286
287 288 -class _InsdcWriter(SequentialSequenceWriter):
289 """Base class for GenBank and EMBL writers (PRIVATE).""" 290 291 MAX_WIDTH = 80 292 QUALIFIER_INDENT = 21 293 QUALIFIER_INDENT_STR = " " * QUALIFIER_INDENT 294 QUALIFIER_INDENT_TMP = " %s " # 21 if %s is empty 295 FTQUAL_NO_QUOTE = ("anticodon", "citation", "codon_start", "compare", 296 "direction", "estimated_length", "mod_base", "number", 297 "rpt_type", "rpt_unit_range", "tag_peptide", 298 "transl_except", "transl_table") 299
300 - def _write_feature_qualifier(self, key, value=None, quote=None):
301 if value is None: 302 # Value-less entry like /pseudo 303 self.handle.write("%s/%s\n" % (self.QUALIFIER_INDENT_STR, key)) 304 return 305 # Quick hack with no line wrapping, may be useful for testing: 306 # self.handle.write('%s/%s="%s"\n' % (self.QUALIFIER_INDENT_STR, key, value)) 307 if quote is None: 308 # Try to mimic unwritten rules about when quotes can be left out: 309 if _is_int_or_long(value) or key in self.FTQUAL_NO_QUOTE: 310 quote = False 311 else: 312 quote = True 313 if quote: 314 line = '%s/%s="%s"' % (self.QUALIFIER_INDENT_STR, key, value) 315 else: 316 line = '%s/%s=%s' % (self.QUALIFIER_INDENT_STR, key, value) 317 if len(line) <= self.MAX_WIDTH: 318 self.handle.write(line + "\n") 319 return 320 while line.lstrip(): 321 if len(line) <= self.MAX_WIDTH: 322 self.handle.write(line + "\n") 323 return 324 # Insert line break... 325 for index in range(min(len(line) - 1, self.MAX_WIDTH), 326 self.QUALIFIER_INDENT + 1, -1): 327 if line[index] == " ": 328 break 329 if line[index] != " ": 330 # No nice place to break... 331 index = self.MAX_WIDTH 332 assert index <= self.MAX_WIDTH 333 self.handle.write(line[:index] + "\n") 334 line = self.QUALIFIER_INDENT_STR + line[index:].lstrip()
335
336 - def _wrap_location(self, location):
337 """Split a feature location into lines (break at commas) (PRIVATE).""" 338 # TODO - Rewrite this not to recurse! 339 length = self.MAX_WIDTH - self.QUALIFIER_INDENT 340 if len(location) <= length: 341 return location 342 index = location[:length].rfind(",") 343 if index == -1: 344 # No good place to split (!) 345 warnings.warn("Couldn't split location:\n%s" % location, 346 BiopythonWarning) 347 return location 348 return location[:index + 1] + "\n" + \ 349 self.QUALIFIER_INDENT_STR + \ 350 self._wrap_location(location[index + 1:])
351
352 - def _write_feature(self, feature, record_length):
353 """Write a single SeqFeature object to features table (PRIVATE).""" 354 assert feature.type, feature 355 location = _insdc_location_string(feature.location, record_length) 356 f_type = feature.type.replace(" ", "_") 357 line = (self.QUALIFIER_INDENT_TMP % f_type)[:self.QUALIFIER_INDENT] \ 358 + self._wrap_location(location) + "\n" 359 self.handle.write(line) 360 # Now the qualifiers... 361 # Note as of Biopython 1.69, this is an ordered-dict, don't sort it: 362 for key, values in feature.qualifiers.items(): 363 if isinstance(values, (list, tuple)): 364 for value in values: 365 self._write_feature_qualifier(key, value) 366 else: 367 # String, int, etc - or None for a /pseudo tpy entry 368 self._write_feature_qualifier(key, values)
369 370 @staticmethod
371 - def _get_annotation_str(record, key, default=".", just_first=False):
372 """Get an annotation dictionary entry (as a string) (PRIVATE). 373 374 Some entries are lists, in which case if just_first=True the first entry 375 is returned. If just_first=False (default) this verifies there is only 376 one entry before returning it. 377 """ 378 try: 379 answer = record.annotations[key] 380 except KeyError: 381 return default 382 if isinstance(answer, list): 383 if not just_first: 384 assert len(answer) == 1 385 return str(answer[0]) 386 else: 387 return str(answer)
388 389 @staticmethod
390 - def _split_multi_line(text, max_len):
391 """Return a list of strings (PRIVATE). 392 393 Any single words which are too long get returned as a whole line 394 (e.g. URLs) without an exception or warning. 395 """ 396 # TODO - Do the line spliting while preserving white space? 397 text = text.strip() 398 if len(text) <= max_len: 399 return [text] 400 401 words = text.split() 402 text = "" 403 while words and len(text) + 1 + len(words[0]) <= max_len: 404 text += " " + words.pop(0) 405 text = text.strip() 406 # assert len(text) <= max_len 407 answer = [text] 408 while words: 409 text = words.pop(0) 410 while words and len(text) + 1 + len(words[0]) <= max_len: 411 text += " " + words.pop(0) 412 text = text.strip() 413 # assert len(text) <= max_len 414 answer.append(text) 415 assert not words 416 return answer
417
418 - def _split_contig(self, record, max_len):
419 """Return a list of strings, splits on commas (PRIVATE).""" 420 # TODO - Merge this with _write_multi_line method? 421 # It would need the addition of the comma splitting logic... 422 # are there any other cases where that would be sensible? 423 contig = record.annotations.get("contig", "") 424 if isinstance(contig, (list, tuple)): 425 contig = "".join(contig) 426 contig = self.clean(contig) 427 answer = [] 428 while contig: 429 if len(contig) > max_len: 430 # Split lines at the commas 431 pos = contig[:max_len - 1].rfind(",") 432 if pos == -1: 433 raise ValueError("Could not break up CONTIG") 434 text, contig = contig[:pos + 1], contig[pos + 1:] 435 else: 436 text, contig = contig, "" 437 answer.append(text) 438 return answer
439
440 441 -class GenBankWriter(_InsdcWriter):
442 """GenBank writer.""" 443 444 HEADER_WIDTH = 12 445 QUALIFIER_INDENT = 21 446 STRUCTURED_COMMENT_START = "-START##" 447 STRUCTURED_COMMENT_END = "-END##" 448 STRUCTURED_COMMENT_DELIM = " :: " 449 LETTERS_PER_LINE = 60 450 SEQUENCE_INDENT = 9 451
452 - def _write_single_line(self, tag, text):
453 """Write single line in each GenBank record (PRIVATE). 454 455 Used in the 'header' of each GenBank record. 456 """ 457 assert len(tag) < self.HEADER_WIDTH 458 if len(text) > self.MAX_WIDTH - self.HEADER_WIDTH: 459 if tag: 460 warnings.warn("Annotation %r too long for %r line" % (text, tag), 461 BiopythonWarning) 462 else: 463 # Can't give such a precise warning 464 warnings.warn("Annotation %r too long" % text, BiopythonWarning) 465 self.handle.write("%s%s\n" % (tag.ljust(self.HEADER_WIDTH), 466 text.replace("\n", " ")))
467
468 - def _write_multi_line(self, tag, text):
469 """Write multiple lines in each GenBank record (PRIVATE). 470 471 Used in the 'header' of each GenBank record. 472 """ 473 # TODO - Do the line spliting while preserving white space? 474 max_len = self.MAX_WIDTH - self.HEADER_WIDTH 475 lines = self._split_multi_line(text, max_len) 476 self._write_single_line(tag, lines[0]) 477 for line in lines[1:]: 478 self._write_single_line("", line)
479
480 - def _write_multi_entries(self, tag, text_list):
481 # used for DBLINK and any similar later line types. 482 # If the list of strings is empty, nothing is written. 483 for i, text in enumerate(text_list): 484 if i == 0: 485 self._write_single_line(tag, text) 486 else: 487 self._write_single_line("", text)
488 489 @staticmethod
490 - def _get_date(record):
491 default = "01-JAN-1980" 492 try: 493 date = record.annotations["date"] 494 except KeyError: 495 return default 496 # Cope with a list of one string: 497 if isinstance(date, list) and len(date) == 1: 498 date = date[0] 499 # TODO - allow a Python date object 500 if not isinstance(date, basestring) or len(date) != 11 \ 501 or date[2] != "-" or date[6] != "-" \ 502 or not date[:2].isdigit() or not date[7:].isdigit() \ 503 or int(date[:2]) > 31 \ 504 or date[3:6] not in ["JAN", "FEB", "MAR", "APR", "MAY", "JUN", 505 "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"]: 506 # TODO - Check is a valid date (e.g. not 31 Feb) 507 return default 508 return date
509 510 @staticmethod
511 - def _get_data_division(record):
512 try: 513 division = record.annotations["data_file_division"] 514 except KeyError: 515 division = "UNK" 516 if division in ["PRI", "ROD", "MAM", "VRT", "INV", "PLN", "BCT", 517 "VRL", "PHG", "SYN", "UNA", "EST", "PAT", "STS", 518 "GSS", "HTG", "HTC", "ENV", "CON"]: 519 # Good, already GenBank style 520 # PRI - primate sequences 521 # ROD - rodent sequences 522 # MAM - other mammalian sequences 523 # VRT - other vertebrate sequences 524 # INV - invertebrate sequences 525 # PLN - plant, fungal, and algal sequences 526 # BCT - bacterial sequences [plus archea] 527 # VRL - viral sequences 528 # PHG - bacteriophage sequences 529 # SYN - synthetic sequences 530 # UNA - unannotated sequences 531 # EST - EST sequences (expressed sequence tags) 532 # PAT - patent sequences 533 # STS - STS sequences (sequence tagged sites) 534 # GSS - GSS sequences (genome survey sequences) 535 # HTG - HTGS sequences (high throughput genomic sequences) 536 # HTC - HTC sequences (high throughput cDNA sequences) 537 # ENV - Environmental sampling sequences 538 # CON - Constructed sequences 539 # 540 # (plus UNK for unknown) 541 pass 542 else: 543 # See if this is in EMBL style: 544 # Division Code 545 # ----------------- ---- 546 # Bacteriophage PHG - common 547 # Environmental Sample ENV - common 548 # Fungal FUN - map to PLN (plants + fungal) 549 # Human HUM - map to PRI (primates) 550 # Invertebrate INV - common 551 # Other Mammal MAM - common 552 # Other Vertebrate VRT - common 553 # Mus musculus MUS - map to ROD (rodent) 554 # Plant PLN - common 555 # Prokaryote PRO - map to BCT (poor name) 556 # Other Rodent ROD - common 557 # Synthetic SYN - common 558 # Transgenic TGN - ??? map to SYN ??? 559 # Unclassified UNC - map to UNK 560 # Viral VRL - common 561 # 562 # (plus XXX for submiting which we can map to UNK) 563 embl_to_gbk = {"FUN": "PLN", 564 "HUM": "PRI", 565 "MUS": "ROD", 566 "PRO": "BCT", 567 "UNC": "UNK", 568 "XXX": "UNK", 569 } 570 try: 571 division = embl_to_gbk[division] 572 except KeyError: 573 division = "UNK" 574 assert len(division) == 3 575 return division
576
577 - def _get_topology(self, record):
578 """Set the topology to 'circular', 'linear' if defined (PRIVATE).""" 579 max_topology_len = len("circular") 580 581 topology = self._get_annotation_str(record, "topology", default="") 582 if topology and len(topology) <= max_topology_len: 583 return topology.ljust(max_topology_len) 584 else: 585 return " " * max_topology_len
586
587 - def _write_the_first_line(self, record):
588 """Write the LOCUS line (PRIVATE).""" 589 locus = record.name 590 if not locus or locus == "<unknown name>": 591 locus = record.id 592 if not locus or locus == "<unknown id>": 593 locus = self._get_annotation_str( 594 record, "accession", just_first=True) 595 if len(locus) > 16: 596 if len(locus) + 1 + len(str(len(record))) > 28: 597 # Locus name and record length to long to squeeze in. 598 raise ValueError("Locus identifier %r is too long" % locus) 599 else: 600 warnings.warn("Stealing space from length field to allow long name in LOCUS line", BiopythonWarning) 601 if len(locus.split()) > 1: 602 # locus could be unicode, and u'with space' versus 'with space' 603 # causes trouble with doctest or print-and-compare tests, so 604 tmp = repr(locus) 605 if tmp.startswith("u'") and tmp.endswith("'"): 606 tmp = tmp[1:] 607 raise ValueError("Invalid whitespace in %s for LOCUS line" % tmp) 608 if len(record) > 99999999999: 609 # Currently GenBank only officially support up to 350000, but 610 # the length field can take eleven digits 611 raise ValueError("Sequence too long!") 612 613 # Get the base alphabet (underneath any Gapped or StopCodon encoding) 614 a = Alphabet._get_base_alphabet(record.seq.alphabet) 615 if not isinstance(a, Alphabet.Alphabet): 616 raise TypeError("Invalid alphabet") 617 elif isinstance(a, Alphabet.ProteinAlphabet): 618 units = "aa" 619 elif isinstance(a, Alphabet.NucleotideAlphabet): 620 units = "bp" 621 else: 622 # Must be something like NucleotideAlphabet or 623 # just the generic Alphabet (default for fasta files) 624 raise ValueError("Need a Nucleotide or Protein alphabet") 625 626 # Get the molecule type 627 mol_type = self._get_annotation_str(record, "molecule_type", default=None) 628 if mol_type and len(mol_type) > 7: 629 # Deal with common cases from EMBL to GenBank 630 mol_type = mol_type.replace("unassigned ", "").replace("genomic ", "") 631 if len(mol_type) > 7: 632 warnings.warn("Molecule type %r too long" % mol_type, 633 BiopythonWarning) 634 mol_type = None 635 if mol_type in ["protein", "PROTEIN"]: 636 mol_type = "" 637 638 if mol_type: 639 pass 640 elif isinstance(a, Alphabet.ProteinAlphabet): 641 mol_type = "" 642 elif isinstance(a, Alphabet.DNAAlphabet): 643 mol_type = "DNA" 644 elif isinstance(a, Alphabet.RNAAlphabet): 645 mol_type = "RNA" 646 else: 647 # Must be something like NucleotideAlphabet or 648 # just the generic Alphabet (default for fasta files) 649 raise ValueError("Need a DNA, RNA or Protein alphabet") 650 651 topology = self._get_topology(record) 652 653 division = self._get_data_division(record) 654 655 name_length = str(len(record)).rjust(28) 656 name_length = locus + name_length[len(locus):] 657 assert len(name_length) == 28, name_length 658 assert " " in name_length, name_length 659 660 assert len(units) == 2 661 assert len(division) == 3 662 line = "LOCUS %s %s %s %s %s %s\n" \ 663 % (name_length, 664 units, 665 mol_type.ljust(7), 666 topology, 667 division, 668 self._get_date(record)) 669 assert len(line) == 79 + 1, repr(line) # plus one for new line 670 671 # We're bending the rules to allow an identifier over 16 characters 672 # if we can steal spaces from the length field: 673 # assert line[12:28].rstrip() == locus, \ 674 # 'LOCUS line does not contain the locus at the expected position:\n' + line 675 # assert line[28:29] == " " 676 # assert line[29:40].lstrip() == str(len(record)), \ 677 # 'LOCUS line does not contain the length at the expected position:\n' + line 678 assert line[12:40].split() == [locus, str(len(record))], line 679 680 # Tests copied from Bio.GenBank.Scanner 681 assert line[40:44] in [' bp ', ' aa '], \ 682 'LOCUS line does not contain size units at expected position:\n' + \ 683 line 684 assert line[44:47] in [' ', 'ss-', 'ds-', 'ms-'], \ 685 'LOCUS line does not have valid strand type (Single stranded, ...):\n' + line 686 assert line[47:54].strip() == "" \ 687 or 'DNA' in line[47:54].strip() \ 688 or 'RNA' in line[47:54].strip(), \ 689 'LOCUS line does not contain valid sequence type (DNA, RNA, ...):\n' + line 690 assert line[54:55] == ' ', \ 691 'LOCUS line does not contain space at position 55:\n' + line 692 assert line[55:63].strip() in ['', 'linear', 'circular'], \ 693 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line 694 assert line[63:64] == ' ', \ 695 'LOCUS line does not contain space at position 64:\n' + line 696 assert line[67:68] == ' ', \ 697 'LOCUS line does not contain space at position 68:\n' + line 698 assert line[70:71] == '-', \ 699 'LOCUS line does not contain - at position 71 in date:\n' + line 700 assert line[74:75] == '-', \ 701 'LOCUS line does not contain - at position 75 in date:\n' + line 702 703 self.handle.write(line)
704
705 - def _write_references(self, record):
706 number = 0 707 for ref in record.annotations["references"]: 708 if not isinstance(ref, SeqFeature.Reference): 709 continue 710 number += 1 711 data = str(number) 712 # TODO - support more complex record reference locations? 713 if ref.location and len(ref.location) == 1: 714 a = Alphabet._get_base_alphabet(record.seq.alphabet) 715 if isinstance(a, Alphabet.ProteinAlphabet): 716 units = "residues" 717 else: 718 units = "bases" 719 data += " (%s %i to %i)" % (units, 720 ref.location[0].nofuzzy_start + 1, 721 ref.location[0].nofuzzy_end) 722 self._write_single_line("REFERENCE", data) 723 if ref.authors: 724 # We store the AUTHORS data as a single string 725 self._write_multi_line(" AUTHORS", ref.authors) 726 if ref.consrtm: 727 # We store the consortium as a single string 728 self._write_multi_line(" CONSRTM", ref.consrtm) 729 if ref.title: 730 # We store the title as a single string 731 self._write_multi_line(" TITLE", ref.title) 732 if ref.journal: 733 # We store this as a single string - holds the journal name, 734 # volume, year, and page numbers of the citation 735 self._write_multi_line(" JOURNAL", ref.journal) 736 if ref.medline_id: 737 # This line type is obsolete and was removed from the GenBank 738 # flatfile format in April 2005. Should we write it? 739 # Note this has a two space indent: 740 self._write_multi_line(" MEDLINE", ref.medline_id) 741 if ref.pubmed_id: 742 # Note this has a THREE space indent: 743 self._write_multi_line(" PUBMED", ref.pubmed_id) 744 if ref.comment: 745 self._write_multi_line(" REMARK", ref.comment)
746
747 - def _write_comment(self, record):
748 # This is a bit complicated due to the range of possible 749 # ways people might have done their annotation... 750 # Currently the parser uses a single string with newlines. 751 # A list of lines is also reasonable. 752 # A single (long) string is perhaps the most natural of all. 753 # This means we may need to deal with line wrapping. 754 lines = [] 755 if "structured_comment" in record.annotations: 756 comment = record.annotations["structured_comment"] 757 # Find max length of keys for equal padded printing 758 padding = 0 759 for key, data in comment.items(): 760 for subkey, subdata in data.items(): 761 padding = len(subkey) if len(subkey) > padding else padding 762 # Construct output 763 for key, data in comment.items(): 764 lines.append("##{0}{1}".format(key, self.STRUCTURED_COMMENT_START)) 765 for subkey, subdata in data.items(): 766 spaces = " " * (padding - len(subkey)) 767 lines.append("{0}{1}{2}{3}".format(subkey, spaces, self.STRUCTURED_COMMENT_DELIM, subdata)) 768 lines.append("##{0}{1}".format(key, self.STRUCTURED_COMMENT_END)) 769 if "comment" in record.annotations: 770 comment = record.annotations["comment"] 771 if isinstance(comment, basestring): 772 lines += comment.split("\n") 773 elif isinstance(comment, (list, tuple)): 774 lines += list(comment) 775 else: 776 raise ValueError("Could not understand comment annotation") 777 self._write_multi_line("COMMENT", lines[0]) 778 for line in lines[1:]: 779 self._write_multi_line("", line)
780
781 - def _write_contig(self, record):
782 max_len = self.MAX_WIDTH - self.HEADER_WIDTH 783 lines = self._split_contig(record, max_len) 784 self._write_single_line("CONTIG", lines[0]) 785 for text in lines[1:]: 786 self._write_single_line("", text)
787
788 - def _write_sequence(self, record):
789 # Loosely based on code from Howard Salis 790 # TODO - Force lower case? 791 792 if isinstance(record.seq, UnknownSeq): 793 # We have already recorded the length, and there is no need 794 # to record a long sequence of NNNNNNN...NNN or whatever. 795 if "contig" in record.annotations: 796 self._write_contig(record) 797 else: 798 self.handle.write("ORIGIN\n") 799 return 800 801 # Catches sequence being None: 802 data = self._get_seq_string(record).lower() 803 seq_len = len(data) 804 self.handle.write("ORIGIN\n") 805 for line_number in range(0, seq_len, self.LETTERS_PER_LINE): 806 self.handle.write(str(line_number + 1).rjust(self.SEQUENCE_INDENT)) 807 for words in range(line_number, 808 min(line_number + self.LETTERS_PER_LINE, seq_len), 10): 809 self.handle.write(" %s" % data[words:words + 10]) 810 self.handle.write("\n")
811
812 - def write_record(self, record):
813 """Write a single record to the output file.""" 814 handle = self.handle 815 self._write_the_first_line(record) 816 817 default = record.id 818 if default.count(".") == 1 and default[default.index(".") + 1:].isdigit(): 819 # Good, looks like accesion.version and not something 820 # else like identifier.start-end 821 default = record.id.split(".", 1)[0] 822 accession = self._get_annotation_str(record, "accession", 823 default, 824 just_first=True) 825 acc_with_version = accession 826 if record.id.startswith(accession + "."): 827 try: 828 acc_with_version = "%s.%i" \ 829 % (accession, 830 int(record.id.split(".", 1)[1])) 831 except ValueError: 832 pass 833 gi = self._get_annotation_str(record, "gi", just_first=True) 834 835 descr = record.description 836 if descr == "<unknown description>": 837 descr = "" # Trailing dot will be added later 838 839 # The DEFINITION field must end with a period 840 # see ftp://ftp.ncbi.nih.gov/genbank/gbrel.txt [3.4.5] 841 # and discussion https://github.com/biopython/biopython/pull/616 842 # So let's add a period 843 descr += '.' 844 self._write_multi_line("DEFINITION", descr) 845 846 self._write_single_line("ACCESSION", accession) 847 if gi != ".": 848 self._write_single_line("VERSION", "%s GI:%s" 849 % (acc_with_version, gi)) 850 else: 851 self._write_single_line("VERSION", "%s" % acc_with_version) 852 853 # The NCBI initially expected two types of link, 854 # e.g. "Project:28471" and "Trace Assembly Archive:123456" 855 # 856 # This changed and at some point the formatting switched to 857 # include a space after the colon, e.g. 858 # 859 # LOCUS NC_000011 1606 bp DNA linear CON 06-JUN-2016 860 # DEFINITION Homo sapiens chromosome 11, GRCh38.p7 Primary Assembly. 861 # ACCESSION NC_000011 REGION: complement(5225466..5227071) GPC_000001303 862 # VERSION NC_000011.10 GI:568815587 863 # DBLINK BioProject: PRJNA168 864 # Assembly: GCF_000001405.33 865 # ... 866 # 867 # Or, 868 # 869 # LOCUS JU120277 1044 bp mRNA linear TSA 27-NOV-2012 870 # DEFINITION TSA: Tupaia chinensis tbc000002.Tuchadli mRNA sequence. 871 # ACCESSION JU120277 872 # VERSION JU120277.1 GI:379775257 873 # DBLINK BioProject: PRJNA87013 874 # Sequence Read Archive: SRR433859 875 # ... 876 dbxrefs_with_space = [] 877 for x in record.dbxrefs: 878 if ": " not in x: 879 x = x.replace(":", ": ") 880 dbxrefs_with_space.append(x) 881 self._write_multi_entries("DBLINK", dbxrefs_with_space) 882 del dbxrefs_with_space 883 884 try: 885 # List of strings 886 # Keywords should be given separated with semi colons, 887 keywords = "; ".join(record.annotations["keywords"]) 888 # with a trailing period: 889 if not keywords.endswith("."): 890 keywords += "." 891 except KeyError: 892 # If no keywords, there should be just a period: 893 keywords = "." 894 self._write_multi_line("KEYWORDS", keywords) 895 896 if "segment" in record.annotations: 897 # Deal with SEGMENT line found only in segmented records, 898 # e.g. AH000819 899 segment = record.annotations["segment"] 900 if isinstance(segment, list): 901 assert len(segment) == 1, segment 902 segment = segment[0] 903 self._write_single_line("SEGMENT", segment) 904 905 self._write_multi_line("SOURCE", 906 self._get_annotation_str(record, "source")) 907 # The ORGANISM line MUST be a single line, as any continuation is the taxonomy 908 org = self._get_annotation_str(record, "organism") 909 if len(org) > self.MAX_WIDTH - self.HEADER_WIDTH: 910 org = org[:self.MAX_WIDTH - self.HEADER_WIDTH - 4] + "..." 911 self._write_single_line(" ORGANISM", org) 912 try: 913 # List of strings 914 # Taxonomy should be given separated with semi colons, 915 taxonomy = "; ".join(record.annotations["taxonomy"]) 916 # with a trailing period: 917 if not taxonomy.endswith("."): 918 taxonomy += "." 919 except KeyError: 920 taxonomy = "." 921 self._write_multi_line("", taxonomy) 922 923 if "references" in record.annotations: 924 self._write_references(record) 925 926 if "comment" in record.annotations or "structured_comment" in record.annotations: 927 self._write_comment(record) 928 929 handle.write("FEATURES Location/Qualifiers\n") 930 rec_length = len(record) 931 for feature in record.features: 932 self._write_feature(feature, rec_length) 933 self._write_sequence(record) 934 handle.write("//\n")
935
936 937 -class EmblWriter(_InsdcWriter):
938 """EMBL writer.""" 939 940 HEADER_WIDTH = 5 941 QUALIFIER_INDENT = 21 942 QUALIFIER_INDENT_STR = "FT" + " " * (QUALIFIER_INDENT - 2) 943 QUALIFIER_INDENT_TMP = "FT %s " # 21 if %s is empty 944 # Note second spacer line of just FH is expected: 945 FEATURE_HEADER = "FH Key Location/Qualifiers\nFH\n" 946 947 LETTERS_PER_BLOCK = 10 948 BLOCKS_PER_LINE = 6 949 LETTERS_PER_LINE = LETTERS_PER_BLOCK * BLOCKS_PER_LINE 950 POSITION_PADDING = 10 951
952 - def _write_contig(self, record):
953 max_len = self.MAX_WIDTH - self.HEADER_WIDTH 954 lines = self._split_contig(record, max_len) 955 for text in lines: 956 self._write_single_line("CO", text)
957
958 - def _write_sequence(self, record):
959 handle = self.handle # save looking up this multiple times 960 961 if isinstance(record.seq, UnknownSeq): 962 # We have already recorded the length, and there is no need 963 # to record a long sequence of NNNNNNN...NNN or whatever. 964 if "contig" in record.annotations: 965 self._write_contig(record) 966 else: 967 # TODO - Can the sequence just be left out as in GenBank files? 968 handle.write("SQ \n") 969 return 970 971 # Catches sequence being None 972 data = self._get_seq_string(record).lower() 973 seq_len = len(data) 974 975 # Get the base alphabet (underneath any Gapped or StopCodon encoding) 976 a = Alphabet._get_base_alphabet(record.seq.alphabet) 977 if isinstance(a, Alphabet.DNAAlphabet): 978 # TODO - What if we have RNA? 979 a_count = data.count('A') + data.count('a') 980 c_count = data.count('C') + data.count('c') 981 g_count = data.count('G') + data.count('g') 982 t_count = data.count('T') + data.count('t') 983 other = seq_len - (a_count + c_count + g_count + t_count) 984 handle.write("SQ Sequence %i BP; %i A; %i C; %i G; %i T; %i other;\n" 985 % (seq_len, a_count, c_count, g_count, t_count, other)) 986 else: 987 handle.write("SQ \n") 988 989 for line_number in range(0, seq_len // self.LETTERS_PER_LINE): 990 handle.write(" ") # Just four, not five 991 for block in range(self.BLOCKS_PER_LINE): 992 index = self.LETTERS_PER_LINE * line_number + \ 993 self.LETTERS_PER_BLOCK * block 994 handle.write((" %s" % data[index:index + self.LETTERS_PER_BLOCK])) 995 handle.write(str((line_number + 1) * 996 self.LETTERS_PER_LINE).rjust(self.POSITION_PADDING)) 997 handle.write("\n") 998 if seq_len % self.LETTERS_PER_LINE: 999 # Final (partial) line 1000 line_number = (seq_len // self.LETTERS_PER_LINE) 1001 handle.write(" ") # Just four, not five 1002 for block in range(self.BLOCKS_PER_LINE): 1003 index = self.LETTERS_PER_LINE * line_number + \ 1004 self.LETTERS_PER_BLOCK * block 1005 handle.write( 1006 (" %s" % data[index:index + self.LETTERS_PER_BLOCK]).ljust(11)) 1007 handle.write(str(seq_len).rjust(self.POSITION_PADDING)) 1008 handle.write("\n")
1009
1010 - def _write_single_line(self, tag, text):
1011 assert len(tag) == 2 1012 line = tag + " " + text 1013 if len(text) > self.MAX_WIDTH: 1014 warnings.warn("Line %r too long" % line, BiopythonWarning) 1015 self.handle.write(line + "\n")
1016
1017 - def _write_multi_line(self, tag, text):
1018 max_len = self.MAX_WIDTH - self.HEADER_WIDTH 1019 lines = self._split_multi_line(text, max_len) 1020 for line in lines: 1021 self._write_single_line(tag, line)
1022
1023 - def _write_the_first_lines(self, record):
1024 """Write the ID and AC lines (PRIVATE).""" 1025 if "." in record.id and record.id.rsplit(".", 1)[1].isdigit(): 1026 version = "SV " + record.id.rsplit(".", 1)[1] 1027 accession = self._get_annotation_str(record, "accession", 1028 record.id.rsplit(".", 1)[0], 1029 just_first=True) 1030 else: 1031 version = "" 1032 accession = self._get_annotation_str(record, "accession", 1033 record.id, 1034 just_first=True) 1035 1036 if ";" in accession: 1037 raise ValueError("Cannot have semi-colon in EMBL accession, %s" 1038 % repr(str(accession))) 1039 if " " in accession: 1040 # This is out of practicallity... might it be allowed? 1041 raise ValueError("Cannot have spaces in EMBL accession, %s" 1042 % repr(str(accession))) 1043 1044 topology = self._get_annotation_str(record, "topology", default="") 1045 1046 # Get the molecule type 1047 # TODO - record this explicitly in the parser? 1048 # Get the base alphabet (underneath any Gapped or StopCodon encoding) 1049 a = Alphabet._get_base_alphabet(record.seq.alphabet) 1050 if not isinstance(a, Alphabet.Alphabet): 1051 raise TypeError("Invalid alphabet") 1052 elif isinstance(a, Alphabet.DNAAlphabet): 1053 mol_type = "DNA" 1054 units = "BP" 1055 elif isinstance(a, Alphabet.RNAAlphabet): 1056 mol_type = "RNA" 1057 units = "BP" 1058 elif isinstance(a, Alphabet.ProteinAlphabet): 1059 mol_type = "PROTEIN" 1060 units = "AA" 1061 else: 1062 # Must be something like NucleotideAlphabet 1063 raise ValueError("Need a DNA, RNA or Protein alphabet") 1064 1065 if record.annotations.get("molecule_type", None): 1066 # Note often get RNA vs DNA discrepancy in real EMBL/NCBI files 1067 mol_type = record.annotations["molecule_type"] 1068 if mol_type in ["protein"]: 1069 mol_type = "PROTEIN" 1070 1071 # Get the taxonomy division 1072 division = self._get_data_division(record) 1073 1074 # TODO - Full ID line 1075 handle = self.handle 1076 # ID <1>; SV <2>; <3>; <4>; <5>; <6>; <7> BP. 1077 # 1. Primary accession number 1078 # 2. Sequence version number 1079 # 3. Topology: 'circular' or 'linear' 1080 # 4. Molecule type 1081 # 5. Data class 1082 # 6. Taxonomic division 1083 # 7. Sequence length 1084 self._write_single_line("ID", "%s; %s; %s; %s; ; %s; %i %s." 1085 % (accession, version, topology, mol_type, 1086 division, len(record), units)) 1087 handle.write("XX\n") 1088 self._write_single_line("AC", accession + ";") 1089 handle.write("XX\n")
1090 1091 @staticmethod
1092 - def _get_data_division(record):
1093 try: 1094 division = record.annotations["data_file_division"] 1095 except KeyError: 1096 division = "UNC" 1097 if division in ["PHG", "ENV", "FUN", "HUM", "INV", "MAM", "VRT", 1098 "MUS", "PLN", "PRO", "ROD", "SYN", "TGN", "UNC", 1099 "VRL", "XXX"]: 1100 # Good, already EMBL style 1101 # Division Code 1102 # ----------------- ---- 1103 # Bacteriophage PHG 1104 # Environmental Sample ENV 1105 # Fungal FUN 1106 # Human HUM 1107 # Invertebrate INV 1108 # Other Mammal MAM 1109 # Other Vertebrate VRT 1110 # Mus musculus MUS 1111 # Plant PLN 1112 # Prokaryote PRO 1113 # Other Rodent ROD 1114 # Synthetic SYN 1115 # Transgenic TGN 1116 # Unclassified UNC (i.e. unknown) 1117 # Viral VRL 1118 # 1119 # (plus XXX used for submiting data to EMBL) 1120 pass 1121 else: 1122 # See if this is in GenBank style & can be converted. 1123 # Generally a problem as the GenBank groups are wider 1124 # than those of EMBL. Note that GenBank use "BCT" for 1125 # both bacteria and acherea thus this maps to EMBL's 1126 # "PRO" nicely. 1127 gbk_to_embl = {"BCT": "PRO", 1128 "UNK": "UNC", 1129 } 1130 try: 1131 division = gbk_to_embl[division] 1132 except KeyError: 1133 division = "UNC" 1134 assert len(division) == 3 1135 return division
1136
1137 - def _write_keywords(self, record):
1138 # Put the keywords right after DE line. 1139 # Each 'keyword' can have multiple words and spaces, but we 1140 # must not split any 'keyword' between lines. 1141 # TODO - Combine short keywords onto one line 1142 for keyword in record.annotations["keywords"]: 1143 self._write_single_line("KW", keyword) 1144 self.handle.write("XX\n")
1145
1146 - def _write_references(self, record):
1147 # The order should be RN, RC, RP, RX, RG, RA, RT, RL 1148 number = 0 1149 for ref in record.annotations["references"]: 1150 if not isinstance(ref, SeqFeature.Reference): 1151 continue 1152 number += 1 1153 self._write_single_line("RN", "[%i]" % number) 1154 # TODO - support for RC line (needed in parser too) 1155 # TODO - support more complex record reference locations? 1156 if ref.location and len(ref.location) == 1: 1157 self._write_single_line( 1158 "RP", "%i-%i" % (ref.location[0].nofuzzy_start + 1, 1159 ref.location[0].nofuzzy_end)) 1160 # TODO - record any DOI or AGRICOLA identifier in the reference object? 1161 if ref.pubmed_id: 1162 self._write_single_line("RX", "PUBMED; %s." % ref.pubmed_id) 1163 if ref.consrtm: 1164 self._write_single_line("RG", "%s" % ref.consrtm) 1165 if ref.authors: 1166 # We store the AUTHORS data as a single string 1167 self._write_multi_line("RA", ref.authors + ";") 1168 if ref.title: 1169 # We store the title as a single string 1170 self._write_multi_line("RT", '"%s";' % ref.title) 1171 if ref.journal: 1172 # We store this as a single string - holds the journal name, 1173 # volume, year, and page numbers of the citation 1174 self._write_multi_line("RL", ref.journal) 1175 self.handle.write("XX\n")
1176
1177 - def _write_comment(self, record):
1178 # This is a bit complicated due to the range of possible 1179 # ways people might have done their annotation... 1180 # Currently the parser uses a single string with newlines. 1181 # A list of lines is also reasonable. 1182 # A single (long) string is perhaps the most natural of all. 1183 # This means we may need to deal with line wrapping. 1184 comment = record.annotations["comment"] 1185 if isinstance(comment, basestring): 1186 lines = comment.split("\n") 1187 elif isinstance(comment, (list, tuple)): 1188 lines = comment 1189 else: 1190 raise ValueError("Could not understand comment annotation") 1191 # TODO - Merge this with the GenBank comment code? 1192 if not lines: 1193 return 1194 for line in lines: 1195 self._write_multi_line("CC", line) 1196 self.handle.write("XX\n")
1197
1198 - def write_record(self, record):
1199 """Write a single record to the output file.""" 1200 handle = self.handle 1201 self._write_the_first_lines(record) 1202 1203 # PR line (0 or 1 lines only), project identifier 1204 # 1205 # Assuming can't use 2 lines, we should prefer newer GenBank 1206 # DBLINK BioProject:... entries over the older GenBank DBLINK 1207 # Project:... lines. 1208 # 1209 # In either case, seems EMBL usess just "PR Project:..." 1210 # regardless of the type of ID (old numeric only, or new 1211 # with alpha prefix), e.g. for CP002497 NCBI now uses: 1212 # 1213 # DBLINK BioProject: PRJNA60715 1214 # BioSample: SAMN03081426 1215 # 1216 # While EMBL uses: 1217 # 1218 # XX 1219 # PR Project:PRJNA60715; 1220 # XX 1221 # 1222 # Sorting ensures (new) BioProject:... is before old Project:... 1223 for xref in sorted(record.dbxrefs): 1224 if xref.startswith("BioProject:"): 1225 self._write_single_line("PR", xref[3:] + ";") 1226 handle.write("XX\n") 1227 break 1228 if xref.startswith("Project:"): 1229 self._write_single_line("PR", xref + ";") 1230 handle.write("XX\n") 1231 break 1232 1233 # TODO - DT lines (date) 1234 1235 descr = record.description 1236 if descr == "<unknown description>": 1237 descr = "." 1238 self._write_multi_line("DE", descr) 1239 handle.write("XX\n") 1240 1241 if "keywords" in record.annotations: 1242 self._write_keywords(record) 1243 1244 # Should this be "source" or "organism"? 1245 self._write_multi_line( 1246 "OS", self._get_annotation_str(record, "organism")) 1247 try: 1248 # List of strings 1249 taxonomy = "; ".join(record.annotations["taxonomy"]) + "." 1250 except KeyError: 1251 taxonomy = "." 1252 self._write_multi_line("OC", taxonomy) 1253 handle.write("XX\n") 1254 1255 if "references" in record.annotations: 1256 self._write_references(record) 1257 1258 if "comment" in record.annotations: 1259 self._write_comment(record) 1260 1261 handle.write(self.FEATURE_HEADER) 1262 rec_length = len(record) 1263 for feature in record.features: 1264 self._write_feature(feature, rec_length) 1265 handle.write("XX\n") 1266 1267 self._write_sequence(record) 1268 handle.write("//\n")
1269
1270 1271 -class ImgtWriter(EmblWriter):
1272 """IMGT writer (EMBL format variant).""" 1273 1274 HEADER_WIDTH = 5 1275 QUALIFIER_INDENT = 25 # Not 21 as in EMBL 1276 QUALIFIER_INDENT_STR = "FT" + " " * (QUALIFIER_INDENT - 2) 1277 QUALIFIER_INDENT_TMP = "FT %s " # 25 if %s is empty 1278 FEATURE_HEADER = "FH Key Location/Qualifiers\nFH\n"
1279 1280 1281 if __name__ == "__main__": 1282 from Bio._utils import run_doctest 1283 run_doctest(verbose=0) 1284