Package Bio :: Package SeqIO :: Module InsdcIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.InsdcIO

   1  # Copyright 2007-2016 by Peter Cock.  All rights reserved. 
   2  # 
   3  # This code is part of the Biopython distribution and governed by its 
   4  # license.  Please see the LICENSE file that should have been included 
   5  # as part of this package.. 
   6   
   7  """Bio.SeqIO support for the "genbank" and "embl" file formats. 
   8   
   9  You are expected to use this module via the Bio.SeqIO functions. 
  10  Note that internally this module calls Bio.GenBank to do the actual 
  11  parsing of GenBank, EMBL and IMGT files. 
  12   
  13  See Also: 
  14  International Nucleotide Sequence Database Collaboration 
  15  http://www.insdc.org/ 
  16   
  17  GenBank 
  18  http://www.ncbi.nlm.nih.gov/Genbank/ 
  19   
  20  EMBL Nucleotide Sequence Database 
  21  http://www.ebi.ac.uk/embl/ 
  22   
  23  DDBJ (DNA Data Bank of Japan) 
  24  http://www.ddbj.nig.ac.jp/ 
  25   
  26  IMGT (use a variant of EMBL format with longer feature indents) 
  27  http://imgt.cines.fr/download/LIGM-DB/userman_doc.html 
  28  http://imgt.cines.fr/download/LIGM-DB/ftable_doc.html 
  29  http://www.ebi.ac.uk/imgt/hla/docs/manual.html 
  30   
  31  """ 
  32   
  33  from __future__ import print_function 
  34   
  35  import warnings 
  36  from Bio import BiopythonWarning 
  37   
  38  from Bio.Seq import UnknownSeq 
  39  from Bio.GenBank.Scanner import GenBankScanner, EmblScanner, _ImgtScanner 
  40  from Bio import Alphabet 
  41  from Bio.SeqIO.Interfaces import SequentialSequenceWriter 
  42  from Bio import SeqFeature 
  43   
  44  from Bio._py3k import _is_int_or_long 
  45  from Bio._py3k import basestring 
46 47 48 # NOTE 49 # ==== 50 # The "brains" for parsing GenBank, EMBL and IMGT files (and any 51 # other flat file variants from the INSDC in future) is in 52 # Bio.GenBank.Scanner (plus the _FeatureConsumer in Bio.GenBank) 53 # However, all the writing code is in this file. 54 55 56 -def GenBankIterator(handle):
57 """Breaks up a Genbank file into SeqRecord objects. 58 59 Every section from the LOCUS line to the terminating // becomes 60 a single SeqRecord with associated annotation and features. 61 62 Note that for genomes or chromosomes, there is typically only 63 one record. 64 65 This gets called internally by Bio.SeqIO for the GenBank file format: 66 67 >>> from Bio import SeqIO 68 >>> for record in SeqIO.parse("GenBank/cor6_6.gb", "gb"): 69 ... print(record.id) 70 ... 71 X55053.1 72 X62281.1 73 M81224.1 74 AJ237582.1 75 L31939.1 76 AF297471.1 77 78 Equivalently, 79 80 >>> with open("GenBank/cor6_6.gb") as handle: 81 ... for record in GenBankIterator(handle): 82 ... print(record.id) 83 ... 84 X55053.1 85 X62281.1 86 M81224.1 87 AJ237582.1 88 L31939.1 89 AF297471.1 90 91 """ 92 # This calls a generator function: 93 return GenBankScanner(debug=0).parse_records(handle)
94
95 96 -def EmblIterator(handle):
97 """Breaks up an EMBL file into SeqRecord objects. 98 99 Every section from the LOCUS line to the terminating // becomes 100 a single SeqRecord with associated annotation and features. 101 102 Note that for genomes or chromosomes, there is typically only 103 one record. 104 105 This gets called internally by Bio.SeqIO for the EMBL file format: 106 107 >>> from Bio import SeqIO 108 >>> for record in SeqIO.parse("EMBL/epo_prt_selection.embl", "embl"): 109 ... print(record.id) 110 ... 111 A00022.1 112 A00028.1 113 A00031.1 114 A00034.1 115 A00060.1 116 A00071.1 117 A00072.1 118 A00078.1 119 CQ797900.1 120 121 Equivalently, 122 123 >>> with open("EMBL/epo_prt_selection.embl") as handle: 124 ... for record in EmblIterator(handle): 125 ... print(record.id) 126 ... 127 A00022.1 128 A00028.1 129 A00031.1 130 A00034.1 131 A00060.1 132 A00071.1 133 A00072.1 134 A00078.1 135 CQ797900.1 136 137 """ 138 # This calls a generator function: 139 return EmblScanner(debug=0).parse_records(handle)
140
141 142 -def ImgtIterator(handle):
143 """Breaks up an IMGT file into SeqRecord objects. 144 145 Every section from the LOCUS line to the terminating // becomes 146 a single SeqRecord with associated annotation and features. 147 148 Note that for genomes or chromosomes, there is typically only 149 one record. 150 """ 151 # This calls a generator function: 152 return _ImgtScanner(debug=0).parse_records(handle)
153
154 155 -def GenBankCdsFeatureIterator(handle, alphabet=Alphabet.generic_protein):
156 """Breaks up a Genbank file into SeqRecord objects for each CDS feature. 157 158 Every section from the LOCUS line to the terminating // can contain 159 many CDS features. These are returned as with the stated amino acid 160 translation sequence (if given). 161 """ 162 # This calls a generator function: 163 return GenBankScanner(debug=0).parse_cds_features(handle, alphabet)
164
165 166 -def EmblCdsFeatureIterator(handle, alphabet=Alphabet.generic_protein):
167 """Breaks up a EMBL file into SeqRecord objects for each CDS feature. 168 169 Every section from the LOCUS line to the terminating // can contain 170 many CDS features. These are returned as with the stated amino acid 171 translation sequence (if given). 172 """ 173 # This calls a generator function: 174 return EmblScanner(debug=0).parse_cds_features(handle, alphabet)
175
176 177 -def _insdc_feature_position_string(pos, offset=0):
178 """Build a GenBank/EMBL position string (PRIVATE). 179 180 Use offset=1 to add one to convert a start position from python counting. 181 """ 182 if isinstance(pos, SeqFeature.ExactPosition): 183 return "%i" % (pos.position + offset) 184 elif isinstance(pos, SeqFeature.WithinPosition): 185 return "(%i.%i)" % (pos.position + offset, 186 pos.position + pos.extension + offset) 187 elif isinstance(pos, SeqFeature.BetweenPosition): 188 return "(%i^%i)" % (pos.position + offset, 189 pos.position + pos.extension + offset) 190 elif isinstance(pos, SeqFeature.BeforePosition): 191 return "<%i" % (pos.position + offset) 192 elif isinstance(pos, SeqFeature.AfterPosition): 193 return ">%i" % (pos.position + offset) 194 elif isinstance(pos, SeqFeature.OneOfPosition): 195 return "one-of(%s)" \ 196 % ",".join(_insdc_feature_position_string(p, offset) 197 for p in pos.position_choices) 198 elif isinstance(pos, SeqFeature.AbstractPosition): 199 raise NotImplementedError("Please report this as a bug in Biopython.") 200 else: 201 raise ValueError("Expected a SeqFeature position object.")
202
203 204 -def _insdc_location_string_ignoring_strand_and_subfeatures(location, rec_length):
205 if location.ref: 206 ref = "%s:" % location.ref 207 else: 208 ref = "" 209 assert not location.ref_db 210 if isinstance(location.start, SeqFeature.ExactPosition) \ 211 and isinstance(location.end, SeqFeature.ExactPosition) \ 212 and location.start.position == location.end.position: 213 # Special case, for 12:12 return 12^13 214 # (a zero length slice, meaning the point between two letters) 215 if location.end.position == rec_length: 216 # Very special case, for a between position at the end of a 217 # sequence (used on some circular genomes, Bug 3098) we have 218 # N:N so return N^1 219 return "%s%i^1" % (ref, rec_length) 220 else: 221 return "%s%i^%i" % (ref, location.end.position, 222 location.end.position + 1) 223 if isinstance(location.start, SeqFeature.ExactPosition) \ 224 and isinstance(location.end, SeqFeature.ExactPosition) \ 225 and location.start.position + 1 == location.end.position: 226 # Special case, for 11:12 return 12 rather than 12..12 227 # (a length one slice, meaning a single letter) 228 return "%s%i" % (ref, location.end.position) 229 elif isinstance(location.start, SeqFeature.UnknownPosition) \ 230 or isinstance(location.end, SeqFeature.UnknownPosition): 231 # Special case for features from SwissProt/UniProt files 232 if isinstance(location.start, SeqFeature.UnknownPosition) \ 233 and isinstance(location.end, SeqFeature.UnknownPosition): 234 # warnings.warn("Feature with unknown location", BiopythonWarning) 235 # return "?" 236 raise ValueError("Feature with unknown location") 237 elif isinstance(location.start, SeqFeature.UnknownPosition): 238 # Treat the unknown start position as a BeforePosition 239 return "%s<%i..%s" \ 240 % (ref, 241 location.nofuzzy_end, 242 _insdc_feature_position_string(location.end)) 243 else: 244 # Treat the unknown end position as an AfterPosition 245 return "%s%s..>%i" \ 246 % (ref, 247 _insdc_feature_position_string(location.start, +1), 248 location.nofuzzy_start + 1) 249 else: 250 # Typical case, e.g. 12..15 gets mapped to 11:15 251 return ref \ 252 + _insdc_feature_position_string(location.start, +1) \ 253 + ".." + \ 254 _insdc_feature_position_string(location.end)
255
256 257 -def _insdc_location_string(location, rec_length):
258 """Build a GenBank/EMBL location from a (Compound) FeatureLocation (PRIVATE). 259 260 There is a choice of how to show joins on the reverse complement strand, 261 GenBank used "complement(join(1,10),(20,100))" while EMBL used to use 262 "join(complement(20,100),complement(1,10))" instead (but appears to have 263 now adopted the GenBank convention). Notice that the order of the entries 264 is reversed! This function therefore uses the first form. In this situation 265 we expect the CompoundFeatureLocation and its parts to all be marked as 266 strand == -1, and to be in the order 19:100 then 0:10. 267 """ 268 try: 269 parts = location.parts 270 # CompoundFeatureLocation 271 if location.strand == -1: 272 # Special case, put complement outside the join/order/... and reverse order 273 return "complement(%s(%s))" % (location.operator, 274 ",".join(_insdc_location_string_ignoring_strand_and_subfeatures(p, rec_length) 275 for p in parts[::-1])) 276 else: 277 return "%s(%s)" % (location.operator, 278 ",".join(_insdc_location_string(p, rec_length) for p in parts)) 279 except AttributeError: 280 # Simple FeatureLocation 281 loc = _insdc_location_string_ignoring_strand_and_subfeatures(location, rec_length) 282 if location.strand == -1: 283 return "complement(%s)" % loc 284 else: 285 return loc
286
287 288 -class _InsdcWriter(SequentialSequenceWriter):
289 """Base class for GenBank and EMBL writers (PRIVATE).""" 290 291 MAX_WIDTH = 80 292 QUALIFIER_INDENT = 21 293 QUALIFIER_INDENT_STR = " " * QUALIFIER_INDENT 294 QUALIFIER_INDENT_TMP = " %s " # 21 if %s is empty 295 FTQUAL_NO_QUOTE = ("anticodon", "citation", "codon_start", "compare", 296 "direction", "estimated_length", "mod_base", "number", 297 "rpt_type", "rpt_unit_range", "tag_peptide", 298 "transl_except", "transl_table") 299
300 - def _write_feature_qualifier(self, key, value=None, quote=None):
301 if value is None: 302 # Value-less entry like /pseudo 303 self.handle.write("%s/%s\n" % (self.QUALIFIER_INDENT_STR, key)) 304 return 305 # Quick hack with no line wrapping, may be useful for testing: 306 # self.handle.write('%s/%s="%s"\n' % (self.QUALIFIER_INDENT_STR, key, value)) 307 if quote is None: 308 # Try to mimic unwritten rules about when quotes can be left out: 309 if _is_int_or_long(value) or key in self.FTQUAL_NO_QUOTE: 310 quote = False 311 else: 312 quote = True 313 if quote: 314 line = '%s/%s="%s"' % (self.QUALIFIER_INDENT_STR, key, value) 315 else: 316 line = '%s/%s=%s' % (self.QUALIFIER_INDENT_STR, key, value) 317 if len(line) <= self.MAX_WIDTH: 318 self.handle.write(line + "\n") 319 return 320 while line.lstrip(): 321 if len(line) <= self.MAX_WIDTH: 322 self.handle.write(line + "\n") 323 return 324 # Insert line break... 325 for index in range(min(len(line) - 1, self.MAX_WIDTH), 326 self.QUALIFIER_INDENT + 1, -1): 327 if line[index] == " ": 328 break 329 if line[index] != " ": 330 # No nice place to break... 331 index = self.MAX_WIDTH 332 assert index <= self.MAX_WIDTH 333 self.handle.write(line[:index] + "\n") 334 line = self.QUALIFIER_INDENT_STR + line[index:].lstrip()
335
336 - def _wrap_location(self, location):
337 """Split a feature location into lines (break at commas).""" 338 # TODO - Rewrite this not to recurse! 339 length = self.MAX_WIDTH - self.QUALIFIER_INDENT 340 if len(location) <= length: 341 return location 342 index = location[:length].rfind(",") 343 if index == -1: 344 # No good place to split (!) 345 warnings.warn("Couldn't split location:\n%s" % location, 346 BiopythonWarning) 347 return location 348 return location[:index + 1] + "\n" + \ 349 self.QUALIFIER_INDENT_STR + \ 350 self._wrap_location(location[index + 1:])
351
352 - def _write_feature(self, feature, record_length):
353 """Write a single SeqFeature object to features table.""" 354 assert feature.type, feature 355 location = _insdc_location_string(feature.location, record_length) 356 f_type = feature.type.replace(" ", "_") 357 line = (self.QUALIFIER_INDENT_TMP % f_type)[:self.QUALIFIER_INDENT] \ 358 + self._wrap_location(location) + "\n" 359 self.handle.write(line) 360 # Now the qualifiers... 361 # Note as of Biopython 1.69, this is an ordered-dict, don't sort it: 362 for key, values in feature.qualifiers.items(): 363 if isinstance(values, (list, tuple)): 364 for value in values: 365 self._write_feature_qualifier(key, value) 366 else: 367 # String, int, etc - or None for a /pseudo tpy entry 368 self._write_feature_qualifier(key, values)
369 370 @staticmethod
371 - def _get_annotation_str(record, key, default=".", just_first=False):
372 """Get an annotation dictionary entry (as a string). 373 374 Some entries are lists, in which case if just_first=True the first entry 375 is returned. If just_first=False (default) this verifies there is only 376 one entry before returning it. 377 """ 378 try: 379 answer = record.annotations[key] 380 except KeyError: 381 return default 382 if isinstance(answer, list): 383 if not just_first: 384 assert len(answer) == 1 385 return str(answer[0]) 386 else: 387 return str(answer)
388 389 @staticmethod
390 - def _split_multi_line(text, max_len):
391 """Returns a list of strings. 392 393 Any single words which are too long get returned as a whole line 394 (e.g. URLs) without an exception or warning. 395 """ 396 # TODO - Do the line spliting while preserving white space? 397 text = text.strip() 398 if len(text) <= max_len: 399 return [text] 400 401 words = text.split() 402 text = "" 403 while words and len(text) + 1 + len(words[0]) <= max_len: 404 text += " " + words.pop(0) 405 text = text.strip() 406 # assert len(text) <= max_len 407 answer = [text] 408 while words: 409 text = words.pop(0) 410 while words and len(text) + 1 + len(words[0]) <= max_len: 411 text += " " + words.pop(0) 412 text = text.strip() 413 # assert len(text) <= max_len 414 answer.append(text) 415 assert not words 416 return answer
417
418 - def _split_contig(self, record, max_len):
419 """Returns a list of strings, splits on commas.""" 420 # TODO - Merge this with _write_multi_line method? 421 # It would need the addition of the comma splitting logic... 422 # are there any other cases where that would be sensible? 423 contig = record.annotations.get("contig", "") 424 if isinstance(contig, (list, tuple)): 425 contig = "".join(contig) 426 contig = self.clean(contig) 427 answer = [] 428 while contig: 429 if len(contig) > max_len: 430 # Split lines at the commas 431 pos = contig[:max_len - 1].rfind(",") 432 if pos == -1: 433 raise ValueError("Could not break up CONTIG") 434 text, contig = contig[:pos + 1], contig[pos + 1:] 435 else: 436 text, contig = contig, "" 437 answer.append(text) 438 return answer
439
440 441 -class GenBankWriter(_InsdcWriter):
442 """GenBank writer.""" 443 444 HEADER_WIDTH = 12 445 QUALIFIER_INDENT = 21 446 STRUCTURED_COMMENT_START = "-START##" 447 STRUCTURED_COMMENT_END = "-END##" 448 STRUCTURED_COMMENT_DELIM = " :: " 449 LETTERS_PER_LINE = 60 450 SEQUENCE_INDENT = 9 451
452 - def _write_single_line(self, tag, text):
453 """Used in the 'header' of each GenBank record.""" 454 assert len(tag) < self.HEADER_WIDTH 455 if len(text) > self.MAX_WIDTH - self.HEADER_WIDTH: 456 if tag: 457 warnings.warn("Annotation %r too long for %r line" % (text, tag), 458 BiopythonWarning) 459 else: 460 # Can't give such a precise warning 461 warnings.warn("Annotation %r too long" % text, BiopythonWarning) 462 self.handle.write("%s%s\n" % (tag.ljust(self.HEADER_WIDTH), 463 text.replace("\n", " ")))
464
465 - def _write_multi_line(self, tag, text):
466 """Used in the 'header' of each GenBank record.""" 467 # TODO - Do the line spliting while preserving white space? 468 max_len = self.MAX_WIDTH - self.HEADER_WIDTH 469 lines = self._split_multi_line(text, max_len) 470 self._write_single_line(tag, lines[0]) 471 for line in lines[1:]: 472 self._write_single_line("", line)
473
474 - def _write_multi_entries(self, tag, text_list):
475 # used for DBLINK and any similar later line types. 476 # If the list of strings is empty, nothing is written. 477 for i, text in enumerate(text_list): 478 if i == 0: 479 self._write_single_line(tag, text) 480 else: 481 self._write_single_line("", text)
482 483 @staticmethod
484 - def _get_date(record):
485 default = "01-JAN-1980" 486 try: 487 date = record.annotations["date"] 488 except KeyError: 489 return default 490 # Cope with a list of one string: 491 if isinstance(date, list) and len(date) == 1: 492 date = date[0] 493 # TODO - allow a Python date object 494 if not isinstance(date, basestring) or len(date) != 11 \ 495 or date[2] != "-" or date[6] != "-" \ 496 or not date[:2].isdigit() or not date[7:].isdigit() \ 497 or int(date[:2]) > 31 \ 498 or date[3:6] not in ["JAN", "FEB", "MAR", "APR", "MAY", "JUN", 499 "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"]: 500 # TODO - Check is a valid date (e.g. not 31 Feb) 501 return default 502 return date
503 504 @staticmethod
505 - def _get_data_division(record):
506 try: 507 division = record.annotations["data_file_division"] 508 except KeyError: 509 division = "UNK" 510 if division in ["PRI", "ROD", "MAM", "VRT", "INV", "PLN", "BCT", 511 "VRL", "PHG", "SYN", "UNA", "EST", "PAT", "STS", 512 "GSS", "HTG", "HTC", "ENV", "CON"]: 513 # Good, already GenBank style 514 # PRI - primate sequences 515 # ROD - rodent sequences 516 # MAM - other mammalian sequences 517 # VRT - other vertebrate sequences 518 # INV - invertebrate sequences 519 # PLN - plant, fungal, and algal sequences 520 # BCT - bacterial sequences [plus archea] 521 # VRL - viral sequences 522 # PHG - bacteriophage sequences 523 # SYN - synthetic sequences 524 # UNA - unannotated sequences 525 # EST - EST sequences (expressed sequence tags) 526 # PAT - patent sequences 527 # STS - STS sequences (sequence tagged sites) 528 # GSS - GSS sequences (genome survey sequences) 529 # HTG - HTGS sequences (high throughput genomic sequences) 530 # HTC - HTC sequences (high throughput cDNA sequences) 531 # ENV - Environmental sampling sequences 532 # CON - Constructed sequences 533 # 534 # (plus UNK for unknown) 535 pass 536 else: 537 # See if this is in EMBL style: 538 # Division Code 539 # ----------------- ---- 540 # Bacteriophage PHG - common 541 # Environmental Sample ENV - common 542 # Fungal FUN - map to PLN (plants + fungal) 543 # Human HUM - map to PRI (primates) 544 # Invertebrate INV - common 545 # Other Mammal MAM - common 546 # Other Vertebrate VRT - common 547 # Mus musculus MUS - map to ROD (rodent) 548 # Plant PLN - common 549 # Prokaryote PRO - map to BCT (poor name) 550 # Other Rodent ROD - common 551 # Synthetic SYN - common 552 # Transgenic TGN - ??? map to SYN ??? 553 # Unclassified UNC - map to UNK 554 # Viral VRL - common 555 # 556 # (plus XXX for submiting which we can map to UNK) 557 embl_to_gbk = {"FUN": "PLN", 558 "HUM": "PRI", 559 "MUS": "ROD", 560 "PRO": "BCT", 561 "UNC": "UNK", 562 "XXX": "UNK", 563 } 564 try: 565 division = embl_to_gbk[division] 566 except KeyError: 567 division = "UNK" 568 assert len(division) == 3 569 return division
570
571 - def _get_topology(self, record):
572 """Set the topology to 'circular', 'linear' if defined""" 573 max_topology_len = len("circular") 574 575 topology = self._get_annotation_str(record, "topology", default="") 576 if topology and len(topology) <= max_topology_len: 577 return topology.ljust(max_topology_len) 578 else: 579 return " " * max_topology_len
580
581 - def _write_the_first_line(self, record):
582 """Write the LOCUS line.""" 583 locus = record.name 584 if not locus or locus == "<unknown name>": 585 locus = record.id 586 if not locus or locus == "<unknown id>": 587 locus = self._get_annotation_str( 588 record, "accession", just_first=True) 589 if len(locus) > 16: 590 if len(locus) + 1 + len(str(len(record))) > 28: 591 # Locus name and record length to long to squeeze in. 592 raise ValueError("Locus identifier %r is too long" % locus) 593 else: 594 warnings.warn("Stealing space from length field to allow long name in LOCUS line", BiopythonWarning) 595 if len(locus.split()) > 1: 596 # locus could be unicode, and u'with space' versus 'with space' 597 # causes trouble with doctest or print-and-compare tests, so 598 tmp = repr(locus) 599 if tmp.startswith("u'") and tmp.endswith("'"): 600 tmp = tmp[1:] 601 raise ValueError("Invalid whitespace in %s for LOCUS line" % tmp) 602 if len(record) > 99999999999: 603 # Currently GenBank only officially support up to 350000, but 604 # the length field can take eleven digits 605 raise ValueError("Sequence too long!") 606 607 # Get the base alphabet (underneath any Gapped or StopCodon encoding) 608 a = Alphabet._get_base_alphabet(record.seq.alphabet) 609 if not isinstance(a, Alphabet.Alphabet): 610 raise TypeError("Invalid alphabet") 611 elif isinstance(a, Alphabet.ProteinAlphabet): 612 units = "aa" 613 elif isinstance(a, Alphabet.NucleotideAlphabet): 614 units = "bp" 615 else: 616 # Must be something like NucleotideAlphabet or 617 # just the generic Alphabet (default for fasta files) 618 raise ValueError("Need a Nucleotide or Protein alphabet") 619 620 # Get the molecule type 621 mol_type = self._get_annotation_str(record, "molecule_type", default=None) 622 if mol_type and len(mol_type) > 7: 623 # Deal with common cases from EMBL to GenBank 624 mol_type = mol_type.replace("unassigned ", "").replace("genomic ", "") 625 if len(mol_type) > 7: 626 warnings.warn("Molecule type %r too long" % mol_type, 627 BiopythonWarning) 628 mol_type = None 629 if mol_type == "protein": 630 mol_type = "" 631 632 if mol_type: 633 pass 634 elif isinstance(a, Alphabet.ProteinAlphabet): 635 mol_type = "" 636 elif isinstance(a, Alphabet.DNAAlphabet): 637 mol_type = "DNA" 638 elif isinstance(a, Alphabet.RNAAlphabet): 639 mol_type = "RNA" 640 else: 641 # Must be something like NucleotideAlphabet or 642 # just the generic Alphabet (default for fasta files) 643 raise ValueError("Need a DNA, RNA or Protein alphabet") 644 645 topology = self._get_topology(record) 646 647 division = self._get_data_division(record) 648 649 name_length = str(len(record)).rjust(28) 650 name_length = locus + name_length[len(locus):] 651 assert len(name_length) == 28, name_length 652 assert " " in name_length, name_length 653 654 assert len(units) == 2 655 assert len(division) == 3 656 line = "LOCUS %s %s %s %s %s %s\n" \ 657 % (name_length, 658 units, 659 mol_type.ljust(7), 660 topology, 661 division, 662 self._get_date(record)) 663 assert len(line) == 79 + 1, repr(line) # plus one for new line 664 665 # We're bending the rules to allow an identifier over 16 characters 666 # if we can steal spaces from the length field: 667 # assert line[12:28].rstrip() == locus, \ 668 # 'LOCUS line does not contain the locus at the expected position:\n' + line 669 # assert line[28:29] == " " 670 # assert line[29:40].lstrip() == str(len(record)), \ 671 # 'LOCUS line does not contain the length at the expected position:\n' + line 672 assert line[12:40].split() == [locus, str(len(record))], line 673 674 # Tests copied from Bio.GenBank.Scanner 675 assert line[40:44] in [' bp ', ' aa '], \ 676 'LOCUS line does not contain size units at expected position:\n' + \ 677 line 678 assert line[44:47] in [' ', 'ss-', 'ds-', 'ms-'], \ 679 'LOCUS line does not have valid strand type (Single stranded, ...):\n' + line 680 assert line[47:54].strip() == "" \ 681 or 'DNA' in line[47:54].strip() \ 682 or 'RNA' in line[47:54].strip(), \ 683 'LOCUS line does not contain valid sequence type (DNA, RNA, ...):\n' + line 684 assert line[54:55] == ' ', \ 685 'LOCUS line does not contain space at position 55:\n' + line 686 assert line[55:63].strip() in ['', 'linear', 'circular'], \ 687 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line 688 assert line[63:64] == ' ', \ 689 'LOCUS line does not contain space at position 64:\n' + line 690 assert line[67:68] == ' ', \ 691 'LOCUS line does not contain space at position 68:\n' + line 692 assert line[70:71] == '-', \ 693 'LOCUS line does not contain - at position 71 in date:\n' + line 694 assert line[74:75] == '-', \ 695 'LOCUS line does not contain - at position 75 in date:\n' + line 696 697 self.handle.write(line)
698
699 - def _write_references(self, record):
700 number = 0 701 for ref in record.annotations["references"]: 702 if not isinstance(ref, SeqFeature.Reference): 703 continue 704 number += 1 705 data = str(number) 706 # TODO - support more complex record reference locations? 707 if ref.location and len(ref.location) == 1: 708 a = Alphabet._get_base_alphabet(record.seq.alphabet) 709 if isinstance(a, Alphabet.ProteinAlphabet): 710 units = "residues" 711 else: 712 units = "bases" 713 data += " (%s %i to %i)" % (units, 714 ref.location[0].nofuzzy_start + 1, 715 ref.location[0].nofuzzy_end) 716 self._write_single_line("REFERENCE", data) 717 if ref.authors: 718 # We store the AUTHORS data as a single string 719 self._write_multi_line(" AUTHORS", ref.authors) 720 if ref.consrtm: 721 # We store the consortium as a single string 722 self._write_multi_line(" CONSRTM", ref.consrtm) 723 if ref.title: 724 # We store the title as a single string 725 self._write_multi_line(" TITLE", ref.title) 726 if ref.journal: 727 # We store this as a single string - holds the journal name, 728 # volume, year, and page numbers of the citation 729 self._write_multi_line(" JOURNAL", ref.journal) 730 if ref.medline_id: 731 # This line type is obsolete and was removed from the GenBank 732 # flatfile format in April 2005. Should we write it? 733 # Note this has a two space indent: 734 self._write_multi_line(" MEDLINE", ref.medline_id) 735 if ref.pubmed_id: 736 # Note this has a THREE space indent: 737 self._write_multi_line(" PUBMED", ref.pubmed_id) 738 if ref.comment: 739 self._write_multi_line(" REMARK", ref.comment)
740
741 - def _write_comment(self, record):
742 # This is a bit complicated due to the range of possible 743 # ways people might have done their annotation... 744 # Currently the parser uses a single string with newlines. 745 # A list of lines is also reasonable. 746 # A single (long) string is perhaps the most natural of all. 747 # This means we may need to deal with line wrapping. 748 lines = [] 749 if "structured_comment" in record.annotations: 750 comment = record.annotations["structured_comment"] 751 # Find max length of keys for equal padded printing 752 padding = 0 753 for key, data in comment.items(): 754 for subkey, subdata in data.items(): 755 padding = len(subkey) if len(subkey) > padding else padding 756 # Construct output 757 for key, data in comment.items(): 758 lines.append("##{0}{1}".format(key, self.STRUCTURED_COMMENT_START)) 759 for subkey, subdata in data.items(): 760 spaces = " " * (padding - len(subkey)) 761 lines.append("{0}{1}{2}{3}".format(subkey, spaces, self.STRUCTURED_COMMENT_DELIM, subdata)) 762 lines.append("##{0}{1}".format(key, self.STRUCTURED_COMMENT_END)) 763 if "comment" in record.annotations: 764 comment = record.annotations["comment"] 765 if isinstance(comment, basestring): 766 lines += comment.split("\n") 767 elif isinstance(comment, (list, tuple)): 768 lines += list(comment) 769 else: 770 raise ValueError("Could not understand comment annotation") 771 self._write_multi_line("COMMENT", lines[0]) 772 for line in lines[1:]: 773 self._write_multi_line("", line)
774
775 - def _write_contig(self, record):
776 max_len = self.MAX_WIDTH - self.HEADER_WIDTH 777 lines = self._split_contig(record, max_len) 778 self._write_single_line("CONTIG", lines[0]) 779 for text in lines[1:]: 780 self._write_single_line("", text)
781
782 - def _write_sequence(self, record):
783 # Loosely based on code from Howard Salis 784 # TODO - Force lower case? 785 786 if isinstance(record.seq, UnknownSeq): 787 # We have already recorded the length, and there is no need 788 # to record a long sequence of NNNNNNN...NNN or whatever. 789 if "contig" in record.annotations: 790 self._write_contig(record) 791 else: 792 self.handle.write("ORIGIN\n") 793 return 794 795 # Catches sequence being None: 796 data = self._get_seq_string(record).lower() 797 seq_len = len(data) 798 self.handle.write("ORIGIN\n") 799 for line_number in range(0, seq_len, self.LETTERS_PER_LINE): 800 self.handle.write(str(line_number + 1).rjust(self.SEQUENCE_INDENT)) 801 for words in range(line_number, 802 min(line_number + self.LETTERS_PER_LINE, seq_len), 10): 803 self.handle.write(" %s" % data[words:words + 10]) 804 self.handle.write("\n")
805
806 - def write_record(self, record):
807 """Write a single record to the output file.""" 808 handle = self.handle 809 self._write_the_first_line(record) 810 811 default = record.id 812 if default.count(".") == 1 and default[default.index(".") + 1:].isdigit(): 813 # Good, looks like accesion.version and not something 814 # else like identifier.start-end 815 default = record.id.split(".", 1)[0] 816 accession = self._get_annotation_str(record, "accession", 817 default, 818 just_first=True) 819 acc_with_version = accession 820 if record.id.startswith(accession + "."): 821 try: 822 acc_with_version = "%s.%i" \ 823 % (accession, 824 int(record.id.split(".", 1)[1])) 825 except ValueError: 826 pass 827 gi = self._get_annotation_str(record, "gi", just_first=True) 828 829 descr = record.description 830 if descr == "<unknown description>": 831 descr = "." 832 833 # The DEFINITION field must end with a period 834 # see ftp://ftp.ncbi.nih.gov/genbank/gbrel.txt [3.4.5] 835 # and discussion https://github.com/biopython/biopython/pull/616 836 # So let's add a period 837 descr += '.' 838 self._write_multi_line("DEFINITION", descr) 839 840 self._write_single_line("ACCESSION", accession) 841 if gi != ".": 842 self._write_single_line("VERSION", "%s GI:%s" 843 % (acc_with_version, gi)) 844 else: 845 self._write_single_line("VERSION", "%s" % acc_with_version) 846 847 # The NCBI initially expected two types of link, 848 # e.g. "Project:28471" and "Trace Assembly Archive:123456" 849 # 850 # This changed and at some point the formatting switched to 851 # include a space after the colon, e.g. 852 # 853 # LOCUS NC_000011 1606 bp DNA linear CON 06-JUN-2016 854 # DEFINITION Homo sapiens chromosome 11, GRCh38.p7 Primary Assembly. 855 # ACCESSION NC_000011 REGION: complement(5225466..5227071) GPC_000001303 856 # VERSION NC_000011.10 GI:568815587 857 # DBLINK BioProject: PRJNA168 858 # Assembly: GCF_000001405.33 859 # ... 860 # 861 # Or, 862 # 863 # LOCUS JU120277 1044 bp mRNA linear TSA 27-NOV-2012 864 # DEFINITION TSA: Tupaia chinensis tbc000002.Tuchadli mRNA sequence. 865 # ACCESSION JU120277 866 # VERSION JU120277.1 GI:379775257 867 # DBLINK BioProject: PRJNA87013 868 # Sequence Read Archive: SRR433859 869 # ... 870 dbxrefs_with_space = [] 871 for x in record.dbxrefs: 872 if ": " not in x: 873 x = x.replace(":", ": ") 874 dbxrefs_with_space.append(x) 875 self._write_multi_entries("DBLINK", dbxrefs_with_space) 876 del dbxrefs_with_space 877 878 try: 879 # List of strings 880 # Keywords should be given separated with semi colons, 881 keywords = "; ".join(record.annotations["keywords"]) 882 # with a trailing period: 883 if not keywords.endswith("."): 884 keywords += "." 885 except KeyError: 886 # If no keywords, there should be just a period: 887 keywords = "." 888 self._write_multi_line("KEYWORDS", keywords) 889 890 if "segment" in record.annotations: 891 # Deal with SEGMENT line found only in segmented records, 892 # e.g. AH000819 893 segment = record.annotations["segment"] 894 if isinstance(segment, list): 895 assert len(segment) == 1, segment 896 segment = segment[0] 897 self._write_single_line("SEGMENT", segment) 898 899 self._write_multi_line("SOURCE", 900 self._get_annotation_str(record, "source")) 901 # The ORGANISM line MUST be a single line, as any continuation is the taxonomy 902 org = self._get_annotation_str(record, "organism") 903 if len(org) > self.MAX_WIDTH - self.HEADER_WIDTH: 904 org = org[:self.MAX_WIDTH - self.HEADER_WIDTH - 4] + "..." 905 self._write_single_line(" ORGANISM", org) 906 try: 907 # List of strings 908 # Taxonomy should be given separated with semi colons, 909 taxonomy = "; ".join(record.annotations["taxonomy"]) 910 # with a trailing period: 911 if not taxonomy.endswith("."): 912 taxonomy += "." 913 except KeyError: 914 taxonomy = "." 915 self._write_multi_line("", taxonomy) 916 917 if "references" in record.annotations: 918 self._write_references(record) 919 920 if "comment" in record.annotations or "structured_comment" in record.annotations: 921 self._write_comment(record) 922 923 handle.write("FEATURES Location/Qualifiers\n") 924 rec_length = len(record) 925 for feature in record.features: 926 self._write_feature(feature, rec_length) 927 self._write_sequence(record) 928 handle.write("//\n")
929
930 931 -class EmblWriter(_InsdcWriter):
932 """EMBL writer.""" 933 934 HEADER_WIDTH = 5 935 QUALIFIER_INDENT = 21 936 QUALIFIER_INDENT_STR = "FT" + " " * (QUALIFIER_INDENT - 2) 937 QUALIFIER_INDENT_TMP = "FT %s " # 21 if %s is empty 938 # Note second spacer line of just FH is expected: 939 FEATURE_HEADER = "FH Key Location/Qualifiers\nFH\n" 940 941 LETTERS_PER_BLOCK = 10 942 BLOCKS_PER_LINE = 6 943 LETTERS_PER_LINE = LETTERS_PER_BLOCK * BLOCKS_PER_LINE 944 POSITION_PADDING = 10 945
946 - def _write_contig(self, record):
947 max_len = self.MAX_WIDTH - self.HEADER_WIDTH 948 lines = self._split_contig(record, max_len) 949 for text in lines: 950 self._write_single_line("CO", text)
951
952 - def _write_sequence(self, record):
953 handle = self.handle # save looking up this multiple times 954 955 if isinstance(record.seq, UnknownSeq): 956 # We have already recorded the length, and there is no need 957 # to record a long sequence of NNNNNNN...NNN or whatever. 958 if "contig" in record.annotations: 959 self._write_contig(record) 960 else: 961 # TODO - Can the sequence just be left out as in GenBank files? 962 handle.write("SQ \n") 963 return 964 965 # Catches sequence being None 966 data = self._get_seq_string(record).lower() 967 seq_len = len(data) 968 969 # Get the base alphabet (underneath any Gapped or StopCodon encoding) 970 a = Alphabet._get_base_alphabet(record.seq.alphabet) 971 if isinstance(a, Alphabet.DNAAlphabet): 972 # TODO - What if we have RNA? 973 a_count = data.count('A') + data.count('a') 974 c_count = data.count('C') + data.count('c') 975 g_count = data.count('G') + data.count('g') 976 t_count = data.count('T') + data.count('t') 977 other = seq_len - (a_count + c_count + g_count + t_count) 978 handle.write("SQ Sequence %i BP; %i A; %i C; %i G; %i T; %i other;\n" 979 % (seq_len, a_count, c_count, g_count, t_count, other)) 980 else: 981 handle.write("SQ \n") 982 983 for line_number in range(0, seq_len // self.LETTERS_PER_LINE): 984 handle.write(" ") # Just four, not five 985 for block in range(self.BLOCKS_PER_LINE): 986 index = self.LETTERS_PER_LINE * line_number + \ 987 self.LETTERS_PER_BLOCK * block 988 handle.write((" %s" % data[index:index + self.LETTERS_PER_BLOCK])) 989 handle.write(str((line_number + 1) * 990 self.LETTERS_PER_LINE).rjust(self.POSITION_PADDING)) 991 handle.write("\n") 992 if seq_len % self.LETTERS_PER_LINE: 993 # Final (partial) line 994 line_number = (seq_len // self.LETTERS_PER_LINE) 995 handle.write(" ") # Just four, not five 996 for block in range(self.BLOCKS_PER_LINE): 997 index = self.LETTERS_PER_LINE * line_number + \ 998 self.LETTERS_PER_BLOCK * block 999 handle.write( 1000 (" %s" % data[index:index + self.LETTERS_PER_BLOCK]).ljust(11)) 1001 handle.write(str(seq_len).rjust(self.POSITION_PADDING)) 1002 handle.write("\n")
1003
1004 - def _write_single_line(self, tag, text):
1005 assert len(tag) == 2 1006 line = tag + " " + text 1007 if len(text) > self.MAX_WIDTH: 1008 warnings.warn("Line %r too long" % line, BiopythonWarning) 1009 self.handle.write(line + "\n")
1010
1011 - def _write_multi_line(self, tag, text):
1012 max_len = self.MAX_WIDTH - self.HEADER_WIDTH 1013 lines = self._split_multi_line(text, max_len) 1014 for line in lines: 1015 self._write_single_line(tag, line)
1016
1017 - def _write_the_first_lines(self, record):
1018 """Write the ID and AC lines.""" 1019 if "." in record.id and record.id.rsplit(".", 1)[1].isdigit(): 1020 version = "SV " + record.id.rsplit(".", 1)[1] 1021 accession = self._get_annotation_str(record, "accession", 1022 record.id.rsplit(".", 1)[0], 1023 just_first=True) 1024 else: 1025 version = "" 1026 accession = self._get_annotation_str(record, "accession", 1027 record.id, 1028 just_first=True) 1029 1030 if ";" in accession: 1031 raise ValueError("Cannot have semi-colon in EMBL accession, %s" 1032 % repr(str(accession))) 1033 if " " in accession: 1034 # This is out of practicallity... might it be allowed? 1035 raise ValueError("Cannot have spaces in EMBL accession, %s" 1036 % repr(str(accession))) 1037 1038 topology = self._get_annotation_str(record, "topology", default="") 1039 1040 # Get the molecule type 1041 # TODO - record this explicitly in the parser? 1042 # Get the base alphabet (underneath any Gapped or StopCodon encoding) 1043 a = Alphabet._get_base_alphabet(record.seq.alphabet) 1044 if not isinstance(a, Alphabet.Alphabet): 1045 raise TypeError("Invalid alphabet") 1046 elif isinstance(a, Alphabet.DNAAlphabet): 1047 mol_type = "DNA" 1048 units = "BP" 1049 elif isinstance(a, Alphabet.RNAAlphabet): 1050 mol_type = "RNA" 1051 units = "BP" 1052 elif isinstance(a, Alphabet.ProteinAlphabet): 1053 mol_type = "PROTEIN" 1054 units = "AA" 1055 else: 1056 # Must be something like NucleotideAlphabet 1057 raise ValueError("Need a DNA, RNA or Protein alphabet") 1058 1059 if record.annotations.get("molecule_type", None): 1060 # Note often get RNA vs DNA discrepancy in real EMBL/NCBI files 1061 mol_type = record.annotations["molecule_type"] 1062 if mol_type in ["protein"]: 1063 mol_type = "PROTEIN" 1064 1065 # Get the taxonomy division 1066 division = self._get_data_division(record) 1067 1068 # TODO - Full ID line 1069 handle = self.handle 1070 # ID <1>; SV <2>; <3>; <4>; <5>; <6>; <7> BP. 1071 # 1. Primary accession number 1072 # 2. Sequence version number 1073 # 3. Topology: 'circular' or 'linear' 1074 # 4. Molecule type 1075 # 5. Data class 1076 # 6. Taxonomic division 1077 # 7. Sequence length 1078 self._write_single_line("ID", "%s; %s; %s; %s; ; %s; %i %s." 1079 % (accession, version, topology, mol_type, 1080 division, len(record), units)) 1081 handle.write("XX\n") 1082 self._write_single_line("AC", accession + ";") 1083 handle.write("XX\n")
1084 1085 @staticmethod
1086 - def _get_data_division(record):
1087 try: 1088 division = record.annotations["data_file_division"] 1089 except KeyError: 1090 division = "UNC" 1091 if division in ["PHG", "ENV", "FUN", "HUM", "INV", "MAM", "VRT", 1092 "MUS", "PLN", "PRO", "ROD", "SYN", "TGN", "UNC", 1093 "VRL", "XXX"]: 1094 # Good, already EMBL style 1095 # Division Code 1096 # ----------------- ---- 1097 # Bacteriophage PHG 1098 # Environmental Sample ENV 1099 # Fungal FUN 1100 # Human HUM 1101 # Invertebrate INV 1102 # Other Mammal MAM 1103 # Other Vertebrate VRT 1104 # Mus musculus MUS 1105 # Plant PLN 1106 # Prokaryote PRO 1107 # Other Rodent ROD 1108 # Synthetic SYN 1109 # Transgenic TGN 1110 # Unclassified UNC (i.e. unknown) 1111 # Viral VRL 1112 # 1113 # (plus XXX used for submiting data to EMBL) 1114 pass 1115 else: 1116 # See if this is in GenBank style & can be converted. 1117 # Generally a problem as the GenBank groups are wider 1118 # than those of EMBL. Note that GenBank use "BCT" for 1119 # both bacteria and acherea thus this maps to EMBL's 1120 # "PRO" nicely. 1121 gbk_to_embl = {"BCT": "PRO", 1122 "UNK": "UNC", 1123 } 1124 try: 1125 division = gbk_to_embl[division] 1126 except KeyError: 1127 division = "UNC" 1128 assert len(division) == 3 1129 return division
1130
1131 - def _write_keywords(self, record):
1132 # Put the keywords right after DE line. 1133 # Each 'keyword' can have multiple words and spaces, but we 1134 # must not split any 'keyword' between lines. 1135 # TODO - Combine short keywords onto one line 1136 for keyword in record.annotations["keywords"]: 1137 self._write_single_line("KW", keyword) 1138 self.handle.write("XX\n")
1139
1140 - def _write_references(self, record):
1141 # The order should be RN, RC, RP, RX, RG, RA, RT, RL 1142 number = 0 1143 for ref in record.annotations["references"]: 1144 if not isinstance(ref, SeqFeature.Reference): 1145 continue 1146 number += 1 1147 self._write_single_line("RN", "[%i]" % number) 1148 # TODO - support for RC line (needed in parser too) 1149 # TODO - support more complex record reference locations? 1150 if ref.location and len(ref.location) == 1: 1151 self._write_single_line( 1152 "RP", "%i-%i" % (ref.location[0].nofuzzy_start + 1, 1153 ref.location[0].nofuzzy_end)) 1154 # TODO - record any DOI or AGRICOLA identifier in the reference object? 1155 if ref.pubmed_id: 1156 self._write_single_line("RX", "PUBMED; %s." % ref.pubmed_id) 1157 if ref.consrtm: 1158 self._write_single_line("RG", "%s" % ref.consrtm) 1159 if ref.authors: 1160 # We store the AUTHORS data as a single string 1161 self._write_multi_line("RA", ref.authors + ";") 1162 if ref.title: 1163 # We store the title as a single string 1164 self._write_multi_line("RT", '"%s";' % ref.title) 1165 if ref.journal: 1166 # We store this as a single string - holds the journal name, 1167 # volume, year, and page numbers of the citation 1168 self._write_multi_line("RL", ref.journal) 1169 self.handle.write("XX\n")
1170
1171 - def _write_comment(self, record):
1172 # This is a bit complicated due to the range of possible 1173 # ways people might have done their annotation... 1174 # Currently the parser uses a single string with newlines. 1175 # A list of lines is also reasonable. 1176 # A single (long) string is perhaps the most natural of all. 1177 # This means we may need to deal with line wrapping. 1178 comment = record.annotations["comment"] 1179 if isinstance(comment, basestring): 1180 lines = comment.split("\n") 1181 elif isinstance(comment, (list, tuple)): 1182 lines = comment 1183 else: 1184 raise ValueError("Could not understand comment annotation") 1185 # TODO - Merge this with the GenBank comment code? 1186 if not lines: 1187 return 1188 for line in lines: 1189 self._write_multi_line("CC", line) 1190 self.handle.write("XX\n")
1191
1192 - def write_record(self, record):
1193 """Write a single record to the output file.""" 1194 handle = self.handle 1195 self._write_the_first_lines(record) 1196 1197 # PR line (0 or 1 lines only), project identifier 1198 # 1199 # Assuming can't use 2 lines, we should prefer newer GenBank 1200 # DBLINK BioProject:... entries over the older GenBank DBLINK 1201 # Project:... lines. 1202 # 1203 # In either case, seems EMBL usess just "PR Project:..." 1204 # regardless of the type of ID (old numeric only, or new 1205 # with alpha prefix), e.g. for CP002497 NCBI now uses: 1206 # 1207 # DBLINK BioProject: PRJNA60715 1208 # BioSample: SAMN03081426 1209 # 1210 # While EMBL uses: 1211 # 1212 # XX 1213 # PR Project:PRJNA60715; 1214 # XX 1215 # 1216 # Sorting ensures (new) BioProject:... is before old Project:... 1217 for xref in sorted(record.dbxrefs): 1218 if xref.startswith("BioProject:"): 1219 self._write_single_line("PR", xref[3:] + ";") 1220 handle.write("XX\n") 1221 break 1222 if xref.startswith("Project:"): 1223 self._write_single_line("PR", xref + ";") 1224 handle.write("XX\n") 1225 break 1226 1227 # TODO - DT lines (date) 1228 1229 descr = record.description 1230 if descr == "<unknown description>": 1231 descr = "." 1232 self._write_multi_line("DE", descr) 1233 handle.write("XX\n") 1234 1235 if "keywords" in record.annotations: 1236 self._write_keywords(record) 1237 1238 # Should this be "source" or "organism"? 1239 self._write_multi_line( 1240 "OS", self._get_annotation_str(record, "organism")) 1241 try: 1242 # List of strings 1243 taxonomy = "; ".join(record.annotations["taxonomy"]) + "." 1244 except KeyError: 1245 taxonomy = "." 1246 self._write_multi_line("OC", taxonomy) 1247 handle.write("XX\n") 1248 1249 if "references" in record.annotations: 1250 self._write_references(record) 1251 1252 if "comment" in record.annotations: 1253 self._write_comment(record) 1254 1255 handle.write(self.FEATURE_HEADER) 1256 rec_length = len(record) 1257 for feature in record.features: 1258 self._write_feature(feature, rec_length) 1259 handle.write("XX\n") 1260 1261 self._write_sequence(record) 1262 handle.write("//\n")
1263
1264 1265 -class ImgtWriter(EmblWriter):
1266 """IMGT writer (EMBL format variant).""" 1267 1268 HEADER_WIDTH = 5 1269 QUALIFIER_INDENT = 25 # Not 21 as in EMBL 1270 QUALIFIER_INDENT_STR = "FT" + " " * (QUALIFIER_INDENT - 2) 1271 QUALIFIER_INDENT_TMP = "FT %s " # 25 if %s is empty 1272 FEATURE_HEADER = "FH Key Location/Qualifiers\nFH\n"
1273 1274 1275 if __name__ == "__main__": 1276 from Bio._utils import run_doctest 1277 run_doctest(verbose=0) 1278