Package Bio :: Package SeqIO :: Module InsdcIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.InsdcIO

   1  # Copyright 2007-2016 by Peter Cock.  All rights reserved. 
   2  # 
   3  # This code is part of the Biopython distribution and governed by its 
   4  # license.  Please see the LICENSE file that should have been included 
   5  # as part of this package.. 
   6   
   7  """Bio.SeqIO support for the "genbank" and "embl" file formats. 
   8   
   9  You are expected to use this module via the Bio.SeqIO functions. 
  10  Note that internally this module calls Bio.GenBank to do the actual 
  11  parsing of GenBank, EMBL and IMGT files. 
  12   
  13  See also: 
  14   
  15  International Nucleotide Sequence Database Collaboration 
  16  http://www.insdc.org/ 
  17   
  18  GenBank 
  19  http://www.ncbi.nlm.nih.gov/Genbank/ 
  20   
  21  EMBL Nucleotide Sequence Database 
  22  http://www.ebi.ac.uk/embl/ 
  23   
  24  DDBJ (DNA Data Bank of Japan) 
  25  http://www.ddbj.nig.ac.jp/ 
  26   
  27  IMGT (use a variant of EMBL format with longer feature indents) 
  28  http://imgt.cines.fr/download/LIGM-DB/userman_doc.html 
  29  http://imgt.cines.fr/download/LIGM-DB/ftable_doc.html 
  30  http://www.ebi.ac.uk/imgt/hla/docs/manual.html 
  31   
  32  """ 
  33   
  34  from __future__ import print_function 
  35   
  36  import warnings 
  37  from Bio import BiopythonWarning 
  38   
  39  from Bio.Seq import UnknownSeq 
  40  from Bio.GenBank.Scanner import GenBankScanner, EmblScanner, _ImgtScanner 
  41  from Bio import Alphabet 
  42  from Bio.SeqIO.Interfaces import SequentialSequenceWriter 
  43  from Bio import SeqFeature 
  44   
  45  from Bio._py3k import _is_int_or_long 
  46  from Bio._py3k import basestring 
47 48 49 # NOTE 50 # ==== 51 # The "brains" for parsing GenBank, EMBL and IMGT files (and any 52 # other flat file variants from the INSDC in future) is in 53 # Bio.GenBank.Scanner (plus the _FeatureConsumer in Bio.GenBank) 54 # However, all the writing code is in this file. 55 56 57 -def GenBankIterator(handle):
58 """Breaks up a Genbank file into SeqRecord objects. 59 60 Every section from the LOCUS line to the terminating // becomes 61 a single SeqRecord with associated annotation and features. 62 63 Note that for genomes or chromosomes, there is typically only 64 one record. 65 66 This gets called internally by Bio.SeqIO for the GenBank file format: 67 68 >>> from Bio import SeqIO 69 >>> for record in SeqIO.parse("GenBank/cor6_6.gb", "gb"): 70 ... print(record.id) 71 ... 72 X55053.1 73 X62281.1 74 M81224.1 75 AJ237582.1 76 L31939.1 77 AF297471.1 78 79 Equivalently, 80 81 >>> with open("GenBank/cor6_6.gb") as handle: 82 ... for record in GenBankIterator(handle): 83 ... print(record.id) 84 ... 85 X55053.1 86 X62281.1 87 M81224.1 88 AJ237582.1 89 L31939.1 90 AF297471.1 91 92 """ 93 # This calls a generator function: 94 return GenBankScanner(debug=0).parse_records(handle)
95
96 97 -def EmblIterator(handle):
98 """Breaks up an EMBL file into SeqRecord objects. 99 100 Every section from the LOCUS line to the terminating // becomes 101 a single SeqRecord with associated annotation and features. 102 103 Note that for genomes or chromosomes, there is typically only 104 one record. 105 106 This gets called internally by Bio.SeqIO for the EMBL file format: 107 108 >>> from Bio import SeqIO 109 >>> for record in SeqIO.parse("EMBL/epo_prt_selection.embl", "embl"): 110 ... print(record.id) 111 ... 112 A00022.1 113 A00028.1 114 A00031.1 115 A00034.1 116 A00060.1 117 A00071.1 118 A00072.1 119 A00078.1 120 CQ797900.1 121 122 Equivalently, 123 124 >>> with open("EMBL/epo_prt_selection.embl") as handle: 125 ... for record in EmblIterator(handle): 126 ... print(record.id) 127 ... 128 A00022.1 129 A00028.1 130 A00031.1 131 A00034.1 132 A00060.1 133 A00071.1 134 A00072.1 135 A00078.1 136 CQ797900.1 137 138 """ 139 # This calls a generator function: 140 return EmblScanner(debug=0).parse_records(handle)
141
142 143 -def ImgtIterator(handle):
144 """Breaks up an IMGT file into SeqRecord objects. 145 146 Every section from the LOCUS line to the terminating // becomes 147 a single SeqRecord with associated annotation and features. 148 149 Note that for genomes or chromosomes, there is typically only 150 one record.""" 151 # This calls a generator function: 152 return _ImgtScanner(debug=0).parse_records(handle)
153
154 155 -def GenBankCdsFeatureIterator(handle, alphabet=Alphabet.generic_protein):
156 """Breaks up a Genbank file into SeqRecord objects for each CDS feature. 157 158 Every section from the LOCUS line to the terminating // can contain 159 many CDS features. These are returned as with the stated amino acid 160 translation sequence (if given). 161 """ 162 # This calls a generator function: 163 return GenBankScanner(debug=0).parse_cds_features(handle, alphabet)
164
165 166 -def EmblCdsFeatureIterator(handle, alphabet=Alphabet.generic_protein):
167 """Breaks up a EMBL file into SeqRecord objects for each CDS feature. 168 169 Every section from the LOCUS line to the terminating // can contain 170 many CDS features. These are returned as with the stated amino acid 171 translation sequence (if given). 172 """ 173 # This calls a generator function: 174 return EmblScanner(debug=0).parse_cds_features(handle, alphabet)
175
176 177 -def _insdc_feature_position_string(pos, offset=0):
178 """Build a GenBank/EMBL position string (PRIVATE). 179 180 Use offset=1 to add one to convert a start position from python counting. 181 """ 182 if isinstance(pos, SeqFeature.ExactPosition): 183 return "%i" % (pos.position + offset) 184 elif isinstance(pos, SeqFeature.WithinPosition): 185 return "(%i.%i)" % (pos.position + offset, 186 pos.position + pos.extension + offset) 187 elif isinstance(pos, SeqFeature.BetweenPosition): 188 return "(%i^%i)" % (pos.position + offset, 189 pos.position + pos.extension + offset) 190 elif isinstance(pos, SeqFeature.BeforePosition): 191 return "<%i" % (pos.position + offset) 192 elif isinstance(pos, SeqFeature.AfterPosition): 193 return ">%i" % (pos.position + offset) 194 elif isinstance(pos, SeqFeature.OneOfPosition): 195 return "one-of(%s)" \ 196 % ",".join(_insdc_feature_position_string(p, offset) 197 for p in pos.position_choices) 198 elif isinstance(pos, SeqFeature.AbstractPosition): 199 raise NotImplementedError("Please report this as a bug in Biopython.") 200 else: 201 raise ValueError("Expected a SeqFeature position object.")
202
203 204 -def _insdc_location_string_ignoring_strand_and_subfeatures(location, rec_length):
205 if location.ref: 206 ref = "%s:" % location.ref 207 else: 208 ref = "" 209 assert not location.ref_db 210 if isinstance(location.start, SeqFeature.ExactPosition) \ 211 and isinstance(location.end, SeqFeature.ExactPosition) \ 212 and location.start.position == location.end.position: 213 # Special case, for 12:12 return 12^13 214 # (a zero length slice, meaning the point between two letters) 215 if location.end.position == rec_length: 216 # Very special case, for a between position at the end of a 217 # sequence (used on some circular genomes, Bug 3098) we have 218 # N:N so return N^1 219 return "%s%i^1" % (ref, rec_length) 220 else: 221 return "%s%i^%i" % (ref, location.end.position, 222 location.end.position + 1) 223 if isinstance(location.start, SeqFeature.ExactPosition) \ 224 and isinstance(location.end, SeqFeature.ExactPosition) \ 225 and location.start.position + 1 == location.end.position: 226 # Special case, for 11:12 return 12 rather than 12..12 227 # (a length one slice, meaning a single letter) 228 return "%s%i" % (ref, location.end.position) 229 elif isinstance(location.start, SeqFeature.UnknownPosition) \ 230 or isinstance(location.end, SeqFeature.UnknownPosition): 231 # Special case for features from SwissProt/UniProt files 232 if isinstance(location.start, SeqFeature.UnknownPosition) \ 233 and isinstance(location.end, SeqFeature.UnknownPosition): 234 # warnings.warn("Feature with unknown location", BiopythonWarning) 235 # return "?" 236 raise ValueError("Feature with unknown location") 237 elif isinstance(location.start, SeqFeature.UnknownPosition): 238 # Treat the unknown start position as a BeforePosition 239 return "%s<%i..%s" \ 240 % (ref, 241 location.nofuzzy_end, 242 _insdc_feature_position_string(location.end)) 243 else: 244 # Treat the unknown end position as an AfterPosition 245 return "%s%s..>%i" \ 246 % (ref, 247 _insdc_feature_position_string(location.start, +1), 248 location.nofuzzy_start + 1) 249 else: 250 # Typical case, e.g. 12..15 gets mapped to 11:15 251 return ref \ 252 + _insdc_feature_position_string(location.start, +1) \ 253 + ".." + \ 254 _insdc_feature_position_string(location.end)
255
256 257 -def _insdc_location_string(location, rec_length):
258 """Build a GenBank/EMBL location from a (Compound) FeatureLocation (PRIVATE). 259 260 There is a choice of how to show joins on the reverse complement strand, 261 GenBank used "complement(join(1,10),(20,100))" while EMBL used to use 262 "join(complement(20,100),complement(1,10))" instead (but appears to have 263 now adopted the GenBank convention). Notice that the order of the entries 264 is reversed! This function therefore uses the first form. In this situation 265 we expect the CompoundFeatureLocation and its parts to all be marked as 266 strand == -1, and to be in the order 19:100 then 0:10. 267 """ 268 try: 269 parts = location.parts 270 # CompoundFeatureLocation 271 if location.strand == -1: 272 # Special case, put complement outside the join/order/... and reverse order 273 return "complement(%s(%s))" % (location.operator, 274 ",".join(_insdc_location_string_ignoring_strand_and_subfeatures(p, rec_length) 275 for p in parts[::-1])) 276 else: 277 return "%s(%s)" % (location.operator, 278 ",".join(_insdc_location_string(p, rec_length) for p in parts)) 279 except AttributeError: 280 # Simple FeatureLocation 281 loc = _insdc_location_string_ignoring_strand_and_subfeatures(location, rec_length) 282 if location.strand == -1: 283 return "complement(%s)" % loc 284 else: 285 return loc
286
287 288 -class _InsdcWriter(SequentialSequenceWriter):
289 """Base class for GenBank and EMBL writers (PRIVATE).""" 290 MAX_WIDTH = 80 291 QUALIFIER_INDENT = 21 292 QUALIFIER_INDENT_STR = " " * QUALIFIER_INDENT 293 QUALIFIER_INDENT_TMP = " %s " # 21 if %s is empty 294 FTQUAL_NO_QUOTE = ("anticodon", "citation", "codon_start", "compare", 295 "direction", "estimated_length", "mod_base", "number", 296 "rpt_type", "rpt_unit_range", "tag_peptide", 297 "transl_except", "transl_table") 298
299 - def _write_feature_qualifier(self, key, value=None, quote=None):
300 if value is None: 301 # Value-less entry like /pseudo 302 self.handle.write("%s/%s\n" % (self.QUALIFIER_INDENT_STR, key)) 303 return 304 # Quick hack with no line wrapping, may be useful for testing: 305 # self.handle.write('%s/%s="%s"\n' % (self.QUALIFIER_INDENT_STR, key, value)) 306 if quote is None: 307 # Try to mimic unwritten rules about when quotes can be left out: 308 if _is_int_or_long(value) or key in self.FTQUAL_NO_QUOTE: 309 quote = False 310 else: 311 quote = True 312 if quote: 313 line = '%s/%s="%s"' % (self.QUALIFIER_INDENT_STR, key, value) 314 else: 315 line = '%s/%s=%s' % (self.QUALIFIER_INDENT_STR, key, value) 316 if len(line) <= self.MAX_WIDTH: 317 self.handle.write(line + "\n") 318 return 319 while line.lstrip(): 320 if len(line) <= self.MAX_WIDTH: 321 self.handle.write(line + "\n") 322 return 323 # Insert line break... 324 for index in range(min(len(line) - 1, self.MAX_WIDTH), 325 self.QUALIFIER_INDENT + 1, -1): 326 if line[index] == " ": 327 break 328 if line[index] != " ": 329 # No nice place to break... 330 index = self.MAX_WIDTH 331 assert index <= self.MAX_WIDTH 332 self.handle.write(line[:index] + "\n") 333 line = self.QUALIFIER_INDENT_STR + line[index:].lstrip()
334
335 - def _wrap_location(self, location):
336 """Split a feature location into lines (break at commas).""" 337 # TODO - Rewrite this not to recurse! 338 length = self.MAX_WIDTH - self.QUALIFIER_INDENT 339 if len(location) <= length: 340 return location 341 index = location[:length].rfind(",") 342 if index == -1: 343 # No good place to split (!) 344 warnings.warn("Couldn't split location:\n%s" % location, 345 BiopythonWarning) 346 return location 347 return location[:index + 1] + "\n" + \ 348 self.QUALIFIER_INDENT_STR + \ 349 self._wrap_location(location[index + 1:])
350
351 - def _write_feature(self, feature, record_length):
352 """Write a single SeqFeature object to features table.""" 353 assert feature.type, feature 354 location = _insdc_location_string(feature.location, record_length) 355 f_type = feature.type.replace(" ", "_") 356 line = (self.QUALIFIER_INDENT_TMP % f_type)[:self.QUALIFIER_INDENT] \ 357 + self._wrap_location(location) + "\n" 358 self.handle.write(line) 359 # Now the qualifiers... 360 # Note as of Biopython 1.69, this is an ordered-dict, don't sort it: 361 for key, values in feature.qualifiers.items(): 362 if isinstance(values, (list, tuple)): 363 for value in values: 364 self._write_feature_qualifier(key, value) 365 else: 366 # String, int, etc - or None for a /pseudo tpy entry 367 self._write_feature_qualifier(key, values)
368 369 @staticmethod
370 - def _get_annotation_str(record, key, default=".", just_first=False):
371 """Get an annotation dictionary entry (as a string). 372 373 Some entries are lists, in which case if just_first=True the first entry 374 is returned. If just_first=False (default) this verifies there is only 375 one entry before returning it.""" 376 try: 377 answer = record.annotations[key] 378 except KeyError: 379 return default 380 if isinstance(answer, list): 381 if not just_first: 382 assert len(answer) == 1 383 return str(answer[0]) 384 else: 385 return str(answer)
386 387 @staticmethod
388 - def _split_multi_line(text, max_len):
389 """Returns a list of strings. 390 391 Any single words which are too long get returned as a whole line 392 (e.g. URLs) without an exception or warning. 393 """ 394 # TODO - Do the line spliting while preserving white space? 395 text = text.strip() 396 if len(text) <= max_len: 397 return [text] 398 399 words = text.split() 400 text = "" 401 while words and len(text) + 1 + len(words[0]) <= max_len: 402 text += " " + words.pop(0) 403 text = text.strip() 404 # assert len(text) <= max_len 405 answer = [text] 406 while words: 407 text = words.pop(0) 408 while words and len(text) + 1 + len(words[0]) <= max_len: 409 text += " " + words.pop(0) 410 text = text.strip() 411 # assert len(text) <= max_len 412 answer.append(text) 413 assert not words 414 return answer
415
416 - def _split_contig(self, record, max_len):
417 """Returns a list of strings, splits on commas.""" 418 # TODO - Merge this with _write_multi_line method? 419 # It would need the addition of the comma splitting logic... 420 # are there any other cases where that would be sensible? 421 contig = record.annotations.get("contig", "") 422 if isinstance(contig, (list, tuple)): 423 contig = "".join(contig) 424 contig = self.clean(contig) 425 answer = [] 426 while contig: 427 if len(contig) > max_len: 428 # Split lines at the commas 429 pos = contig[:max_len - 1].rfind(",") 430 if pos == -1: 431 raise ValueError("Could not break up CONTIG") 432 text, contig = contig[:pos + 1], contig[pos + 1:] 433 else: 434 text, contig = contig, "" 435 answer.append(text) 436 return answer
437
438 439 -class GenBankWriter(_InsdcWriter):
440 """GenBank writer.""" 441 442 HEADER_WIDTH = 12 443 QUALIFIER_INDENT = 21 444 STRUCTURED_COMMENT_START = "-START##" 445 STRUCTURED_COMMENT_END = "-END##" 446 STRUCTURED_COMMENT_DELIM = " :: " 447 LETTERS_PER_LINE = 60 448 SEQUENCE_INDENT = 9 449
450 - def _write_single_line(self, tag, text):
451 """Used in the 'header' of each GenBank record.""" 452 assert len(tag) < self.HEADER_WIDTH 453 if len(text) > self.MAX_WIDTH - self.HEADER_WIDTH: 454 if tag: 455 warnings.warn("Annotation %r too long for %r line" % (text, tag), 456 BiopythonWarning) 457 else: 458 # Can't give such a precise warning 459 warnings.warn("Annotation %r too long" % text, BiopythonWarning) 460 self.handle.write("%s%s\n" % (tag.ljust(self.HEADER_WIDTH), 461 text.replace("\n", " ")))
462
463 - def _write_multi_line(self, tag, text):
464 """Used in the 'header' of each GenBank record.""" 465 # TODO - Do the line spliting while preserving white space? 466 max_len = self.MAX_WIDTH - self.HEADER_WIDTH 467 lines = self._split_multi_line(text, max_len) 468 self._write_single_line(tag, lines[0]) 469 for line in lines[1:]: 470 self._write_single_line("", line)
471
472 - def _write_multi_entries(self, tag, text_list):
473 # used for DBLINK and any similar later line types. 474 # If the list of strings is empty, nothing is written. 475 for i, text in enumerate(text_list): 476 if i == 0: 477 self._write_single_line(tag, text) 478 else: 479 self._write_single_line("", text)
480 481 @staticmethod
482 - def _get_date(record):
483 default = "01-JAN-1980" 484 try: 485 date = record.annotations["date"] 486 except KeyError: 487 return default 488 # Cope with a list of one string: 489 if isinstance(date, list) and len(date) == 1: 490 date = date[0] 491 # TODO - allow a Python date object 492 if not isinstance(date, basestring) or len(date) != 11 \ 493 or date[2] != "-" or date[6] != "-" \ 494 or not date[:2].isdigit() or not date[7:].isdigit() \ 495 or int(date[:2]) > 31 \ 496 or date[3:6] not in ["JAN", "FEB", "MAR", "APR", "MAY", "JUN", 497 "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"]: 498 # TODO - Check is a valid date (e.g. not 31 Feb) 499 return default 500 return date
501 502 @staticmethod
503 - def _get_data_division(record):
504 try: 505 division = record.annotations["data_file_division"] 506 except KeyError: 507 division = "UNK" 508 if division in ["PRI", "ROD", "MAM", "VRT", "INV", "PLN", "BCT", 509 "VRL", "PHG", "SYN", "UNA", "EST", "PAT", "STS", 510 "GSS", "HTG", "HTC", "ENV", "CON"]: 511 # Good, already GenBank style 512 # PRI - primate sequences 513 # ROD - rodent sequences 514 # MAM - other mammalian sequences 515 # VRT - other vertebrate sequences 516 # INV - invertebrate sequences 517 # PLN - plant, fungal, and algal sequences 518 # BCT - bacterial sequences [plus archea] 519 # VRL - viral sequences 520 # PHG - bacteriophage sequences 521 # SYN - synthetic sequences 522 # UNA - unannotated sequences 523 # EST - EST sequences (expressed sequence tags) 524 # PAT - patent sequences 525 # STS - STS sequences (sequence tagged sites) 526 # GSS - GSS sequences (genome survey sequences) 527 # HTG - HTGS sequences (high throughput genomic sequences) 528 # HTC - HTC sequences (high throughput cDNA sequences) 529 # ENV - Environmental sampling sequences 530 # CON - Constructed sequences 531 # 532 # (plus UNK for unknown) 533 pass 534 else: 535 # See if this is in EMBL style: 536 # Division Code 537 # ----------------- ---- 538 # Bacteriophage PHG - common 539 # Environmental Sample ENV - common 540 # Fungal FUN - map to PLN (plants + fungal) 541 # Human HUM - map to PRI (primates) 542 # Invertebrate INV - common 543 # Other Mammal MAM - common 544 # Other Vertebrate VRT - common 545 # Mus musculus MUS - map to ROD (rodent) 546 # Plant PLN - common 547 # Prokaryote PRO - map to BCT (poor name) 548 # Other Rodent ROD - common 549 # Synthetic SYN - common 550 # Transgenic TGN - ??? map to SYN ??? 551 # Unclassified UNC - map to UNK 552 # Viral VRL - common 553 # 554 # (plus XXX for submiting which we can map to UNK) 555 embl_to_gbk = {"FUN": "PLN", 556 "HUM": "PRI", 557 "MUS": "ROD", 558 "PRO": "BCT", 559 "UNC": "UNK", 560 "XXX": "UNK", 561 } 562 try: 563 division = embl_to_gbk[division] 564 except KeyError: 565 division = "UNK" 566 assert len(division) == 3 567 return division
568
569 - def _get_topology(self, record):
570 """Set the topology to 'circular', 'linear' if defined""" 571 max_topology_len = len("circular") 572 573 topology = self._get_annotation_str(record, "topology", default="") 574 if topology and len(topology) <= max_topology_len: 575 return topology.ljust(max_topology_len) 576 else: 577 return " " * max_topology_len
578
579 - def _write_the_first_line(self, record):
580 """Write the LOCUS line.""" 581 locus = record.name 582 if not locus or locus == "<unknown name>": 583 locus = record.id 584 if not locus or locus == "<unknown id>": 585 locus = self._get_annotation_str( 586 record, "accession", just_first=True) 587 if len(locus) > 16: 588 if len(locus) + 1 + len(str(len(record))) > 28: 589 # Locus name and record length to long to squeeze in. 590 raise ValueError("Locus identifier %r is too long" % locus) 591 else: 592 warnings.warn("Stealing space from length field to allow long name in LOCUS line", BiopythonWarning) 593 if len(locus.split()) > 1: 594 # locus could be unicode, and u'with space' versus 'with space' 595 # causes trouble with doctest or print-and-compare tests, so 596 tmp = repr(locus) 597 if tmp.startswith("u'") and tmp.endswith("'"): 598 tmp = tmp[1:] 599 raise ValueError("Invalid whitespace in %s for LOCUS line" % tmp) 600 if len(record) > 99999999999: 601 # Currently GenBank only officially support up to 350000, but 602 # the length field can take eleven digits 603 raise ValueError("Sequence too long!") 604 605 # Get the base alphabet (underneath any Gapped or StopCodon encoding) 606 a = Alphabet._get_base_alphabet(record.seq.alphabet) 607 if not isinstance(a, Alphabet.Alphabet): 608 raise TypeError("Invalid alphabet") 609 elif isinstance(a, Alphabet.ProteinAlphabet): 610 units = "aa" 611 elif isinstance(a, Alphabet.NucleotideAlphabet): 612 units = "bp" 613 else: 614 # Must be something like NucleotideAlphabet or 615 # just the generic Alphabet (default for fasta files) 616 raise ValueError("Need a Nucleotide or Protein alphabet") 617 618 # Get the molecule type 619 mol_type = self._get_annotation_str(record, "molecule_type", default=None) 620 if mol_type and len(mol_type) > 7: 621 # Deal with common cases from EMBL to GenBank 622 mol_type = mol_type.replace("unassigned ", "").replace("genomic ", "") 623 if len(mol_type) > 7: 624 warnings.warn("Molecule type %r too long" % mol_type, 625 BiopythonWarning) 626 mol_type = None 627 if mol_type == "protein": 628 mol_type = "" 629 630 if mol_type: 631 pass 632 elif isinstance(a, Alphabet.ProteinAlphabet): 633 mol_type = "" 634 elif isinstance(a, Alphabet.DNAAlphabet): 635 mol_type = "DNA" 636 elif isinstance(a, Alphabet.RNAAlphabet): 637 mol_type = "RNA" 638 else: 639 # Must be something like NucleotideAlphabet or 640 # just the generic Alphabet (default for fasta files) 641 raise ValueError("Need a DNA, RNA or Protein alphabet") 642 643 topology = self._get_topology(record) 644 645 division = self._get_data_division(record) 646 647 name_length = str(len(record)).rjust(28) 648 name_length = locus + name_length[len(locus):] 649 assert len(name_length) == 28, name_length 650 assert " " in name_length, name_length 651 652 assert len(units) == 2 653 assert len(division) == 3 654 line = "LOCUS %s %s %s %s %s %s\n" \ 655 % (name_length, 656 units, 657 mol_type.ljust(7), 658 topology, 659 division, 660 self._get_date(record)) 661 assert len(line) == 79 + 1, repr(line) # plus one for new line 662 663 # We're bending the rules to allow an identifier over 16 characters 664 # if we can steal spaces from the length field: 665 # assert line[12:28].rstrip() == locus, \ 666 # 'LOCUS line does not contain the locus at the expected position:\n' + line 667 # assert line[28:29] == " " 668 # assert line[29:40].lstrip() == str(len(record)), \ 669 # 'LOCUS line does not contain the length at the expected position:\n' + line 670 assert line[12:40].split() == [locus, str(len(record))], line 671 672 # Tests copied from Bio.GenBank.Scanner 673 assert line[40:44] in [' bp ', ' aa '], \ 674 'LOCUS line does not contain size units at expected position:\n' + \ 675 line 676 assert line[44:47] in [' ', 'ss-', 'ds-', 'ms-'], \ 677 'LOCUS line does not have valid strand type (Single stranded, ...):\n' + line 678 assert line[47:54].strip() == "" \ 679 or 'DNA' in line[47:54].strip() \ 680 or 'RNA' in line[47:54].strip(), \ 681 'LOCUS line does not contain valid sequence type (DNA, RNA, ...):\n' + line 682 assert line[54:55] == ' ', \ 683 'LOCUS line does not contain space at position 55:\n' + line 684 assert line[55:63].strip() in ['', 'linear', 'circular'], \ 685 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line 686 assert line[63:64] == ' ', \ 687 'LOCUS line does not contain space at position 64:\n' + line 688 assert line[67:68] == ' ', \ 689 'LOCUS line does not contain space at position 68:\n' + line 690 assert line[70:71] == '-', \ 691 'LOCUS line does not contain - at position 71 in date:\n' + line 692 assert line[74:75] == '-', \ 693 'LOCUS line does not contain - at position 75 in date:\n' + line 694 695 self.handle.write(line)
696
697 - def _write_references(self, record):
698 number = 0 699 for ref in record.annotations["references"]: 700 if not isinstance(ref, SeqFeature.Reference): 701 continue 702 number += 1 703 data = str(number) 704 # TODO - support more complex record reference locations? 705 if ref.location and len(ref.location) == 1: 706 a = Alphabet._get_base_alphabet(record.seq.alphabet) 707 if isinstance(a, Alphabet.ProteinAlphabet): 708 units = "residues" 709 else: 710 units = "bases" 711 data += " (%s %i to %i)" % (units, 712 ref.location[0].nofuzzy_start + 1, 713 ref.location[0].nofuzzy_end) 714 self._write_single_line("REFERENCE", data) 715 if ref.authors: 716 # We store the AUTHORS data as a single string 717 self._write_multi_line(" AUTHORS", ref.authors) 718 if ref.consrtm: 719 # We store the consortium as a single string 720 self._write_multi_line(" CONSRTM", ref.consrtm) 721 if ref.title: 722 # We store the title as a single string 723 self._write_multi_line(" TITLE", ref.title) 724 if ref.journal: 725 # We store this as a single string - holds the journal name, 726 # volume, year, and page numbers of the citation 727 self._write_multi_line(" JOURNAL", ref.journal) 728 if ref.medline_id: 729 # This line type is obsolete and was removed from the GenBank 730 # flatfile format in April 2005. Should we write it? 731 # Note this has a two space indent: 732 self._write_multi_line(" MEDLINE", ref.medline_id) 733 if ref.pubmed_id: 734 # Note this has a THREE space indent: 735 self._write_multi_line(" PUBMED", ref.pubmed_id) 736 if ref.comment: 737 self._write_multi_line(" REMARK", ref.comment)
738
739 - def _write_comment(self, record):
740 # This is a bit complicated due to the range of possible 741 # ways people might have done their annotation... 742 # Currently the parser uses a single string with newlines. 743 # A list of lines is also reasonable. 744 # A single (long) string is perhaps the most natural of all. 745 # This means we may need to deal with line wrapping. 746 lines = [] 747 if "structured_comment" in record.annotations: 748 comment = record.annotations["structured_comment"] 749 # Find max length of keys for equal padded printing 750 padding = 0 751 for key, data in comment.items(): 752 for subkey, subdata in data.items(): 753 padding = len(subkey) if len(subkey) > padding else padding 754 # Construct output 755 for key, data in comment.items(): 756 lines.append("##{0}{1}".format(key, self.STRUCTURED_COMMENT_START)) 757 for subkey, subdata in data.items(): 758 spaces = " " * (padding - len(subkey)) 759 lines.append("{0}{1}{2}{3}".format(subkey, spaces, self.STRUCTURED_COMMENT_DELIM, subdata)) 760 lines.append("##{0}{1}".format(key, self.STRUCTURED_COMMENT_END)) 761 if "comment" in record.annotations: 762 comment = record.annotations["comment"] 763 if isinstance(comment, basestring): 764 lines += comment.split("\n") 765 elif isinstance(comment, (list, tuple)): 766 lines += list(comment) 767 else: 768 raise ValueError("Could not understand comment annotation") 769 self._write_multi_line("COMMENT", lines[0]) 770 for line in lines[1:]: 771 self._write_multi_line("", line)
772
773 - def _write_contig(self, record):
774 max_len = self.MAX_WIDTH - self.HEADER_WIDTH 775 lines = self._split_contig(record, max_len) 776 self._write_single_line("CONTIG", lines[0]) 777 for text in lines[1:]: 778 self._write_single_line("", text)
779
780 - def _write_sequence(self, record):
781 # Loosely based on code from Howard Salis 782 # TODO - Force lower case? 783 784 if isinstance(record.seq, UnknownSeq): 785 # We have already recorded the length, and there is no need 786 # to record a long sequence of NNNNNNN...NNN or whatever. 787 if "contig" in record.annotations: 788 self._write_contig(record) 789 else: 790 self.handle.write("ORIGIN\n") 791 return 792 793 # Catches sequence being None: 794 data = self._get_seq_string(record).lower() 795 seq_len = len(data) 796 self.handle.write("ORIGIN\n") 797 for line_number in range(0, seq_len, self.LETTERS_PER_LINE): 798 self.handle.write(str(line_number + 1).rjust(self.SEQUENCE_INDENT)) 799 for words in range(line_number, 800 min(line_number + self.LETTERS_PER_LINE, seq_len), 10): 801 self.handle.write(" %s" % data[words:words + 10]) 802 self.handle.write("\n")
803
804 - def write_record(self, record):
805 """Write a single record to the output file.""" 806 handle = self.handle 807 self._write_the_first_line(record) 808 809 default = record.id 810 if default.count(".") == 1 and default[default.index(".") + 1:].isdigit(): 811 # Good, looks like accesion.version and not something 812 # else like identifier.start-end 813 default = record.id.split(".", 1)[0] 814 accession = self._get_annotation_str(record, "accession", 815 default, 816 just_first=True) 817 acc_with_version = accession 818 if record.id.startswith(accession + "."): 819 try: 820 acc_with_version = "%s.%i" \ 821 % (accession, 822 int(record.id.split(".", 1)[1])) 823 except ValueError: 824 pass 825 gi = self._get_annotation_str(record, "gi", just_first=True) 826 827 descr = record.description 828 if descr == "<unknown description>": 829 descr = "." 830 831 # The DEFINITION field must end with a period 832 # see ftp://ftp.ncbi.nih.gov/genbank/gbrel.txt [3.4.5] 833 # and discussion https://github.com/biopython/biopython/pull/616 834 # So let's add a period 835 descr += '.' 836 self._write_multi_line("DEFINITION", descr) 837 838 self._write_single_line("ACCESSION", accession) 839 if gi != ".": 840 self._write_single_line("VERSION", "%s GI:%s" 841 % (acc_with_version, gi)) 842 else: 843 self._write_single_line("VERSION", "%s" % acc_with_version) 844 845 # The NCBI initially expected two types of link, 846 # e.g. "Project:28471" and "Trace Assembly Archive:123456" 847 # 848 # This changed and at some point the formatting switched to 849 # include a space after the colon, e.g. 850 # 851 # LOCUS NC_000011 1606 bp DNA linear CON 06-JUN-2016 852 # DEFINITION Homo sapiens chromosome 11, GRCh38.p7 Primary Assembly. 853 # ACCESSION NC_000011 REGION: complement(5225466..5227071) GPC_000001303 854 # VERSION NC_000011.10 GI:568815587 855 # DBLINK BioProject: PRJNA168 856 # Assembly: GCF_000001405.33 857 # ... 858 # 859 # Or, 860 # 861 # LOCUS JU120277 1044 bp mRNA linear TSA 27-NOV-2012 862 # DEFINITION TSA: Tupaia chinensis tbc000002.Tuchadli mRNA sequence. 863 # ACCESSION JU120277 864 # VERSION JU120277.1 GI:379775257 865 # DBLINK BioProject: PRJNA87013 866 # Sequence Read Archive: SRR433859 867 # ... 868 dbxrefs_with_space = [] 869 for x in record.dbxrefs: 870 if ": " not in x: 871 x = x.replace(":", ": ") 872 dbxrefs_with_space.append(x) 873 self._write_multi_entries("DBLINK", dbxrefs_with_space) 874 del dbxrefs_with_space 875 876 try: 877 # List of strings 878 # Keywords should be given separated with semi colons, 879 keywords = "; ".join(record.annotations["keywords"]) 880 # with a trailing period: 881 if not keywords.endswith("."): 882 keywords += "." 883 except KeyError: 884 # If no keywords, there should be just a period: 885 keywords = "." 886 self._write_multi_line("KEYWORDS", keywords) 887 888 if "segment" in record.annotations: 889 # Deal with SEGMENT line found only in segmented records, 890 # e.g. AH000819 891 segment = record.annotations["segment"] 892 if isinstance(segment, list): 893 assert len(segment) == 1, segment 894 segment = segment[0] 895 self._write_single_line("SEGMENT", segment) 896 897 self._write_multi_line("SOURCE", 898 self._get_annotation_str(record, "source")) 899 # The ORGANISM line MUST be a single line, as any continuation is the taxonomy 900 org = self._get_annotation_str(record, "organism") 901 if len(org) > self.MAX_WIDTH - self.HEADER_WIDTH: 902 org = org[:self.MAX_WIDTH - self.HEADER_WIDTH - 4] + "..." 903 self._write_single_line(" ORGANISM", org) 904 try: 905 # List of strings 906 # Taxonomy should be given separated with semi colons, 907 taxonomy = "; ".join(record.annotations["taxonomy"]) 908 # with a trailing period: 909 if not taxonomy.endswith("."): 910 taxonomy += "." 911 except KeyError: 912 taxonomy = "." 913 self._write_multi_line("", taxonomy) 914 915 if "references" in record.annotations: 916 self._write_references(record) 917 918 if "comment" in record.annotations or "structured_comment" in record.annotations: 919 self._write_comment(record) 920 921 handle.write("FEATURES Location/Qualifiers\n") 922 rec_length = len(record) 923 for feature in record.features: 924 self._write_feature(feature, rec_length) 925 self._write_sequence(record) 926 handle.write("//\n")
927
928 929 -class EmblWriter(_InsdcWriter):
930 """EMBL writer.""" 931 932 HEADER_WIDTH = 5 933 QUALIFIER_INDENT = 21 934 QUALIFIER_INDENT_STR = "FT" + " " * (QUALIFIER_INDENT - 2) 935 QUALIFIER_INDENT_TMP = "FT %s " # 21 if %s is empty 936 # Note second spacer line of just FH is expected: 937 FEATURE_HEADER = "FH Key Location/Qualifiers\nFH\n" 938 939 LETTERS_PER_BLOCK = 10 940 BLOCKS_PER_LINE = 6 941 LETTERS_PER_LINE = LETTERS_PER_BLOCK * BLOCKS_PER_LINE 942 POSITION_PADDING = 10 943
944 - def _write_contig(self, record):
945 max_len = self.MAX_WIDTH - self.HEADER_WIDTH 946 lines = self._split_contig(record, max_len) 947 for text in lines: 948 self._write_single_line("CO", text)
949
950 - def _write_sequence(self, record):
951 handle = self.handle # save looking up this multiple times 952 953 if isinstance(record.seq, UnknownSeq): 954 # We have already recorded the length, and there is no need 955 # to record a long sequence of NNNNNNN...NNN or whatever. 956 if "contig" in record.annotations: 957 self._write_contig(record) 958 else: 959 # TODO - Can the sequence just be left out as in GenBank files? 960 handle.write("SQ \n") 961 return 962 963 # Catches sequence being None 964 data = self._get_seq_string(record).lower() 965 seq_len = len(data) 966 967 # Get the base alphabet (underneath any Gapped or StopCodon encoding) 968 a = Alphabet._get_base_alphabet(record.seq.alphabet) 969 if isinstance(a, Alphabet.DNAAlphabet): 970 # TODO - What if we have RNA? 971 a_count = data.count('A') + data.count('a') 972 c_count = data.count('C') + data.count('c') 973 g_count = data.count('G') + data.count('g') 974 t_count = data.count('T') + data.count('t') 975 other = seq_len - (a_count + c_count + g_count + t_count) 976 handle.write("SQ Sequence %i BP; %i A; %i C; %i G; %i T; %i other;\n" 977 % (seq_len, a_count, c_count, g_count, t_count, other)) 978 else: 979 handle.write("SQ \n") 980 981 for line_number in range(0, seq_len // self.LETTERS_PER_LINE): 982 handle.write(" ") # Just four, not five 983 for block in range(self.BLOCKS_PER_LINE): 984 index = self.LETTERS_PER_LINE * line_number + \ 985 self.LETTERS_PER_BLOCK * block 986 handle.write((" %s" % data[index:index + self.LETTERS_PER_BLOCK])) 987 handle.write(str((line_number + 1) * 988 self.LETTERS_PER_LINE).rjust(self.POSITION_PADDING)) 989 handle.write("\n") 990 if seq_len % self.LETTERS_PER_LINE: 991 # Final (partial) line 992 line_number = (seq_len // self.LETTERS_PER_LINE) 993 handle.write(" ") # Just four, not five 994 for block in range(self.BLOCKS_PER_LINE): 995 index = self.LETTERS_PER_LINE * line_number + \ 996 self.LETTERS_PER_BLOCK * block 997 handle.write( 998 (" %s" % data[index:index + self.LETTERS_PER_BLOCK]).ljust(11)) 999 handle.write(str(seq_len).rjust(self.POSITION_PADDING)) 1000 handle.write("\n")
1001
1002 - def _write_single_line(self, tag, text):
1003 assert len(tag) == 2 1004 line = tag + " " + text 1005 if len(text) > self.MAX_WIDTH: 1006 warnings.warn("Line %r too long" % line, BiopythonWarning) 1007 self.handle.write(line + "\n")
1008
1009 - def _write_multi_line(self, tag, text):
1010 max_len = self.MAX_WIDTH - self.HEADER_WIDTH 1011 lines = self._split_multi_line(text, max_len) 1012 for line in lines: 1013 self._write_single_line(tag, line)
1014
1015 - def _write_the_first_lines(self, record):
1016 """Write the ID and AC lines.""" 1017 if "." in record.id and record.id.rsplit(".", 1)[1].isdigit(): 1018 version = "SV " + record.id.rsplit(".", 1)[1] 1019 accession = self._get_annotation_str(record, "accession", 1020 record.id.rsplit(".", 1)[0], 1021 just_first=True) 1022 else: 1023 version = "" 1024 accession = self._get_annotation_str(record, "accession", 1025 record.id, 1026 just_first=True) 1027 1028 if ";" in accession: 1029 raise ValueError("Cannot have semi-colon in EMBL accession, %s" 1030 % repr(str(accession))) 1031 if " " in accession: 1032 # This is out of practicallity... might it be allowed? 1033 raise ValueError("Cannot have spaces in EMBL accession, %s" 1034 % repr(str(accession))) 1035 1036 topology = self._get_annotation_str(record, "topology", default="") 1037 1038 # Get the molecule type 1039 # TODO - record this explicitly in the parser? 1040 # Get the base alphabet (underneath any Gapped or StopCodon encoding) 1041 a = Alphabet._get_base_alphabet(record.seq.alphabet) 1042 if not isinstance(a, Alphabet.Alphabet): 1043 raise TypeError("Invalid alphabet") 1044 elif isinstance(a, Alphabet.DNAAlphabet): 1045 mol_type = "DNA" 1046 units = "BP" 1047 elif isinstance(a, Alphabet.RNAAlphabet): 1048 mol_type = "RNA" 1049 units = "BP" 1050 elif isinstance(a, Alphabet.ProteinAlphabet): 1051 mol_type = "PROTEIN" 1052 units = "AA" 1053 else: 1054 # Must be something like NucleotideAlphabet 1055 raise ValueError("Need a DNA, RNA or Protein alphabet") 1056 1057 if record.annotations.get("molecule_type", None): 1058 # Note often get RNA vs DNA discrepancy in real EMBL/NCBI files 1059 mol_type = record.annotations["molecule_type"] 1060 if mol_type in ["protein"]: 1061 mol_type = "PROTEIN" 1062 1063 # Get the taxonomy division 1064 division = self._get_data_division(record) 1065 1066 # TODO - Full ID line 1067 handle = self.handle 1068 # ID <1>; SV <2>; <3>; <4>; <5>; <6>; <7> BP. 1069 # 1. Primary accession number 1070 # 2. Sequence version number 1071 # 3. Topology: 'circular' or 'linear' 1072 # 4. Molecule type 1073 # 5. Data class 1074 # 6. Taxonomic division 1075 # 7. Sequence length 1076 self._write_single_line("ID", "%s; %s; %s; %s; ; %s; %i %s." 1077 % (accession, version, topology, mol_type, 1078 division, len(record), units)) 1079 handle.write("XX\n") 1080 self._write_single_line("AC", accession + ";") 1081 handle.write("XX\n")
1082 1083 @staticmethod
1084 - def _get_data_division(record):
1085 try: 1086 division = record.annotations["data_file_division"] 1087 except KeyError: 1088 division = "UNC" 1089 if division in ["PHG", "ENV", "FUN", "HUM", "INV", "MAM", "VRT", 1090 "MUS", "PLN", "PRO", "ROD", "SYN", "TGN", "UNC", 1091 "VRL", "XXX"]: 1092 # Good, already EMBL style 1093 # Division Code 1094 # ----------------- ---- 1095 # Bacteriophage PHG 1096 # Environmental Sample ENV 1097 # Fungal FUN 1098 # Human HUM 1099 # Invertebrate INV 1100 # Other Mammal MAM 1101 # Other Vertebrate VRT 1102 # Mus musculus MUS 1103 # Plant PLN 1104 # Prokaryote PRO 1105 # Other Rodent ROD 1106 # Synthetic SYN 1107 # Transgenic TGN 1108 # Unclassified UNC (i.e. unknown) 1109 # Viral VRL 1110 # 1111 # (plus XXX used for submiting data to EMBL) 1112 pass 1113 else: 1114 # See if this is in GenBank style & can be converted. 1115 # Generally a problem as the GenBank groups are wider 1116 # than those of EMBL. Note that GenBank use "BCT" for 1117 # both bacteria and acherea thus this maps to EMBL's 1118 # "PRO" nicely. 1119 gbk_to_embl = {"BCT": "PRO", 1120 "UNK": "UNC", 1121 } 1122 try: 1123 division = gbk_to_embl[division] 1124 except KeyError: 1125 division = "UNC" 1126 assert len(division) == 3 1127 return division
1128
1129 - def _write_keywords(self, record):
1130 # Put the keywords right after DE line. 1131 # Each 'keyword' can have multiple words and spaces, but we 1132 # must not split any 'keyword' between lines. 1133 # TODO - Combine short keywords onto one line 1134 for keyword in record.annotations["keywords"]: 1135 self._write_single_line("KW", keyword) 1136 self.handle.write("XX\n")
1137
1138 - def _write_references(self, record):
1139 # The order should be RN, RC, RP, RX, RG, RA, RT, RL 1140 number = 0 1141 for ref in record.annotations["references"]: 1142 if not isinstance(ref, SeqFeature.Reference): 1143 continue 1144 number += 1 1145 self._write_single_line("RN", "[%i]" % number) 1146 # TODO - support for RC line (needed in parser too) 1147 # TODO - support more complex record reference locations? 1148 if ref.location and len(ref.location) == 1: 1149 self._write_single_line( 1150 "RP", "%i-%i" % (ref.location[0].nofuzzy_start + 1, 1151 ref.location[0].nofuzzy_end)) 1152 # TODO - record any DOI or AGRICOLA identifier in the reference object? 1153 if ref.pubmed_id: 1154 self._write_single_line("RX", "PUBMED; %s." % ref.pubmed_id) 1155 if ref.consrtm: 1156 self._write_single_line("RG", "%s" % ref.consrtm) 1157 if ref.authors: 1158 # We store the AUTHORS data as a single string 1159 self._write_multi_line("RA", ref.authors + ";") 1160 if ref.title: 1161 # We store the title as a single string 1162 self._write_multi_line("RT", '"%s";' % ref.title) 1163 if ref.journal: 1164 # We store this as a single string - holds the journal name, 1165 # volume, year, and page numbers of the citation 1166 self._write_multi_line("RL", ref.journal) 1167 self.handle.write("XX\n")
1168
1169 - def _write_comment(self, record):
1170 # This is a bit complicated due to the range of possible 1171 # ways people might have done their annotation... 1172 # Currently the parser uses a single string with newlines. 1173 # A list of lines is also reasonable. 1174 # A single (long) string is perhaps the most natural of all. 1175 # This means we may need to deal with line wrapping. 1176 comment = record.annotations["comment"] 1177 if isinstance(comment, basestring): 1178 lines = comment.split("\n") 1179 elif isinstance(comment, (list, tuple)): 1180 lines = comment 1181 else: 1182 raise ValueError("Could not understand comment annotation") 1183 # TODO - Merge this with the GenBank comment code? 1184 if not lines: 1185 return 1186 for line in lines: 1187 self._write_multi_line("CC", line) 1188 self.handle.write("XX\n")
1189
1190 - def write_record(self, record):
1191 """Write a single record to the output file.""" 1192 handle = self.handle 1193 self._write_the_first_lines(record) 1194 1195 # PR line (0 or 1 lines only), project identifier 1196 # 1197 # Assuming can't use 2 lines, we should prefer newer GenBank 1198 # DBLINK BioProject:... entries over the older GenBank DBLINK 1199 # Project:... lines. 1200 # 1201 # In either case, seems EMBL usess just "PR Project:..." 1202 # regardless of the type of ID (old numeric only, or new 1203 # with alpha prefix), e.g. for CP002497 NCBI now uses: 1204 # 1205 # DBLINK BioProject: PRJNA60715 1206 # BioSample: SAMN03081426 1207 # 1208 # While EMBL uses: 1209 # 1210 # XX 1211 # PR Project:PRJNA60715; 1212 # XX 1213 # 1214 # Sorting ensures (new) BioProject:... is before old Project:... 1215 for xref in sorted(record.dbxrefs): 1216 if xref.startswith("BioProject:"): 1217 self._write_single_line("PR", xref[3:] + ";") 1218 handle.write("XX\n") 1219 break 1220 if xref.startswith("Project:"): 1221 self._write_single_line("PR", xref + ";") 1222 handle.write("XX\n") 1223 break 1224 1225 # TODO - DT lines (date) 1226 1227 descr = record.description 1228 if descr == "<unknown description>": 1229 descr = "." 1230 self._write_multi_line("DE", descr) 1231 handle.write("XX\n") 1232 1233 if "keywords" in record.annotations: 1234 self._write_keywords(record) 1235 1236 # Should this be "source" or "organism"? 1237 self._write_multi_line( 1238 "OS", self._get_annotation_str(record, "organism")) 1239 try: 1240 # List of strings 1241 taxonomy = "; ".join(record.annotations["taxonomy"]) + "." 1242 except KeyError: 1243 taxonomy = "." 1244 self._write_multi_line("OC", taxonomy) 1245 handle.write("XX\n") 1246 1247 if "references" in record.annotations: 1248 self._write_references(record) 1249 1250 if "comment" in record.annotations: 1251 self._write_comment(record) 1252 1253 handle.write(self.FEATURE_HEADER) 1254 rec_length = len(record) 1255 for feature in record.features: 1256 self._write_feature(feature, rec_length) 1257 handle.write("XX\n") 1258 1259 self._write_sequence(record) 1260 handle.write("//\n")
1261
1262 1263 -class ImgtWriter(EmblWriter):
1264 """IMGT writer (EMBL format variant).""" 1265 1266 HEADER_WIDTH = 5 1267 QUALIFIER_INDENT = 25 # Not 21 as in EMBL 1268 QUALIFIER_INDENT_STR = "FT" + " " * (QUALIFIER_INDENT - 2) 1269 QUALIFIER_INDENT_TMP = "FT %s " # 25 if %s is empty 1270 FEATURE_HEADER = "FH Key Location/Qualifiers\nFH\n"
1271 1272 if __name__ == "__main__": 1273 from Bio._utils import run_doctest 1274 run_doctest(verbose=0) 1275