Package Bio :: Package SeqIO :: Module InsdcIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.InsdcIO

   1  # Copyright 2007-2016 by Peter Cock.  All rights reserved. 
   2  # 
   3  # This code is part of the Biopython distribution and governed by its 
   4  # license.  Please see the LICENSE file that should have been included 
   5  # as part of this package.. 
   6   
   7  """Bio.SeqIO support for the "genbank" and "embl" file formats. 
   8   
   9  You are expected to use this module via the Bio.SeqIO functions. 
  10  Note that internally this module calls Bio.GenBank to do the actual 
  11  parsing of GenBank, EMBL and IMGT files. 
  12   
  13  See also: 
  14   
  15  International Nucleotide Sequence Database Collaboration 
  16  http://www.insdc.org/ 
  17   
  18  GenBank 
  19  http://www.ncbi.nlm.nih.gov/Genbank/ 
  20   
  21  EMBL Nucleotide Sequence Database 
  22  http://www.ebi.ac.uk/embl/ 
  23   
  24  DDBJ (DNA Data Bank of Japan) 
  25  http://www.ddbj.nig.ac.jp/ 
  26   
  27  IMGT (use a variant of EMBL format with longer feature indents) 
  28  http://imgt.cines.fr/download/LIGM-DB/userman_doc.html 
  29  http://imgt.cines.fr/download/LIGM-DB/ftable_doc.html 
  30  http://www.ebi.ac.uk/imgt/hla/docs/manual.html 
  31   
  32  """ 
  33   
  34  from __future__ import print_function 
  35   
  36  import warnings 
  37  from Bio import BiopythonWarning 
  38   
  39  from Bio.Seq import UnknownSeq 
  40  from Bio.GenBank.Scanner import GenBankScanner, EmblScanner, _ImgtScanner 
  41  from Bio import Alphabet 
  42  from Bio.SeqIO.Interfaces import SequentialSequenceWriter 
  43  from Bio import SeqFeature 
  44   
  45  from Bio._py3k import _is_int_or_long 
  46  from Bio._py3k import basestring 
  47   
  48   
  49  # NOTE 
  50  # ==== 
  51  # The "brains" for parsing GenBank, EMBL and IMGT files (and any 
  52  # other flat file variants from the INSDC in future) is in 
  53  # Bio.GenBank.Scanner (plus the _FeatureConsumer in Bio.GenBank) 
  54  # However, all the writing code is in this file. 
  55   
  56   
57 -def GenBankIterator(handle):
58 """Breaks up a Genbank file into SeqRecord objects. 59 60 Every section from the LOCUS line to the terminating // becomes 61 a single SeqRecord with associated annotation and features. 62 63 Note that for genomes or chromosomes, there is typically only 64 one record. 65 66 This gets called internally by Bio.SeqIO for the GenBank file format: 67 68 >>> from Bio import SeqIO 69 >>> for record in SeqIO.parse("GenBank/cor6_6.gb", "gb"): 70 ... print(record.id) 71 ... 72 X55053.1 73 X62281.1 74 M81224.1 75 AJ237582.1 76 L31939.1 77 AF297471.1 78 79 Equivalently, 80 81 >>> with open("GenBank/cor6_6.gb") as handle: 82 ... for record in GenBankIterator(handle): 83 ... print(record.id) 84 ... 85 X55053.1 86 X62281.1 87 M81224.1 88 AJ237582.1 89 L31939.1 90 AF297471.1 91 92 """ 93 # This calls a generator function: 94 return GenBankScanner(debug=0).parse_records(handle)
95 96
97 -def EmblIterator(handle):
98 """Breaks up an EMBL file into SeqRecord objects. 99 100 Every section from the LOCUS line to the terminating // becomes 101 a single SeqRecord with associated annotation and features. 102 103 Note that for genomes or chromosomes, there is typically only 104 one record. 105 106 This gets called internally by Bio.SeqIO for the EMBL file format: 107 108 >>> from Bio import SeqIO 109 >>> for record in SeqIO.parse("EMBL/epo_prt_selection.embl", "embl"): 110 ... print(record.id) 111 ... 112 A00022.1 113 A00028.1 114 A00031.1 115 A00034.1 116 A00060.1 117 A00071.1 118 A00072.1 119 A00078.1 120 CQ797900.1 121 122 Equivalently, 123 124 >>> with open("EMBL/epo_prt_selection.embl") as handle: 125 ... for record in EmblIterator(handle): 126 ... print(record.id) 127 ... 128 A00022.1 129 A00028.1 130 A00031.1 131 A00034.1 132 A00060.1 133 A00071.1 134 A00072.1 135 A00078.1 136 CQ797900.1 137 138 """ 139 # This calls a generator function: 140 return EmblScanner(debug=0).parse_records(handle)
141 142
143 -def ImgtIterator(handle):
144 """Breaks up an IMGT file into SeqRecord objects. 145 146 Every section from the LOCUS line to the terminating // becomes 147 a single SeqRecord with associated annotation and features. 148 149 Note that for genomes or chromosomes, there is typically only 150 one record.""" 151 # This calls a generator function: 152 return _ImgtScanner(debug=0).parse_records(handle)
153 154
155 -def GenBankCdsFeatureIterator(handle, alphabet=Alphabet.generic_protein):
156 """Breaks up a Genbank file into SeqRecord objects for each CDS feature. 157 158 Every section from the LOCUS line to the terminating // can contain 159 many CDS features. These are returned as with the stated amino acid 160 translation sequence (if given). 161 """ 162 # This calls a generator function: 163 return GenBankScanner(debug=0).parse_cds_features(handle, alphabet)
164 165
166 -def EmblCdsFeatureIterator(handle, alphabet=Alphabet.generic_protein):
167 """Breaks up a EMBL file into SeqRecord objects for each CDS feature. 168 169 Every section from the LOCUS line to the terminating // can contain 170 many CDS features. These are returned as with the stated amino acid 171 translation sequence (if given). 172 """ 173 # This calls a generator function: 174 return EmblScanner(debug=0).parse_cds_features(handle, alphabet)
175 176
177 -def _insdc_feature_position_string(pos, offset=0):
178 """Build a GenBank/EMBL position string (PRIVATE). 179 180 Use offset=1 to add one to convert a start position from python counting. 181 """ 182 if isinstance(pos, SeqFeature.ExactPosition): 183 return "%i" % (pos.position + offset) 184 elif isinstance(pos, SeqFeature.WithinPosition): 185 return "(%i.%i)" % (pos.position + offset, 186 pos.position + pos.extension + offset) 187 elif isinstance(pos, SeqFeature.BetweenPosition): 188 return "(%i^%i)" % (pos.position + offset, 189 pos.position + pos.extension + offset) 190 elif isinstance(pos, SeqFeature.BeforePosition): 191 return "<%i" % (pos.position + offset) 192 elif isinstance(pos, SeqFeature.AfterPosition): 193 return ">%i" % (pos.position + offset) 194 elif isinstance(pos, SeqFeature.OneOfPosition): 195 return "one-of(%s)" \ 196 % ",".join(_insdc_feature_position_string(p, offset) 197 for p in pos.position_choices) 198 elif isinstance(pos, SeqFeature.AbstractPosition): 199 raise NotImplementedError("Please report this as a bug in Biopython.") 200 else: 201 raise ValueError("Expected a SeqFeature position object.")
202 203
204 -def _insdc_location_string_ignoring_strand_and_subfeatures(location, rec_length):
205 if location.ref: 206 ref = "%s:" % location.ref 207 else: 208 ref = "" 209 assert not location.ref_db 210 if isinstance(location.start, SeqFeature.ExactPosition) \ 211 and isinstance(location.end, SeqFeature.ExactPosition) \ 212 and location.start.position == location.end.position: 213 # Special case, for 12:12 return 12^13 214 # (a zero length slice, meaning the point between two letters) 215 if location.end.position == rec_length: 216 # Very special case, for a between position at the end of a 217 # sequence (used on some circular genomes, Bug 3098) we have 218 # N:N so return N^1 219 return "%s%i^1" % (ref, rec_length) 220 else: 221 return "%s%i^%i" % (ref, location.end.position, 222 location.end.position + 1) 223 if isinstance(location.start, SeqFeature.ExactPosition) \ 224 and isinstance(location.end, SeqFeature.ExactPosition) \ 225 and location.start.position + 1 == location.end.position: 226 # Special case, for 11:12 return 12 rather than 12..12 227 # (a length one slice, meaning a single letter) 228 return "%s%i" % (ref, location.end.position) 229 elif isinstance(location.start, SeqFeature.UnknownPosition) \ 230 or isinstance(location.end, SeqFeature.UnknownPosition): 231 # Special case for features from SwissProt/UniProt files 232 if isinstance(location.start, SeqFeature.UnknownPosition) \ 233 and isinstance(location.end, SeqFeature.UnknownPosition): 234 # warnings.warn("Feature with unknown location", BiopythonWarning) 235 # return "?" 236 raise ValueError("Feature with unknown location") 237 elif isinstance(location.start, SeqFeature.UnknownPosition): 238 # Treat the unknown start position as a BeforePosition 239 return "%s<%i..%s" \ 240 % (ref, 241 location.nofuzzy_end, 242 _insdc_feature_position_string(location.end)) 243 else: 244 # Treat the unknown end position as an AfterPosition 245 return "%s%s..>%i" \ 246 % (ref, 247 _insdc_feature_position_string(location.start, +1), 248 location.nofuzzy_start + 1) 249 else: 250 # Typical case, e.g. 12..15 gets mapped to 11:15 251 return ref \ 252 + _insdc_feature_position_string(location.start, +1) \ 253 + ".." + \ 254 _insdc_feature_position_string(location.end)
255 256
257 -def _insdc_location_string(location, rec_length):
258 """Build a GenBank/EMBL location from a (Compound) FeatureLocation (PRIVATE). 259 260 There is a choice of how to show joins on the reverse complement strand, 261 GenBank used "complement(join(1,10),(20,100))" while EMBL used to use 262 "join(complement(20,100),complement(1,10))" instead (but appears to have 263 now adopted the GenBank convention). Notice that the order of the entries 264 is reversed! This function therefore uses the first form. In this situation 265 we expect the CompoundFeatureLocation and its parts to all be marked as 266 strand == -1, and to be in the order 19:100 then 0:10. 267 """ 268 try: 269 parts = location.parts 270 # CompoundFeatureLocation 271 if location.strand == -1: 272 # Special case, put complement outside the join/order/... and reverse order 273 return "complement(%s(%s))" % (location.operator, 274 ",".join(_insdc_location_string_ignoring_strand_and_subfeatures(p, rec_length) 275 for p in parts[::-1])) 276 else: 277 return "%s(%s)" % (location.operator, 278 ",".join(_insdc_location_string(p, rec_length) for p in parts)) 279 except AttributeError: 280 # Simple FeatureLocation 281 loc = _insdc_location_string_ignoring_strand_and_subfeatures(location, rec_length) 282 if location.strand == -1: 283 return "complement(%s)" % loc 284 else: 285 return loc
286 287
288 -def _insdc_feature_location_string(feature, rec_length):
289 """Build a GenBank/EMBL location string from a SeqFeature (PRIVATE, OBSOLETE). 290 291 There is a choice of how to show joins on the reverse complement strand, 292 GenBank used "complement(join(1,10),(20,100))" while EMBL used to use 293 "join(complement(20,100),complement(1,10))" instead (but appears to have 294 now adopted the GenBank convention). Notice that the order of the entries 295 is reversed! This function therefore uses the first form. In this situation 296 we expect the parent feature and the two children to all be marked as 297 strand == -1, and in the order 0:10 then 19:100. 298 299 Also need to consider dual-strand examples like these from the Arabidopsis 300 thaliana chloroplast NC_000932: join(complement(69611..69724),139856..140650) 301 gene ArthCp047, GeneID:844801 or its CDS (protein NP_051038.1 GI:7525057) 302 which is further complicated by a splice: 303 join(complement(69611..69724),139856..140087,140625..140650) 304 305 For this mixed strand feature, the parent SeqFeature should have 306 no strand (either 0 or None) while the child features should have either 307 strand +1 or -1 as appropriate, and be listed in the order given here. 308 """ 309 # Using private variable to avoid deprecation warning 310 if not feature._sub_features: 311 # Non-recursive. 312 # assert feature.location_operator == "", \ 313 # "%s has no subfeatures but location_operator %s" \ 314 # % (repr(feature), feature.location_operator) 315 location = _insdc_location_string_ignoring_strand_and_subfeatures( 316 feature.location, rec_length) 317 if feature.strand == -1: 318 location = "complement(%s)" % location 319 return location 320 # As noted above, treat reverse complement strand features carefully: 321 if feature.strand == -1: 322 for f in feature._sub_features: 323 if f.strand != -1: 324 raise ValueError("Inconsistent strands: %r for parent, %r for child" 325 % (feature.strand, f.strand)) 326 return "complement(%s(%s))" \ 327 % (feature.location_operator, 328 ",".join(_insdc_location_string_ignoring_strand_and_subfeatures(f.location, rec_length) 329 for f in feature._sub_features)) 330 # if feature.strand == +1: 331 # for f in feature.sub_features: 332 # assert f.strand == +1 333 # This covers typical forward strand features, and also an evil mixed strand: 334 assert feature.location_operator != "" 335 return "%s(%s)" % (feature.location_operator, 336 ",".join(_insdc_feature_location_string(f, rec_length) 337 for f in feature._sub_features))
338 339
340 -class _InsdcWriter(SequentialSequenceWriter):
341 """Base class for GenBank and EMBL writers (PRIVATE).""" 342 MAX_WIDTH = 80 343 QUALIFIER_INDENT = 21 344 QUALIFIER_INDENT_STR = " " * QUALIFIER_INDENT 345 QUALIFIER_INDENT_TMP = " %s " # 21 if %s is empty 346 FTQUAL_NO_QUOTE = ("anticodon", "citation", "codon_start", "compare", 347 "direction", "estimated_length", "mod_base", "number", 348 "rpt_type", "rpt_unit_range", "tag_peptide", 349 "transl_except", "transl_table") 350
351 - def _write_feature_qualifier(self, key, value=None, quote=None):
352 if value is None: 353 # Value-less entry like /pseudo 354 self.handle.write("%s/%s\n" % (self.QUALIFIER_INDENT_STR, key)) 355 return 356 # Quick hack with no line wrapping, may be useful for testing: 357 # self.handle.write('%s/%s="%s"\n' % (self.QUALIFIER_INDENT_STR, key, value)) 358 if quote is None: 359 # Try to mimic unwritten rules about when quotes can be left out: 360 if _is_int_or_long(value) or key in self.FTQUAL_NO_QUOTE: 361 quote = False 362 else: 363 quote = True 364 if quote: 365 line = '%s/%s="%s"' % (self.QUALIFIER_INDENT_STR, key, value) 366 else: 367 line = '%s/%s=%s' % (self.QUALIFIER_INDENT_STR, key, value) 368 if len(line) <= self.MAX_WIDTH: 369 self.handle.write(line + "\n") 370 return 371 while line.lstrip(): 372 if len(line) <= self.MAX_WIDTH: 373 self.handle.write(line + "\n") 374 return 375 # Insert line break... 376 for index in range(min(len(line) - 1, self.MAX_WIDTH), 377 self.QUALIFIER_INDENT + 1, -1): 378 if line[index] == " ": 379 break 380 if line[index] != " ": 381 # No nice place to break... 382 index = self.MAX_WIDTH 383 assert index <= self.MAX_WIDTH 384 self.handle.write(line[:index] + "\n") 385 line = self.QUALIFIER_INDENT_STR + line[index:].lstrip()
386
387 - def _wrap_location(self, location):
388 """Split a feature location into lines (break at commas).""" 389 # TODO - Rewrite this not to recurse! 390 length = self.MAX_WIDTH - self.QUALIFIER_INDENT 391 if len(location) <= length: 392 return location 393 index = location[:length].rfind(",") 394 if index == -1: 395 # No good place to split (!) 396 warnings.warn("Couldn't split location:\n%s" % location, 397 BiopythonWarning) 398 return location 399 return location[:index + 1] + "\n" + \ 400 self.QUALIFIER_INDENT_STR + \ 401 self._wrap_location(location[index + 1:])
402
403 - def _write_feature(self, feature, record_length):
404 """Write a single SeqFeature object to features table.""" 405 assert feature.type, feature 406 location = _insdc_location_string(feature.location, record_length) 407 f_type = feature.type.replace(" ", "_") 408 line = (self.QUALIFIER_INDENT_TMP % f_type)[:self.QUALIFIER_INDENT] \ 409 + self._wrap_location(location) + "\n" 410 self.handle.write(line) 411 # Now the qualifiers... 412 for key in sorted(feature.qualifiers.keys()): 413 values = feature.qualifiers[key] 414 if isinstance(values, (list, tuple)): 415 for value in values: 416 self._write_feature_qualifier(key, value) 417 else: 418 # String, int, etc - or None for a /pseudo tpy entry 419 self._write_feature_qualifier(key, values)
420
421 - def _get_annotation_str(self, record, key, default=".", just_first=False):
422 """Get an annotation dictionary entry (as a string). 423 424 Some entries are lists, in which case if just_first=True the first entry 425 is returned. If just_first=False (default) this verifies there is only 426 one entry before returning it.""" 427 try: 428 answer = record.annotations[key] 429 except KeyError: 430 return default 431 if isinstance(answer, list): 432 if not just_first: 433 assert len(answer) == 1 434 return str(answer[0]) 435 else: 436 return str(answer)
437
438 - def _split_multi_line(self, text, max_len):
439 """Returns a list of strings. 440 441 Any single words which are too long get returned as a whole line 442 (e.g. URLs) without an exception or warning. 443 """ 444 # TODO - Do the line spliting while preserving white space? 445 text = text.strip() 446 if len(text) <= max_len: 447 return [text] 448 449 words = text.split() 450 text = "" 451 while words and len(text) + 1 + len(words[0]) <= max_len: 452 text += " " + words.pop(0) 453 text = text.strip() 454 # assert len(text) <= max_len 455 answer = [text] 456 while words: 457 text = words.pop(0) 458 while words and len(text) + 1 + len(words[0]) <= max_len: 459 text += " " + words.pop(0) 460 text = text.strip() 461 # assert len(text) <= max_len 462 answer.append(text) 463 assert not words 464 return answer
465
466 - def _split_contig(self, record, max_len):
467 """Returns a list of strings, splits on commas.""" 468 # TODO - Merge this with _write_multi_line method? 469 # It would need the addition of the comma splitting logic... 470 # are there any other cases where that would be sensible? 471 contig = record.annotations.get("contig", "") 472 if isinstance(contig, (list, tuple)): 473 contig = "".join(contig) 474 contig = self.clean(contig) 475 answer = [] 476 while contig: 477 if len(contig) > max_len: 478 # Split lines at the commas 479 pos = contig[:max_len - 1].rfind(",") 480 if pos == -1: 481 raise ValueError("Could not break up CONTIG") 482 text, contig = contig[:pos + 1], contig[pos + 1:] 483 else: 484 text, contig = contig, "" 485 answer.append(text) 486 return answer
487 488
489 -class GenBankWriter(_InsdcWriter):
490 HEADER_WIDTH = 12 491 QUALIFIER_INDENT = 21 492
493 - def _write_single_line(self, tag, text):
494 """Used in the 'header' of each GenBank record.""" 495 assert len(tag) < self.HEADER_WIDTH 496 if len(text) > self.MAX_WIDTH - self.HEADER_WIDTH: 497 if tag: 498 warnings.warn("Annotation %r too long for %r line" % (text, tag), 499 BiopythonWarning) 500 else: 501 # Can't give such a precise warning 502 warnings.warn("Annotation %r too long" % text, BiopythonWarning) 503 self.handle.write("%s%s\n" % (tag.ljust(self.HEADER_WIDTH), 504 text.replace("\n", " ")))
505
506 - def _write_multi_line(self, tag, text):
507 """Used in the 'header' of each GenBank record.""" 508 # TODO - Do the line spliting while preserving white space? 509 max_len = self.MAX_WIDTH - self.HEADER_WIDTH 510 lines = self._split_multi_line(text, max_len) 511 self._write_single_line(tag, lines[0]) 512 for line in lines[1:]: 513 self._write_single_line("", line)
514
515 - def _write_multi_entries(self, tag, text_list):
516 # used for DBLINK and any similar later line types. 517 # If the list of strings is empty, nothing is written. 518 for i, text in enumerate(text_list): 519 if i == 0: 520 self._write_single_line(tag, text) 521 else: 522 self._write_single_line("", text)
523
524 - def _get_date(self, record):
525 default = "01-JAN-1980" 526 try: 527 date = record.annotations["date"] 528 except KeyError: 529 return default 530 # Cope with a list of one string: 531 if isinstance(date, list) and len(date) == 1: 532 date = date[0] 533 # TODO - allow a Python date object 534 if not isinstance(date, basestring) or len(date) != 11 \ 535 or date[2] != "-" or date[6] != "-" \ 536 or not date[:2].isdigit() or not date[7:].isdigit() \ 537 or int(date[:2]) > 31 \ 538 or date[3:6] not in ["JAN", "FEB", "MAR", "APR", "MAY", "JUN", 539 "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"]: 540 # TODO - Check is a valid date (e.g. not 31 Feb) 541 return default 542 return date
543
544 - def _get_data_division(self, record):
545 try: 546 division = record.annotations["data_file_division"] 547 except KeyError: 548 division = "UNK" 549 if division in ["PRI", "ROD", "MAM", "VRT", "INV", "PLN", "BCT", 550 "VRL", "PHG", "SYN", "UNA", "EST", "PAT", "STS", 551 "GSS", "HTG", "HTC", "ENV", "CON"]: 552 # Good, already GenBank style 553 # PRI - primate sequences 554 # ROD - rodent sequences 555 # MAM - other mammalian sequences 556 # VRT - other vertebrate sequences 557 # INV - invertebrate sequences 558 # PLN - plant, fungal, and algal sequences 559 # BCT - bacterial sequences [plus archea] 560 # VRL - viral sequences 561 # PHG - bacteriophage sequences 562 # SYN - synthetic sequences 563 # UNA - unannotated sequences 564 # EST - EST sequences (expressed sequence tags) 565 # PAT - patent sequences 566 # STS - STS sequences (sequence tagged sites) 567 # GSS - GSS sequences (genome survey sequences) 568 # HTG - HTGS sequences (high throughput genomic sequences) 569 # HTC - HTC sequences (high throughput cDNA sequences) 570 # ENV - Environmental sampling sequences 571 # CON - Constructed sequences 572 # 573 # (plus UNK for unknown) 574 pass 575 else: 576 # See if this is in EMBL style: 577 # Division Code 578 # ----------------- ---- 579 # Bacteriophage PHG - common 580 # Environmental Sample ENV - common 581 # Fungal FUN - map to PLN (plants + fungal) 582 # Human HUM - map to PRI (primates) 583 # Invertebrate INV - common 584 # Other Mammal MAM - common 585 # Other Vertebrate VRT - common 586 # Mus musculus MUS - map to ROD (rodent) 587 # Plant PLN - common 588 # Prokaryote PRO - map to BCT (poor name) 589 # Other Rodent ROD - common 590 # Synthetic SYN - common 591 # Transgenic TGN - ??? map to SYN ??? 592 # Unclassified UNC - map to UNK 593 # Viral VRL - common 594 # 595 # (plus XXX for submiting which we can map to UNK) 596 embl_to_gbk = {"FUN": "PLN", 597 "HUM": "PRI", 598 "MUS": "ROD", 599 "PRO": "BCT", 600 "UNC": "UNK", 601 "XXX": "UNK", 602 } 603 try: 604 division = embl_to_gbk[division] 605 except KeyError: 606 division = "UNK" 607 assert len(division) == 3 608 return division
609
610 - def _write_the_first_line(self, record):
611 """Write the LOCUS line.""" 612 613 locus = record.name 614 if not locus or locus == "<unknown name>": 615 locus = record.id 616 if not locus or locus == "<unknown id>": 617 locus = self._get_annotation_str( 618 record, "accession", just_first=True) 619 if len(locus) > 16: 620 raise ValueError("Locus identifier %r is too long" % str(locus)) 621 622 if len(record) > 99999999999: 623 # Currently GenBank only officially support up to 350000, but 624 # the length field can take eleven digits 625 raise ValueError("Sequence too long!") 626 627 # Get the base alphabet (underneath any Gapped or StopCodon encoding) 628 a = Alphabet._get_base_alphabet(record.seq.alphabet) 629 if not isinstance(a, Alphabet.Alphabet): 630 raise TypeError("Invalid alphabet") 631 elif isinstance(a, Alphabet.ProteinAlphabet): 632 units = "aa" 633 elif isinstance(a, Alphabet.NucleotideAlphabet): 634 units = "bp" 635 else: 636 # Must be something like NucleotideAlphabet or 637 # just the generic Alphabet (default for fasta files) 638 raise ValueError("Need a Nucleotide or Protein alphabet") 639 640 # Get the molecule type 641 # TODO - record this explicitly in the parser? 642 if isinstance(a, Alphabet.ProteinAlphabet): 643 mol_type = "" 644 elif isinstance(a, Alphabet.DNAAlphabet): 645 mol_type = "DNA" 646 elif isinstance(a, Alphabet.RNAAlphabet): 647 mol_type = "RNA" 648 else: 649 # Must be something like NucleotideAlphabet or 650 # just the generic Alphabet (default for fasta files) 651 raise ValueError("Need a DNA, RNA or Protein alphabet") 652 653 division = self._get_data_division(record) 654 655 assert len(units) == 2 656 assert len(division) == 3 657 # TODO - date 658 # TODO - mol_type 659 line = "LOCUS %s %s %s %s %s %s\n" \ 660 % (locus.ljust(16), 661 str(len(record)).rjust(11), 662 units, 663 mol_type.ljust(6), 664 division, 665 self._get_date(record)) 666 assert len(line) == 79 + 1, repr(line) # plus one for new line 667 668 assert line[12:28].rstrip() == locus, \ 669 'LOCUS line does not contain the locus at the expected position:\n' + line 670 assert line[28:29] == " " 671 assert line[29:40].lstrip() == str(len(record)), \ 672 'LOCUS line does not contain the length at the expected position:\n' + line 673 674 # Tests copied from Bio.GenBank.Scanner 675 assert line[40:44] in [' bp ', ' aa '], \ 676 'LOCUS line does not contain size units at expected position:\n' + \ 677 line 678 assert line[44:47] in [' ', 'ss-', 'ds-', 'ms-'], \ 679 'LOCUS line does not have valid strand type (Single stranded, ...):\n' + line 680 assert line[47:54].strip() == "" \ 681 or 'DNA' in line[47:54].strip() \ 682 or 'RNA' in line[47:54].strip(), \ 683 'LOCUS line does not contain valid sequence type (DNA, RNA, ...):\n' + line 684 assert line[54:55] == ' ', \ 685 'LOCUS line does not contain space at position 55:\n' + line 686 assert line[55:63].strip() in ['', 'linear', 'circular'], \ 687 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line 688 assert line[63:64] == ' ', \ 689 'LOCUS line does not contain space at position 64:\n' + line 690 assert line[67:68] == ' ', \ 691 'LOCUS line does not contain space at position 68:\n' + line 692 assert line[70:71] == '-', \ 693 'LOCUS line does not contain - at position 71 in date:\n' + line 694 assert line[74:75] == '-', \ 695 'LOCUS line does not contain - at position 75 in date:\n' + line 696 697 self.handle.write(line)
698
699 - def _write_references(self, record):
700 number = 0 701 for ref in record.annotations["references"]: 702 if not isinstance(ref, SeqFeature.Reference): 703 continue 704 number += 1 705 data = str(number) 706 # TODO - support more complex record reference locations? 707 if ref.location and len(ref.location) == 1: 708 a = Alphabet._get_base_alphabet(record.seq.alphabet) 709 if isinstance(a, Alphabet.ProteinAlphabet): 710 units = "residues" 711 else: 712 units = "bases" 713 data += " (%s %i to %i)" % (units, 714 ref.location[0].nofuzzy_start + 1, 715 ref.location[0].nofuzzy_end) 716 self._write_single_line("REFERENCE", data) 717 if ref.authors: 718 # We store the AUTHORS data as a single string 719 self._write_multi_line(" AUTHORS", ref.authors) 720 if ref.consrtm: 721 # We store the consortium as a single string 722 self._write_multi_line(" CONSRTM", ref.consrtm) 723 if ref.title: 724 # We store the title as a single string 725 self._write_multi_line(" TITLE", ref.title) 726 if ref.journal: 727 # We store this as a single string - holds the journal name, 728 # volume, year, and page numbers of the citation 729 self._write_multi_line(" JOURNAL", ref.journal) 730 if ref.medline_id: 731 # This line type is obsolete and was removed from the GenBank 732 # flatfile format in April 2005. Should we write it? 733 # Note this has a two space indent: 734 self._write_multi_line(" MEDLINE", ref.medline_id) 735 if ref.pubmed_id: 736 # Note this has a THREE space indent: 737 self._write_multi_line(" PUBMED", ref.pubmed_id) 738 if ref.comment: 739 self._write_multi_line(" REMARK", ref.comment)
740
741 - def _write_comment(self, record):
742 # This is a bit complicated due to the range of possible 743 # ways people might have done their annotation... 744 # Currently the parser uses a single string with newlines. 745 # A list of lines is also reasonable. 746 # A single (long) string is perhaps the most natural of all. 747 # This means we may need to deal with line wrapping. 748 comment = record.annotations["comment"] 749 if isinstance(comment, basestring): 750 lines = comment.split("\n") 751 elif isinstance(comment, (list, tuple)): 752 lines = comment 753 else: 754 raise ValueError("Could not understand comment annotation") 755 self._write_multi_line("COMMENT", lines[0]) 756 for line in lines[1:]: 757 self._write_multi_line("", line)
758
759 - def _write_contig(self, record):
760 max_len = self.MAX_WIDTH - self.HEADER_WIDTH 761 lines = self._split_contig(record, max_len) 762 self._write_single_line("CONTIG", lines[0]) 763 for text in lines[1:]: 764 self._write_single_line("", text)
765
766 - def _write_sequence(self, record):
767 # Loosely based on code from Howard Salis 768 # TODO - Force lower case? 769 LETTERS_PER_LINE = 60 770 SEQUENCE_INDENT = 9 771 772 if isinstance(record.seq, UnknownSeq): 773 # We have already recorded the length, and there is no need 774 # to record a long sequence of NNNNNNN...NNN or whatever. 775 if "contig" in record.annotations: 776 self._write_contig(record) 777 else: 778 self.handle.write("ORIGIN\n") 779 return 780 781 # Catches sequence being None: 782 data = self._get_seq_string(record).lower() 783 seq_len = len(data) 784 self.handle.write("ORIGIN\n") 785 for line_number in range(0, seq_len, LETTERS_PER_LINE): 786 self.handle.write(str(line_number + 1).rjust(SEQUENCE_INDENT)) 787 for words in range(line_number, 788 min(line_number + LETTERS_PER_LINE, seq_len), 10): 789 self.handle.write(" %s" % data[words:words + 10]) 790 self.handle.write("\n")
791
792 - def write_record(self, record):
793 """Write a single record to the output file.""" 794 handle = self.handle 795 self._write_the_first_line(record) 796 797 accession = self._get_annotation_str(record, "accession", 798 record.id.split(".", 1)[0], 799 just_first=True) 800 acc_with_version = accession 801 if record.id.startswith(accession + "."): 802 try: 803 acc_with_version = "%s.%i" \ 804 % (accession, 805 int(record.id.split(".", 1)[1])) 806 except ValueError: 807 pass 808 gi = self._get_annotation_str(record, "gi", just_first=True) 809 810 descr = record.description 811 if descr == "<unknown description>": 812 descr = "." 813 self._write_multi_line("DEFINITION", descr) 814 815 self._write_single_line("ACCESSION", accession) 816 if gi != ".": 817 self._write_single_line("VERSION", "%s GI:%s" 818 % (acc_with_version, gi)) 819 else: 820 self._write_single_line("VERSION", "%s" % (acc_with_version)) 821 822 # The NCBI only expect two types of link so far, 823 # e.g. "Project:28471" and "Trace Assembly Archive:123456" 824 # TODO - Filter the dbxrefs list to just these? 825 self._write_multi_entries("DBLINK", record.dbxrefs) 826 827 try: 828 # List of strings 829 # Keywords should be given separated with semi colons, 830 keywords = "; ".join(record.annotations["keywords"]) 831 # with a trailing period: 832 if not keywords.endswith("."): 833 keywords += "." 834 except KeyError: 835 # If no keywords, there should be just a period: 836 keywords = "." 837 self._write_multi_line("KEYWORDS", keywords) 838 839 if "segment" in record.annotations: 840 # Deal with SEGMENT line found only in segmented records, 841 # e.g. AH000819 842 segment = record.annotations["segment"] 843 if isinstance(segment, list): 844 assert len(segment) == 1, segment 845 segment = segment[0] 846 self._write_single_line("SEGMENT", segment) 847 848 self._write_multi_line("SOURCE", 849 self._get_annotation_str(record, "source")) 850 # The ORGANISM line MUST be a single line, as any continuation is the taxonomy 851 org = self._get_annotation_str(record, "organism") 852 if len(org) > self.MAX_WIDTH - self.HEADER_WIDTH: 853 org = org[:self.MAX_WIDTH - self.HEADER_WIDTH - 4] + "..." 854 self._write_single_line(" ORGANISM", org) 855 try: 856 # List of strings 857 # Taxonomy should be given separated with semi colons, 858 taxonomy = "; ".join(record.annotations["taxonomy"]) 859 # with a trailing period: 860 if not taxonomy.endswith("."): 861 taxonomy += "." 862 except KeyError: 863 taxonomy = "." 864 self._write_multi_line("", taxonomy) 865 866 if "references" in record.annotations: 867 self._write_references(record) 868 869 if "comment" in record.annotations: 870 self._write_comment(record) 871 872 handle.write("FEATURES Location/Qualifiers\n") 873 rec_length = len(record) 874 for feature in record.features: 875 self._write_feature(feature, rec_length) 876 self._write_sequence(record) 877 handle.write("//\n")
878 879
880 -class EmblWriter(_InsdcWriter):
881 HEADER_WIDTH = 5 882 QUALIFIER_INDENT = 21 883 QUALIFIER_INDENT_STR = "FT" + " " * (QUALIFIER_INDENT - 2) 884 QUALIFIER_INDENT_TMP = "FT %s " # 21 if %s is empty 885 FEATURE_HEADER = "FH Key Location/Qualifiers\n" 886
887 - def _write_contig(self, record):
888 max_len = self.MAX_WIDTH - self.HEADER_WIDTH 889 lines = self._split_contig(record, max_len) 890 for text in lines: 891 self._write_single_line("CO", text)
892
893 - def _write_sequence(self, record):
894 LETTERS_PER_BLOCK = 10 895 BLOCKS_PER_LINE = 6 896 LETTERS_PER_LINE = LETTERS_PER_BLOCK * BLOCKS_PER_LINE 897 POSITION_PADDING = 10 898 handle = self.handle # save looking up this multiple times 899 900 if isinstance(record.seq, UnknownSeq): 901 # We have already recorded the length, and there is no need 902 # to record a long sequence of NNNNNNN...NNN or whatever. 903 if "contig" in record.annotations: 904 self._write_contig(record) 905 else: 906 # TODO - Can the sequence just be left out as in GenBank files? 907 handle.write("SQ \n") 908 return 909 910 # Catches sequence being None 911 data = self._get_seq_string(record).lower() 912 seq_len = len(data) 913 914 # Get the base alphabet (underneath any Gapped or StopCodon encoding) 915 a = Alphabet._get_base_alphabet(record.seq.alphabet) 916 if isinstance(a, Alphabet.DNAAlphabet): 917 # TODO - What if we have RNA? 918 a_count = data.count('A') + data.count('a') 919 c_count = data.count('C') + data.count('c') 920 g_count = data.count('G') + data.count('g') 921 t_count = data.count('T') + data.count('t') 922 other = seq_len - (a_count + c_count + g_count + t_count) 923 handle.write("SQ Sequence %i BP; %i A; %i C; %i G; %i T; %i other;\n" 924 % (seq_len, a_count, c_count, g_count, t_count, other)) 925 else: 926 handle.write("SQ \n") 927 928 for line_number in range(0, seq_len // LETTERS_PER_LINE): 929 handle.write(" ") # Just four, not five 930 for block in range(BLOCKS_PER_LINE): 931 index = LETTERS_PER_LINE * line_number + \ 932 LETTERS_PER_BLOCK * block 933 handle.write((" %s" % data[index:index + LETTERS_PER_BLOCK])) 934 handle.write(str((line_number + 1) * 935 LETTERS_PER_LINE).rjust(POSITION_PADDING)) 936 handle.write("\n") 937 if seq_len % LETTERS_PER_LINE: 938 # Final (partial) line 939 line_number = (seq_len // LETTERS_PER_LINE) 940 handle.write(" ") # Just four, not five 941 for block in range(BLOCKS_PER_LINE): 942 index = LETTERS_PER_LINE * line_number + \ 943 LETTERS_PER_BLOCK * block 944 handle.write( 945 (" %s" % data[index:index + LETTERS_PER_BLOCK]).ljust(11)) 946 handle.write(str(seq_len).rjust(POSITION_PADDING)) 947 handle.write("\n")
948
949 - def _write_single_line(self, tag, text):
950 assert len(tag) == 2 951 line = tag + " " + text 952 if len(text) > self.MAX_WIDTH: 953 warnings.warn("Line %r too long" % line, BiopythonWarning) 954 self.handle.write(line + "\n")
955
956 - def _write_multi_line(self, tag, text):
957 max_len = self.MAX_WIDTH - self.HEADER_WIDTH 958 lines = self._split_multi_line(text, max_len) 959 for line in lines: 960 self._write_single_line(tag, line)
961
962 - def _write_the_first_lines(self, record):
963 """Write the ID and AC lines.""" 964 if "." in record.id and record.id.rsplit(".", 1)[1].isdigit(): 965 version = "SV " + record.id.rsplit(".", 1)[1] 966 accession = self._get_annotation_str(record, "accession", 967 record.id.rsplit(".", 1)[0], 968 just_first=True) 969 else: 970 version = "" 971 accession = self._get_annotation_str(record, "accession", 972 record.id, 973 just_first=True) 974 975 if ";" in accession: 976 raise ValueError("Cannot have semi-colon in EMBL accession, %s" 977 % repr(str(accession))) 978 if " " in accession: 979 # This is out of practicallity... might it be allowed? 980 raise ValueError("Cannot have spaces in EMBL accession, %s" 981 % repr(str(accession))) 982 983 # Get the molecule type 984 # TODO - record this explicitly in the parser? 985 # Get the base alphabet (underneath any Gapped or StopCodon encoding) 986 a = Alphabet._get_base_alphabet(record.seq.alphabet) 987 if not isinstance(a, Alphabet.Alphabet): 988 raise TypeError("Invalid alphabet") 989 elif isinstance(a, Alphabet.DNAAlphabet): 990 mol_type = "DNA" 991 units = "BP" 992 elif isinstance(a, Alphabet.RNAAlphabet): 993 mol_type = "RNA" 994 units = "BP" 995 elif isinstance(a, Alphabet.ProteinAlphabet): 996 mol_type = "PROTEIN" 997 units = "AA" 998 else: 999 # Must be something like NucleotideAlphabet 1000 raise ValueError("Need a DNA, RNA or Protein alphabet") 1001 1002 # Get the taxonomy division 1003 division = self._get_data_division(record) 1004 1005 # TODO - Full ID line 1006 handle = self.handle 1007 # ID <1>; SV <2>; <3>; <4>; <5>; <6>; <7> BP. 1008 # 1. Primary accession number 1009 # 2. Sequence version number 1010 # 3. Topology: 'circular' or 'linear' 1011 # 4. Molecule type 1012 # 5. Data class 1013 # 6. Taxonomic division 1014 # 7. Sequence length 1015 self._write_single_line("ID", "%s; %s; ; %s; ; %s; %i %s." 1016 % (accession, version, mol_type, 1017 division, len(record), units)) 1018 handle.write("XX\n") 1019 self._write_single_line("AC", accession + ";") 1020 handle.write("XX\n")
1021
1022 - def _get_data_division(self, record):
1023 try: 1024 division = record.annotations["data_file_division"] 1025 except KeyError: 1026 division = "UNC" 1027 if division in ["PHG", "ENV", "FUN", "HUM", "INV", "MAM", "VRT", 1028 "MUS", "PLN", "PRO", "ROD", "SYN", "TGN", "UNC", 1029 "VRL", "XXX"]: 1030 # Good, already EMBL style 1031 # Division Code 1032 # ----------------- ---- 1033 # Bacteriophage PHG 1034 # Environmental Sample ENV 1035 # Fungal FUN 1036 # Human HUM 1037 # Invertebrate INV 1038 # Other Mammal MAM 1039 # Other Vertebrate VRT 1040 # Mus musculus MUS 1041 # Plant PLN 1042 # Prokaryote PRO 1043 # Other Rodent ROD 1044 # Synthetic SYN 1045 # Transgenic TGN 1046 # Unclassified UNC (i.e. unknown) 1047 # Viral VRL 1048 # 1049 # (plus XXX used for submiting data to EMBL) 1050 pass 1051 else: 1052 # See if this is in GenBank style & can be converted. 1053 # Generally a problem as the GenBank groups are wider 1054 # than those of EMBL. Note that GenBank use "BCT" for 1055 # both bacteria and acherea thus this maps to EMBL's 1056 # "PRO" nicely. 1057 gbk_to_embl = {"BCT": "PRO", 1058 "UNK": "UNC", 1059 } 1060 try: 1061 division = gbk_to_embl[division] 1062 except KeyError: 1063 division = "UNC" 1064 assert len(division) == 3 1065 return division
1066
1067 - def _write_keywords(self, record):
1068 # Put the keywords right after DE line. 1069 # Each 'keyword' can have multiple words and spaces, but we 1070 # must not split any 'keyword' between lines. 1071 # TODO - Combine short keywords onto one line 1072 for keyword in record.annotations["keywords"]: 1073 self._write_single_line("KW", keyword) 1074 self.handle.write("XX\n")
1075
1076 - def _write_references(self, record):
1077 # The order should be RN, RC, RP, RX, RG, RA, RT, RL 1078 number = 0 1079 for ref in record.annotations["references"]: 1080 if not isinstance(ref, SeqFeature.Reference): 1081 continue 1082 number += 1 1083 self._write_single_line("RN", "[%i]" % number) 1084 # TODO - support for RC line (needed in parser too) 1085 # TODO - support more complex record reference locations? 1086 if ref.location and len(ref.location) == 1: 1087 self._write_single_line( 1088 "RP", "%i-%i" % (ref.location[0].nofuzzy_start + 1, 1089 ref.location[0].nofuzzy_end)) 1090 # TODO - record any DOI or AGRICOLA identifier in the reference object? 1091 if ref.pubmed_id: 1092 self._write_single_line("RX", "PUBMED; %s." % ref.pubmed_id) 1093 if ref.consrtm: 1094 self._write_single_line("RG", "%s" % ref.consrtm) 1095 if ref.authors: 1096 # We store the AUTHORS data as a single string 1097 self._write_multi_line("RA", ref.authors + ";") 1098 if ref.title: 1099 # We store the title as a single string 1100 self._write_multi_line("RT", '"%s";' % ref.title) 1101 if ref.journal: 1102 # We store this as a single string - holds the journal name, 1103 # volume, year, and page numbers of the citation 1104 self._write_multi_line("RL", ref.journal) 1105 self.handle.write("XX\n")
1106
1107 - def _write_comment(self, record):
1108 # This is a bit complicated due to the range of possible 1109 # ways people might have done their annotation... 1110 # Currently the parser uses a single string with newlines. 1111 # A list of lines is also reasonable. 1112 # A single (long) string is perhaps the most natural of all. 1113 # This means we may need to deal with line wrapping. 1114 comment = record.annotations["comment"] 1115 if isinstance(comment, basestring): 1116 lines = comment.split("\n") 1117 elif isinstance(comment, (list, tuple)): 1118 lines = comment 1119 else: 1120 raise ValueError("Could not understand comment annotation") 1121 # TODO - Merge this with the GenBank comment code? 1122 if not lines: 1123 return 1124 for line in lines: 1125 self._write_multi_line("CC", line) 1126 self.handle.write("XX\n")
1127
1128 - def write_record(self, record):
1129 """Write a single record to the output file.""" 1130 1131 handle = self.handle 1132 self._write_the_first_lines(record) 1133 1134 # PR line (0 or 1 lines only), project identifier 1135 # 1136 # Assuming can't use 2 lines, we should prefer newer GenBank 1137 # DBLINK BioProject:... entries over the older GenBank DBLINK 1138 # Project:... lines. 1139 # 1140 # In either case, seems EMBL usess just "PR Project:..." 1141 # regardless of the type of ID (old numeric only, or new 1142 # with alpha prefix), e.g. for CP002497 NCBI now uses: 1143 # 1144 # DBLINK BioProject: PRJNA60715 1145 # BioSample: SAMN03081426 1146 # 1147 # While EMBL uses: 1148 # 1149 # XX 1150 # PR Project:PRJNA60715; 1151 # XX 1152 # 1153 # Sorting ensures (new) BioProject:... is before old Project:... 1154 for xref in sorted(record.dbxrefs): 1155 if xref.startswith("BioProject:"): 1156 self._write_single_line("PR", xref[3:] + ";") 1157 handle.write("XX\n") 1158 break 1159 if xref.startswith("Project:"): 1160 self._write_single_line("PR", xref + ";") 1161 handle.write("XX\n") 1162 break 1163 1164 # TODO - DT lines (date) 1165 1166 descr = record.description 1167 if descr == "<unknown description>": 1168 descr = "." 1169 self._write_multi_line("DE", descr) 1170 handle.write("XX\n") 1171 1172 if "keywords" in record.annotations: 1173 self._write_keywords(record) 1174 1175 # Should this be "source" or "organism"? 1176 self._write_multi_line( 1177 "OS", self._get_annotation_str(record, "organism")) 1178 try: 1179 # List of strings 1180 taxonomy = "; ".join(record.annotations["taxonomy"]) + "." 1181 except KeyError: 1182 taxonomy = "." 1183 self._write_multi_line("OC", taxonomy) 1184 handle.write("XX\n") 1185 1186 if "references" in record.annotations: 1187 self._write_references(record) 1188 1189 if "comment" in record.annotations: 1190 self._write_comment(record) 1191 1192 handle.write(self.FEATURE_HEADER) 1193 rec_length = len(record) 1194 for feature in record.features: 1195 self._write_feature(feature, rec_length) 1196 handle.write("XX\n") 1197 1198 self._write_sequence(record) 1199 handle.write("//\n")
1200 1201
1202 -class ImgtWriter(EmblWriter):
1203 HEADER_WIDTH = 5 1204 QUALIFIER_INDENT = 25 # Not 21 as in EMBL 1205 QUALIFIER_INDENT_STR = "FT" + " " * (QUALIFIER_INDENT - 2) 1206 QUALIFIER_INDENT_TMP = "FT %s " # 25 if %s is empty 1207 FEATURE_HEADER = "FH Key Location/Qualifiers\n"
1208 1209 if __name__ == "__main__": 1210 from Bio._utils import run_doctest 1211 run_doctest(verbose=0) 1212