Package Bio :: Package SeqIO :: Module InsdcIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.InsdcIO

   1  # Copyright 2007-2011 by Peter Cock.  All rights reserved. 
   2  # 
   3  # This code is part of the Biopython distribution and governed by its 
   4  # license.  Please see the LICENSE file that should have been included 
   5  # as part of this package.. 
   6   
   7  """Bio.SeqIO support for the "genbank" and "embl" file formats. 
   8   
   9  You are expected to use this module via the Bio.SeqIO functions. 
  10  Note that internally this module calls Bio.GenBank to do the actual 
  11  parsing of GenBank, EMBL and IMGT files. 
  12   
  13  See also: 
  14   
  15  International Nucleotide Sequence Database Collaboration 
  16  http://www.insdc.org/ 
  17   
  18  GenBank 
  19  http://www.ncbi.nlm.nih.gov/Genbank/ 
  20   
  21  EMBL Nucleotide Sequence Database 
  22  http://www.ebi.ac.uk/embl/ 
  23   
  24  DDBJ (DNA Data Bank of Japan) 
  25  http://www.ddbj.nig.ac.jp/ 
  26   
  27  IMGT (use a variant of EMBL format with longer feature indents) 
  28  http://imgt.cines.fr/download/LIGM-DB/userman_doc.html 
  29  http://imgt.cines.fr/download/LIGM-DB/ftable_doc.html 
  30  http://www.ebi.ac.uk/imgt/hla/docs/manual.html 
  31   
  32  """ 
  33   
  34  from __future__ import print_function 
  35   
  36  import warnings 
  37  from Bio import BiopythonWarning 
  38   
  39  from Bio.Seq import UnknownSeq 
  40  from Bio.GenBank.Scanner import GenBankScanner, EmblScanner, _ImgtScanner 
  41  from Bio import Alphabet 
  42  from .Interfaces import SequentialSequenceWriter 
  43  from Bio import SeqFeature 
  44   
  45  from Bio._py3k import _is_int_or_long 
  46  from Bio._py3k import basestring 
  47   
  48  __docformat__ = "restructuredtext en" 
  49   
  50  # NOTE 
  51  # ==== 
  52  # The "brains" for parsing GenBank, EMBL and IMGT files (and any 
  53  # other flat file variants from the INSDC in future) is in 
  54  # Bio.GenBank.Scanner (plus the _FeatureConsumer in Bio.GenBank) 
  55  # However, all the writing code is in this file. 
  56   
  57   
58 -def GenBankIterator(handle):
59 """Breaks up a Genbank file into SeqRecord objects. 60 61 Every section from the LOCUS line to the terminating // becomes 62 a single SeqRecord with associated annotation and features. 63 64 Note that for genomes or chromosomes, there is typically only 65 one record.""" 66 # This calls a generator function: 67 return GenBankScanner(debug=0).parse_records(handle)
68 69
70 -def EmblIterator(handle):
71 """Breaks up an EMBL file into SeqRecord objects. 72 73 Every section from the LOCUS line to the terminating // becomes 74 a single SeqRecord with associated annotation and features. 75 76 Note that for genomes or chromosomes, there is typically only 77 one record.""" 78 # This calls a generator function: 79 return EmblScanner(debug=0).parse_records(handle)
80 81
82 -def ImgtIterator(handle):
83 """Breaks up an IMGT file into SeqRecord objects. 84 85 Every section from the LOCUS line to the terminating // becomes 86 a single SeqRecord with associated annotation and features. 87 88 Note that for genomes or chromosomes, there is typically only 89 one record.""" 90 # This calls a generator function: 91 return _ImgtScanner(debug=0).parse_records(handle)
92 93
94 -def GenBankCdsFeatureIterator(handle, alphabet=Alphabet.generic_protein):
95 """Breaks up a Genbank file into SeqRecord objects for each CDS feature. 96 97 Every section from the LOCUS line to the terminating // can contain 98 many CDS features. These are returned as with the stated amino acid 99 translation sequence (if given). 100 """ 101 # This calls a generator function: 102 return GenBankScanner(debug=0).parse_cds_features(handle, alphabet)
103 104
105 -def EmblCdsFeatureIterator(handle, alphabet=Alphabet.generic_protein):
106 """Breaks up a EMBL file into SeqRecord objects for each CDS feature. 107 108 Every section from the LOCUS line to the terminating // can contain 109 many CDS features. These are returned as with the stated amino acid 110 translation sequence (if given). 111 """ 112 # This calls a generator function: 113 return EmblScanner(debug=0).parse_cds_features(handle, alphabet)
114 115
116 -def _insdc_feature_position_string(pos, offset=0):
117 """Build a GenBank/EMBL position string (PRIVATE). 118 119 Use offset=1 to add one to convert a start position from python counting. 120 """ 121 if isinstance(pos, SeqFeature.ExactPosition): 122 return "%i" % (pos.position + offset) 123 elif isinstance(pos, SeqFeature.WithinPosition): 124 return "(%i.%i)" % (pos.position + offset, 125 pos.position + pos.extension + offset) 126 elif isinstance(pos, SeqFeature.BetweenPosition): 127 return "(%i^%i)" % (pos.position + offset, 128 pos.position + pos.extension + offset) 129 elif isinstance(pos, SeqFeature.BeforePosition): 130 return "<%i" % (pos.position + offset) 131 elif isinstance(pos, SeqFeature.AfterPosition): 132 return ">%i" % (pos.position + offset) 133 elif isinstance(pos, SeqFeature.OneOfPosition): 134 return "one-of(%s)" \ 135 % ",".join(_insdc_feature_position_string(p, offset) 136 for p in pos.position_choices) 137 elif isinstance(pos, SeqFeature.AbstractPosition): 138 raise NotImplementedError("Please report this as a bug in Biopython.") 139 else: 140 raise ValueError("Expected a SeqFeature position object.")
141 142
143 -def _insdc_location_string_ignoring_strand_and_subfeatures(location, rec_length):
144 if location.ref: 145 ref = "%s:" % location.ref 146 else: 147 ref = "" 148 assert not location.ref_db 149 if isinstance(location.start, SeqFeature.ExactPosition) \ 150 and isinstance(location.end, SeqFeature.ExactPosition) \ 151 and location.start.position == location.end.position: 152 # Special case, for 12:12 return 12^13 153 # (a zero length slice, meaning the point between two letters) 154 if location.end.position == rec_length: 155 # Very special case, for a between position at the end of a 156 # sequence (used on some circular genomes, Bug 3098) we have 157 # N:N so return N^1 158 return "%s%i^1" % (ref, rec_length) 159 else: 160 return "%s%i^%i" % (ref, location.end.position, 161 location.end.position + 1) 162 if isinstance(location.start, SeqFeature.ExactPosition) \ 163 and isinstance(location.end, SeqFeature.ExactPosition) \ 164 and location.start.position + 1 == location.end.position: 165 # Special case, for 11:12 return 12 rather than 12..12 166 # (a length one slice, meaning a single letter) 167 return "%s%i" % (ref, location.end.position) 168 elif isinstance(location.start, SeqFeature.UnknownPosition) \ 169 or isinstance(location.end, SeqFeature.UnknownPosition): 170 # Special case for features from SwissProt/UniProt files 171 if isinstance(location.start, SeqFeature.UnknownPosition) \ 172 and isinstance(location.end, SeqFeature.UnknownPosition): 173 # warnings.warn("Feature with unknown location", BiopythonWarning) 174 # return "?" 175 raise ValueError("Feature with unknown location") 176 elif isinstance(location.start, SeqFeature.UnknownPosition): 177 # Treat the unknown start position as a BeforePosition 178 return "%s<%i..%s" \ 179 % (ref, 180 location.nofuzzy_end, 181 _insdc_feature_position_string(location.end)) 182 else: 183 # Treat the unknown end position as an AfterPosition 184 return "%s%s..>%i" \ 185 % (ref, 186 _insdc_feature_position_string(location.start), 187 location.nofuzzy_start) 188 else: 189 # Typical case, e.g. 12..15 gets mapped to 11:15 190 return ref \ 191 + _insdc_feature_position_string(location.start, +1) \ 192 + ".." + \ 193 _insdc_feature_position_string(location.end)
194 195
196 -def _insdc_location_string(location, rec_length):
197 """Build a GenBank/EMBL location from a (Compound) FeatureLocation (PRIVATE). 198 199 There is a choice of how to show joins on the reverse complement strand, 200 GenBank used "complement(join(1,10),(20,100))" while EMBL used to use 201 "join(complement(20,100),complement(1,10))" instead (but appears to have 202 now adopted the GenBank convention). Notice that the order of the entries 203 is reversed! This function therefore uses the first form. In this situation 204 we expect the CompoundFeatureLocation and its parts to all be marked as 205 strand == -1, and to be in the order 19:100 then 0:10. 206 """ 207 try: 208 parts = location.parts 209 # CompoundFeatureLocation 210 if location.strand == -1: 211 # Special case, put complement outside the join/order/... and reverse order 212 return "complement(%s(%s))" % (location.operator, 213 ",".join(_insdc_location_string_ignoring_strand_and_subfeatures(p, rec_length) 214 for p in parts[::-1])) 215 else: 216 return "%s(%s)" % (location.operator, 217 ",".join(_insdc_location_string(p, rec_length) for p in parts)) 218 except AttributeError: 219 # Simple FeatureLocation 220 loc = _insdc_location_string_ignoring_strand_and_subfeatures(location, rec_length) 221 if location.strand == -1: 222 return "complement(%s)" % loc 223 else: 224 return loc
225 226
227 -def _insdc_feature_location_string(feature, rec_length):
228 """Build a GenBank/EMBL location string from a SeqFeature (PRIVATE, OBSOLETE). 229 230 There is a choice of how to show joins on the reverse complement strand, 231 GenBank used "complement(join(1,10),(20,100))" while EMBL used to use 232 "join(complement(20,100),complement(1,10))" instead (but appears to have 233 now adopted the GenBank convention). Notice that the order of the entries 234 is reversed! This function therefore uses the first form. In this situation 235 we expect the parent feature and the two children to all be marked as 236 strand == -1, and in the order 0:10 then 19:100. 237 238 Also need to consider dual-strand examples like these from the Arabidopsis 239 thaliana chloroplast NC_000932: join(complement(69611..69724),139856..140650) 240 gene ArthCp047, GeneID:844801 or its CDS (protein NP_051038.1 GI:7525057) 241 which is further complicated by a splice: 242 join(complement(69611..69724),139856..140087,140625..140650) 243 244 For this mixed strand feature, the parent SeqFeature should have 245 no strand (either 0 or None) while the child features should have either 246 strand +1 or -1 as appropriate, and be listed in the order given here. 247 """ 248 # Using private variable to avoid deprecation warning 249 if not feature._sub_features: 250 # Non-recursive. 251 # assert feature.location_operator == "", \ 252 # "%s has no subfeatures but location_operator %s" \ 253 # % (repr(feature), feature.location_operator) 254 location = _insdc_location_string_ignoring_strand_and_subfeatures( 255 feature.location, rec_length) 256 if feature.strand == -1: 257 location = "complement(%s)" % location 258 return location 259 # As noted above, treat reverse complement strand features carefully: 260 if feature.strand == -1: 261 for f in feature._sub_features: 262 if f.strand != -1: 263 raise ValueError("Inconsistent strands: %r for parent, %r for child" 264 % (feature.strand, f.strand)) 265 return "complement(%s(%s))" \ 266 % (feature.location_operator, 267 ",".join(_insdc_location_string_ignoring_strand_and_subfeatures(f.location, rec_length) 268 for f in feature._sub_features)) 269 # if feature.strand == +1: 270 # for f in feature.sub_features: 271 # assert f.strand == +1 272 # This covers typical forward strand features, and also an evil mixed strand: 273 assert feature.location_operator != "" 274 return "%s(%s)" % (feature.location_operator, 275 ",".join(_insdc_feature_location_string(f, rec_length) 276 for f in feature._sub_features))
277 278
279 -class _InsdcWriter(SequentialSequenceWriter):
280 """Base class for GenBank and EMBL writers (PRIVATE).""" 281 MAX_WIDTH = 80 282 QUALIFIER_INDENT = 21 283 QUALIFIER_INDENT_STR = " " * QUALIFIER_INDENT 284 QUALIFIER_INDENT_TMP = " %s " # 21 if %s is empty 285 FTQUAL_NO_QUOTE = ("anticodon", "citation", "codon_start", "compare", 286 "direction", "estimated_length", "mod_base", "number", 287 "rpt_type", "rpt_unit_range", "tag_peptide", 288 "transl_except", "transl_table") 289
290 - def _write_feature_qualifier(self, key, value=None, quote=None):
291 if value is None: 292 # Value-less entry like /pseudo 293 self.handle.write("%s/%s\n" % (self.QUALIFIER_INDENT_STR, key)) 294 return 295 # Quick hack with no line wrapping, may be useful for testing: 296 # self.handle.write('%s/%s="%s"\n' % (self.QUALIFIER_INDENT_STR, key, value)) 297 if quote is None: 298 # Try to mimic unwritten rules about when quotes can be left out: 299 if _is_int_or_long(value) or key in self.FTQUAL_NO_QUOTE: 300 quote = False 301 else: 302 quote = True 303 if quote: 304 line = '%s/%s="%s"' % (self.QUALIFIER_INDENT_STR, key, value) 305 else: 306 line = '%s/%s=%s' % (self.QUALIFIER_INDENT_STR, key, value) 307 if len(line) <= self.MAX_WIDTH: 308 self.handle.write(line + "\n") 309 return 310 while line.lstrip(): 311 if len(line) <= self.MAX_WIDTH: 312 self.handle.write(line + "\n") 313 return 314 # Insert line break... 315 for index in range(min(len(line) - 1, self.MAX_WIDTH), 316 self.QUALIFIER_INDENT + 1, -1): 317 if line[index] == " ": 318 break 319 if line[index] != " ": 320 # No nice place to break... 321 index = self.MAX_WIDTH 322 assert index <= self.MAX_WIDTH 323 self.handle.write(line[:index] + "\n") 324 line = self.QUALIFIER_INDENT_STR + line[index:].lstrip()
325
326 - def _wrap_location(self, location):
327 """Split a feature location into lines (break at commas).""" 328 # TODO - Rewrite this not to recurse! 329 length = self.MAX_WIDTH - self.QUALIFIER_INDENT 330 if len(location) <= length: 331 return location 332 index = location[:length].rfind(",") 333 if index == -1: 334 # No good place to split (!) 335 warnings.warn("Couldn't split location:\n%s" % location, 336 BiopythonWarning) 337 return location 338 return location[:index + 1] + "\n" + \ 339 self.QUALIFIER_INDENT_STR + \ 340 self._wrap_location(location[index + 1:])
341
342 - def _write_feature(self, feature, record_length):
343 """Write a single SeqFeature object to features table.""" 344 assert feature.type, feature 345 location = _insdc_location_string(feature.location, record_length) 346 f_type = feature.type.replace(" ", "_") 347 line = (self.QUALIFIER_INDENT_TMP % f_type)[:self.QUALIFIER_INDENT] \ 348 + self._wrap_location(location) + "\n" 349 self.handle.write(line) 350 # Now the qualifiers... 351 for key in sorted(feature.qualifiers.keys()): 352 values = feature.qualifiers[key] 353 if isinstance(values, list) or isinstance(values, tuple): 354 for value in values: 355 self._write_feature_qualifier(key, value) 356 else: 357 # String, int, etc - or None for a /pseudo tpy entry 358 self._write_feature_qualifier(key, values)
359
360 - def _get_annotation_str(self, record, key, default=".", just_first=False):
361 """Get an annotation dictionary entry (as a string). 362 363 Some entries are lists, in which case if just_first=True the first entry 364 is returned. If just_first=False (default) this verifies there is only 365 one entry before returning it.""" 366 try: 367 answer = record.annotations[key] 368 except KeyError: 369 return default 370 if isinstance(answer, list): 371 if not just_first: 372 assert len(answer) == 1 373 return str(answer[0]) 374 else: 375 return str(answer)
376
377 - def _split_multi_line(self, text, max_len):
378 """Returns a list of strings. 379 380 Any single words which are too long get returned as a whole line 381 (e.g. URLs) without an exception or warning. 382 """ 383 # TODO - Do the line spliting while preserving white space? 384 text = text.strip() 385 if len(text) <= max_len: 386 return [text] 387 388 words = text.split() 389 text = "" 390 while words and len(text) + 1 + len(words[0]) <= max_len: 391 text += " " + words.pop(0) 392 text = text.strip() 393 # assert len(text) <= max_len 394 answer = [text] 395 while words: 396 text = words.pop(0) 397 while words and len(text) + 1 + len(words[0]) <= max_len: 398 text += " " + words.pop(0) 399 text = text.strip() 400 # assert len(text) <= max_len 401 answer.append(text) 402 assert not words 403 return answer
404
405 - def _split_contig(self, record, max_len):
406 """Returns a list of strings, splits on commas.""" 407 # TODO - Merge this with _write_multi_line method? 408 # It would need the addition of the comma splitting logic... 409 # are there any other cases where that would be sensible? 410 contig = record.annotations.get("contig", "") 411 if isinstance(contig, list) or isinstance(contig, tuple): 412 contig = "".join(contig) 413 contig = self.clean(contig) 414 answer = [] 415 while contig: 416 if len(contig) > max_len: 417 # Split lines at the commas 418 pos = contig[:max_len - 1].rfind(",") 419 if pos == -1: 420 raise ValueError("Could not break up CONTIG") 421 text, contig = contig[:pos + 1], contig[pos + 1:] 422 else: 423 text, contig = contig, "" 424 answer.append(text) 425 return answer
426 427
428 -class GenBankWriter(_InsdcWriter):
429 HEADER_WIDTH = 12 430 QUALIFIER_INDENT = 21 431
432 - def _write_single_line(self, tag, text):
433 """Used in the 'header' of each GenBank record.""" 434 assert len(tag) < self.HEADER_WIDTH 435 if len(text) > self.MAX_WIDTH - self.HEADER_WIDTH: 436 warnings.warn("Annotation %r too long for %r line" % (text, tag), 437 BiopythonWarning) 438 self.handle.write("%s%s\n" % (tag.ljust(self.HEADER_WIDTH), 439 text.replace("\n", " ")))
440
441 - def _write_multi_line(self, tag, text):
442 """Used in the 'header' of each GenBank record.""" 443 # TODO - Do the line spliting while preserving white space? 444 max_len = self.MAX_WIDTH - self.HEADER_WIDTH 445 lines = self._split_multi_line(text, max_len) 446 self._write_single_line(tag, lines[0]) 447 for line in lines[1:]: 448 self._write_single_line("", line)
449
450 - def _write_multi_entries(self, tag, text_list):
451 # used for DBLINK and any similar later line types. 452 # If the list of strings is empty, nothing is written. 453 for i, text in enumerate(text_list): 454 if i == 0: 455 self._write_single_line(tag, text) 456 else: 457 self._write_single_line("", text)
458
459 - def _get_date(self, record):
460 default = "01-JAN-1980" 461 try: 462 date = record.annotations["date"] 463 except KeyError: 464 return default 465 # Cope with a list of one string: 466 if isinstance(date, list) and len(date) == 1: 467 date = date[0] 468 # TODO - allow a Python date object 469 if not isinstance(date, basestring) or len(date) != 11 \ 470 or date[2] != "-" or date[6] != "-" \ 471 or not date[:2].isdigit() or not date[7:].isdigit() \ 472 or int(date[:2]) > 31 \ 473 or date[3:6] not in ["JAN", "FEB", "MAR", "APR", "MAY", "JUN", 474 "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"]: 475 # TODO - Check is a valid date (e.g. not 31 Feb) 476 return default 477 return date
478
479 - def _get_data_division(self, record):
480 try: 481 division = record.annotations["data_file_division"] 482 except KeyError: 483 division = "UNK" 484 if division in ["PRI", "ROD", "MAM", "VRT", "INV", "PLN", "BCT", 485 "VRL", "PHG", "SYN", "UNA", "EST", "PAT", "STS", 486 "GSS", "HTG", "HTC", "ENV", "CON"]: 487 # Good, already GenBank style 488 # PRI - primate sequences 489 # ROD - rodent sequences 490 # MAM - other mammalian sequences 491 # VRT - other vertebrate sequences 492 # INV - invertebrate sequences 493 # PLN - plant, fungal, and algal sequences 494 # BCT - bacterial sequences [plus archea] 495 # VRL - viral sequences 496 # PHG - bacteriophage sequences 497 # SYN - synthetic sequences 498 # UNA - unannotated sequences 499 # EST - EST sequences (expressed sequence tags) 500 # PAT - patent sequences 501 # STS - STS sequences (sequence tagged sites) 502 # GSS - GSS sequences (genome survey sequences) 503 # HTG - HTGS sequences (high throughput genomic sequences) 504 # HTC - HTC sequences (high throughput cDNA sequences) 505 # ENV - Environmental sampling sequences 506 # CON - Constructed sequences 507 # 508 # (plus UNK for unknown) 509 pass 510 else: 511 # See if this is in EMBL style: 512 # Division Code 513 # ----------------- ---- 514 # Bacteriophage PHG - common 515 # Environmental Sample ENV - common 516 # Fungal FUN - map to PLN (plants + fungal) 517 # Human HUM - map to PRI (primates) 518 # Invertebrate INV - common 519 # Other Mammal MAM - common 520 # Other Vertebrate VRT - common 521 # Mus musculus MUS - map to ROD (rodent) 522 # Plant PLN - common 523 # Prokaryote PRO - map to BCT (poor name) 524 # Other Rodent ROD - common 525 # Synthetic SYN - common 526 # Transgenic TGN - ??? map to SYN ??? 527 # Unclassified UNC - map to UNK 528 # Viral VRL - common 529 # 530 # (plus XXX for submiting which we can map to UNK) 531 embl_to_gbk = {"FUN": "PLN", 532 "HUM": "PRI", 533 "MUS": "ROD", 534 "PRO": "BCT", 535 "UNC": "UNK", 536 "XXX": "UNK", 537 } 538 try: 539 division = embl_to_gbk[division] 540 except KeyError: 541 division = "UNK" 542 assert len(division) == 3 543 return division
544
545 - def _write_the_first_line(self, record):
546 """Write the LOCUS line.""" 547 548 locus = record.name 549 if not locus or locus == "<unknown name>": 550 locus = record.id 551 if not locus or locus == "<unknown id>": 552 locus = self._get_annotation_str( 553 record, "accession", just_first=True) 554 if len(locus) > 16: 555 raise ValueError("Locus identifier %r is too long" % str(locus)) 556 557 if len(record) > 99999999999: 558 # Currently GenBank only officially support up to 350000, but 559 # the length field can take eleven digits 560 raise ValueError("Sequence too long!") 561 562 # Get the base alphabet (underneath any Gapped or StopCodon encoding) 563 a = Alphabet._get_base_alphabet(record.seq.alphabet) 564 if not isinstance(a, Alphabet.Alphabet): 565 raise TypeError("Invalid alphabet") 566 elif isinstance(a, Alphabet.ProteinAlphabet): 567 units = "aa" 568 elif isinstance(a, Alphabet.NucleotideAlphabet): 569 units = "bp" 570 else: 571 # Must be something like NucleotideAlphabet or 572 # just the generic Alphabet (default for fasta files) 573 raise ValueError("Need a Nucleotide or Protein alphabet") 574 575 # Get the molecule type 576 # TODO - record this explicitly in the parser? 577 if isinstance(a, Alphabet.ProteinAlphabet): 578 mol_type = "" 579 elif isinstance(a, Alphabet.DNAAlphabet): 580 mol_type = "DNA" 581 elif isinstance(a, Alphabet.RNAAlphabet): 582 mol_type = "RNA" 583 else: 584 # Must be something like NucleotideAlphabet or 585 # just the generic Alphabet (default for fasta files) 586 raise ValueError("Need a DNA, RNA or Protein alphabet") 587 588 division = self._get_data_division(record) 589 590 assert len(units) == 2 591 assert len(division) == 3 592 # TODO - date 593 # TODO - mol_type 594 line = "LOCUS %s %s %s %s %s %s\n" \ 595 % (locus.ljust(16), 596 str(len(record)).rjust(11), 597 units, 598 mol_type.ljust(6), 599 division, 600 self._get_date(record)) 601 assert len(line) == 79 + 1, repr(line) # plus one for new line 602 603 assert line[12:28].rstrip() == locus, \ 604 'LOCUS line does not contain the locus at the expected position:\n' + line 605 assert line[28:29] == " " 606 assert line[29:40].lstrip() == str(len(record)), \ 607 'LOCUS line does not contain the length at the expected position:\n' + line 608 609 # Tests copied from Bio.GenBank.Scanner 610 assert line[40:44] in [' bp ', ' aa '], \ 611 'LOCUS line does not contain size units at expected position:\n' + \ 612 line 613 assert line[44:47] in [' ', 'ss-', 'ds-', 'ms-'], \ 614 'LOCUS line does not have valid strand type (Single stranded, ...):\n' + line 615 assert line[47:54].strip() == "" \ 616 or 'DNA' in line[47:54].strip() \ 617 or 'RNA' in line[47:54].strip(), \ 618 'LOCUS line does not contain valid sequence type (DNA, RNA, ...):\n' + line 619 assert line[54:55] == ' ', \ 620 'LOCUS line does not contain space at position 55:\n' + line 621 assert line[55:63].strip() in ['', 'linear', 'circular'], \ 622 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line 623 assert line[63:64] == ' ', \ 624 'LOCUS line does not contain space at position 64:\n' + line 625 assert line[67:68] == ' ', \ 626 'LOCUS line does not contain space at position 68:\n' + line 627 assert line[70:71] == '-', \ 628 'LOCUS line does not contain - at position 71 in date:\n' + line 629 assert line[74:75] == '-', \ 630 'LOCUS line does not contain - at position 75 in date:\n' + line 631 632 self.handle.write(line)
633
634 - def _write_references(self, record):
635 number = 0 636 for ref in record.annotations["references"]: 637 if not isinstance(ref, SeqFeature.Reference): 638 continue 639 number += 1 640 data = str(number) 641 # TODO - support more complex record reference locations? 642 if ref.location and len(ref.location) == 1: 643 a = Alphabet._get_base_alphabet(record.seq.alphabet) 644 if isinstance(a, Alphabet.ProteinAlphabet): 645 units = "residues" 646 else: 647 units = "bases" 648 data += " (%s %i to %i)" % (units, 649 ref.location[0].nofuzzy_start + 1, 650 ref.location[0].nofuzzy_end) 651 self._write_single_line("REFERENCE", data) 652 if ref.authors: 653 # We store the AUTHORS data as a single string 654 self._write_multi_line(" AUTHORS", ref.authors) 655 if ref.consrtm: 656 # We store the consortium as a single string 657 self._write_multi_line(" CONSRTM", ref.consrtm) 658 if ref.title: 659 # We store the title as a single string 660 self._write_multi_line(" TITLE", ref.title) 661 if ref.journal: 662 # We store this as a single string - holds the journal name, 663 # volume, year, and page numbers of the citation 664 self._write_multi_line(" JOURNAL", ref.journal) 665 if ref.medline_id: 666 # This line type is obsolete and was removed from the GenBank 667 # flatfile format in April 2005. Should we write it? 668 # Note this has a two space indent: 669 self._write_multi_line(" MEDLINE", ref.medline_id) 670 if ref.pubmed_id: 671 # Note this has a THREE space indent: 672 self._write_multi_line(" PUBMED", ref.pubmed_id) 673 if ref.comment: 674 self._write_multi_line(" REMARK", ref.comment)
675
676 - def _write_comment(self, record):
677 # This is a bit complicated due to the range of possible 678 # ways people might have done their annotation... 679 # Currently the parser uses a single string with newlines. 680 # A list of lines is also reasonable. 681 # A single (long) string is perhaps the most natural of all. 682 # This means we may need to deal with line wrapping. 683 comment = record.annotations["comment"] 684 if isinstance(comment, basestring): 685 lines = comment.split("\n") 686 elif isinstance(comment, list) or isinstance(comment, tuple): 687 lines = comment 688 else: 689 raise ValueError("Could not understand comment annotation") 690 self._write_multi_line("COMMENT", lines[0]) 691 for line in lines[1:]: 692 self._write_multi_line("", line)
693
694 - def _write_contig(self, record):
695 max_len = self.MAX_WIDTH - self.HEADER_WIDTH 696 lines = self._split_contig(record, max_len) 697 self._write_single_line("CONTIG", lines[0]) 698 for text in lines[1:]: 699 self._write_single_line("", text)
700
701 - def _write_sequence(self, record):
702 # Loosely based on code from Howard Salis 703 # TODO - Force lower case? 704 LETTERS_PER_LINE = 60 705 SEQUENCE_INDENT = 9 706 707 if isinstance(record.seq, UnknownSeq): 708 # We have already recorded the length, and there is no need 709 # to record a long sequence of NNNNNNN...NNN or whatever. 710 if "contig" in record.annotations: 711 self._write_contig(record) 712 else: 713 self.handle.write("ORIGIN\n") 714 return 715 716 # Catches sequence being None: 717 data = self._get_seq_string(record).lower() 718 seq_len = len(data) 719 self.handle.write("ORIGIN\n") 720 for line_number in range(0, seq_len, LETTERS_PER_LINE): 721 self.handle.write(str(line_number + 1).rjust(SEQUENCE_INDENT)) 722 for words in range(line_number, 723 min(line_number + LETTERS_PER_LINE, seq_len), 10): 724 self.handle.write(" %s" % data[words:words + 10]) 725 self.handle.write("\n")
726
727 - def write_record(self, record):
728 """Write a single record to the output file.""" 729 handle = self.handle 730 self._write_the_first_line(record) 731 732 accession = self._get_annotation_str(record, "accession", 733 record.id.split(".", 1)[0], 734 just_first=True) 735 acc_with_version = accession 736 if record.id.startswith(accession + "."): 737 try: 738 acc_with_version = "%s.%i" \ 739 % (accession, 740 int(record.id.split(".", 1)[1])) 741 except ValueError: 742 pass 743 gi = self._get_annotation_str(record, "gi", just_first=True) 744 745 descr = record.description 746 if descr == "<unknown description>": 747 descr = "." 748 self._write_multi_line("DEFINITION", descr) 749 750 self._write_single_line("ACCESSION", accession) 751 if gi != ".": 752 self._write_single_line("VERSION", "%s GI:%s" 753 % (acc_with_version, gi)) 754 else: 755 self._write_single_line("VERSION", "%s" % (acc_with_version)) 756 757 # The NCBI only expect two types of link so far, 758 # e.g. "Project:28471" and "Trace Assembly Archive:123456" 759 # TODO - Filter the dbxrefs list to just these? 760 self._write_multi_entries("DBLINK", record.dbxrefs) 761 762 try: 763 # List of strings 764 # Keywords should be given separated with semi colons, 765 keywords = "; ".join(record.annotations["keywords"]) 766 # with a trailing period: 767 if not keywords.endswith("."): 768 keywords += "." 769 except KeyError: 770 # If no keywords, there should be just a period: 771 keywords = "." 772 self._write_multi_line("KEYWORDS", keywords) 773 774 if "segment" in record.annotations: 775 # Deal with SEGMENT line found only in segmented records, 776 # e.g. AH000819 777 segment = record.annotations["segment"] 778 if isinstance(segment, list): 779 assert len(segment) == 1, segment 780 segment = segment[0] 781 self._write_single_line("SEGMENT", segment) 782 783 self._write_multi_line("SOURCE", 784 self._get_annotation_str(record, "source")) 785 # The ORGANISM line MUST be a single line, as any continuation is the taxonomy 786 org = self._get_annotation_str(record, "organism") 787 if len(org) > self.MAX_WIDTH - self.HEADER_WIDTH: 788 org = org[:self.MAX_WIDTH - self.HEADER_WIDTH - 4] + "..." 789 self._write_single_line(" ORGANISM", org) 790 try: 791 # List of strings 792 # Taxonomy should be given separated with semi colons, 793 taxonomy = "; ".join(record.annotations["taxonomy"]) 794 # with a trailing period: 795 if not taxonomy.endswith("."): 796 taxonomy += "." 797 except KeyError: 798 taxonomy = "." 799 self._write_multi_line("", taxonomy) 800 801 if "references" in record.annotations: 802 self._write_references(record) 803 804 if "comment" in record.annotations: 805 self._write_comment(record) 806 807 handle.write("FEATURES Location/Qualifiers\n") 808 rec_length = len(record) 809 for feature in record.features: 810 self._write_feature(feature, rec_length) 811 self._write_sequence(record) 812 handle.write("//\n")
813 814
815 -class EmblWriter(_InsdcWriter):
816 HEADER_WIDTH = 5 817 QUALIFIER_INDENT = 21 818 QUALIFIER_INDENT_STR = "FT" + " " * (QUALIFIER_INDENT - 2) 819 QUALIFIER_INDENT_TMP = "FT %s " # 21 if %s is empty 820 FEATURE_HEADER = "FH Key Location/Qualifiers\n" 821
822 - def _write_contig(self, record):
823 max_len = self.MAX_WIDTH - self.HEADER_WIDTH 824 lines = self._split_contig(record, max_len) 825 for text in lines: 826 self._write_single_line("CO", text)
827
828 - def _write_sequence(self, record):
829 LETTERS_PER_BLOCK = 10 830 BLOCKS_PER_LINE = 6 831 LETTERS_PER_LINE = LETTERS_PER_BLOCK * BLOCKS_PER_LINE 832 POSITION_PADDING = 10 833 handle = self.handle # save looking up this multiple times 834 835 if isinstance(record.seq, UnknownSeq): 836 # We have already recorded the length, and there is no need 837 # to record a long sequence of NNNNNNN...NNN or whatever. 838 if "contig" in record.annotations: 839 self._write_contig(record) 840 else: 841 # TODO - Can the sequence just be left out as in GenBank files? 842 handle.write("SQ \n") 843 return 844 845 # Catches sequence being None 846 data = self._get_seq_string(record).lower() 847 seq_len = len(data) 848 849 # Get the base alphabet (underneath any Gapped or StopCodon encoding) 850 a = Alphabet._get_base_alphabet(record.seq.alphabet) 851 if isinstance(a, Alphabet.DNAAlphabet): 852 # TODO - What if we have RNA? 853 a_count = data.count('A') + data.count('a') 854 c_count = data.count('C') + data.count('c') 855 g_count = data.count('G') + data.count('g') 856 t_count = data.count('T') + data.count('t') 857 other = seq_len - (a_count + c_count + g_count + t_count) 858 handle.write("SQ Sequence %i BP; %i A; %i C; %i G; %i T; %i other;\n" 859 % (seq_len, a_count, c_count, g_count, t_count, other)) 860 else: 861 handle.write("SQ \n") 862 863 for line_number in range(0, seq_len // LETTERS_PER_LINE): 864 handle.write(" ") # Just four, not five 865 for block in range(BLOCKS_PER_LINE): 866 index = LETTERS_PER_LINE * line_number + \ 867 LETTERS_PER_BLOCK * block 868 handle.write((" %s" % data[index:index + LETTERS_PER_BLOCK])) 869 handle.write(str((line_number + 1) 870 * LETTERS_PER_LINE).rjust(POSITION_PADDING)) 871 handle.write("\n") 872 if seq_len % LETTERS_PER_LINE: 873 # Final (partial) line 874 line_number = (seq_len // LETTERS_PER_LINE) 875 handle.write(" ") # Just four, not five 876 for block in range(BLOCKS_PER_LINE): 877 index = LETTERS_PER_LINE * line_number + \ 878 LETTERS_PER_BLOCK * block 879 handle.write( 880 (" %s" % data[index:index + LETTERS_PER_BLOCK]).ljust(11)) 881 handle.write(str(seq_len).rjust(POSITION_PADDING)) 882 handle.write("\n")
883
884 - def _write_single_line(self, tag, text):
885 assert len(tag) == 2 886 line = tag + " " + text 887 if len(text) > self.MAX_WIDTH: 888 warnings.warn("Line %r too long" % line, BiopythonWarning) 889 self.handle.write(line + "\n")
890
891 - def _write_multi_line(self, tag, text):
892 max_len = self.MAX_WIDTH - self.HEADER_WIDTH 893 lines = self._split_multi_line(text, max_len) 894 for line in lines: 895 self._write_single_line(tag, line)
896
897 - def _write_the_first_lines(self, record):
898 """Write the ID and AC lines.""" 899 if "." in record.id and record.id.rsplit(".", 1)[1].isdigit(): 900 version = "SV " + record.id.rsplit(".", 1)[1] 901 accession = self._get_annotation_str(record, "accession", 902 record.id.rsplit(".", 1)[0], 903 just_first=True) 904 else: 905 version = "" 906 accession = self._get_annotation_str(record, "accession", 907 record.id, 908 just_first=True) 909 910 if ";" in accession: 911 raise ValueError("Cannot have semi-colon in EMBL accession, %s" 912 % repr(str(accession))) 913 if " " in accession: 914 # This is out of practicallity... might it be allowed? 915 raise ValueError("Cannot have spaces in EMBL accession, %s" 916 % repr(str(accession))) 917 918 # Get the molecule type 919 # TODO - record this explicitly in the parser? 920 # Get the base alphabet (underneath any Gapped or StopCodon encoding) 921 a = Alphabet._get_base_alphabet(record.seq.alphabet) 922 if not isinstance(a, Alphabet.Alphabet): 923 raise TypeError("Invalid alphabet") 924 elif isinstance(a, Alphabet.DNAAlphabet): 925 mol_type = "DNA" 926 units = "BP" 927 elif isinstance(a, Alphabet.RNAAlphabet): 928 mol_type = "RNA" 929 units = "BP" 930 elif isinstance(a, Alphabet.ProteinAlphabet): 931 mol_type = "PROTEIN" 932 units = "AA" 933 else: 934 # Must be something like NucleotideAlphabet 935 raise ValueError("Need a DNA, RNA or Protein alphabet") 936 937 # Get the taxonomy division 938 division = self._get_data_division(record) 939 940 # TODO - Full ID line 941 handle = self.handle 942 # ID <1>; SV <2>; <3>; <4>; <5>; <6>; <7> BP. 943 # 1. Primary accession number 944 # 2. Sequence version number 945 # 3. Topology: 'circular' or 'linear' 946 # 4. Molecule type 947 # 5. Data class 948 # 6. Taxonomic division 949 # 7. Sequence length 950 self._write_single_line("ID", "%s; %s; ; %s; ; %s; %i %s." 951 % (accession, version, mol_type, 952 division, len(record), units)) 953 handle.write("XX\n") 954 self._write_single_line("AC", accession + ";") 955 handle.write("XX\n")
956
957 - def _get_data_division(self, record):
958 try: 959 division = record.annotations["data_file_division"] 960 except KeyError: 961 division = "UNC" 962 if division in ["PHG", "ENV", "FUN", "HUM", "INV", "MAM", "VRT", 963 "MUS", "PLN", "PRO", "ROD", "SYN", "TGN", "UNC", 964 "VRL", "XXX"]: 965 # Good, already EMBL style 966 # Division Code 967 # ----------------- ---- 968 # Bacteriophage PHG 969 # Environmental Sample ENV 970 # Fungal FUN 971 # Human HUM 972 # Invertebrate INV 973 # Other Mammal MAM 974 # Other Vertebrate VRT 975 # Mus musculus MUS 976 # Plant PLN 977 # Prokaryote PRO 978 # Other Rodent ROD 979 # Synthetic SYN 980 # Transgenic TGN 981 # Unclassified UNC (i.e. unknown) 982 # Viral VRL 983 # 984 # (plus XXX used for submiting data to EMBL) 985 pass 986 else: 987 # See if this is in GenBank style & can be converted. 988 # Generally a problem as the GenBank groups are wider 989 # than those of EMBL. Note that GenBank use "BCT" for 990 # both bacteria and acherea thus this maps to EMBL's 991 # "PRO" nicely. 992 gbk_to_embl = {"BCT": "PRO", 993 "UNK": "UNC", 994 } 995 try: 996 division = gbk_to_embl[division] 997 except KeyError: 998 division = "UNC" 999 assert len(division) == 3 1000 return division
1001
1002 - def _write_keywords(self, record):
1003 # Put the keywords right after DE line. 1004 # Each 'keyword' can have multiple words and spaces, but we 1005 # must not split any 'keyword' between lines. 1006 # TODO - Combine short keywords onto one line 1007 for keyword in record.annotations["keywords"]: 1008 self._write_single_line("KW", keyword) 1009 self.handle.write("XX\n")
1010
1011 - def _write_references(self, record):
1012 # The order should be RN, RC, RP, RX, RG, RA, RT, RL 1013 number = 0 1014 for ref in record.annotations["references"]: 1015 if not isinstance(ref, SeqFeature.Reference): 1016 continue 1017 number += 1 1018 self._write_single_line("RN", "[%i]" % number) 1019 # TODO - support for RC line (needed in parser too) 1020 # TODO - support more complex record reference locations? 1021 if ref.location and len(ref.location) == 1: 1022 self._write_single_line( 1023 "RP", "%i-%i" % (ref.location[0].nofuzzy_start + 1, 1024 ref.location[0].nofuzzy_end)) 1025 # TODO - record any DOI or AGRICOLA identifier in the reference object? 1026 if ref.pubmed_id: 1027 self._write_single_line("RX", "PUBMED; %s." % ref.pubmed_id) 1028 if ref.consrtm: 1029 self._write_single_line("RG", "%s" % ref.consrtm) 1030 if ref.authors: 1031 # We store the AUTHORS data as a single string 1032 self._write_multi_line("RA", ref.authors + ";") 1033 if ref.title: 1034 # We store the title as a single string 1035 self._write_multi_line("RT", '"%s";' % ref.title) 1036 if ref.journal: 1037 # We store this as a single string - holds the journal name, 1038 # volume, year, and page numbers of the citation 1039 self._write_multi_line("RL", ref.journal) 1040 self.handle.write("XX\n")
1041
1042 - def _write_comment(self, record):
1043 # This is a bit complicated due to the range of possible 1044 # ways people might have done their annotation... 1045 # Currently the parser uses a single string with newlines. 1046 # A list of lines is also reasonable. 1047 # A single (long) string is perhaps the most natural of all. 1048 # This means we may need to deal with line wrapping. 1049 comment = record.annotations["comment"] 1050 if isinstance(comment, basestring): 1051 lines = comment.split("\n") 1052 elif isinstance(comment, list) or isinstance(comment, tuple): 1053 lines = comment 1054 else: 1055 raise ValueError("Could not understand comment annotation") 1056 # TODO - Merge this with the GenBank comment code? 1057 if not lines: 1058 return 1059 for line in lines: 1060 self._write_multi_line("CC", line) 1061 self.handle.write("XX\n")
1062
1063 - def write_record(self, record):
1064 """Write a single record to the output file.""" 1065 1066 handle = self.handle 1067 self._write_the_first_lines(record) 1068 1069 # PR line (0 or 1 lines only), project identifier 1070 for xref in record.dbxrefs: 1071 if xref.startswith("Project:"): 1072 self._write_single_line("PR", xref + ";") 1073 handle.write("XX\n") 1074 break 1075 1076 # TODO - DT lines (date) 1077 1078 descr = record.description 1079 if descr == "<unknown description>": 1080 descr = "." 1081 self._write_multi_line("DE", descr) 1082 handle.write("XX\n") 1083 1084 if "keywords" in record.annotations: 1085 self._write_keywords(record) 1086 1087 # Should this be "source" or "organism"? 1088 self._write_multi_line( 1089 "OS", self._get_annotation_str(record, "organism")) 1090 try: 1091 # List of strings 1092 taxonomy = "; ".join(record.annotations["taxonomy"]) + "." 1093 except KeyError: 1094 taxonomy = "." 1095 self._write_multi_line("OC", taxonomy) 1096 handle.write("XX\n") 1097 1098 if "references" in record.annotations: 1099 self._write_references(record) 1100 1101 if "comment" in record.annotations: 1102 self._write_comment(record) 1103 1104 handle.write(self.FEATURE_HEADER) 1105 rec_length = len(record) 1106 for feature in record.features: 1107 self._write_feature(feature, rec_length) 1108 handle.write("XX\n") 1109 1110 self._write_sequence(record) 1111 handle.write("//\n")
1112 1113
1114 -class ImgtWriter(EmblWriter):
1115 HEADER_WIDTH = 5 1116 QUALIFIER_INDENT = 25 # Not 21 as in EMBL 1117 QUALIFIER_INDENT_STR = "FT" + " " * (QUALIFIER_INDENT - 2) 1118 QUALIFIER_INDENT_TMP = "FT %s " # 25 if %s is empty 1119 FEATURE_HEADER = "FH Key Location/Qualifiers\n"
1120 1121 if __name__ == "__main__": 1122 print("Quick self test") 1123 import os 1124 from Bio._py3k import StringIO 1125
1126 - def compare_record(old, new):
1127 if old.id != new.id and old.name != new.name: 1128 raise ValueError("'%s' or '%s' vs '%s' or '%s' records" 1129 % (old.id, old.name, new.id, new.name)) 1130 if len(old.seq) != len(new.seq): 1131 raise ValueError("%i vs %i" % (len(old.seq), len(new.seq))) 1132 if str(old.seq).upper() != str(new.seq).upper(): 1133 if len(old.seq) < 200: 1134 raise ValueError("'%s' vs '%s'" % (old.seq, new.seq)) 1135 else: 1136 raise ValueError( 1137 "'%s...' vs '%s...'" % (old.seq[:100], new.seq[:100])) 1138 if old.features and new.features: 1139 return compare_features(old.features, new.features) 1140 # Just insist on at least one word in common: 1141 if (old.description or new.description) \ 1142 and not set(old.description.split()).intersection(new.description.split()): 1143 raise ValueError("%s versus %s" 1144 % (repr(old.description), repr(new.description))) 1145 # TODO - check annotation 1146 if "contig" in old.annotations: 1147 assert old.annotations["contig"] == \ 1148 new.annotations["contig"] 1149 return True
1150
1151 - def compare_records(old_list, new_list):
1152 """Check two lists of SeqRecords agree, raises a ValueError if mismatch.""" 1153 if len(old_list) != len(new_list): 1154 raise ValueError( 1155 "%i vs %i records" % (len(old_list), len(new_list))) 1156 for old, new in zip(old_list, new_list): 1157 if not compare_record(old, new): 1158 return False 1159 return True
1160
1161 - def compare_feature(old, new, ignore_sub_features=False):
1162 """Check two SeqFeatures agree.""" 1163 if old.type != new.type: 1164 raise ValueError("Type %s versus %s" % (old.type, new.type)) 1165 if old.location.nofuzzy_start != new.location.nofuzzy_start \ 1166 or old.location.nofuzzy_end != new.location.nofuzzy_end: 1167 raise ValueError("%s versus %s:\n%s\nvs:\n%s" 1168 % (old.location, new.location, str(old), str(new))) 1169 if old.strand != new.strand: 1170 raise ValueError( 1171 "Different strand:\n%s\nvs:\n%s" % (str(old), str(new))) 1172 if old.location.start != new.location.start: 1173 raise ValueError("Start %s versus %s:\n%s\nvs:\n%s" 1174 % (old.location.start, new.location.start, str(old), str(new))) 1175 if old.location.end != new.location.end: 1176 raise ValueError("End %s versus %s:\n%s\nvs:\n%s" 1177 % (old.location.end, new.location.end, str(old), str(new))) 1178 if not ignore_sub_features: 1179 if len(old.sub_features) != len(new.sub_features): 1180 raise ValueError("Different sub features") 1181 for a, b in zip(old.sub_features, new.sub_features): 1182 if not compare_feature(a, b): 1183 return False 1184 # This only checks key shared qualifiers 1185 # Would a white list be easier? 1186 # for key in ["name", "gene", "translation", "codon_table", "codon_start", "locus_tag"]: 1187 for key in set(old.qualifiers).intersection(new.qualifiers): 1188 if key in ["db_xref", "protein_id", "product", "note"]: 1189 # EMBL and GenBank files are use different references/notes/etc 1190 continue 1191 if old.qualifiers[key] != new.qualifiers[key]: 1192 raise ValueError("Qualifier mis-match for %s:\n%s\n%s" 1193 % (key, old.qualifiers[key], new.qualifiers[key])) 1194 return True
1195
1196 - def compare_features(old_list, new_list, ignore_sub_features=False):
1197 """Check two lists of SeqFeatures agree, raises a ValueError if mismatch.""" 1198 if len(old_list) != len(new_list): 1199 raise ValueError( 1200 "%i vs %i features" % (len(old_list), len(new_list))) 1201 for old, new in zip(old_list, new_list): 1202 # This assumes they are in the same order 1203 if not compare_feature(old, new, ignore_sub_features): 1204 return False 1205 return True
1206
1207 - def check_genbank_writer(records):
1208 handle = StringIO() 1209 GenBankWriter(handle).write_file(records) 1210 handle.seek(0) 1211 1212 records2 = list(GenBankIterator(handle)) 1213 assert compare_records(records, records2)
1214
1215 - def check_embl_writer(records):
1216 handle = StringIO() 1217 try: 1218 EmblWriter(handle).write_file(records) 1219 except ValueError as err: 1220 print(err) 1221 return 1222 handle.seek(0) 1223 1224 records2 = list(EmblIterator(handle)) 1225 assert compare_records(records, records2)
1226 1227 for filename in os.listdir("../../Tests/GenBank"): 1228 if not filename.endswith(".gbk") and not filename.endswith(".gb"): 1229 continue 1230 print(filename) 1231 1232 with open("../../Tests/GenBank/%s" % filename) as handle: 1233 records = list(GenBankIterator(handle)) 1234 1235 check_genbank_writer(records) 1236 check_embl_writer(records) 1237 1238 for filename in os.listdir("../../Tests/EMBL"): 1239 if not filename.endswith(".embl"): 1240 continue 1241 print(filename) 1242 1243 with open("../../Tests/EMBL/%s" % filename) as handle: 1244 records = list(EmblIterator(handle)) 1245 1246 check_genbank_writer(records) 1247 check_embl_writer(records) 1248 1249 from Bio import SeqIO 1250 for filename in os.listdir("../../Tests/SwissProt"): 1251 if not filename.startswith("sp"): 1252 continue 1253 print(filename) 1254 1255 with open("../../Tests/SwissProt/%s" % filename) as handle: 1256 records = list(SeqIO.parse(handle, "swiss")) 1257 1258 check_genbank_writer(records) 1259