Package Bio :: Package SeqIO :: Module InsdcIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.InsdcIO

   1  # Copyright 2007-2011 by Peter Cock.  All rights reserved. 
   2  # 
   3  # This code is part of the Biopython distribution and governed by its 
   4  # license.  Please see the LICENSE file that should have been included 
   5  # as part of this package.. 
   6   
   7  """Bio.SeqIO support for the "genbank" and "embl" file formats. 
   8   
   9  You are expected to use this module via the Bio.SeqIO functions. 
  10  Note that internally this module calls Bio.GenBank to do the actual 
  11  parsing of GenBank, EMBL and IMGT files. 
  12   
  13  See also: 
  14   
  15  International Nucleotide Sequence Database Collaboration 
  16  http://www.insdc.org/ 
  17   
  18  GenBank 
  19  http://www.ncbi.nlm.nih.gov/Genbank/ 
  20   
  21  EMBL Nucleotide Sequence Database 
  22  http://www.ebi.ac.uk/embl/ 
  23   
  24  DDBJ (DNA Data Bank of Japan) 
  25  http://www.ddbj.nig.ac.jp/ 
  26   
  27  IMGT (use a variant of EMBL format with longer feature indents) 
  28  http://imgt.cines.fr/download/LIGM-DB/userman_doc.html 
  29  http://imgt.cines.fr/download/LIGM-DB/ftable_doc.html 
  30  http://www.ebi.ac.uk/imgt/hla/docs/manual.html 
  31   
  32  """ 
  33   
  34  from __future__ import print_function 
  35   
  36  from Bio.Seq import UnknownSeq 
  37  from Bio.GenBank.Scanner import GenBankScanner, EmblScanner, _ImgtScanner 
  38  from Bio import Alphabet 
  39  from .Interfaces import SequentialSequenceWriter 
  40  from Bio import SeqFeature 
  41   
  42  from Bio._py3k import _is_int_or_long 
  43  from Bio._py3k import basestring 
  44   
  45  __docformat__ = "restructuredtext en" 
  46   
  47  # NOTE 
  48  # ==== 
  49  # The "brains" for parsing GenBank, EMBL and IMGT files (and any 
  50  # other flat file variants from the INSDC in future) is in 
  51  # Bio.GenBank.Scanner (plus the _FeatureConsumer in Bio.GenBank) 
  52  # However, all the writing code is in this file. 
  53   
  54   
55 -def GenBankIterator(handle):
56 """Breaks up a Genbank file into SeqRecord objects. 57 58 Every section from the LOCUS line to the terminating // becomes 59 a single SeqRecord with associated annotation and features. 60 61 Note that for genomes or chromosomes, there is typically only 62 one record.""" 63 # This calls a generator function: 64 return GenBankScanner(debug=0).parse_records(handle)
65 66
67 -def EmblIterator(handle):
68 """Breaks up an EMBL file into SeqRecord objects. 69 70 Every section from the LOCUS line to the terminating // becomes 71 a single SeqRecord with associated annotation and features. 72 73 Note that for genomes or chromosomes, there is typically only 74 one record.""" 75 # This calls a generator function: 76 return EmblScanner(debug=0).parse_records(handle)
77 78
79 -def ImgtIterator(handle):
80 """Breaks up an IMGT file into SeqRecord objects. 81 82 Every section from the LOCUS line to the terminating // becomes 83 a single SeqRecord with associated annotation and features. 84 85 Note that for genomes or chromosomes, there is typically only 86 one record.""" 87 # This calls a generator function: 88 return _ImgtScanner(debug=0).parse_records(handle)
89 90
91 -def GenBankCdsFeatureIterator(handle, alphabet=Alphabet.generic_protein):
92 """Breaks up a Genbank file into SeqRecord objects for each CDS feature. 93 94 Every section from the LOCUS line to the terminating // can contain 95 many CDS features. These are returned as with the stated amino acid 96 translation sequence (if given). 97 """ 98 # This calls a generator function: 99 return GenBankScanner(debug=0).parse_cds_features(handle, alphabet)
100 101
102 -def EmblCdsFeatureIterator(handle, alphabet=Alphabet.generic_protein):
103 """Breaks up a EMBL file into SeqRecord objects for each CDS feature. 104 105 Every section from the LOCUS line to the terminating // can contain 106 many CDS features. These are returned as with the stated amino acid 107 translation sequence (if given). 108 """ 109 # This calls a generator function: 110 return EmblScanner(debug=0).parse_cds_features(handle, alphabet)
111 112
113 -def _insdc_feature_position_string(pos, offset=0):
114 """Build a GenBank/EMBL position string (PRIVATE). 115 116 Use offset=1 to add one to convert a start position from python counting. 117 """ 118 if isinstance(pos, SeqFeature.ExactPosition): 119 return "%i" % (pos.position + offset) 120 elif isinstance(pos, SeqFeature.WithinPosition): 121 return "(%i.%i)" % (pos.position + offset, 122 pos.position + pos.extension + offset) 123 elif isinstance(pos, SeqFeature.BetweenPosition): 124 return "(%i^%i)" % (pos.position + offset, 125 pos.position + pos.extension + offset) 126 elif isinstance(pos, SeqFeature.BeforePosition): 127 return "<%i" % (pos.position + offset) 128 elif isinstance(pos, SeqFeature.AfterPosition): 129 return ">%i" % (pos.position + offset) 130 elif isinstance(pos, SeqFeature.OneOfPosition): 131 return "one-of(%s)" \ 132 % ",".join(_insdc_feature_position_string(p, offset) 133 for p in pos.position_choices) 134 elif isinstance(pos, SeqFeature.AbstractPosition): 135 raise NotImplementedError("Please report this as a bug in Biopython.") 136 else: 137 raise ValueError("Expected a SeqFeature position object.")
138 139
140 -def _insdc_location_string_ignoring_strand_and_subfeatures(location, rec_length):
141 if location.ref: 142 ref = "%s:" % location.ref 143 else: 144 ref = "" 145 assert not location.ref_db 146 if isinstance(location.start, SeqFeature.ExactPosition) \ 147 and isinstance(location.end, SeqFeature.ExactPosition) \ 148 and location.start.position == location.end.position: 149 # Special case, for 12:12 return 12^13 150 # (a zero length slice, meaning the point between two letters) 151 if location.end.position == rec_length: 152 # Very special case, for a between position at the end of a 153 # sequence (used on some circular genomes, Bug 3098) we have 154 # N:N so return N^1 155 return "%s%i^1" % (ref, rec_length) 156 else: 157 return "%s%i^%i" % (ref, location.end.position, 158 location.end.position + 1) 159 if isinstance(location.start, SeqFeature.ExactPosition) \ 160 and isinstance(location.end, SeqFeature.ExactPosition) \ 161 and location.start.position + 1 == location.end.position: 162 # Special case, for 11:12 return 12 rather than 12..12 163 # (a length one slice, meaning a single letter) 164 return "%s%i" % (ref, location.end.position) 165 elif isinstance(location.start, SeqFeature.UnknownPosition) \ 166 or isinstance(location.end, SeqFeature.UnknownPosition): 167 # Special case for features from SwissProt/UniProt files 168 if isinstance(location.start, SeqFeature.UnknownPosition) \ 169 and isinstance(location.end, SeqFeature.UnknownPosition): 170 # import warnings 171 # warnings.warn("Feature with unknown location") 172 # return "?" 173 raise ValueError("Feature with unknown location") 174 elif isinstance(location.start, SeqFeature.UnknownPosition): 175 # Treat the unknown start position as a BeforePosition 176 return "%s<%i..%s" \ 177 % (ref, 178 location.nofuzzy_end, 179 _insdc_feature_position_string(location.end)) 180 else: 181 # Treat the unknown end position as an AfterPosition 182 return "%s%s..>%i" \ 183 % (ref, 184 _insdc_feature_position_string(location.start), 185 location.nofuzzy_start) 186 else: 187 # Typical case, e.g. 12..15 gets mapped to 11:15 188 return ref \ 189 + _insdc_feature_position_string(location.start, +1) \ 190 + ".." + \ 191 _insdc_feature_position_string(location.end)
192 193
194 -def _insdc_location_string(location, rec_length):
195 """Build a GenBank/EMBL location from a (Compound) FeatureLocation (PRIVATE). 196 197 There is a choice of how to show joins on the reverse complement strand, 198 GenBank used "complement(join(1,10),(20,100))" while EMBL used to use 199 "join(complement(20,100),complement(1,10))" instead (but appears to have 200 now adopted the GenBank convention). Notice that the order of the entries 201 is reversed! This function therefore uses the first form. In this situation 202 we expect the CompoundFeatureLocation and its parts to all be marked as 203 strand == -1, and to be in the order 19:100 then 0:10. 204 """ 205 try: 206 parts = location.parts 207 # CompoundFeatureLocation 208 if location.strand == -1: 209 # Special case, put complement outside the join/order/... and reverse order 210 return "complement(%s(%s))" % (location.operator, 211 ",".join(_insdc_location_string_ignoring_strand_and_subfeatures(p, rec_length) 212 for p in parts[::-1])) 213 else: 214 return "%s(%s)" % (location.operator, 215 ",".join(_insdc_location_string(p, rec_length) for p in parts)) 216 except AttributeError: 217 # Simple FeatureLocation 218 loc = _insdc_location_string_ignoring_strand_and_subfeatures(location, rec_length) 219 if location.strand == -1: 220 return "complement(%s)" % loc 221 else: 222 return loc
223 224
225 -def _insdc_feature_location_string(feature, rec_length):
226 """Build a GenBank/EMBL location string from a SeqFeature (PRIVATE, OBSOLETE). 227 228 There is a choice of how to show joins on the reverse complement strand, 229 GenBank used "complement(join(1,10),(20,100))" while EMBL used to use 230 "join(complement(20,100),complement(1,10))" instead (but appears to have 231 now adopted the GenBank convention). Notice that the order of the entries 232 is reversed! This function therefore uses the first form. In this situation 233 we expect the parent feature and the two children to all be marked as 234 strand == -1, and in the order 0:10 then 19:100. 235 236 Also need to consider dual-strand examples like these from the Arabidopsis 237 thaliana chloroplast NC_000932: join(complement(69611..69724),139856..140650) 238 gene ArthCp047, GeneID:844801 or its CDS (protein NP_051038.1 GI:7525057) 239 which is further complicated by a splice: 240 join(complement(69611..69724),139856..140087,140625..140650) 241 242 For this mixed strand feature, the parent SeqFeature should have 243 no strand (either 0 or None) while the child features should have either 244 strand +1 or -1 as appropriate, and be listed in the order given here. 245 """ 246 # Using private variable to avoid deprecation warning 247 if not feature._sub_features: 248 # Non-recursive. 249 # assert feature.location_operator == "", \ 250 # "%s has no subfeatures but location_operator %s" \ 251 # % (repr(feature), feature.location_operator) 252 location = _insdc_location_string_ignoring_strand_and_subfeatures( 253 feature.location, rec_length) 254 if feature.strand == -1: 255 location = "complement(%s)" % location 256 return location 257 # As noted above, treat reverse complement strand features carefully: 258 if feature.strand == -1: 259 for f in feature._sub_features: 260 if f.strand != -1: 261 raise ValueError("Inconsistent strands: %r for parent, %r for child" 262 % (feature.strand, f.strand)) 263 return "complement(%s(%s))" \ 264 % (feature.location_operator, 265 ",".join(_insdc_location_string_ignoring_strand_and_subfeatures(f.location, rec_length) 266 for f in feature._sub_features)) 267 # if feature.strand == +1: 268 # for f in feature.sub_features: 269 # assert f.strand == +1 270 # This covers typical forward strand features, and also an evil mixed strand: 271 assert feature.location_operator != "" 272 return "%s(%s)" % (feature.location_operator, 273 ",".join(_insdc_feature_location_string(f, rec_length) 274 for f in feature._sub_features))
275 276
277 -class _InsdcWriter(SequentialSequenceWriter):
278 """Base class for GenBank and EMBL writers (PRIVATE).""" 279 MAX_WIDTH = 80 280 QUALIFIER_INDENT = 21 281 QUALIFIER_INDENT_STR = " " * QUALIFIER_INDENT 282 QUALIFIER_INDENT_TMP = " %s " # 21 if %s is empty 283 FTQUAL_NO_QUOTE = ("anticodon", "citation", "codon_start", "compare", 284 "direction", "estimated_length", "mod_base", "number", 285 "rpt_type", "rpt_unit_range", "tag_peptide", 286 "transl_except", "transl_table") 287
288 - def _write_feature_qualifier(self, key, value=None, quote=None):
289 if not value: 290 self.handle.write("%s/%s\n" % (self.QUALIFIER_INDENT_STR, key)) 291 return 292 # Quick hack with no line wrapping, may be useful for testing: 293 # self.handle.write('%s/%s="%s"\n' % (self.QUALIFIER_INDENT_STR, key, value)) 294 if quote is None: 295 # Try to mimic unwritten rules about when quotes can be left out: 296 if _is_int_or_long(value) or key in self.FTQUAL_NO_QUOTE: 297 quote = False 298 else: 299 quote = True 300 if quote: 301 line = '%s/%s="%s"' % (self.QUALIFIER_INDENT_STR, key, value) 302 else: 303 line = '%s/%s=%s' % (self.QUALIFIER_INDENT_STR, key, value) 304 if len(line) <= self.MAX_WIDTH: 305 self.handle.write(line + "\n") 306 return 307 while line.lstrip(): 308 if len(line) <= self.MAX_WIDTH: 309 self.handle.write(line + "\n") 310 return 311 # Insert line break... 312 for index in range(min(len(line) - 1, self.MAX_WIDTH), 313 self.QUALIFIER_INDENT + 1, -1): 314 if line[index] == " ": 315 break 316 if line[index] != " ": 317 # No nice place to break... 318 index = self.MAX_WIDTH 319 assert index <= self.MAX_WIDTH 320 self.handle.write(line[:index] + "\n") 321 line = self.QUALIFIER_INDENT_STR + line[index:].lstrip()
322
323 - def _wrap_location(self, location):
324 """Split a feature location into lines (break at commas).""" 325 # TODO - Rewrite this not to recurse! 326 length = self.MAX_WIDTH - self.QUALIFIER_INDENT 327 if len(location) <= length: 328 return location 329 index = location[:length].rfind(",") 330 if index == -1: 331 # No good place to split (!) 332 import warnings 333 warnings.warn("Couldn't split location:\n%s" % location) 334 return location 335 return location[:index + 1] + "\n" + \ 336 self.QUALIFIER_INDENT_STR + \ 337 self._wrap_location(location[index + 1:])
338
339 - def _write_feature(self, feature, record_length):
340 """Write a single SeqFeature object to features table.""" 341 assert feature.type, feature 342 location = _insdc_location_string(feature.location, record_length) 343 f_type = feature.type.replace(" ", "_") 344 line = (self.QUALIFIER_INDENT_TMP % f_type)[:self.QUALIFIER_INDENT] \ 345 + self._wrap_location(location) + "\n" 346 self.handle.write(line) 347 # Now the qualifiers... 348 for key, values in feature.qualifiers.items(): 349 if isinstance(values, list) or isinstance(values, tuple): 350 for value in values: 351 self._write_feature_qualifier(key, value) 352 elif values: 353 # String, int, etc 354 self._write_feature_qualifier(key, values) 355 else: 356 # e.g. a /pseudo entry 357 self._write_feature_qualifier(key)
358
359 - def _get_annotation_str(self, record, key, default=".", just_first=False):
360 """Get an annotation dictionary entry (as a string). 361 362 Some entries are lists, in which case if just_first=True the first entry 363 is returned. If just_first=False (default) this verifies there is only 364 one entry before returning it.""" 365 try: 366 answer = record.annotations[key] 367 except KeyError: 368 return default 369 if isinstance(answer, list): 370 if not just_first: 371 assert len(answer) == 1 372 return str(answer[0]) 373 else: 374 return str(answer)
375
376 - def _split_multi_line(self, text, max_len):
377 """Returns a list of strings. 378 379 Any single words which are too long get returned as a whole line 380 (e.g. URLs) without an exception or warning. 381 """ 382 # TODO - Do the line spliting while preserving white space? 383 text = text.strip() 384 if len(text) <= max_len: 385 return [text] 386 387 words = text.split() 388 text = "" 389 while words and len(text) + 1 + len(words[0]) <= max_len: 390 text += " " + words.pop(0) 391 text = text.strip() 392 # assert len(text) <= max_len 393 answer = [text] 394 while words: 395 text = words.pop(0) 396 while words and len(text) + 1 + len(words[0]) <= max_len: 397 text += " " + words.pop(0) 398 text = text.strip() 399 # assert len(text) <= max_len 400 answer.append(text) 401 assert not words 402 return answer
403
404 - def _split_contig(self, record, max_len):
405 """Returns a list of strings, splits on commas.""" 406 # TODO - Merge this with _write_multi_line method? 407 # It would need the addition of the comma splitting logic... 408 # are there any other cases where that would be sensible? 409 contig = record.annotations.get("contig", "") 410 if isinstance(contig, list) or isinstance(contig, tuple): 411 contig = "".join(contig) 412 contig = self.clean(contig) 413 answer = [] 414 while contig: 415 if len(contig) > max_len: 416 # Split lines at the commas 417 pos = contig[:max_len - 1].rfind(",") 418 if pos == -1: 419 raise ValueError("Could not break up CONTIG") 420 text, contig = contig[:pos + 1], contig[pos + 1:] 421 else: 422 text, contig = contig, "" 423 answer.append(text) 424 return answer
425 426
427 -class GenBankWriter(_InsdcWriter):
428 HEADER_WIDTH = 12 429 QUALIFIER_INDENT = 21 430
431 - def _write_single_line(self, tag, text):
432 """Used in the 'header' of each GenBank record.""" 433 assert len(tag) < self.HEADER_WIDTH 434 if len(text) > self.MAX_WIDTH - self.HEADER_WIDTH: 435 import warnings 436 from Bio import BiopythonWarning 437 warnings.warn("Annotation %r too long for %r line" % (text, tag), 438 BiopythonWarning) 439 self.handle.write("%s%s\n" % (tag.ljust(self.HEADER_WIDTH), 440 text.replace("\n", " ")))
441
442 - def _write_multi_line(self, tag, text):
443 """Used in the 'header' of each GenBank record.""" 444 # TODO - Do the line spliting while preserving white space? 445 max_len = self.MAX_WIDTH - self.HEADER_WIDTH 446 lines = self._split_multi_line(text, max_len) 447 self._write_single_line(tag, lines[0]) 448 for line in lines[1:]: 449 self._write_single_line("", line)
450
451 - def _write_multi_entries(self, tag, text_list):
452 # used for DBLINK and any similar later line types. 453 # If the list of strings is empty, nothing is written. 454 for i, text in enumerate(text_list): 455 if i == 0: 456 self._write_single_line(tag, text) 457 else: 458 self._write_single_line("", text)
459
460 - def _get_date(self, record):
461 default = "01-JAN-1980" 462 try: 463 date = record.annotations["date"] 464 except KeyError: 465 return default 466 # Cope with a list of one string: 467 if isinstance(date, list) and len(date) == 1: 468 date = date[0] 469 # TODO - allow a Python date object 470 if not isinstance(date, basestring) or len(date) != 11 \ 471 or date[2] != "-" or date[6] != "-" \ 472 or not date[:2].isdigit() or not date[7:].isdigit() \ 473 or int(date[:2]) > 31 \ 474 or date[3:6] not in ["JAN", "FEB", "MAR", "APR", "MAY", "JUN", 475 "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"]: 476 # TODO - Check is a valid date (e.g. not 31 Feb) 477 return default 478 return date
479
480 - def _get_data_division(self, record):
481 try: 482 division = record.annotations["data_file_division"] 483 except KeyError: 484 division = "UNK" 485 if division in ["PRI", "ROD", "MAM", "VRT", "INV", "PLN", "BCT", 486 "VRL", "PHG", "SYN", "UNA", "EST", "PAT", "STS", 487 "GSS", "HTG", "HTC", "ENV", "CON"]: 488 # Good, already GenBank style 489 # PRI - primate sequences 490 # ROD - rodent sequences 491 # MAM - other mammalian sequences 492 # VRT - other vertebrate sequences 493 # INV - invertebrate sequences 494 # PLN - plant, fungal, and algal sequences 495 # BCT - bacterial sequences [plus archea] 496 # VRL - viral sequences 497 # PHG - bacteriophage sequences 498 # SYN - synthetic sequences 499 # UNA - unannotated sequences 500 # EST - EST sequences (expressed sequence tags) 501 # PAT - patent sequences 502 # STS - STS sequences (sequence tagged sites) 503 # GSS - GSS sequences (genome survey sequences) 504 # HTG - HTGS sequences (high throughput genomic sequences) 505 # HTC - HTC sequences (high throughput cDNA sequences) 506 # ENV - Environmental sampling sequences 507 # CON - Constructed sequences 508 # 509 # (plus UNK for unknown) 510 pass 511 else: 512 # See if this is in EMBL style: 513 # Division Code 514 # ----------------- ---- 515 # Bacteriophage PHG - common 516 # Environmental Sample ENV - common 517 # Fungal FUN - map to PLN (plants + fungal) 518 # Human HUM - map to PRI (primates) 519 # Invertebrate INV - common 520 # Other Mammal MAM - common 521 # Other Vertebrate VRT - common 522 # Mus musculus MUS - map to ROD (rodent) 523 # Plant PLN - common 524 # Prokaryote PRO - map to BCT (poor name) 525 # Other Rodent ROD - common 526 # Synthetic SYN - common 527 # Transgenic TGN - ??? map to SYN ??? 528 # Unclassified UNC - map to UNK 529 # Viral VRL - common 530 # 531 # (plus XXX for submiting which we can map to UNK) 532 embl_to_gbk = {"FUN": "PLN", 533 "HUM": "PRI", 534 "MUS": "ROD", 535 "PRO": "BCT", 536 "UNC": "UNK", 537 "XXX": "UNK", 538 } 539 try: 540 division = embl_to_gbk[division] 541 except KeyError: 542 division = "UNK" 543 assert len(division) == 3 544 return division
545
546 - def _write_the_first_line(self, record):
547 """Write the LOCUS line.""" 548 549 locus = record.name 550 if not locus or locus == "<unknown name>": 551 locus = record.id 552 if not locus or locus == "<unknown id>": 553 locus = self._get_annotation_str( 554 record, "accession", just_first=True) 555 if len(locus) > 16: 556 raise ValueError("Locus identifier %r is too long" % str(locus)) 557 558 if len(record) > 99999999999: 559 # Currently GenBank only officially support up to 350000, but 560 # the length field can take eleven digits 561 raise ValueError("Sequence too long!") 562 563 # Get the base alphabet (underneath any Gapped or StopCodon encoding) 564 a = Alphabet._get_base_alphabet(record.seq.alphabet) 565 if not isinstance(a, Alphabet.Alphabet): 566 raise TypeError("Invalid alphabet") 567 elif isinstance(a, Alphabet.ProteinAlphabet): 568 units = "aa" 569 elif isinstance(a, Alphabet.NucleotideAlphabet): 570 units = "bp" 571 else: 572 # Must be something like NucleotideAlphabet or 573 # just the generic Alphabet (default for fasta files) 574 raise ValueError("Need a Nucleotide or Protein alphabet") 575 576 # Get the molecule type 577 # TODO - record this explicitly in the parser? 578 if isinstance(a, Alphabet.ProteinAlphabet): 579 mol_type = "" 580 elif isinstance(a, Alphabet.DNAAlphabet): 581 mol_type = "DNA" 582 elif isinstance(a, Alphabet.RNAAlphabet): 583 mol_type = "RNA" 584 else: 585 # Must be something like NucleotideAlphabet or 586 # just the generic Alphabet (default for fasta files) 587 raise ValueError("Need a DNA, RNA or Protein alphabet") 588 589 division = self._get_data_division(record) 590 591 assert len(units) == 2 592 assert len(division) == 3 593 # TODO - date 594 # TODO - mol_type 595 line = "LOCUS %s %s %s %s %s %s\n" \ 596 % (locus.ljust(16), 597 str(len(record)).rjust(11), 598 units, 599 mol_type.ljust(6), 600 division, 601 self._get_date(record)) 602 assert len(line) == 79 + 1, repr(line) # plus one for new line 603 604 assert line[12:28].rstrip() == locus, \ 605 'LOCUS line does not contain the locus at the expected position:\n' + line 606 assert line[28:29] == " " 607 assert line[29:40].lstrip() == str(len(record)), \ 608 'LOCUS line does not contain the length at the expected position:\n' + line 609 610 # Tests copied from Bio.GenBank.Scanner 611 assert line[40:44] in [' bp ', ' aa '], \ 612 'LOCUS line does not contain size units at expected position:\n' + \ 613 line 614 assert line[44:47] in [' ', 'ss-', 'ds-', 'ms-'], \ 615 'LOCUS line does not have valid strand type (Single stranded, ...):\n' + line 616 assert line[47:54].strip() == "" \ 617 or 'DNA' in line[47:54].strip() \ 618 or 'RNA' in line[47:54].strip(), \ 619 'LOCUS line does not contain valid sequence type (DNA, RNA, ...):\n' + line 620 assert line[54:55] == ' ', \ 621 'LOCUS line does not contain space at position 55:\n' + line 622 assert line[55:63].strip() in ['', 'linear', 'circular'], \ 623 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line 624 assert line[63:64] == ' ', \ 625 'LOCUS line does not contain space at position 64:\n' + line 626 assert line[67:68] == ' ', \ 627 'LOCUS line does not contain space at position 68:\n' + line 628 assert line[70:71] == '-', \ 629 'LOCUS line does not contain - at position 71 in date:\n' + line 630 assert line[74:75] == '-', \ 631 'LOCUS line does not contain - at position 75 in date:\n' + line 632 633 self.handle.write(line)
634
635 - def _write_references(self, record):
636 number = 0 637 for ref in record.annotations["references"]: 638 if not isinstance(ref, SeqFeature.Reference): 639 continue 640 number += 1 641 data = str(number) 642 # TODO - support more complex record reference locations? 643 if ref.location and len(ref.location) == 1: 644 a = Alphabet._get_base_alphabet(record.seq.alphabet) 645 if isinstance(a, Alphabet.ProteinAlphabet): 646 units = "residues" 647 else: 648 units = "bases" 649 data += " (%s %i to %i)" % (units, 650 ref.location[0].nofuzzy_start + 1, 651 ref.location[0].nofuzzy_end) 652 self._write_single_line("REFERENCE", data) 653 if ref.authors: 654 # We store the AUTHORS data as a single string 655 self._write_multi_line(" AUTHORS", ref.authors) 656 if ref.consrtm: 657 # We store the consortium as a single string 658 self._write_multi_line(" CONSRTM", ref.consrtm) 659 if ref.title: 660 # We store the title as a single string 661 self._write_multi_line(" TITLE", ref.title) 662 if ref.journal: 663 # We store this as a single string - holds the journal name, 664 # volume, year, and page numbers of the citation 665 self._write_multi_line(" JOURNAL", ref.journal) 666 if ref.medline_id: 667 # This line type is obsolete and was removed from the GenBank 668 # flatfile format in April 2005. Should we write it? 669 # Note this has a two space indent: 670 self._write_multi_line(" MEDLINE", ref.medline_id) 671 if ref.pubmed_id: 672 # Note this has a THREE space indent: 673 self._write_multi_line(" PUBMED", ref.pubmed_id) 674 if ref.comment: 675 self._write_multi_line(" REMARK", ref.comment)
676
677 - def _write_comment(self, record):
678 # This is a bit complicated due to the range of possible 679 # ways people might have done their annotation... 680 # Currently the parser uses a single string with newlines. 681 # A list of lines is also reasonable. 682 # A single (long) string is perhaps the most natural of all. 683 # This means we may need to deal with line wrapping. 684 comment = record.annotations["comment"] 685 if isinstance(comment, basestring): 686 lines = comment.split("\n") 687 elif isinstance(comment, list) or isinstance(comment, tuple): 688 lines = comment 689 else: 690 raise ValueError("Could not understand comment annotation") 691 self._write_multi_line("COMMENT", lines[0]) 692 for line in lines[1:]: 693 self._write_multi_line("", line)
694
695 - def _write_contig(self, record):
696 max_len = self.MAX_WIDTH - self.HEADER_WIDTH 697 lines = self._split_contig(record, max_len) 698 self._write_single_line("CONTIG", lines[0]) 699 for text in lines[1:]: 700 self._write_single_line("", text)
701
702 - def _write_sequence(self, record):
703 # Loosely based on code from Howard Salis 704 # TODO - Force lower case? 705 LETTERS_PER_LINE = 60 706 SEQUENCE_INDENT = 9 707 708 if isinstance(record.seq, UnknownSeq): 709 # We have already recorded the length, and there is no need 710 # to record a long sequence of NNNNNNN...NNN or whatever. 711 if "contig" in record.annotations: 712 self._write_contig(record) 713 else: 714 self.handle.write("ORIGIN\n") 715 return 716 717 # Catches sequence being None: 718 data = self._get_seq_string(record).lower() 719 seq_len = len(data) 720 self.handle.write("ORIGIN\n") 721 for line_number in range(0, seq_len, LETTERS_PER_LINE): 722 self.handle.write(str(line_number + 1).rjust(SEQUENCE_INDENT)) 723 for words in range(line_number, 724 min(line_number + LETTERS_PER_LINE, seq_len), 10): 725 self.handle.write(" %s" % data[words:words + 10]) 726 self.handle.write("\n")
727
728 - def write_record(self, record):
729 """Write a single record to the output file.""" 730 handle = self.handle 731 self._write_the_first_line(record) 732 733 accession = self._get_annotation_str(record, "accession", 734 record.id.split(".", 1)[0], 735 just_first=True) 736 acc_with_version = accession 737 if record.id.startswith(accession + "."): 738 try: 739 acc_with_version = "%s.%i" \ 740 % (accession, 741 int(record.id.split(".", 1)[1])) 742 except ValueError: 743 pass 744 gi = self._get_annotation_str(record, "gi", just_first=True) 745 746 descr = record.description 747 if descr == "<unknown description>": 748 descr = "." 749 self._write_multi_line("DEFINITION", descr) 750 751 self._write_single_line("ACCESSION", accession) 752 if gi != ".": 753 self._write_single_line("VERSION", "%s GI:%s" 754 % (acc_with_version, gi)) 755 else: 756 self._write_single_line("VERSION", "%s" % (acc_with_version)) 757 758 # The NCBI only expect two types of link so far, 759 # e.g. "Project:28471" and "Trace Assembly Archive:123456" 760 # TODO - Filter the dbxrefs list to just these? 761 self._write_multi_entries("DBLINK", record.dbxrefs) 762 763 try: 764 # List of strings 765 # Keywords should be given separated with semi colons, 766 keywords = "; ".join(record.annotations["keywords"]) 767 # with a trailing period: 768 if not keywords.endswith("."): 769 keywords += "." 770 except KeyError: 771 # If no keywords, there should be just a period: 772 keywords = "." 773 self._write_multi_line("KEYWORDS", keywords) 774 775 if "segment" in record.annotations: 776 # Deal with SEGMENT line found only in segmented records, 777 # e.g. AH000819 778 segment = record.annotations["segment"] 779 if isinstance(segment, list): 780 assert len(segment) == 1, segment 781 segment = segment[0] 782 self._write_single_line("SEGMENT", segment) 783 784 self._write_multi_line("SOURCE", 785 self._get_annotation_str(record, "source")) 786 # The ORGANISM line MUST be a single line, as any continuation is the taxonomy 787 org = self._get_annotation_str(record, "organism") 788 if len(org) > self.MAX_WIDTH - self.HEADER_WIDTH: 789 org = org[:self.MAX_WIDTH - self.HEADER_WIDTH - 4] + "..." 790 self._write_single_line(" ORGANISM", org) 791 try: 792 # List of strings 793 # Taxonomy should be given separated with semi colons, 794 taxonomy = "; ".join(record.annotations["taxonomy"]) 795 # with a trailing period: 796 if not taxonomy.endswith("."): 797 taxonomy += "." 798 except KeyError: 799 taxonomy = "." 800 self._write_multi_line("", taxonomy) 801 802 if "references" in record.annotations: 803 self._write_references(record) 804 805 if "comment" in record.annotations: 806 self._write_comment(record) 807 808 handle.write("FEATURES Location/Qualifiers\n") 809 rec_length = len(record) 810 for feature in record.features: 811 self._write_feature(feature, rec_length) 812 self._write_sequence(record) 813 handle.write("//\n")
814 815
816 -class EmblWriter(_InsdcWriter):
817 HEADER_WIDTH = 5 818 QUALIFIER_INDENT = 21 819 QUALIFIER_INDENT_STR = "FT" + " " * (QUALIFIER_INDENT - 2) 820 QUALIFIER_INDENT_TMP = "FT %s " # 21 if %s is empty 821 FEATURE_HEADER = "FH Key Location/Qualifiers\n" 822
823 - def _write_contig(self, record):
824 max_len = self.MAX_WIDTH - self.HEADER_WIDTH 825 lines = self._split_contig(record, max_len) 826 for text in lines: 827 self._write_single_line("CO", text)
828
829 - def _write_sequence(self, record):
830 LETTERS_PER_BLOCK = 10 831 BLOCKS_PER_LINE = 6 832 LETTERS_PER_LINE = LETTERS_PER_BLOCK * BLOCKS_PER_LINE 833 POSITION_PADDING = 10 834 handle = self.handle # save looking up this multiple times 835 836 if isinstance(record.seq, UnknownSeq): 837 # We have already recorded the length, and there is no need 838 # to record a long sequence of NNNNNNN...NNN or whatever. 839 if "contig" in record.annotations: 840 self._write_contig(record) 841 else: 842 # TODO - Can the sequence just be left out as in GenBank files? 843 handle.write("SQ \n") 844 return 845 846 # Catches sequence being None 847 data = self._get_seq_string(record).lower() 848 seq_len = len(data) 849 850 # Get the base alphabet (underneath any Gapped or StopCodon encoding) 851 a = Alphabet._get_base_alphabet(record.seq.alphabet) 852 if isinstance(a, Alphabet.DNAAlphabet): 853 # TODO - What if we have RNA? 854 a_count = data.count('A') + data.count('a') 855 c_count = data.count('C') + data.count('c') 856 g_count = data.count('G') + data.count('g') 857 t_count = data.count('T') + data.count('t') 858 other = seq_len - (a_count + c_count + g_count + t_count) 859 handle.write("SQ Sequence %i BP; %i A; %i C; %i G; %i T; %i other;\n" 860 % (seq_len, a_count, c_count, g_count, t_count, other)) 861 else: 862 handle.write("SQ \n") 863 864 for line_number in range(0, seq_len // LETTERS_PER_LINE): 865 handle.write(" ") # Just four, not five 866 for block in range(BLOCKS_PER_LINE): 867 index = LETTERS_PER_LINE * line_number + \ 868 LETTERS_PER_BLOCK * block 869 handle.write((" %s" % data[index:index + LETTERS_PER_BLOCK])) 870 handle.write(str((line_number + 1) 871 * LETTERS_PER_LINE).rjust(POSITION_PADDING)) 872 handle.write("\n") 873 if seq_len % LETTERS_PER_LINE: 874 # Final (partial) line 875 line_number = (seq_len // LETTERS_PER_LINE) 876 handle.write(" ") # Just four, not five 877 for block in range(BLOCKS_PER_LINE): 878 index = LETTERS_PER_LINE * line_number + \ 879 LETTERS_PER_BLOCK * block 880 handle.write( 881 (" %s" % data[index:index + LETTERS_PER_BLOCK]).ljust(11)) 882 handle.write(str(seq_len).rjust(POSITION_PADDING)) 883 handle.write("\n")
884
885 - def _write_single_line(self, tag, text):
886 assert len(tag) == 2 887 line = tag + " " + text 888 if len(text) > self.MAX_WIDTH: 889 import warnings 890 warnings.warn("Line %r too long" % line) 891 self.handle.write(line + "\n")
892
893 - def _write_multi_line(self, tag, text):
894 max_len = self.MAX_WIDTH - self.HEADER_WIDTH 895 lines = self._split_multi_line(text, max_len) 896 for line in lines: 897 self._write_single_line(tag, line)
898
899 - def _write_the_first_lines(self, record):
900 """Write the ID and AC lines.""" 901 if "." in record.id and record.id.rsplit(".", 1)[1].isdigit(): 902 version = "SV " + record.id.rsplit(".", 1)[1] 903 accession = self._get_annotation_str(record, "accession", 904 record.id.rsplit(".", 1)[0], 905 just_first=True) 906 else: 907 version = "" 908 accession = self._get_annotation_str(record, "accession", 909 record.id, 910 just_first=True) 911 912 if ";" in accession: 913 raise ValueError("Cannot have semi-colon in EMBL accession, %s" 914 % repr(str(accession))) 915 if " " in accession: 916 # This is out of practicallity... might it be allowed? 917 raise ValueError("Cannot have spaces in EMBL accession, %s" 918 % repr(str(accession))) 919 920 # Get the molecule type 921 # TODO - record this explicitly in the parser? 922 # Get the base alphabet (underneath any Gapped or StopCodon encoding) 923 a = Alphabet._get_base_alphabet(record.seq.alphabet) 924 if not isinstance(a, Alphabet.Alphabet): 925 raise TypeError("Invalid alphabet") 926 elif isinstance(a, Alphabet.DNAAlphabet): 927 mol_type = "DNA" 928 units = "BP" 929 elif isinstance(a, Alphabet.RNAAlphabet): 930 mol_type = "RNA" 931 units = "BP" 932 elif isinstance(a, Alphabet.ProteinAlphabet): 933 mol_type = "PROTEIN" 934 units = "AA" 935 else: 936 # Must be something like NucleotideAlphabet 937 raise ValueError("Need a DNA, RNA or Protein alphabet") 938 939 # Get the taxonomy division 940 division = self._get_data_division(record) 941 942 # TODO - Full ID line 943 handle = self.handle 944 # ID <1>; SV <2>; <3>; <4>; <5>; <6>; <7> BP. 945 # 1. Primary accession number 946 # 2. Sequence version number 947 # 3. Topology: 'circular' or 'linear' 948 # 4. Molecule type 949 # 5. Data class 950 # 6. Taxonomic division 951 # 7. Sequence length 952 self._write_single_line("ID", "%s; %s; ; %s; ; %s; %i %s." 953 % (accession, version, mol_type, 954 division, len(record), units)) 955 handle.write("XX\n") 956 self._write_single_line("AC", accession + ";") 957 handle.write("XX\n")
958
959 - def _get_data_division(self, record):
960 try: 961 division = record.annotations["data_file_division"] 962 except KeyError: 963 division = "UNC" 964 if division in ["PHG", "ENV", "FUN", "HUM", "INV", "MAM", "VRT", 965 "MUS", "PLN", "PRO", "ROD", "SYN", "TGN", "UNC", 966 "VRL", "XXX"]: 967 # Good, already EMBL style 968 # Division Code 969 # ----------------- ---- 970 # Bacteriophage PHG 971 # Environmental Sample ENV 972 # Fungal FUN 973 # Human HUM 974 # Invertebrate INV 975 # Other Mammal MAM 976 # Other Vertebrate VRT 977 # Mus musculus MUS 978 # Plant PLN 979 # Prokaryote PRO 980 # Other Rodent ROD 981 # Synthetic SYN 982 # Transgenic TGN 983 # Unclassified UNC (i.e. unknown) 984 # Viral VRL 985 # 986 # (plus XXX used for submiting data to EMBL) 987 pass 988 else: 989 # See if this is in GenBank style & can be converted. 990 # Generally a problem as the GenBank groups are wider 991 # than those of EMBL. Note that GenBank use "BCT" for 992 # both bacteria and acherea thus this maps to EMBL's 993 # "PRO" nicely. 994 gbk_to_embl = {"BCT": "PRO", 995 "UNK": "UNC", 996 } 997 try: 998 division = gbk_to_embl[division] 999 except KeyError: 1000 division = "UNC" 1001 assert len(division) == 3 1002 return division
1003
1004 - def _write_keywords(self, record):
1005 # Put the keywords right after DE line. 1006 # Each 'keyword' can have multiple words and spaces, but we 1007 # must not split any 'keyword' between lines. 1008 # TODO - Combine short keywords onto one line 1009 for keyword in record.annotations["keywords"]: 1010 self._write_single_line("KW", keyword) 1011 self.handle.write("XX\n")
1012
1013 - def _write_references(self, record):
1014 # The order should be RN, RC, RP, RX, RG, RA, RT, RL 1015 number = 0 1016 for ref in record.annotations["references"]: 1017 if not isinstance(ref, SeqFeature.Reference): 1018 continue 1019 number += 1 1020 self._write_single_line("RN", "[%i]" % number) 1021 # TODO - support for RC line (needed in parser too) 1022 # TODO - support more complex record reference locations? 1023 if ref.location and len(ref.location) == 1: 1024 self._write_single_line( 1025 "RP", "%i-%i" % (ref.location[0].nofuzzy_start + 1, 1026 ref.location[0].nofuzzy_end)) 1027 # TODO - record any DOI or AGRICOLA identifier in the reference object? 1028 if ref.pubmed_id: 1029 self._write_single_line("RX", "PUBMED; %s." % ref.pubmed_id) 1030 if ref.consrtm: 1031 self._write_single_line("RG", "%s" % ref.consrtm) 1032 if ref.authors: 1033 # We store the AUTHORS data as a single string 1034 self._write_multi_line("RA", ref.authors + ";") 1035 if ref.title: 1036 # We store the title as a single string 1037 self._write_multi_line("RT", '"%s";' % ref.title) 1038 if ref.journal: 1039 # We store this as a single string - holds the journal name, 1040 # volume, year, and page numbers of the citation 1041 self._write_multi_line("RL", ref.journal) 1042 self.handle.write("XX\n")
1043
1044 - def _write_comment(self, record):
1045 # This is a bit complicated due to the range of possible 1046 # ways people might have done their annotation... 1047 # Currently the parser uses a single string with newlines. 1048 # A list of lines is also reasonable. 1049 # A single (long) string is perhaps the most natural of all. 1050 # This means we may need to deal with line wrapping. 1051 comment = record.annotations["comment"] 1052 if isinstance(comment, basestring): 1053 lines = comment.split("\n") 1054 elif isinstance(comment, list) or isinstance(comment, tuple): 1055 lines = comment 1056 else: 1057 raise ValueError("Could not understand comment annotation") 1058 # TODO - Merge this with the GenBank comment code? 1059 if not lines: 1060 return 1061 for line in lines: 1062 self._write_multi_line("CC", line) 1063 self.handle.write("XX\n")
1064
1065 - def write_record(self, record):
1066 """Write a single record to the output file.""" 1067 1068 handle = self.handle 1069 self._write_the_first_lines(record) 1070 1071 # PR line (0 or 1 lines only), project identifier 1072 for xref in record.dbxrefs: 1073 if xref.startswith("Project:"): 1074 self._write_single_line("PR", xref + ";") 1075 handle.write("XX\n") 1076 break 1077 1078 # TODO - DT lines (date) 1079 1080 descr = record.description 1081 if descr == "<unknown description>": 1082 descr = "." 1083 self._write_multi_line("DE", descr) 1084 handle.write("XX\n") 1085 1086 if "keywords" in record.annotations: 1087 self._write_keywords(record) 1088 1089 # Should this be "source" or "organism"? 1090 self._write_multi_line( 1091 "OS", self._get_annotation_str(record, "organism")) 1092 try: 1093 # List of strings 1094 taxonomy = "; ".join(record.annotations["taxonomy"]) + "." 1095 except KeyError: 1096 taxonomy = "." 1097 self._write_multi_line("OC", taxonomy) 1098 handle.write("XX\n") 1099 1100 if "references" in record.annotations: 1101 self._write_references(record) 1102 1103 if "comment" in record.annotations: 1104 self._write_comment(record) 1105 1106 handle.write(self.FEATURE_HEADER) 1107 rec_length = len(record) 1108 for feature in record.features: 1109 self._write_feature(feature, rec_length) 1110 1111 self._write_sequence(record) 1112 handle.write("//\n")
1113 1114
1115 -class ImgtWriter(EmblWriter):
1116 HEADER_WIDTH = 5 1117 QUALIFIER_INDENT = 25 # Not 21 as in EMBL 1118 QUALIFIER_INDENT_STR = "FT" + " " * (QUALIFIER_INDENT - 2) 1119 QUALIFIER_INDENT_TMP = "FT %s " # 25 if %s is empty 1120 FEATURE_HEADER = "FH Key Location/Qualifiers\n"
1121 1122 if __name__ == "__main__": 1123 print("Quick self test") 1124 import os 1125 from Bio._py3k import StringIO 1126
1127 - def compare_record(old, new):
1128 if old.id != new.id and old.name != new.name: 1129 raise ValueError("'%s' or '%s' vs '%s' or '%s' records" 1130 % (old.id, old.name, new.id, new.name)) 1131 if len(old.seq) != len(new.seq): 1132 raise ValueError("%i vs %i" % (len(old.seq), len(new.seq))) 1133 if str(old.seq).upper() != str(new.seq).upper(): 1134 if len(old.seq) < 200: 1135 raise ValueError("'%s' vs '%s'" % (old.seq, new.seq)) 1136 else: 1137 raise ValueError( 1138 "'%s...' vs '%s...'" % (old.seq[:100], new.seq[:100])) 1139 if old.features and new.features: 1140 return compare_features(old.features, new.features) 1141 # Just insist on at least one word in common: 1142 if (old.description or new.description) \ 1143 and not set(old.description.split()).intersection(new.description.split()): 1144 raise ValueError("%s versus %s" 1145 % (repr(old.description), repr(new.description))) 1146 # TODO - check annotation 1147 if "contig" in old.annotations: 1148 assert old.annotations["contig"] == \ 1149 new.annotations["contig"] 1150 return True
1151
1152 - def compare_records(old_list, new_list):
1153 """Check two lists of SeqRecords agree, raises a ValueError if mismatch.""" 1154 if len(old_list) != len(new_list): 1155 raise ValueError( 1156 "%i vs %i records" % (len(old_list), len(new_list))) 1157 for old, new in zip(old_list, new_list): 1158 if not compare_record(old, new): 1159 return False 1160 return True
1161
1162 - def compare_feature(old, new, ignore_sub_features=False):
1163 """Check two SeqFeatures agree.""" 1164 if old.type != new.type: 1165 raise ValueError("Type %s versus %s" % (old.type, new.type)) 1166 if old.location.nofuzzy_start != new.location.nofuzzy_start \ 1167 or old.location.nofuzzy_end != new.location.nofuzzy_end: 1168 raise ValueError("%s versus %s:\n%s\nvs:\n%s" 1169 % (old.location, new.location, str(old), str(new))) 1170 if old.strand != new.strand: 1171 raise ValueError( 1172 "Different strand:\n%s\nvs:\n%s" % (str(old), str(new))) 1173 if old.location.start != new.location.start: 1174 raise ValueError("Start %s versus %s:\n%s\nvs:\n%s" 1175 % (old.location.start, new.location.start, str(old), str(new))) 1176 if old.location.end != new.location.end: 1177 raise ValueError("End %s versus %s:\n%s\nvs:\n%s" 1178 % (old.location.end, new.location.end, str(old), str(new))) 1179 if not ignore_sub_features: 1180 if len(old.sub_features) != len(new.sub_features): 1181 raise ValueError("Different sub features") 1182 for a, b in zip(old.sub_features, new.sub_features): 1183 if not compare_feature(a, b): 1184 return False 1185 # This only checks key shared qualifiers 1186 # Would a white list be easier? 1187 # for key in ["name", "gene", "translation", "codon_table", "codon_start", "locus_tag"]: 1188 for key in set(old.qualifiers).intersection(new.qualifiers): 1189 if key in ["db_xref", "protein_id", "product", "note"]: 1190 # EMBL and GenBank files are use different references/notes/etc 1191 continue 1192 if old.qualifiers[key] != new.qualifiers[key]: 1193 raise ValueError("Qualifier mis-match for %s:\n%s\n%s" 1194 % (key, old.qualifiers[key], new.qualifiers[key])) 1195 return True
1196
1197 - def compare_features(old_list, new_list, ignore_sub_features=False):
1198 """Check two lists of SeqFeatures agree, raises a ValueError if mismatch.""" 1199 if len(old_list) != len(new_list): 1200 raise ValueError( 1201 "%i vs %i features" % (len(old_list), len(new_list))) 1202 for old, new in zip(old_list, new_list): 1203 # This assumes they are in the same order 1204 if not compare_feature(old, new, ignore_sub_features): 1205 return False 1206 return True
1207
1208 - def check_genbank_writer(records):
1209 handle = StringIO() 1210 GenBankWriter(handle).write_file(records) 1211 handle.seek(0) 1212 1213 records2 = list(GenBankIterator(handle)) 1214 assert compare_records(records, records2)
1215
1216 - def check_embl_writer(records):
1217 handle = StringIO() 1218 try: 1219 EmblWriter(handle).write_file(records) 1220 except ValueError as err: 1221 print(err) 1222 return 1223 handle.seek(0) 1224 1225 records2 = list(EmblIterator(handle)) 1226 assert compare_records(records, records2)
1227 1228 for filename in os.listdir("../../Tests/GenBank"): 1229 if not filename.endswith(".gbk") and not filename.endswith(".gb"): 1230 continue 1231 print(filename) 1232 1233 with open("../../Tests/GenBank/%s" % filename) as handle: 1234 records = list(GenBankIterator(handle)) 1235 1236 check_genbank_writer(records) 1237 check_embl_writer(records) 1238 1239 for filename in os.listdir("../../Tests/EMBL"): 1240 if not filename.endswith(".embl"): 1241 continue 1242 print(filename) 1243 1244 with open("../../Tests/EMBL/%s" % filename) as handle: 1245 records = list(EmblIterator(handle)) 1246 1247 check_genbank_writer(records) 1248 check_embl_writer(records) 1249 1250 from Bio import SeqIO 1251 for filename in os.listdir("../../Tests/SwissProt"): 1252 if not filename.startswith("sp"): 1253 continue 1254 print(filename) 1255 1256 with open("../../Tests/SwissProt/%s" % filename) as handle: 1257 records = list(SeqIO.parse(handle, "swiss")) 1258 1259 check_genbank_writer(records) 1260