Package Bio :: Package SeqIO :: Module InsdcIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.InsdcIO

   1  # Copyright 2007-2011 by Peter Cock.  All rights reserved. 
   2  # 
   3  # This code is part of the Biopython distribution and governed by its 
   4  # license.  Please see the LICENSE file that should have been included 
   5  # as part of this package.. 
   6   
   7  """Bio.SeqIO support for the "genbank" and "embl" file formats. 
   8   
   9  You are expected to use this module via the Bio.SeqIO functions. 
  10  Note that internally this module calls Bio.GenBank to do the actual 
  11  parsing of GenBank, EMBL and IMGT files. 
  12   
  13  See also: 
  14   
  15  International Nucleotide Sequence Database Collaboration 
  16  http://www.insdc.org/ 
  17   
  18  GenBank 
  19  http://www.ncbi.nlm.nih.gov/Genbank/ 
  20   
  21  EMBL Nucleotide Sequence Database 
  22  http://www.ebi.ac.uk/embl/ 
  23   
  24  DDBJ (DNA Data Bank of Japan) 
  25  http://www.ddbj.nig.ac.jp/ 
  26   
  27  IMGT (use a variant of EMBL format with longer feature indents) 
  28  http://imgt.cines.fr/download/LIGM-DB/userman_doc.html 
  29  http://imgt.cines.fr/download/LIGM-DB/ftable_doc.html 
  30  http://www.ebi.ac.uk/imgt/hla/docs/manual.html 
  31   
  32  """ 
  33   
  34  from __future__ import print_function 
  35   
  36  from Bio.Seq import UnknownSeq 
  37  from Bio.GenBank.Scanner import GenBankScanner, EmblScanner, _ImgtScanner 
  38  from Bio import Alphabet 
  39  from .Interfaces import SequentialSequenceWriter 
  40  from Bio import SeqFeature 
  41   
  42  from Bio._py3k import _is_int_or_long 
  43  from Bio._py3k import basestring 
  44   
  45  # NOTE 
  46  # ==== 
  47  # The "brains" for parsing GenBank, EMBL and IMGT files (and any 
  48  # other flat file variants from the INSDC in future) is in 
  49  # Bio.GenBank.Scanner (plus the _FeatureConsumer in Bio.GenBank) 
  50  # However, all the writing code is in this file. 
  51   
  52   
53 -def GenBankIterator(handle):
54 """Breaks up a Genbank file into SeqRecord objects. 55 56 Every section from the LOCUS line to the terminating // becomes 57 a single SeqRecord with associated annotation and features. 58 59 Note that for genomes or chromosomes, there is typically only 60 one record.""" 61 #This calls a generator function: 62 return GenBankScanner(debug=0).parse_records(handle)
63 64
65 -def EmblIterator(handle):
66 """Breaks up an EMBL file into SeqRecord objects. 67 68 Every section from the LOCUS line to the terminating // becomes 69 a single SeqRecord with associated annotation and features. 70 71 Note that for genomes or chromosomes, there is typically only 72 one record.""" 73 #This calls a generator function: 74 return EmblScanner(debug=0).parse_records(handle)
75 76
77 -def ImgtIterator(handle):
78 """Breaks up an IMGT file into SeqRecord objects. 79 80 Every section from the LOCUS line to the terminating // becomes 81 a single SeqRecord with associated annotation and features. 82 83 Note that for genomes or chromosomes, there is typically only 84 one record.""" 85 #This calls a generator function: 86 return _ImgtScanner(debug=0).parse_records(handle)
87 88
89 -def GenBankCdsFeatureIterator(handle, alphabet=Alphabet.generic_protein):
90 """Breaks up a Genbank file into SeqRecord objects for each CDS feature. 91 92 Every section from the LOCUS line to the terminating // can contain 93 many CDS features. These are returned as with the stated amino acid 94 translation sequence (if given). 95 """ 96 #This calls a generator function: 97 return GenBankScanner(debug=0).parse_cds_features(handle, alphabet)
98 99
100 -def EmblCdsFeatureIterator(handle, alphabet=Alphabet.generic_protein):
101 """Breaks up a EMBL file into SeqRecord objects for each CDS feature. 102 103 Every section from the LOCUS line to the terminating // can contain 104 many CDS features. These are returned as with the stated amino acid 105 translation sequence (if given). 106 """ 107 #This calls a generator function: 108 return EmblScanner(debug=0).parse_cds_features(handle, alphabet)
109 110
111 -def _insdc_feature_position_string(pos, offset=0):
112 """Build a GenBank/EMBL position string (PRIVATE). 113 114 Use offset=1 to add one to convert a start position from python counting. 115 """ 116 if isinstance(pos, SeqFeature.ExactPosition): 117 return "%i" % (pos.position + offset) 118 elif isinstance(pos, SeqFeature.WithinPosition): 119 return "(%i.%i)" % (pos.position + offset, 120 pos.position + pos.extension + offset) 121 elif isinstance(pos, SeqFeature.BetweenPosition): 122 return "(%i^%i)" % (pos.position + offset, 123 pos.position + pos.extension + offset) 124 elif isinstance(pos, SeqFeature.BeforePosition): 125 return "<%i" % (pos.position + offset) 126 elif isinstance(pos, SeqFeature.AfterPosition): 127 return ">%i" % (pos.position + offset) 128 elif isinstance(pos, SeqFeature.OneOfPosition): 129 return "one-of(%s)" \ 130 % ",".join(_insdc_feature_position_string(p, offset) 131 for p in pos.position_choices) 132 elif isinstance(pos, SeqFeature.AbstractPosition): 133 raise NotImplementedError("Please report this as a bug in Biopython.") 134 else: 135 raise ValueError("Expected a SeqFeature position object.")
136 137
138 -def _insdc_location_string_ignoring_strand_and_subfeatures(location, rec_length):
139 if location.ref: 140 ref = "%s:" % location.ref 141 else: 142 ref = "" 143 assert not location.ref_db 144 if isinstance(location.start, SeqFeature.ExactPosition) \ 145 and isinstance(location.end, SeqFeature.ExactPosition) \ 146 and location.start.position == location.end.position: 147 #Special case, for 12:12 return 12^13 148 #(a zero length slice, meaning the point between two letters) 149 if location.end.position == rec_length: 150 #Very special case, for a between position at the end of a 151 #sequence (used on some circular genomes, Bug 3098) we have 152 #N:N so return N^1 153 return "%s%i^1" % (ref, rec_length) 154 else: 155 return "%s%i^%i" % (ref, location.end.position, 156 location.end.position + 1) 157 if isinstance(location.start, SeqFeature.ExactPosition) \ 158 and isinstance(location.end, SeqFeature.ExactPosition) \ 159 and location.start.position + 1 == location.end.position: 160 #Special case, for 11:12 return 12 rather than 12..12 161 #(a length one slice, meaning a single letter) 162 return "%s%i" % (ref, location.end.position) 163 elif isinstance(location.start, SeqFeature.UnknownPosition) \ 164 or isinstance(location.end, SeqFeature.UnknownPosition): 165 #Special case for features from SwissProt/UniProt files 166 if isinstance(location.start, SeqFeature.UnknownPosition) \ 167 and isinstance(location.end, SeqFeature.UnknownPosition): 168 #import warnings 169 #warnings.warn("Feature with unknown location") 170 #return "?" 171 raise ValueError("Feature with unknown location") 172 elif isinstance(location.start, SeqFeature.UnknownPosition): 173 #Treat the unknown start position as a BeforePosition 174 return "%s<%i..%s" \ 175 % (ref, 176 location.nofuzzy_end, 177 _insdc_feature_position_string(location.end)) 178 else: 179 #Treat the unknown end position as an AfterPosition 180 return "%s%s..>%i" \ 181 % (ref, 182 _insdc_feature_position_string(location.start), 183 location.nofuzzy_start) 184 else: 185 #Typical case, e.g. 12..15 gets mapped to 11:15 186 return ref \ 187 + _insdc_feature_position_string(location.start, +1) \ 188 + ".." + \ 189 _insdc_feature_position_string(location.end)
190 191
192 -def _insdc_location_string(location, rec_length):
193 """Build a GenBank/EMBL location from a (Compound) FeatureLocation (PRIVATE). 194 195 There is a choice of how to show joins on the reverse complement strand, 196 GenBank used "complement(join(1,10),(20,100))" while EMBL used to use 197 "join(complement(20,100),complement(1,10))" instead (but appears to have 198 now adopted the GenBank convention). Notice that the order of the entries 199 is reversed! This function therefore uses the first form. In this situation 200 we expect the CompoundFeatureLocation and its parts to all be marked as 201 strand == -1, and to be in the order 19:100 then 0:10. 202 """ 203 try: 204 parts = location.parts 205 #CompoundFeatureLocation 206 if location.strand == -1: 207 #Special case, put complement outside the join/order/... and reverse order 208 return "complement(%s(%s))" % (location.operator, 209 ",".join(_insdc_location_string_ignoring_strand_and_subfeatures(p, rec_length) \ 210 for p in parts[::-1])) 211 else: 212 return "%s(%s)" % (location.operator, 213 ",".join(_insdc_location_string(p, rec_length) for p in parts)) 214 except AttributeError: 215 #Simple FeatureLocation 216 loc = _insdc_location_string_ignoring_strand_and_subfeatures(location, rec_length) 217 if location.strand == -1: 218 return "complement(%s)" % loc 219 else: 220 return loc
221
222 -def _insdc_feature_location_string(feature, rec_length):
223 """Build a GenBank/EMBL location string from a SeqFeature (PRIVATE, OBSOLETE). 224 225 There is a choice of how to show joins on the reverse complement strand, 226 GenBank used "complement(join(1,10),(20,100))" while EMBL used to use 227 "join(complement(20,100),complement(1,10))" instead (but appears to have 228 now adopted the GenBank convention). Notice that the order of the entries 229 is reversed! This function therefore uses the first form. In this situation 230 we expect the parent feature and the two children to all be marked as 231 strand == -1, and in the order 0:10 then 19:100. 232 233 Also need to consider dual-strand examples like these from the Arabidopsis 234 thaliana chloroplast NC_000932: join(complement(69611..69724),139856..140650) 235 gene ArthCp047, GeneID:844801 or its CDS (protein NP_051038.1 GI:7525057) 236 which is further complicated by a splice: 237 join(complement(69611..69724),139856..140087,140625..140650) 238 239 For this mixed strand feature, the parent SeqFeature should have 240 no strand (either 0 or None) while the child features should have either 241 strand +1 or -1 as appropriate, and be listed in the order given here. 242 """ 243 #Using private variable to avoid deprecation warning 244 if not feature._sub_features: 245 #Non-recursive. 246 #assert feature.location_operator == "", \ 247 # "%s has no subfeatures but location_operator %s" \ 248 # % (repr(feature), feature.location_operator) 249 location = _insdc_location_string_ignoring_strand_and_subfeatures( 250 feature.location, rec_length) 251 if feature.strand == -1: 252 location = "complement(%s)" % location 253 return location 254 # As noted above, treat reverse complement strand features carefully: 255 if feature.strand == -1: 256 for f in feature._sub_features: 257 if f.strand != -1: 258 raise ValueError("Inconsistent strands: %r for parent, %r for child" 259 % (feature.strand, f.strand)) 260 return "complement(%s(%s))" \ 261 % (feature.location_operator, 262 ",".join(_insdc_location_string_ignoring_strand_and_subfeatures(f.location, rec_length) 263 for f in feature._sub_features)) 264 #if feature.strand == +1: 265 # for f in feature.sub_features: 266 # assert f.strand == +1 267 #This covers typical forward strand features, and also an evil mixed strand: 268 assert feature.location_operator != "" 269 return "%s(%s)" % (feature.location_operator, 270 ",".join(_insdc_feature_location_string(f, rec_length) 271 for f in feature._sub_features))
272 273
274 -class _InsdcWriter(SequentialSequenceWriter):
275 """Base class for GenBank and EMBL writers (PRIVATE).""" 276 MAX_WIDTH = 80 277 QUALIFIER_INDENT = 21 278 QUALIFIER_INDENT_STR = " " * QUALIFIER_INDENT 279 QUALIFIER_INDENT_TMP = " %s " # 21 if %s is empty 280 FTQUAL_NO_QUOTE = ("anticodon", "citation", "codon_start", "compare", 281 "direction", "estimated_length", "mod_base", "number", 282 "rpt_type", "rpt_unit_range", "tag_peptide", 283 "transl_except", "transl_table") 284
285 - def _write_feature_qualifier(self, key, value=None, quote=None):
286 if not value: 287 self.handle.write("%s/%s\n" % (self.QUALIFIER_INDENT_STR, key)) 288 return 289 #Quick hack with no line wrapping, may be useful for testing: 290 #self.handle.write('%s/%s="%s"\n' % (self.QUALIFIER_INDENT_STR, key, value)) 291 if quote is None: 292 #Try to mimic unwritten rules about when quotes can be left out: 293 if _is_int_or_long(value) or key in self.FTQUAL_NO_QUOTE: 294 quote = False 295 else: 296 quote = True 297 if quote: 298 line = '%s/%s="%s"' % (self.QUALIFIER_INDENT_STR, key, value) 299 else: 300 line = '%s/%s=%s' % (self.QUALIFIER_INDENT_STR, key, value) 301 if len(line) <= self.MAX_WIDTH: 302 self.handle.write(line + "\n") 303 return 304 while line.lstrip(): 305 if len(line) <= self.MAX_WIDTH: 306 self.handle.write(line + "\n") 307 return 308 #Insert line break... 309 for index in range(min(len(line) - 1, self.MAX_WIDTH), 310 self.QUALIFIER_INDENT + 1, -1): 311 if line[index] == " ": 312 break 313 if line[index] != " ": 314 #No nice place to break... 315 index = self.MAX_WIDTH 316 assert index <= self.MAX_WIDTH 317 self.handle.write(line[:index] + "\n") 318 line = self.QUALIFIER_INDENT_STR + line[index:].lstrip()
319
320 - def _wrap_location(self, location):
321 """Split a feature location into lines (break at commas).""" 322 #TODO - Rewrite this not to recurse! 323 length = self.MAX_WIDTH - self.QUALIFIER_INDENT 324 if len(location) <= length: 325 return location 326 index = location[:length].rfind(",") 327 if index == -1: 328 #No good place to split (!) 329 import warnings 330 warnings.warn("Couldn't split location:\n%s" % location) 331 return location 332 return location[:index + 1] + "\n" + \ 333 self.QUALIFIER_INDENT_STR + \ 334 self._wrap_location(location[index + 1:])
335
336 - def _write_feature(self, feature, record_length):
337 """Write a single SeqFeature object to features table.""" 338 assert feature.type, feature 339 location = _insdc_location_string(feature.location, record_length) 340 f_type = feature.type.replace(" ", "_") 341 line = (self.QUALIFIER_INDENT_TMP % f_type)[:self.QUALIFIER_INDENT] \ 342 + self._wrap_location(location) + "\n" 343 self.handle.write(line) 344 #Now the qualifiers... 345 for key, values in feature.qualifiers.items(): 346 if isinstance(values, list) or isinstance(values, tuple): 347 for value in values: 348 self._write_feature_qualifier(key, value) 349 elif values: 350 #String, int, etc 351 self._write_feature_qualifier(key, values) 352 else: 353 #e.g. a /pseudo entry 354 self._write_feature_qualifier(key)
355
356 - def _get_annotation_str(self, record, key, default=".", just_first=False):
357 """Get an annotation dictionary entry (as a string). 358 359 Some entries are lists, in which case if just_first=True the first entry 360 is returned. If just_first=False (default) this verifies there is only 361 one entry before returning it.""" 362 try: 363 answer = record.annotations[key] 364 except KeyError: 365 return default 366 if isinstance(answer, list): 367 if not just_first: 368 assert len(answer) == 1 369 return str(answer[0]) 370 else: 371 return str(answer)
372
373 - def _split_multi_line(self, text, max_len):
374 """Returns a list of strings. 375 376 Any single words which are too long get returned as a whole line 377 (e.g. URLs) without an exception or warning. 378 """ 379 #TODO - Do the line spliting while preserving white space? 380 text = text.strip() 381 if len(text) <= max_len: 382 return [text] 383 384 words = text.split() 385 text = "" 386 while words and len(text) + 1 + len(words[0]) <= max_len: 387 text += " " + words.pop(0) 388 text = text.strip() 389 #assert len(text) <= max_len 390 answer = [text] 391 while words: 392 text = words.pop(0) 393 while words and len(text) + 1 + len(words[0]) <= max_len: 394 text += " " + words.pop(0) 395 text = text.strip() 396 #assert len(text) <= max_len 397 answer.append(text) 398 assert not words 399 return answer
400
401 - def _split_contig(self, record, max_len):
402 """Returns a list of strings, splits on commas.""" 403 #TODO - Merge this with _write_multi_line method? 404 #It would need the addition of the comma splitting logic... 405 #are there any other cases where that would be sensible? 406 contig = record.annotations.get("contig", "") 407 if isinstance(contig, list) or isinstance(contig, tuple): 408 contig = "".join(contig) 409 contig = self.clean(contig) 410 answer = [] 411 while contig: 412 if len(contig) > max_len: 413 #Split lines at the commas 414 pos = contig[:max_len - 1].rfind(",") 415 if pos == -1: 416 raise ValueError("Could not break up CONTIG") 417 text, contig = contig[:pos + 1], contig[pos + 1:] 418 else: 419 text, contig = contig, "" 420 answer.append(text) 421 return answer
422 423
424 -class GenBankWriter(_InsdcWriter):
425 HEADER_WIDTH = 12 426 QUALIFIER_INDENT = 21 427
428 - def _write_single_line(self, tag, text):
429 """Used in the 'header' of each GenBank record.""" 430 assert len(tag) < self.HEADER_WIDTH 431 if len(text) > self.MAX_WIDTH - self.HEADER_WIDTH: 432 import warnings 433 from Bio import BiopythonWarning 434 warnings.warn("Annotation %r too long for %r line" % (text, tag), 435 BiopythonWarning) 436 self.handle.write("%s%s\n" % (tag.ljust(self.HEADER_WIDTH), 437 text.replace("\n", " ")))
438
439 - def _write_multi_line(self, tag, text):
440 """Used in the 'header' of each GenBank record.""" 441 #TODO - Do the line spliting while preserving white space? 442 max_len = self.MAX_WIDTH - self.HEADER_WIDTH 443 lines = self._split_multi_line(text, max_len) 444 self._write_single_line(tag, lines[0]) 445 for line in lines[1:]: 446 self._write_single_line("", line)
447
448 - def _write_multi_entries(self, tag, text_list):
449 #used for DBLINK and any similar later line types. 450 #If the list of strings is empty, nothing is written. 451 for i, text in enumerate(text_list): 452 if i == 0: 453 self._write_single_line(tag, text) 454 else: 455 self._write_single_line("", text)
456
457 - def _get_date(self, record):
458 default = "01-JAN-1980" 459 try: 460 date = record.annotations["date"] 461 except KeyError: 462 return default 463 #Cope with a list of one string: 464 if isinstance(date, list) and len(date) == 1: 465 date = date[0] 466 #TODO - allow a Python date object 467 if not isinstance(date, basestring) or len(date) != 11 \ 468 or date[2] != "-" or date[6] != "-" \ 469 or not date[:2].isdigit() or not date[7:].isdigit() \ 470 or int(date[:2]) > 31 \ 471 or date[3:6] not in ["JAN", "FEB", "MAR", "APR", "MAY", "JUN", 472 "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"]: 473 #TODO - Check is a valid date (e.g. not 31 Feb) 474 return default 475 return date
476
477 - def _get_data_division(self, record):
478 try: 479 division = record.annotations["data_file_division"] 480 except KeyError: 481 division = "UNK" 482 if division in ["PRI", "ROD", "MAM", "VRT", "INV", "PLN", "BCT", 483 "VRL", "PHG", "SYN", "UNA", "EST", "PAT", "STS", 484 "GSS", "HTG", "HTC", "ENV", "CON"]: 485 #Good, already GenBank style 486 # PRI - primate sequences 487 # ROD - rodent sequences 488 # MAM - other mammalian sequences 489 # VRT - other vertebrate sequences 490 # INV - invertebrate sequences 491 # PLN - plant, fungal, and algal sequences 492 # BCT - bacterial sequences [plus archea] 493 # VRL - viral sequences 494 # PHG - bacteriophage sequences 495 # SYN - synthetic sequences 496 # UNA - unannotated sequences 497 # EST - EST sequences (expressed sequence tags) 498 # PAT - patent sequences 499 # STS - STS sequences (sequence tagged sites) 500 # GSS - GSS sequences (genome survey sequences) 501 # HTG - HTGS sequences (high throughput genomic sequences) 502 # HTC - HTC sequences (high throughput cDNA sequences) 503 # ENV - Environmental sampling sequences 504 # CON - Constructed sequences 505 # 506 #(plus UNK for unknown) 507 pass 508 else: 509 #See if this is in EMBL style: 510 # Division Code 511 # ----------------- ---- 512 # Bacteriophage PHG - common 513 # Environmental Sample ENV - common 514 # Fungal FUN - map to PLN (plants + fungal) 515 # Human HUM - map to PRI (primates) 516 # Invertebrate INV - common 517 # Other Mammal MAM - common 518 # Other Vertebrate VRT - common 519 # Mus musculus MUS - map to ROD (rodent) 520 # Plant PLN - common 521 # Prokaryote PRO - map to BCT (poor name) 522 # Other Rodent ROD - common 523 # Synthetic SYN - common 524 # Transgenic TGN - ??? map to SYN ??? 525 # Unclassified UNC - map to UNK 526 # Viral VRL - common 527 # 528 #(plus XXX for submiting which we can map to UNK) 529 embl_to_gbk = {"FUN": "PLN", 530 "HUM": "PRI", 531 "MUS": "ROD", 532 "PRO": "BCT", 533 "UNC": "UNK", 534 "XXX": "UNK", 535 } 536 try: 537 division = embl_to_gbk[division] 538 except KeyError: 539 division = "UNK" 540 assert len(division) == 3 541 return division
542
543 - def _write_the_first_line(self, record):
544 """Write the LOCUS line.""" 545 546 locus = record.name 547 if not locus or locus == "<unknown name>": 548 locus = record.id 549 if not locus or locus == "<unknown id>": 550 locus = self._get_annotation_str( 551 record, "accession", just_first=True) 552 if len(locus) > 16: 553 raise ValueError("Locus identifier %r is too long" % str(locus)) 554 555 if len(record) > 99999999999: 556 #Currently GenBank only officially support up to 350000, but 557 #the length field can take eleven digits 558 raise ValueError("Sequence too long!") 559 560 #Get the base alphabet (underneath any Gapped or StopCodon encoding) 561 a = Alphabet._get_base_alphabet(record.seq.alphabet) 562 if not isinstance(a, Alphabet.Alphabet): 563 raise TypeError("Invalid alphabet") 564 elif isinstance(a, Alphabet.ProteinAlphabet): 565 units = "aa" 566 elif isinstance(a, Alphabet.NucleotideAlphabet): 567 units = "bp" 568 else: 569 #Must be something like NucleotideAlphabet or 570 #just the generic Alphabet (default for fasta files) 571 raise ValueError("Need a Nucleotide or Protein alphabet") 572 573 #Get the molecule type 574 #TODO - record this explicitly in the parser? 575 if isinstance(a, Alphabet.ProteinAlphabet): 576 mol_type = "" 577 elif isinstance(a, Alphabet.DNAAlphabet): 578 mol_type = "DNA" 579 elif isinstance(a, Alphabet.RNAAlphabet): 580 mol_type = "RNA" 581 else: 582 #Must be something like NucleotideAlphabet or 583 #just the generic Alphabet (default for fasta files) 584 raise ValueError("Need a DNA, RNA or Protein alphabet") 585 586 division = self._get_data_division(record) 587 588 assert len(units) == 2 589 assert len(division) == 3 590 #TODO - date 591 #TODO - mol_type 592 line = "LOCUS %s %s %s %s %s %s\n" \ 593 % (locus.ljust(16), 594 str(len(record)).rjust(11), 595 units, 596 mol_type.ljust(6), 597 division, 598 self._get_date(record)) 599 assert len(line) == 79 + 1, repr(line) # plus one for new line 600 601 assert line[12:28].rstrip() == locus, \ 602 'LOCUS line does not contain the locus at the expected position:\n' + line 603 assert line[28:29] == " " 604 assert line[29:40].lstrip() == str(len(record)), \ 605 'LOCUS line does not contain the length at the expected position:\n' + line 606 607 #Tests copied from Bio.GenBank.Scanner 608 assert line[40:44] in [' bp ', ' aa '], \ 609 'LOCUS line does not contain size units at expected position:\n' + \ 610 line 611 assert line[44:47] in [' ', 'ss-', 'ds-', 'ms-'], \ 612 'LOCUS line does not have valid strand type (Single stranded, ...):\n' + line 613 assert line[47:54].strip() == "" \ 614 or 'DNA' in line[47:54].strip() \ 615 or 'RNA' in line[47:54].strip(), \ 616 'LOCUS line does not contain valid sequence type (DNA, RNA, ...):\n' + line 617 assert line[54:55] == ' ', \ 618 'LOCUS line does not contain space at position 55:\n' + line 619 assert line[55:63].strip() in ['', 'linear', 'circular'], \ 620 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line 621 assert line[63:64] == ' ', \ 622 'LOCUS line does not contain space at position 64:\n' + line 623 assert line[67:68] == ' ', \ 624 'LOCUS line does not contain space at position 68:\n' + line 625 assert line[70:71] == '-', \ 626 'LOCUS line does not contain - at position 71 in date:\n' + line 627 assert line[74:75] == '-', \ 628 'LOCUS line does not contain - at position 75 in date:\n' + line 629 630 self.handle.write(line)
631
632 - def _write_references(self, record):
633 number = 0 634 for ref in record.annotations["references"]: 635 if not isinstance(ref, SeqFeature.Reference): 636 continue 637 number += 1 638 data = str(number) 639 #TODO - support more complex record reference locations? 640 if ref.location and len(ref.location) == 1: 641 a = Alphabet._get_base_alphabet(record.seq.alphabet) 642 if isinstance(a, Alphabet.ProteinAlphabet): 643 units = "residues" 644 else: 645 units = "bases" 646 data += " (%s %i to %i)" % (units, 647 ref.location[0].nofuzzy_start + 1, 648 ref.location[0].nofuzzy_end) 649 self._write_single_line("REFERENCE", data) 650 if ref.authors: 651 #We store the AUTHORS data as a single string 652 self._write_multi_line(" AUTHORS", ref.authors) 653 if ref.consrtm: 654 #We store the consortium as a single string 655 self._write_multi_line(" CONSRTM", ref.consrtm) 656 if ref.title: 657 #We store the title as a single string 658 self._write_multi_line(" TITLE", ref.title) 659 if ref.journal: 660 #We store this as a single string - holds the journal name, 661 #volume, year, and page numbers of the citation 662 self._write_multi_line(" JOURNAL", ref.journal) 663 if ref.medline_id: 664 #This line type is obsolete and was removed from the GenBank 665 #flatfile format in April 2005. Should we write it? 666 #Note this has a two space indent: 667 self._write_multi_line(" MEDLINE", ref.medline_id) 668 if ref.pubmed_id: 669 #Note this has a THREE space indent: 670 self._write_multi_line(" PUBMED", ref.pubmed_id) 671 if ref.comment: 672 self._write_multi_line(" REMARK", ref.comment)
673
674 - def _write_comment(self, record):
675 #This is a bit complicated due to the range of possible 676 #ways people might have done their annotation... 677 #Currently the parser uses a single string with newlines. 678 #A list of lines is also reasonable. 679 #A single (long) string is perhaps the most natural of all. 680 #This means we may need to deal with line wrapping. 681 comment = record.annotations["comment"] 682 if isinstance(comment, basestring): 683 lines = comment.split("\n") 684 elif isinstance(comment, list) or isinstance(comment, tuple): 685 lines = comment 686 else: 687 raise ValueError("Could not understand comment annotation") 688 self._write_multi_line("COMMENT", lines[0]) 689 for line in lines[1:]: 690 self._write_multi_line("", line)
691
692 - def _write_contig(self, record):
693 max_len = self.MAX_WIDTH - self.HEADER_WIDTH 694 lines = self._split_contig(record, max_len) 695 self._write_single_line("CONTIG", lines[0]) 696 for text in lines[1:]: 697 self._write_single_line("", text)
698
699 - def _write_sequence(self, record):
700 #Loosely based on code from Howard Salis 701 #TODO - Force lower case? 702 LETTERS_PER_LINE = 60 703 SEQUENCE_INDENT = 9 704 705 if isinstance(record.seq, UnknownSeq): 706 #We have already recorded the length, and there is no need 707 #to record a long sequence of NNNNNNN...NNN or whatever. 708 if "contig" in record.annotations: 709 self._write_contig(record) 710 else: 711 self.handle.write("ORIGIN\n") 712 return 713 714 #Catches sequence being None: 715 data = self._get_seq_string(record).lower() 716 seq_len = len(data) 717 self.handle.write("ORIGIN\n") 718 for line_number in range(0, seq_len, LETTERS_PER_LINE): 719 self.handle.write(str(line_number + 1).rjust(SEQUENCE_INDENT)) 720 for words in range(line_number, 721 min(line_number + LETTERS_PER_LINE, seq_len), 10): 722 self.handle.write(" %s" % data[words:words + 10]) 723 self.handle.write("\n")
724
725 - def write_record(self, record):
726 """Write a single record to the output file.""" 727 handle = self.handle 728 self._write_the_first_line(record) 729 730 accession = self._get_annotation_str(record, "accession", 731 record.id.split(".", 1)[0], 732 just_first=True) 733 acc_with_version = accession 734 if record.id.startswith(accession + "."): 735 try: 736 acc_with_version = "%s.%i" \ 737 % (accession, 738 int(record.id.split(".", 1)[1])) 739 except ValueError: 740 pass 741 gi = self._get_annotation_str(record, "gi", just_first=True) 742 743 descr = record.description 744 if descr == "<unknown description>": 745 descr = "." 746 self._write_multi_line("DEFINITION", descr) 747 748 self._write_single_line("ACCESSION", accession) 749 if gi != ".": 750 self._write_single_line("VERSION", "%s GI:%s" 751 % (acc_with_version, gi)) 752 else: 753 self._write_single_line("VERSION", "%s" % (acc_with_version)) 754 755 #The NCBI only expect two types of link so far, 756 #e.g. "Project:28471" and "Trace Assembly Archive:123456" 757 #TODO - Filter the dbxrefs list to just these? 758 self._write_multi_entries("DBLINK", record.dbxrefs) 759 760 try: 761 #List of strings 762 #Keywords should be given separated with semi colons, 763 keywords = "; ".join(record.annotations["keywords"]) 764 #with a trailing period: 765 if not keywords.endswith("."): 766 keywords += "." 767 except KeyError: 768 #If no keywords, there should be just a period: 769 keywords = "." 770 self._write_multi_line("KEYWORDS", keywords) 771 772 if "segment" in record.annotations: 773 #Deal with SEGMENT line found only in segmented records, 774 #e.g. AH000819 775 segment = record.annotations["segment"] 776 if isinstance(segment, list): 777 assert len(segment) == 1, segment 778 segment = segment[0] 779 self._write_single_line("SEGMENT", segment) 780 781 self._write_multi_line("SOURCE", 782 self._get_annotation_str(record, "source")) 783 #The ORGANISM line MUST be a single line, as any continuation is the taxonomy 784 org = self._get_annotation_str(record, "organism") 785 if len(org) > self.MAX_WIDTH - self.HEADER_WIDTH: 786 org = org[:self.MAX_WIDTH - self.HEADER_WIDTH - 4] + "..." 787 self._write_single_line(" ORGANISM", org) 788 try: 789 #List of strings 790 #Taxonomy should be given separated with semi colons, 791 taxonomy = "; ".join(record.annotations["taxonomy"]) 792 #with a trailing period: 793 if not taxonomy.endswith("."): 794 taxonomy += "." 795 except KeyError: 796 taxonomy = "." 797 self._write_multi_line("", taxonomy) 798 799 if "references" in record.annotations: 800 self._write_references(record) 801 802 if "comment" in record.annotations: 803 self._write_comment(record) 804 805 handle.write("FEATURES Location/Qualifiers\n") 806 rec_length = len(record) 807 for feature in record.features: 808 self._write_feature(feature, rec_length) 809 self._write_sequence(record) 810 handle.write("//\n")
811 812
813 -class EmblWriter(_InsdcWriter):
814 HEADER_WIDTH = 5 815 QUALIFIER_INDENT = 21 816 QUALIFIER_INDENT_STR = "FT" + " " * (QUALIFIER_INDENT - 2) 817 QUALIFIER_INDENT_TMP = "FT %s " # 21 if %s is empty 818 FEATURE_HEADER = "FH Key Location/Qualifiers\n" 819
820 - def _write_contig(self, record):
821 max_len = self.MAX_WIDTH - self.HEADER_WIDTH 822 lines = self._split_contig(record, max_len) 823 for text in lines: 824 self._write_single_line("CO", text)
825
826 - def _write_sequence(self, record):
827 LETTERS_PER_BLOCK = 10 828 BLOCKS_PER_LINE = 6 829 LETTERS_PER_LINE = LETTERS_PER_BLOCK * BLOCKS_PER_LINE 830 POSITION_PADDING = 10 831 handle = self.handle # save looking up this multiple times 832 833 if isinstance(record.seq, UnknownSeq): 834 #We have already recorded the length, and there is no need 835 #to record a long sequence of NNNNNNN...NNN or whatever. 836 if "contig" in record.annotations: 837 self._write_contig(record) 838 else: 839 #TODO - Can the sequence just be left out as in GenBank files? 840 handle.write("SQ \n") 841 return 842 843 #Catches sequence being None 844 data = self._get_seq_string(record).lower() 845 seq_len = len(data) 846 847 #Get the base alphabet (underneath any Gapped or StopCodon encoding) 848 a = Alphabet._get_base_alphabet(record.seq.alphabet) 849 if isinstance(a, Alphabet.DNAAlphabet): 850 #TODO - What if we have RNA? 851 a_count = data.count('A') + data.count('a') 852 c_count = data.count('C') + data.count('c') 853 g_count = data.count('G') + data.count('g') 854 t_count = data.count('T') + data.count('t') 855 other = seq_len - (a_count + c_count + g_count + t_count) 856 handle.write("SQ Sequence %i BP; %i A; %i C; %i G; %i T; %i other;\n" 857 % (seq_len, a_count, c_count, g_count, t_count, other)) 858 else: 859 handle.write("SQ \n") 860 861 for line_number in range(0, seq_len // LETTERS_PER_LINE): 862 handle.write(" ") # Just four, not five 863 for block in range(BLOCKS_PER_LINE): 864 index = LETTERS_PER_LINE * line_number + \ 865 LETTERS_PER_BLOCK * block 866 handle.write((" %s" % data[index:index + LETTERS_PER_BLOCK])) 867 handle.write(str((line_number + 1) 868 * LETTERS_PER_LINE).rjust(POSITION_PADDING)) 869 handle.write("\n") 870 if seq_len % LETTERS_PER_LINE: 871 #Final (partial) line 872 line_number = (seq_len // LETTERS_PER_LINE) 873 handle.write(" ") # Just four, not five 874 for block in range(BLOCKS_PER_LINE): 875 index = LETTERS_PER_LINE * line_number + \ 876 LETTERS_PER_BLOCK * block 877 handle.write( 878 (" %s" % data[index:index + LETTERS_PER_BLOCK]).ljust(11)) 879 handle.write(str(seq_len).rjust(POSITION_PADDING)) 880 handle.write("\n")
881
882 - def _write_single_line(self, tag, text):
883 assert len(tag) == 2 884 line = tag + " " + text 885 if len(text) > self.MAX_WIDTH: 886 import warnings 887 warnings.warn("Line %r too long" % line) 888 self.handle.write(line + "\n")
889
890 - def _write_multi_line(self, tag, text):
891 max_len = self.MAX_WIDTH - self.HEADER_WIDTH 892 lines = self._split_multi_line(text, max_len) 893 for line in lines: 894 self._write_single_line(tag, line)
895
896 - def _write_the_first_lines(self, record):
897 """Write the ID and AC lines.""" 898 if "." in record.id and record.id.rsplit(".", 1)[1].isdigit(): 899 version = "SV " + record.id.rsplit(".", 1)[1] 900 accession = self._get_annotation_str(record, "accession", 901 record.id.rsplit(".", 1)[0], 902 just_first=True) 903 else: 904 version = "" 905 accession = self._get_annotation_str(record, "accession", 906 record.id, 907 just_first=True) 908 909 if ";" in accession: 910 raise ValueError("Cannot have semi-colon in EMBL accession, %s" 911 % repr(str(accession))) 912 if " " in accession: 913 #This is out of practicallity... might it be allowed? 914 raise ValueError("Cannot have spaces in EMBL accession, %s" 915 % repr(str(accession))) 916 917 #Get the molecule type 918 #TODO - record this explicitly in the parser? 919 #Get the base alphabet (underneath any Gapped or StopCodon encoding) 920 a = Alphabet._get_base_alphabet(record.seq.alphabet) 921 if not isinstance(a, Alphabet.Alphabet): 922 raise TypeError("Invalid alphabet") 923 elif isinstance(a, Alphabet.DNAAlphabet): 924 mol_type = "DNA" 925 units = "BP" 926 elif isinstance(a, Alphabet.RNAAlphabet): 927 mol_type = "RNA" 928 units = "BP" 929 elif isinstance(a, Alphabet.ProteinAlphabet): 930 mol_type = "PROTEIN" 931 units = "AA" 932 else: 933 #Must be something like NucleotideAlphabet 934 raise ValueError("Need a DNA, RNA or Protein alphabet") 935 936 #Get the taxonomy division 937 division = self._get_data_division(record) 938 939 #TODO - Full ID line 940 handle = self.handle 941 #ID <1>; SV <2>; <3>; <4>; <5>; <6>; <7> BP. 942 #1. Primary accession number 943 #2. Sequence version number 944 #3. Topology: 'circular' or 'linear' 945 #4. Molecule type 946 #5. Data class 947 #6. Taxonomic division 948 #7. Sequence length 949 self._write_single_line("ID", "%s; %s; ; %s; ; %s; %i %s." 950 % (accession, version, mol_type, 951 division, len(record), units)) 952 handle.write("XX\n") 953 self._write_single_line("AC", accession + ";") 954 handle.write("XX\n")
955
956 - def _get_data_division(self, record):
957 try: 958 division = record.annotations["data_file_division"] 959 except KeyError: 960 division = "UNC" 961 if division in ["PHG", "ENV", "FUN", "HUM", "INV", "MAM", "VRT", 962 "MUS", "PLN", "PRO", "ROD", "SYN", "TGN", "UNC", 963 "VRL", "XXX"]: 964 #Good, already EMBL style 965 # Division Code 966 # ----------------- ---- 967 # Bacteriophage PHG 968 # Environmental Sample ENV 969 # Fungal FUN 970 # Human HUM 971 # Invertebrate INV 972 # Other Mammal MAM 973 # Other Vertebrate VRT 974 # Mus musculus MUS 975 # Plant PLN 976 # Prokaryote PRO 977 # Other Rodent ROD 978 # Synthetic SYN 979 # Transgenic TGN 980 # Unclassified UNC (i.e. unknown) 981 # Viral VRL 982 # 983 #(plus XXX used for submiting data to EMBL) 984 pass 985 else: 986 #See if this is in GenBank style & can be converted. 987 #Generally a problem as the GenBank groups are wider 988 #than those of EMBL. Note that GenBank use "BCT" for 989 #both bacteria and acherea thus this maps to EMBL's 990 #"PRO" nicely. 991 gbk_to_embl = {"BCT": "PRO", 992 "UNK": "UNC", 993 } 994 try: 995 division = gbk_to_embl[division] 996 except KeyError: 997 division = "UNC" 998 assert len(division) == 3 999 return division
1000
1001 - def _write_keywords(self, record):
1002 #Put the keywords right after DE line. 1003 #Each 'keyword' can have multiple words and spaces, but we 1004 #must not split any 'keyword' between lines. 1005 #TODO - Combine short keywords onto one line 1006 for keyword in record.annotations["keywords"]: 1007 self._write_single_line("KW", keyword) 1008 self.handle.write("XX\n")
1009
1010 - def _write_references(self, record):
1011 #The order should be RN, RC, RP, RX, RG, RA, RT, RL 1012 number = 0 1013 for ref in record.annotations["references"]: 1014 if not isinstance(ref, SeqFeature.Reference): 1015 continue 1016 number += 1 1017 self._write_single_line("RN", "[%i]" % number) 1018 #TODO - support for RC line (needed in parser too) 1019 #TODO - support more complex record reference locations? 1020 if ref.location and len(ref.location) == 1: 1021 self._write_single_line( 1022 "RP", "%i-%i" % (ref.location[0].nofuzzy_start + 1, 1023 ref.location[0].nofuzzy_end)) 1024 #TODO - record any DOI or AGRICOLA identifier in the reference object? 1025 if ref.pubmed_id: 1026 self._write_single_line("RX", "PUBMED; %s." % ref.pubmed_id) 1027 if ref.consrtm: 1028 self._write_single_line("RG", "%s" % ref.consrtm) 1029 if ref.authors: 1030 #We store the AUTHORS data as a single string 1031 self._write_multi_line("RA", ref.authors + ";") 1032 if ref.title: 1033 #We store the title as a single string 1034 self._write_multi_line("RT", '"%s";' % ref.title) 1035 if ref.journal: 1036 #We store this as a single string - holds the journal name, 1037 #volume, year, and page numbers of the citation 1038 self._write_multi_line("RL", ref.journal) 1039 self.handle.write("XX\n")
1040
1041 - def _write_comment(self, record):
1042 #This is a bit complicated due to the range of possible 1043 #ways people might have done their annotation... 1044 #Currently the parser uses a single string with newlines. 1045 #A list of lines is also reasonable. 1046 #A single (long) string is perhaps the most natural of all. 1047 #This means we may need to deal with line wrapping. 1048 comment = record.annotations["comment"] 1049 if isinstance(comment, basestring): 1050 lines = comment.split("\n") 1051 elif isinstance(comment, list) or isinstance(comment, tuple): 1052 lines = comment 1053 else: 1054 raise ValueError("Could not understand comment annotation") 1055 #TODO - Merge this with the GenBank comment code? 1056 if not lines: 1057 return 1058 for line in lines: 1059 self._write_multi_line("CC", line) 1060 self.handle.write("XX\n")
1061
1062 - def write_record(self, record):
1063 """Write a single record to the output file.""" 1064 1065 handle = self.handle 1066 self._write_the_first_lines(record) 1067 1068 #PR line (0 or 1 lines only), project identifier 1069 for xref in record.dbxrefs: 1070 if xref.startswith("Project:"): 1071 self._write_single_line("PR", xref + ";") 1072 handle.write("XX\n") 1073 break 1074 1075 #TODO - DT lines (date) 1076 1077 descr = record.description 1078 if descr == "<unknown description>": 1079 descr = "." 1080 self._write_multi_line("DE", descr) 1081 handle.write("XX\n") 1082 1083 if "keywords" in record.annotations: 1084 self._write_keywords(record) 1085 1086 #Should this be "source" or "organism"? 1087 self._write_multi_line( 1088 "OS", self._get_annotation_str(record, "organism")) 1089 try: 1090 #List of strings 1091 taxonomy = "; ".join(record.annotations["taxonomy"]) + "." 1092 except KeyError: 1093 taxonomy = "." 1094 self._write_multi_line("OC", taxonomy) 1095 handle.write("XX\n") 1096 1097 if "references" in record.annotations: 1098 self._write_references(record) 1099 1100 if "comment" in record.annotations: 1101 self._write_comment(record) 1102 1103 handle.write(self.FEATURE_HEADER) 1104 rec_length = len(record) 1105 for feature in record.features: 1106 self._write_feature(feature, rec_length) 1107 1108 self._write_sequence(record) 1109 handle.write("//\n")
1110 1111
1112 -class ImgtWriter(EmblWriter):
1113 HEADER_WIDTH = 5 1114 QUALIFIER_INDENT = 25 # Not 21 as in EMBL 1115 QUALIFIER_INDENT_STR = "FT" + " " * (QUALIFIER_INDENT - 2) 1116 QUALIFIER_INDENT_TMP = "FT %s " # 25 if %s is empty 1117 FEATURE_HEADER = "FH Key Location/Qualifiers\n"
1118 1119 if __name__ == "__main__": 1120 print("Quick self test") 1121 import os 1122 from Bio._py3k import StringIO 1123
1124 - def compare_record(old, new):
1125 if old.id != new.id and old.name != new.name: 1126 raise ValueError("'%s' or '%s' vs '%s' or '%s' records" 1127 % (old.id, old.name, new.id, new.name)) 1128 if len(old.seq) != len(new.seq): 1129 raise ValueError("%i vs %i" % (len(old.seq), len(new.seq))) 1130 if str(old.seq).upper() != str(new.seq).upper(): 1131 if len(old.seq) < 200: 1132 raise ValueError("'%s' vs '%s'" % (old.seq, new.seq)) 1133 else: 1134 raise ValueError( 1135 "'%s...' vs '%s...'" % (old.seq[:100], new.seq[:100])) 1136 if old.features and new.features: 1137 return compare_features(old.features, new.features) 1138 #Just insist on at least one word in common: 1139 if (old.description or new.description) \ 1140 and not set(old.description.split()).intersection(new.description.split()): 1141 raise ValueError("%s versus %s" 1142 % (repr(old.description), repr(new.description))) 1143 #TODO - check annotation 1144 if "contig" in old.annotations: 1145 assert old.annotations["contig"] == \ 1146 new.annotations["contig"] 1147 return True
1148
1149 - def compare_records(old_list, new_list):
1150 """Check two lists of SeqRecords agree, raises a ValueError if mismatch.""" 1151 if len(old_list) != len(new_list): 1152 raise ValueError( 1153 "%i vs %i records" % (len(old_list), len(new_list))) 1154 for old, new in zip(old_list, new_list): 1155 if not compare_record(old, new): 1156 return False 1157 return True
1158
1159 - def compare_feature(old, new, ignore_sub_features=False):
1160 """Check two SeqFeatures agree.""" 1161 if old.type != new.type: 1162 raise ValueError("Type %s versus %s" % (old.type, new.type)) 1163 if old.location.nofuzzy_start != new.location.nofuzzy_start \ 1164 or old.location.nofuzzy_end != new.location.nofuzzy_end: 1165 raise ValueError("%s versus %s:\n%s\nvs:\n%s" 1166 % (old.location, new.location, str(old), str(new))) 1167 if old.strand != new.strand: 1168 raise ValueError( 1169 "Different strand:\n%s\nvs:\n%s" % (str(old), str(new))) 1170 if old.location.start != new.location.start: 1171 raise ValueError("Start %s versus %s:\n%s\nvs:\n%s" 1172 % (old.location.start, new.location.start, str(old), str(new))) 1173 if old.location.end != new.location.end: 1174 raise ValueError("End %s versus %s:\n%s\nvs:\n%s" 1175 % (old.location.end, new.location.end, str(old), str(new))) 1176 if not ignore_sub_features: 1177 if len(old.sub_features) != len(new.sub_features): 1178 raise ValueError("Different sub features") 1179 for a, b in zip(old.sub_features, new.sub_features): 1180 if not compare_feature(a, b): 1181 return False 1182 #This only checks key shared qualifiers 1183 #Would a white list be easier? 1184 #for key in ["name", "gene", "translation", "codon_table", "codon_start", "locus_tag"]: 1185 for key in set(old.qualifiers).intersection(new.qualifiers): 1186 if key in ["db_xref", "protein_id", "product", "note"]: 1187 #EMBL and GenBank files are use different references/notes/etc 1188 continue 1189 if old.qualifiers[key] != new.qualifiers[key]: 1190 raise ValueError("Qualifier mis-match for %s:\n%s\n%s" 1191 % (key, old.qualifiers[key], new.qualifiers[key])) 1192 return True
1193
1194 - def compare_features(old_list, new_list, ignore_sub_features=False):
1195 """Check two lists of SeqFeatures agree, raises a ValueError if mismatch.""" 1196 if len(old_list) != len(new_list): 1197 raise ValueError( 1198 "%i vs %i features" % (len(old_list), len(new_list))) 1199 for old, new in zip(old_list, new_list): 1200 #This assumes they are in the same order 1201 if not compare_feature(old, new, ignore_sub_features): 1202 return False 1203 return True
1204
1205 - def check_genbank_writer(records):
1206 handle = StringIO() 1207 GenBankWriter(handle).write_file(records) 1208 handle.seek(0) 1209 1210 records2 = list(GenBankIterator(handle)) 1211 assert compare_records(records, records2)
1212
1213 - def check_embl_writer(records):
1214 handle = StringIO() 1215 try: 1216 EmblWriter(handle).write_file(records) 1217 except ValueError as err: 1218 print(err) 1219 return 1220 handle.seek(0) 1221 1222 records2 = list(EmblIterator(handle)) 1223 assert compare_records(records, records2)
1224 1225 for filename in os.listdir("../../Tests/GenBank"): 1226 if not filename.endswith(".gbk") and not filename.endswith(".gb"): 1227 continue 1228 print(filename) 1229 1230 with open("../../Tests/GenBank/%s" % filename) as handle: 1231 records = list(GenBankIterator(handle)) 1232 1233 check_genbank_writer(records) 1234 check_embl_writer(records) 1235 1236 for filename in os.listdir("../../Tests/EMBL"): 1237 if not filename.endswith(".embl"): 1238 continue 1239 print(filename) 1240 1241 with open("../../Tests/EMBL/%s" % filename) as handle: 1242 records = list(EmblIterator(handle)) 1243 1244 check_genbank_writer(records) 1245 check_embl_writer(records) 1246 1247 from Bio import SeqIO 1248 for filename in os.listdir("../../Tests/SwissProt"): 1249 if not filename.startswith("sp"): 1250 continue 1251 print(filename) 1252 1253 with open("../../Tests/SwissProt/%s" % filename) as handle: 1254 records = list(SeqIO.parse(handle, "swiss")) 1255 1256 check_genbank_writer(records) 1257