Package Bio :: Package GenBank :: Module Scanner
[hide private]
[frames] | no frames]

Source Code for Module Bio.GenBank.Scanner

   1  # Copyright 2007-2010 by Peter Cock.  All rights reserved. 
   2  # Revisions copyright 2010 by Uri Laserson.  All rights reserved. 
   3  # This code is part of the Biopython distribution and governed by its 
   4  # license.  Please see the LICENSE file that should have been included 
   5  # as part of this package. 
   6  """Internal code for parsing GenBank and EMBL files (PRIVATE). 
   7   
   8  This code is NOT intended for direct use.  It provides a basic scanner 
   9  (for use with a event consumer such as Bio.GenBank._FeatureConsumer) 
  10  to parse a GenBank or EMBL file (with their shared INSDC feature table). 
  11   
  12  It is used by Bio.GenBank to parse GenBank files 
  13  It is also used by Bio.SeqIO to parse GenBank and EMBL files 
  14   
  15  Feature Table Documentation: 
  16  http://www.insdc.org/files/feature_table.html 
  17  http://www.ncbi.nlm.nih.gov/projects/collab/FT/index.html 
  18  ftp://ftp.ncbi.nih.gov/genbank/docs/ 
  19  """ 
  20  # 17-MAR-2009: added wgs, wgs_scafld for GenBank whole genome shotgun master records. 
  21  # These are GenBank files that summarize the content of a project, and provide lists of 
  22  # scaffold and contig files in the project. These will be in annotations['wgs'] and 
  23  # annotations['wgs_scafld']. These GenBank files do not have sequences. See 
  24  # http://groups.google.com/group/bionet.molbio.genbank/browse_thread/thread/51fb88bf39e7dc36 
  25  # http://is.gd/nNgk 
  26  # for more details of this format, and an example. 
  27  # Added by Ying Huang & Iddo Friedberg 
  28   
  29  from __future__ import print_function 
  30   
  31  import warnings 
  32  import re 
  33  from collections import defaultdict 
  34  from Bio.Seq import Seq 
  35  from Bio.SeqRecord import SeqRecord 
  36  from Bio.Alphabet import generic_protein 
  37  from Bio import BiopythonParserWarning 
  38   
  39   
40 -class InsdcScanner(object):
41 """Basic functions for breaking up a GenBank/EMBL file into sub sections. 42 43 The International Nucleotide Sequence Database Collaboration (INSDC) 44 between the DDBJ, EMBL, and GenBank. These organisations all use the 45 same "Feature Table" layout in their plain text flat file formats. 46 47 However, the header and sequence sections of an EMBL file are very 48 different in layout to those produced by GenBank/DDBJ.""" 49 50 # These constants get redefined with sensible values in the sub classes: 51 RECORD_START = "XXX" # "LOCUS " or "ID " 52 HEADER_WIDTH = 3 # 12 or 5 53 FEATURE_START_MARKERS = ["XXX***FEATURES***XXX"] 54 FEATURE_END_MARKERS = ["XXX***END FEATURES***XXX"] 55 FEATURE_QUALIFIER_INDENT = 0 56 FEATURE_QUALIFIER_SPACER = "" 57 SEQUENCE_HEADERS = ["XXX"] # with right hand side spaces removed 58
59 - def __init__(self, debug=0):
60 assert len(self.RECORD_START) == self.HEADER_WIDTH 61 for marker in self.SEQUENCE_HEADERS: 62 assert marker == marker.rstrip() 63 assert len(self.FEATURE_QUALIFIER_SPACER) == self.FEATURE_QUALIFIER_INDENT 64 self.debug = debug 65 self.line = None
66
67 - def set_handle(self, handle):
68 self.handle = handle 69 self.line = ""
70
71 - def find_start(self):
72 """Read in lines until find the ID/LOCUS line, which is returned. 73 74 Any preamble (such as the header used by the NCBI on ``*.seq.gz`` archives) 75 will we ignored.""" 76 while True: 77 if self.line: 78 line = self.line 79 self.line = "" 80 else: 81 line = self.handle.readline() 82 if not line: 83 if self.debug: 84 print("End of file") 85 return None 86 if line[:self.HEADER_WIDTH] == self.RECORD_START: 87 if self.debug > 1: 88 print("Found the start of a record:\n" + line) 89 break 90 line = line.rstrip() 91 if line == "//": 92 if self.debug > 1: 93 print("Skipping // marking end of last record") 94 elif line == "": 95 if self.debug > 1: 96 print("Skipping blank line before record") 97 else: 98 # Ignore any header before the first ID/LOCUS line. 99 if self.debug > 1: 100 print("Skipping header line before record:\n" + line) 101 self.line = line 102 return line
103
104 - def parse_header(self):
105 """Return list of strings making up the header 106 107 New line characters are removed. 108 109 Assumes you have just read in the ID/LOCUS line. 110 """ 111 assert self.line[:self.HEADER_WIDTH] == self.RECORD_START, \ 112 "Not at start of record" 113 114 header_lines = [] 115 while True: 116 line = self.handle.readline() 117 if not line: 118 raise ValueError("Premature end of line during sequence data") 119 line = line.rstrip() 120 if line in self.FEATURE_START_MARKERS: 121 if self.debug: 122 print("Found feature table") 123 break 124 # if line[:self.HEADER_WIDTH]==self.FEATURE_START_MARKER[:self.HEADER_WIDTH]: 125 # if self.debug : print("Found header table (?)") 126 # break 127 if line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS: 128 if self.debug: 129 print("Found start of sequence") 130 break 131 if line == "//": 132 raise ValueError("Premature end of sequence data marker '//' found") 133 header_lines.append(line) 134 self.line = line 135 return header_lines
136
137 - def parse_features(self, skip=False):
138 """Return list of tuples for the features (if present) 139 140 Each feature is returned as a tuple (key, location, qualifiers) 141 where key and location are strings (e.g. "CDS" and 142 "complement(join(490883..490885,1..879))") while qualifiers 143 is a list of two string tuples (feature qualifier keys and values). 144 145 Assumes you have already read to the start of the features table. 146 """ 147 if self.line.rstrip() not in self.FEATURE_START_MARKERS: 148 if self.debug: 149 print("Didn't find any feature table") 150 return [] 151 152 while self.line.rstrip() in self.FEATURE_START_MARKERS: 153 self.line = self.handle.readline() 154 155 features = [] 156 line = self.line 157 while True: 158 if not line: 159 raise ValueError("Premature end of line during features table") 160 if line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS: 161 if self.debug: 162 print("Found start of sequence") 163 break 164 line = line.rstrip() 165 if line == "//": 166 raise ValueError("Premature end of features table, marker '//' found") 167 if line in self.FEATURE_END_MARKERS: 168 if self.debug: 169 print("Found end of features") 170 line = self.handle.readline() 171 break 172 if line[2:self.FEATURE_QUALIFIER_INDENT].strip() == "": 173 # This is an empty feature line between qualifiers. Empty 174 # feature lines within qualifiers are handled below (ignored). 175 line = self.handle.readline() 176 continue 177 if len(line) < self.FEATURE_QUALIFIER_INDENT: 178 warnings.warn("line too short to contain a feature: %r" % line, 179 BiopythonParserWarning) 180 line = self.handle.readline() 181 continue 182 183 if skip: 184 line = self.handle.readline() 185 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER: 186 line = self.handle.readline() 187 else: 188 # Build up a list of the lines making up this feature: 189 if line[self.FEATURE_QUALIFIER_INDENT] != " " \ 190 and " " in line[self.FEATURE_QUALIFIER_INDENT:]: 191 # The feature table design enforces a length limit on the feature keys. 192 # Some third party files (e.g. IGMT's EMBL like files) solve this by 193 # over indenting the location and qualifiers. 194 feature_key, line = line[2:].strip().split(None, 1) 195 feature_lines = [line] 196 warnings.warn("Overindented %s feature?" % feature_key, 197 BiopythonParserWarning) 198 else: 199 feature_key = line[2:self.FEATURE_QUALIFIER_INDENT].strip() 200 feature_lines = [line[self.FEATURE_QUALIFIER_INDENT:]] 201 line = self.handle.readline() 202 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER \ 203 or (line != '' and line.rstrip() == ""): # cope with blank lines in the midst of a feature 204 # Use strip to remove any harmless trailing white space AND and leading 205 # white space (e.g. out of spec files with too much indentation) 206 feature_lines.append(line[self.FEATURE_QUALIFIER_INDENT:].strip()) 207 line = self.handle.readline() 208 features.append(self.parse_feature(feature_key, feature_lines)) 209 self.line = line 210 return features
211
212 - def parse_feature(self, feature_key, lines):
213 r"""Expects a feature as a list of strings, returns a tuple (key, location, qualifiers) 214 215 For example given this GenBank feature:: 216 217 CDS complement(join(490883..490885,1..879)) 218 /locus_tag="NEQ001" 219 /note="conserved hypothetical [Methanococcus jannaschii]; 220 COG1583:Uncharacterized ACR; IPR001472:Bipartite nuclear 221 localization signal; IPR002743: Protein of unknown 222 function DUF57" 223 /codon_start=1 224 /transl_table=11 225 /product="hypothetical protein" 226 /protein_id="NP_963295.1" 227 /db_xref="GI:41614797" 228 /db_xref="GeneID:2732620" 229 /translation="MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK 230 EKYFNFTLIPKKDIIENKRYYLIISSPDKRFIEVLHNKIKDLDIITIGLAQFQLRKTK 231 KFDPKLRFPWVTITPIVLREGKIVILKGDKYYKVFVKRLEELKKYNLIKKKEPILEEP 232 IEISLNQIKDGWKIIDVKDRYYDFRNKSFSAFSNWLRDLKEQSLRKYNNFCGKNFYFE 233 EAIFEGFTFYKTVSIRIRINRGEAVYIGTLWKELNVYRKLDKEEREFYKFLYDCGLGS 234 LNSMGFGFVNTKKNSAR" 235 236 Then should give input key="CDS" and the rest of the data as a list of strings 237 lines=["complement(join(490883..490885,1..879))", ..., "LNSMGFGFVNTKKNSAR"] 238 where the leading spaces and trailing newlines have been removed. 239 240 Returns tuple containing: (key as string, location string, qualifiers as list) 241 as follows for this example: 242 243 key = "CDS", string 244 location = "complement(join(490883..490885,1..879))", string 245 qualifiers = list of string tuples: 246 247 [('locus_tag', '"NEQ001"'), 248 ('note', '"conserved hypothetical [Methanococcus jannaschii];\nCOG1583:..."'), 249 ('codon_start', '1'), 250 ('transl_table', '11'), 251 ('product', '"hypothetical protein"'), 252 ('protein_id', '"NP_963295.1"'), 253 ('db_xref', '"GI:41614797"'), 254 ('db_xref', '"GeneID:2732620"'), 255 ('translation', '"MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK\nEKYFNFT..."')] 256 257 In the above example, the "note" and "translation" were edited for compactness, 258 and they would contain multiple new line characters (displayed above as \n) 259 260 If a qualifier is quoted (in this case, everything except codon_start and 261 transl_table) then the quotes are NOT removed. 262 263 Note that no whitespace is removed. 264 """ 265 # Skip any blank lines 266 iterator = (x for x in lines if x) 267 try: 268 line = next(iterator) 269 270 feature_location = line.strip() 271 while feature_location[-1:] == ",": 272 # Multiline location, still more to come! 273 line = next(iterator) 274 feature_location += line.strip() 275 if feature_location.count("(") > feature_location.count(")"): 276 # Including the prev line in warning would be more explicit, 277 # but this way get one-and-only-one warning shown by default: 278 warnings.warn("Non-standard feature line wrapping (didn't break on comma)?", 279 BiopythonParserWarning) 280 while feature_location[-1:] == "," or feature_location.count("(") > feature_location.count(")"): 281 line = next(iterator) 282 feature_location += line.strip() 283 284 qualifiers = [] 285 286 for line_number, line in enumerate(iterator): 287 # check for extra wrapping of the location closing parentheses 288 if line_number == 0 and line.startswith(")"): 289 feature_location += line.strip() 290 elif line[0] == "/": 291 # New qualifier 292 i = line.find("=") 293 key = line[1:i] # does not work if i==-1 294 value = line[i + 1:] # we ignore 'value' if i==-1 295 if i == -1: 296 # Qualifier with no key, e.g. /pseudo 297 key = line[1:] 298 qualifiers.append((key, None)) 299 elif not value: 300 # ApE can output /note= 301 qualifiers.append((key, "")) 302 elif value == '"': 303 # One single quote 304 if self.debug: 305 print("Single quote %s:%s" % (key, value)) 306 # DO NOT remove the quote... 307 qualifiers.append((key, value)) 308 elif value[0] == '"': 309 # Quoted... 310 value_list = [value] 311 while value_list[-1][-1] != '"': 312 value_list.append(next(iterator)) 313 value = '\n'.join(value_list) 314 # DO NOT remove the quotes... 315 qualifiers.append((key, value)) 316 else: 317 # Unquoted 318 # if debug : print("Unquoted line %s:%s" % (key,value)) 319 qualifiers.append((key, value)) 320 else: 321 # Unquoted continuation 322 assert len(qualifiers) > 0 323 assert key == qualifiers[-1][0] 324 # if debug : print("Unquoted Cont %s:%s" % (key, line)) 325 if qualifiers[-1][1] is None: 326 raise StopIteration 327 qualifiers[-1] = (key, qualifiers[-1][1] + "\n" + line) 328 return (feature_key, feature_location, qualifiers) 329 except StopIteration: 330 # Bummer 331 raise ValueError("Problem with '%s' feature:\n%s" 332 % (feature_key, "\n".join(lines)))
333 356
357 - def _feed_first_line(self, consumer, line):
358 """Handle the LOCUS/ID line, passing data to the comsumer 359 360 This should be implemented by the EMBL / GenBank specific subclass 361 362 Used by the parse_records() and parse() methods. 363 """ 364 pass
365
366 - def _feed_header_lines(self, consumer, lines):
367 """Handle the header lines (list of strings), passing data to the comsumer 368 369 This should be implemented by the EMBL / GenBank specific subclass 370 371 Used by the parse_records() and parse() methods. 372 """ 373 pass
374
375 - def _feed_feature_table(self, consumer, feature_tuples):
376 """Handle the feature table (list of tuples), passing data to the comsumer 377 378 Used by the parse_records() and parse() methods. 379 """ 380 consumer.start_feature_table() 381 for feature_key, location_string, qualifiers in feature_tuples: 382 consumer.feature_key(feature_key) 383 consumer.location(location_string) 384 for q_key, q_value in qualifiers: 385 if q_value is None: 386 consumer.feature_qualifier(q_key, q_value) 387 else: 388 consumer.feature_qualifier(q_key, q_value.replace("\n", " "))
389
390 - def _feed_misc_lines(self, consumer, lines):
391 """Handle any lines between features and sequence (list of strings), passing data to the consumer 392 393 This should be implemented by the EMBL / GenBank specific subclass 394 395 Used by the parse_records() and parse() methods. 396 """ 397 pass
398
399 - def feed(self, handle, consumer, do_features=True):
400 """Feed a set of data into the consumer. 401 402 This method is intended for use with the "old" code in Bio.GenBank 403 404 Arguments: 405 406 - handle - A handle with the information to parse. 407 - consumer - The consumer that should be informed of events. 408 - do_features - Boolean, should the features be parsed? 409 Skipping the features can be much faster. 410 411 Return values: 412 413 - true - Passed a record 414 - false - Did not find a record 415 """ 416 # Should work with both EMBL and GenBank files provided the 417 # equivalent Bio.GenBank._FeatureConsumer methods are called... 418 self.set_handle(handle) 419 if not self.find_start(): 420 # Could not find (another) record 421 consumer.data = None 422 return False 423 424 # We use the above class methods to parse the file into a simplified format. 425 # The first line, header lines and any misc lines after the features will be 426 # dealt with by GenBank / EMBL specific derived classes. 427 428 # First line and header: 429 self._feed_first_line(consumer, self.line) 430 self._feed_header_lines(consumer, self.parse_header()) 431 432 # Features (common to both EMBL and GenBank): 433 if do_features: 434 self._feed_feature_table(consumer, self.parse_features(skip=False)) 435 else: 436 self.parse_features(skip=True) # ignore the data 437 438 # Footer and sequence 439 misc_lines, sequence_string = self.parse_footer() 440 self._feed_misc_lines(consumer, misc_lines) 441 442 consumer.sequence(sequence_string) 443 # Calls to consumer.base_number() do nothing anyway 444 consumer.record_end("//") 445 446 assert self.line == "//" 447 448 # And we are done 449 return True
450
451 - def parse(self, handle, do_features=True):
452 """Returns a SeqRecord (with SeqFeatures if do_features=True) 453 454 See also the method parse_records() for use on multi-record files. 455 """ 456 from Bio.GenBank import _FeatureConsumer 457 from Bio.GenBank.utils import FeatureValueCleaner 458 459 consumer = _FeatureConsumer(use_fuzziness=1, 460 feature_cleaner=FeatureValueCleaner()) 461 462 if self.feed(handle, consumer, do_features): 463 return consumer.data 464 else: 465 return None
466
467 - def parse_records(self, handle, do_features=True):
468 """Returns a SeqRecord object iterator 469 470 Each record (from the ID/LOCUS line to the // line) becomes a SeqRecord 471 472 The SeqRecord objects include SeqFeatures if do_features=True 473 474 This method is intended for use in Bio.SeqIO 475 """ 476 # This is a generator function 477 while True: 478 record = self.parse(handle, do_features) 479 if record is None: 480 break 481 if record.id is None: 482 raise ValueError("Failed to parse the record's ID. Invalid ID line?") 483 if record.name == "<unknown name>": 484 raise ValueError("Failed to parse the record's name. Invalid ID line?") 485 if record.description == "<unknown description>": 486 raise ValueError("Failed to parse the record's description") 487 yield record
488
489 - def parse_cds_features(self, handle, 490 alphabet=generic_protein, 491 tags2id=('protein_id', 'locus_tag', 'product')):
492 """Returns SeqRecord object iterator 493 494 Each CDS feature becomes a SeqRecord. 495 496 - alphabet - Used for any sequence found in a translation field. 497 - tags2id - Tupple of three strings, the feature keys to use 498 for the record id, name and description, 499 500 This method is intended for use in Bio.SeqIO 501 """ 502 self.set_handle(handle) 503 while self.find_start(): 504 # Got an EMBL or GenBank record... 505 self.parse_header() # ignore header lines! 506 feature_tuples = self.parse_features() 507 # self.parse_footer() # ignore footer lines! 508 while True: 509 line = self.handle.readline() 510 if not line: 511 break 512 if line[:2] == "//": 513 break 514 self.line = line.rstrip() 515 516 # Now go though those features... 517 for key, location_string, qualifiers in feature_tuples: 518 if key == "CDS": 519 # Create SeqRecord 520 # ================ 521 # SeqRecord objects cannot be created with annotations, they 522 # must be added afterwards. So create an empty record and 523 # then populate it: 524 record = SeqRecord(seq=None) 525 annotations = record.annotations 526 527 # Should we add a location object to the annotations? 528 # I *think* that only makes sense for SeqFeatures with their 529 # sub features... 530 annotations['raw_location'] = location_string.replace(' ', '') 531 532 for (qualifier_name, qualifier_data) in qualifiers: 533 if qualifier_data is not None \ 534 and qualifier_data[0] == '"' and qualifier_data[-1] == '"': 535 # Remove quotes 536 qualifier_data = qualifier_data[1:-1] 537 # Append the data to the annotation qualifier... 538 if qualifier_name == "translation": 539 assert record.seq is None, "Multiple translations!" 540 record.seq = Seq(qualifier_data.replace("\n", ""), alphabet) 541 elif qualifier_name == "db_xref": 542 # its a list, possibly empty. Its safe to extend 543 record.dbxrefs.append(qualifier_data) 544 else: 545 if qualifier_data is not None: 546 qualifier_data = qualifier_data.replace("\n", " ").replace(" ", " ") 547 try: 548 annotations[qualifier_name] += " " + qualifier_data 549 except KeyError: 550 # Not an addition to existing data, its the first bit 551 annotations[qualifier_name] = qualifier_data 552 553 # Fill in the ID, Name, Description 554 # ================================= 555 try: 556 record.id = annotations[tags2id[0]] 557 except KeyError: 558 pass 559 try: 560 record.name = annotations[tags2id[1]] 561 except KeyError: 562 pass 563 try: 564 record.description = annotations[tags2id[2]] 565 except KeyError: 566 pass 567 568 yield record
569 570
571 -class EmblScanner(InsdcScanner):
572 """For extracting chunks of information in EMBL files""" 573 574 RECORD_START = "ID " 575 HEADER_WIDTH = 5 576 FEATURE_START_MARKERS = ["FH Key Location/Qualifiers", "FH"] 577 FEATURE_END_MARKERS = ["XX"] # XX can also mark the end of many things! 578 FEATURE_QUALIFIER_INDENT = 21 579 FEATURE_QUALIFIER_SPACER = "FT" + " " * (FEATURE_QUALIFIER_INDENT - 2) 580 SEQUENCE_HEADERS = ["SQ", "CO"] # Remove trailing spaces 581 618
619 - def _feed_first_line(self, consumer, line):
620 assert line[:self.HEADER_WIDTH].rstrip() == "ID" 621 if line[self.HEADER_WIDTH:].count(";") == 6: 622 # Looks like the semi colon separated style introduced in 2006 623 self._feed_first_line_new(consumer, line) 624 elif line[self.HEADER_WIDTH:].count(";") == 3: 625 if line.rstrip().endswith(" SQ"): 626 # EMBL-bank patent data 627 self._feed_first_line_patents(consumer, line) 628 else: 629 # Looks like the pre 2006 style 630 self._feed_first_line_old(consumer, line) 631 else: 632 raise ValueError('Did not recognise the ID line layout:\n' + line)
633
634 - def _feed_first_line_patents(self, consumer, line):
635 # Either Non-Redundant Level 1 database records, 636 # ID <accession>; <molecule type>; <non-redundant level 1>; <cluster size L1> 637 # e.g. ID NRP_AX000635; PRT; NR1; 15 SQ 638 # 639 # Or, Non-Redundant Level 2 database records: 640 # ID <L2-accession>; <molecule type>; <non-redundant level 2>; <cluster size L2> 641 # e.g. ID NRP0000016E; PRT; NR2; 5 SQ 642 fields = line[self.HEADER_WIDTH:].rstrip()[:-3].split(";") 643 assert len(fields) == 4 644 consumer.locus(fields[0]) 645 consumer.residue_type(fields[1]) 646 consumer.data_file_division(fields[2])
647 # TODO - Record cluster size? 648
649 - def _feed_first_line_old(self, consumer, line):
650 # Expects an ID line in the style before 2006, e.g. 651 # ID SC10H5 standard; DNA; PRO; 4870 BP. 652 # ID BSUB9999 standard; circular DNA; PRO; 4214630 BP. 653 assert line[:self.HEADER_WIDTH].rstrip() == "ID" 654 fields = [line[self.HEADER_WIDTH:].split(None, 1)[0]] 655 fields.extend(line[self.HEADER_WIDTH:].split(None, 1)[1].split(";")) 656 fields = [entry.strip() for entry in fields] 657 """ 658 The tokens represent: 659 660 0. Primary accession number 661 (space sep) 662 1. ??? (e.g. standard) 663 (semi-colon) 664 2. Topology and/or Molecule type (e.g. 'circular DNA' or 'DNA') 665 3. Taxonomic division (e.g. 'PRO') 666 4. Sequence length (e.g. '4639675 BP.') 667 """ 668 consumer.locus(fields[0]) # Should we also call the accession consumer? 669 consumer.residue_type(fields[2]) 670 consumer.data_file_division(fields[3]) 671 self._feed_seq_length(consumer, fields[4])
672
673 - def _feed_first_line_new(self, consumer, line):
674 # Expects an ID line in the style introduced in 2006, e.g. 675 # ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP. 676 # ID CD789012; SV 4; linear; genomic DNA; HTG; MAM; 500 BP. 677 assert line[:self.HEADER_WIDTH].rstrip() == "ID" 678 fields = [data.strip() for data in line[self.HEADER_WIDTH:].strip().split(";")] 679 assert len(fields) == 7 680 """ 681 The tokens represent: 682 683 0. Primary accession number 684 1. Sequence version number 685 2. Topology: 'circular' or 'linear' 686 3. Molecule type (e.g. 'genomic DNA') 687 4. Data class (e.g. 'STD') 688 5. Taxonomic division (e.g. 'PRO') 689 6. Sequence length (e.g. '4639675 BP.') 690 """ 691 692 consumer.locus(fields[0]) 693 694 # Call the accession consumer now, to make sure we record 695 # something as the record.id, in case there is no AC line 696 consumer.accession(fields[0]) 697 698 # TODO - How to deal with the version field? At the moment the consumer 699 # will try and use this for the ID which isn't ideal for EMBL files. 700 version_parts = fields[1].split() 701 if len(version_parts) == 2 \ 702 and version_parts[0] == "SV" \ 703 and version_parts[1].isdigit(): 704 consumer.version_suffix(version_parts[1]) 705 706 # Based on how the old GenBank parser worked, merge these two: 707 consumer.residue_type(" ".join(fields[2:4])) # TODO - Store as two fields? 708 709 # consumer.xxx(fields[4]) # TODO - What should we do with the data class? 710 711 consumer.data_file_division(fields[5]) 712 713 self._feed_seq_length(consumer, fields[6])
714
715 - def _feed_seq_length(self, consumer, text):
716 length_parts = text.split() 717 assert len(length_parts) == 2, "Invalid sequence length string %r" % text 718 assert length_parts[1].upper() in ["BP", "BP.", "AA", "AA."] 719 consumer.size(length_parts[0])
720
721 - def _feed_header_lines(self, consumer, lines):
722 EMBL_INDENT = self.HEADER_WIDTH 723 EMBL_SPACER = " " * EMBL_INDENT 724 consumer_dict = { 725 'AC': 'accession', 726 'SV': 'version', # SV line removed in June 2006, now part of ID line 727 'DE': 'definition', 728 # 'RN' : 'reference_num', 729 # 'RC' : reference comment... TODO 730 # 'RP' : 'reference_bases', 731 # 'RX' : reference cross reference... DOI or Pubmed 732 'RG': 'consrtm', # optional consortium 733 # 'RA' : 'authors', 734 # 'RT' : 'title', 735 'RL': 'journal', 736 'OS': 'organism', 737 'OC': 'taxonomy', 738 # 'DR' : data reference 739 'CC': 'comment', 740 # 'XX' : splitter 741 } 742 # We have to handle the following specially: 743 # RX (depending on reference type...) 744 for line in lines: 745 line_type = line[:EMBL_INDENT].strip() 746 data = line[EMBL_INDENT:].strip() 747 if line_type == 'XX': 748 pass 749 elif line_type == 'RN': 750 # Reformat reference numbers for the GenBank based consumer 751 # e.g. '[1]' becomes '1' 752 if data[0] == "[" and data[-1] == "]": 753 data = data[1:-1] 754 consumer.reference_num(data) 755 elif line_type == 'RP': 756 # Reformat reference numbers for the GenBank based consumer 757 # e.g. '1-4639675' becomes '(bases 1 to 4639675)' 758 # and '160-550, 904-1055' becomes '(bases 160 to 550; 904 to 1055)' 759 # Note could be multi-line, and end with a comma 760 parts = [bases.replace("-", " to ").strip() for bases in data.split(",") if bases.strip()] 761 consumer.reference_bases("(bases %s)" % "; ".join(parts)) 762 elif line_type == 'RT': 763 # Remove the enclosing quotes and trailing semi colon. 764 # Note the title can be split over multiple lines. 765 if data.startswith('"'): 766 data = data[1:] 767 if data.endswith('";'): 768 data = data[:-2] 769 consumer.title(data) 770 elif line_type == 'RX': 771 # EMBL support three reference types at the moment: 772 # - PUBMED PUBMED bibliographic database (NLM) 773 # - DOI Digital Object Identifier (International DOI Foundation) 774 # - AGRICOLA US National Agriculture Library (NAL) of the US Department 775 # of Agriculture (USDA) 776 # 777 # Format: 778 # RX resource_identifier; identifier. 779 # 780 # e.g. 781 # RX DOI; 10.1016/0024-3205(83)90010-3. 782 # RX PUBMED; 264242. 783 # 784 # Currently our reference object only supports PUBMED and MEDLINE 785 # (as these were in GenBank files?). 786 key, value = data.split(";", 1) 787 if value.endswith("."): 788 value = value[:-1] 789 value = value.strip() 790 if key == "PUBMED": 791 consumer.pubmed_id(value) 792 # TODO - Handle other reference types (here and in BioSQL bindings) 793 elif line_type == 'CC': 794 # Have to pass a list of strings for this one (not just a string) 795 consumer.comment([data]) 796 elif line_type == 'DR': 797 # Database Cross-reference, format: 798 # DR database_identifier; primary_identifier; secondary_identifier. 799 # 800 # e.g. 801 # DR MGI; 98599; Tcrb-V4. 802 # 803 # TODO - How should we store any secondary identifier? 804 parts = data.rstrip(".").split(";") 805 # Turn it into "database_identifier:primary_identifier" to 806 # mimic the GenBank parser. e.g. "MGI:98599" 807 consumer.dblink("%s:%s" % (parts[0].strip(), 808 parts[1].strip())) 809 elif line_type == 'RA': 810 # Remove trailing ; at end of authors list 811 consumer.authors(data.rstrip(";")) 812 elif line_type == 'PR': 813 # In the EMBL patent files, this is a PR (PRiority) line which 814 # provides the earliest active priority within the family. 815 # The priority number comes first, followed by the priority date. 816 # 817 # e.g. 818 # PR JP19990377484 16-DEC-1999 819 # 820 # However, in most EMBL files this is a PR (PRoject) line which 821 # gives the BioProject reference number. 822 # 823 # e.g. 824 # PR Project:PRJNA60715; 825 # 826 # In GenBank files this corresponds to the old PROJECT line 827 # which was later replaced with the DBLINK line. 828 if data.startswith("Project:"): 829 # Remove trailing ; at end of the project reference 830 consumer.project(data.rstrip(";")) 831 elif line_type == 'KW': 832 consumer.keywords(data.rstrip(";")) 833 elif line_type in consumer_dict: 834 # Its a semi-automatic entry! 835 getattr(consumer, consumer_dict[line_type])(data) 836 else: 837 if self.debug: 838 print("Ignoring EMBL header line:\n%s" % line)
839
840 - def _feed_misc_lines(self, consumer, lines):
841 # TODO - Should we do something with the information on the SQ line(s)? 842 lines.append("") 843 line_iter = iter(lines) 844 try: 845 for line in line_iter: 846 if line.startswith("CO "): 847 line = line[5:].strip() 848 contig_location = line 849 while True: 850 line = next(line_iter) 851 if not line: 852 break 853 elif line.startswith("CO "): 854 # Don't need to preseve the whitespace here. 855 contig_location += line[5:].strip() 856 else: 857 raise ValueError('Expected CO (contig) continuation line, got:\n' + line) 858 consumer.contig_location(contig_location) 859 if line.startswith("SQ Sequence "): 860 # e.g. 861 # SQ Sequence 219 BP; 82 A; 48 C; 33 G; 45 T; 11 other; 862 # 863 # Or, EMBL-bank patent, e.g. 864 # SQ Sequence 465 AA; 3963407aa91d3a0d622fec679a4524e0; MD5; 865 self._feed_seq_length(consumer, line[14:].rstrip().rstrip(";").split(";", 1)[0]) 866 # TODO - Record the checksum etc? 867 return 868 except StopIteration: 869 raise ValueError("Problem in misc lines before sequence")
870 871
872 -class _ImgtScanner(EmblScanner):
873 """For extracting chunks of information in IMGT (EMBL like) files (PRIVATE). 874 875 IMGT files are like EMBL files but in order to allow longer feature types 876 the features should be indented by 25 characters not 21 characters. In 877 practice the IMGT flat files tend to use either 21 or 25 characters, so we 878 must cope with both. 879 880 This is private to encourage use of Bio.SeqIO rather than Bio.GenBank. 881 """ 882 883 FEATURE_START_MARKERS = ["FH Key Location/Qualifiers", 884 "FH Key Location/Qualifiers (from EMBL)", 885 "FH Key Location/Qualifiers", 886 "FH"] 887
888 - def parse_features(self, skip=False):
889 """Return list of tuples for the features (if present) 890 891 Each feature is returned as a tuple (key, location, qualifiers) 892 where key and location are strings (e.g. "CDS" and 893 "complement(join(490883..490885,1..879))") while qualifiers 894 is a list of two string tuples (feature qualifier keys and values). 895 896 Assumes you have already read to the start of the features table. 897 """ 898 if self.line.rstrip() not in self.FEATURE_START_MARKERS: 899 if self.debug: 900 print("Didn't find any feature table") 901 return [] 902 903 while self.line.rstrip() in self.FEATURE_START_MARKERS: 904 self.line = self.handle.readline() 905 906 bad_position_re = re.compile(r'([0-9]+)>{1}') 907 908 features = [] 909 line = self.line 910 while True: 911 if not line: 912 raise ValueError("Premature end of line during features table") 913 if line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS: 914 if self.debug: 915 print("Found start of sequence") 916 break 917 line = line.rstrip() 918 if line == "//": 919 raise ValueError("Premature end of features table, marker '//' found") 920 if line in self.FEATURE_END_MARKERS: 921 if self.debug: 922 print("Found end of features") 923 line = self.handle.readline() 924 break 925 if line[2:self.FEATURE_QUALIFIER_INDENT].strip() == "": 926 # This is an empty feature line between qualifiers. Empty 927 # feature lines within qualifiers are handled below (ignored). 928 line = self.handle.readline() 929 continue 930 931 if skip: 932 line = self.handle.readline() 933 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER: 934 line = self.handle.readline() 935 else: 936 assert line[:2] == "FT" 937 try: 938 feature_key, location_start = line[2:].strip().split() 939 except ValueError: 940 # e.g. "FT TRANSMEMBRANE-REGION2163..2240\n" 941 # Assume indent of 25 as per IMGT spec, with the location 942 # start in column 26 (one-based). 943 feature_key = line[2:25].strip() 944 location_start = line[25:].strip() 945 feature_lines = [location_start] 946 line = self.handle.readline() 947 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER \ 948 or line.rstrip() == "": # cope with blank lines in the midst of a feature 949 # Use strip to remove any harmless trailing white space AND and leading 950 # white space (copes with 21 or 26 indents and orther variants) 951 assert line[:2] == "FT" 952 feature_lines.append(line[self.FEATURE_QUALIFIER_INDENT:].strip()) 953 line = self.handle.readline() 954 feature_key, location, qualifiers = \ 955 self.parse_feature(feature_key, feature_lines) 956 # Try to handle known problems with IMGT locations here: 957 if ">" in location: 958 # Nasty hack for common IMGT bug, should be >123 not 123> 959 # in a location string. At least here the meaning is clear, 960 # and since it is so common I don't want to issue a warning 961 # warnings.warn("Feature location %s is invalid, " 962 # "moving greater than sign before position" 963 # % location, BiopythonParserWarning) 964 location = bad_position_re.sub(r'>\1', location) 965 features.append((feature_key, location, qualifiers)) 966 self.line = line 967 return features
968 969
970 -class GenBankScanner(InsdcScanner):
971 """For extracting chunks of information in GenBank files""" 972 973 RECORD_START = "LOCUS " 974 HEADER_WIDTH = 12 975 FEATURE_START_MARKERS = ["FEATURES Location/Qualifiers", "FEATURES"] 976 FEATURE_END_MARKERS = [] 977 FEATURE_QUALIFIER_INDENT = 21 978 FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT 979 SEQUENCE_HEADERS = ["CONTIG", "ORIGIN", "BASE COUNT", "WGS"] # trailing spaces removed 980 1033
1034 - def _feed_first_line(self, consumer, line):
1035 """Scan over and parse GenBank LOCUS line (PRIVATE). 1036 1037 This must cope with several variants, primarily the old and new column 1038 based standards from GenBank. Additionally EnsEMBL produces GenBank 1039 files where the LOCUS line is space separated rather that following 1040 the column based layout. 1041 1042 We also try to cope with GenBank like files with partial LOCUS lines. 1043 """ 1044 ##################################### 1045 # LOCUS line # 1046 ##################################### 1047 GENBANK_INDENT = self.HEADER_WIDTH 1048 GENBANK_SPACER = " " * GENBANK_INDENT 1049 assert line[0:GENBANK_INDENT] == 'LOCUS ', \ 1050 'LOCUS line does not start correctly:\n' + line 1051 1052 # Have to break up the locus line, and handle the different bits of it. 1053 # There are at least two different versions of the locus line... 1054 if line[29:33] in [' bp ', ' aa ', ' rc '] and line[55:62] == ' ': 1055 # Old... note we insist on the 55:62 being empty to avoid trying 1056 # to parse space separated LOCUS lines from Ensembl etc, see below. 1057 # 1058 # Positions Contents 1059 # --------- -------- 1060 # 00:06 LOCUS 1061 # 06:12 spaces 1062 # 12:?? Locus name 1063 # ??:?? space 1064 # ??:29 Length of sequence, right-justified 1065 # 29:33 space, bp, space 1066 # 33:41 strand type 1067 # 41:42 space 1068 # 42:51 Blank (implies linear), linear or circular 1069 # 51:52 space 1070 # 52:55 The division code (e.g. BCT, VRL, INV) 1071 # 55:62 space 1072 # 62:73 Date, in the form dd-MMM-yyyy (e.g., 15-MAR-1991) 1073 # 1074 # assert line[29:33] in [' bp ', ' aa ',' rc '] , \ 1075 # 'LOCUS line does not contain size units at expected position:\n' + line 1076 assert line[41:42] == ' ', \ 1077 'LOCUS line does not contain space at position 42:\n' + line 1078 assert line[42:51].strip() in ['', 'linear', 'circular'], \ 1079 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line 1080 assert line[51:52] == ' ', \ 1081 'LOCUS line does not contain space at position 52:\n' + line 1082 # assert line[55:62] == ' ', \ 1083 # 'LOCUS line does not contain spaces from position 56 to 62:\n' + line 1084 if line[62:73].strip(): 1085 assert line[64:65] == '-', \ 1086 'LOCUS line does not contain - at position 65 in date:\n' + line 1087 assert line[68:69] == '-', \ 1088 'LOCUS line does not contain - at position 69 in date:\n' + line 1089 1090 name_and_length_str = line[GENBANK_INDENT:29] 1091 while ' ' in name_and_length_str: 1092 name_and_length_str = name_and_length_str.replace(' ', ' ') 1093 name_and_length = name_and_length_str.split(' ') 1094 assert len(name_and_length) <= 2, \ 1095 'Cannot parse the name and length in the LOCUS line:\n' + line 1096 assert len(name_and_length) != 1, \ 1097 'Name and length collide in the LOCUS line:\n' + line 1098 # Should be possible to split them based on position, if 1099 # a clear definition of the standard exists THAT AGREES with 1100 # existing files. 1101 name, length = name_and_length 1102 if len(name) > 16: 1103 # As long as the sequence is short, can steal its leading spaces 1104 # to extend the name over the current 16 character limit. 1105 # However, that deserves a warning as it is out of spec. 1106 warnings.warn("GenBank LOCUS line identifier over 16 characters", 1107 BiopythonParserWarning) 1108 consumer.locus(name) 1109 consumer.size(length) 1110 # consumer.residue_type(line[33:41].strip()) 1111 1112 if line[33:51].strip() == "" and line[29:33] == ' aa ': 1113 # Amino acids -> protein (even if there is no residue type given) 1114 # We want to use a protein alphabet in this case, rather than a 1115 # generic one. Not sure if this is the best way to achieve this, 1116 # but it works because the scanner checks for this: 1117 consumer.residue_type("PROTEIN") 1118 else: 1119 consumer.residue_type(line[33:51].strip()) 1120 1121 consumer.data_file_division(line[52:55]) 1122 if line[62:73].strip(): 1123 consumer.date(line[62:73]) 1124 elif line[40:44] in [' bp ', ' aa ', ' rc '] \ 1125 and line[54:64].strip() in ['', 'linear', 'circular']: 1126 # New... linear/circular/big blank test should avoid EnsEMBL style 1127 # LOCUS line being treated like a proper column based LOCUS line. 1128 # 1129 # Positions Contents 1130 # --------- -------- 1131 # 00:06 LOCUS 1132 # 06:12 spaces 1133 # 12:?? Locus name 1134 # ??:?? space 1135 # ??:40 Length of sequence, right-justified 1136 # 40:44 space, bp, space 1137 # 44:47 Blank, ss-, ds-, ms- 1138 # 47:54 Blank, DNA, RNA, tRNA, mRNA, uRNA, snRNA, cDNA 1139 # 54:55 space 1140 # 55:63 Blank (implies linear), linear or circular 1141 # 63:64 space 1142 # 64:67 The division code (e.g. BCT, VRL, INV) 1143 # 67:68 space 1144 # 68:79 Date, in the form dd-MMM-yyyy (e.g., 15-MAR-1991) 1145 # 1146 assert line[40:44] in [' bp ', ' aa ', ' rc '], \ 1147 'LOCUS line does not contain size units at expected position:\n' + line 1148 assert line[44:47] in [' ', 'ss-', 'ds-', 'ms-'], \ 1149 'LOCUS line does not have valid strand type (Single stranded, ...):\n' + line 1150 assert line[47:54].strip() == "" \ 1151 or 'DNA' in line[47:54].strip().upper() \ 1152 or 'RNA' in line[47:54].strip().upper(), \ 1153 'LOCUS line does not contain valid sequence type (DNA, RNA, ...):\n' + line 1154 assert line[54:55] == ' ', \ 1155 'LOCUS line does not contain space at position 55:\n' + line 1156 assert line[55:63].strip() in ['', 'linear', 'circular'], \ 1157 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line 1158 assert line[63:64] == ' ', \ 1159 'LOCUS line does not contain space at position 64:\n' + line 1160 assert line[67:68] == ' ', \ 1161 'LOCUS line does not contain space at position 68:\n' + line 1162 if line[68:79].strip(): 1163 assert line[70:71] == '-', \ 1164 'LOCUS line does not contain - at position 71 in date:\n' + line 1165 assert line[74:75] == '-', \ 1166 'LOCUS line does not contain - at position 75 in date:\n' + line 1167 1168 name_and_length_str = line[GENBANK_INDENT:40] 1169 while ' ' in name_and_length_str: 1170 name_and_length_str = name_and_length_str.replace(' ', ' ') 1171 name_and_length = name_and_length_str.split(' ') 1172 assert len(name_and_length) <= 2, \ 1173 'Cannot parse the name and length in the LOCUS line:\n' + line 1174 assert len(name_and_length) != 1, \ 1175 'Name and length collide in the LOCUS line:\n' + line 1176 # Should be possible to split them based on position, if 1177 # a clear definition of the stand exists THAT AGREES with 1178 # existing files. 1179 consumer.locus(name_and_length[0]) 1180 consumer.size(name_and_length[1]) 1181 1182 if line[44:54].strip() == "" and line[40:44] == ' aa ': 1183 # Amino acids -> protein (even if there is no residue type given) 1184 # We want to use a protein alphabet in this case, rather than a 1185 # generic one. Not sure if this is the best way to achieve this, 1186 # but it works because the scanner checks for this: 1187 consumer.residue_type(("PROTEIN " + line[54:63]).strip()) 1188 else: 1189 consumer.residue_type(line[44:63].strip()) 1190 1191 consumer.data_file_division(line[64:67]) 1192 if line[68:79].strip(): 1193 consumer.date(line[68:79]) 1194 elif line[GENBANK_INDENT:].strip().count(" ") == 0: 1195 # Truncated LOCUS line, as produced by some EMBOSS tools - see bug 1762 1196 # 1197 # e.g. 1198 # 1199 # "LOCUS U00096" 1200 # 1201 # rather than: 1202 # 1203 # "LOCUS U00096 4639675 bp DNA circular BCT" 1204 # 1205 # Positions Contents 1206 # --------- -------- 1207 # 00:06 LOCUS 1208 # 06:12 spaces 1209 # 12:?? Locus name 1210 if line[GENBANK_INDENT:].strip() != "": 1211 consumer.locus(line[GENBANK_INDENT:].strip()) 1212 else: 1213 # Must just have just "LOCUS ", is this even legitimate? 1214 # We should be able to continue parsing... we need real world testcases! 1215 warnings.warn("Minimal LOCUS line found - is this " 1216 "correct?\n:%r" % line, BiopythonParserWarning) 1217 elif len(line.split()) == 8 and line.split()[3] in ("aa", "bp") and \ 1218 line.split()[5] in ('linear', 'circular'): 1219 # Cope with invalidly spaced GenBank LOCUS lines like 1220 # LOCUS AB070938 6497 bp DNA linear BCT 11-OCT-2001 1221 splitline = line.split() 1222 consumer.locus(splitline[1]) 1223 consumer.size(splitline[2]) 1224 consumer.residue_type(splitline[4]) 1225 consumer.data_file_division(splitline[6]) 1226 consumer.date(splitline[7]) 1227 warnings.warn("Attempting to parse malformed locus line:\n%r\n" 1228 "Found locus %r size %r residue_type %r\n" 1229 "Some fields may be wrong." % (line, splitline[1], 1230 splitline[2], splitline[4]), BiopythonParserWarning) 1231 elif len(line.split()) == 7 and line.split()[3] in ["aa", "bp"]: 1232 # Cope with EnsEMBL genbank files which use space separation rather 1233 # than the expected column based layout. e.g. 1234 # LOCUS HG531_PATCH 1000000 bp DNA HTG 18-JUN-2011 1235 # LOCUS HG531_PATCH 759984 bp DNA HTG 18-JUN-2011 1236 # LOCUS HG506_HG1000_1_PATCH 814959 bp DNA HTG 18-JUN-2011 1237 # LOCUS HG506_HG1000_1_PATCH 1219964 bp DNA HTG 18-JUN-2011 1238 # Notice that the 'bp' can occur in the position expected by either 1239 # the old or the new fixed column standards (parsed above). 1240 splitline = line.split() 1241 consumer.locus(splitline[1]) 1242 consumer.size(splitline[2]) 1243 consumer.residue_type(splitline[4]) 1244 consumer.data_file_division(splitline[5]) 1245 consumer.date(splitline[6]) 1246 elif len(line.split()) >= 4 and line.split()[3] in ["aa", "bp"]: 1247 # Cope with EMBOSS seqret output where it seems the locus id can cause 1248 # the other fields to overflow. We just IGNORE the other fields! 1249 warnings.warn("Malformed LOCUS line found - is this " 1250 "correct?\n:%r" % line, BiopythonParserWarning) 1251 consumer.locus(line.split()[1]) 1252 consumer.size(line.split()[2]) 1253 elif len(line.split()) >= 4 and line.split()[-1] in ["aa", "bp"]: 1254 # Cope with pseudo-GenBank files like this: 1255 # "LOCUS RNA5 complete 1718 bp" 1256 # Treat everything between LOCUS and the size as the identifier. 1257 warnings.warn("Malformed LOCUS line found - is this " 1258 "correct?\n:%r" % line, BiopythonParserWarning) 1259 consumer.locus(line[5:].rsplit(None, 2)[0].strip()) 1260 consumer.size(line.split()[-2]) 1261 else: 1262 raise ValueError('Did not recognise the LOCUS line layout:\n' + line)
1263
1264 - def _feed_header_lines(self, consumer, lines):
1265 # Following dictionary maps GenBank lines to the associated 1266 # consumer methods - the special cases like LOCUS where one 1267 # genbank line triggers several consumer calls have to be 1268 # handled individually. 1269 GENBANK_INDENT = self.HEADER_WIDTH 1270 GENBANK_SPACER = " " * GENBANK_INDENT 1271 STRUCTURED_COMMENT_START = "-START##" 1272 STRUCTURED_COMMENT_END = "-END##" 1273 STRUCTURED_COMMENT_DELIM = " :: " 1274 consumer_dict = { 1275 'DEFINITION': 'definition', 1276 'ACCESSION': 'accession', 1277 'NID': 'nid', 1278 'PID': 'pid', 1279 'DBSOURCE': 'db_source', 1280 'KEYWORDS': 'keywords', 1281 'SEGMENT': 'segment', 1282 'SOURCE': 'source', 1283 'AUTHORS': 'authors', 1284 'CONSRTM': 'consrtm', 1285 'PROJECT': 'project', 1286 'TITLE': 'title', 1287 'JOURNAL': 'journal', 1288 'MEDLINE': 'medline_id', 1289 'PUBMED': 'pubmed_id', 1290 'REMARK': 'remark'} 1291 # We have to handle the following specially: 1292 # ORIGIN (locus, size, residue_type, data_file_division and date) 1293 # COMMENT (comment) 1294 # VERSION (version and gi) 1295 # DBLINK (database links like projects, newlines important) 1296 # REFERENCE (eference_num and reference_bases) 1297 # ORGANISM (organism and taxonomy) 1298 lines = [_f for _f in lines if _f] 1299 lines.append("") # helps avoid getting StopIteration all the time 1300 line_iter = iter(lines) 1301 try: 1302 line = next(line_iter) 1303 while True: 1304 if not line: 1305 break 1306 line_type = line[:GENBANK_INDENT].strip() 1307 data = line[GENBANK_INDENT:].strip() 1308 1309 if line_type == 'VERSION': 1310 # Need to call consumer.version(), and maybe also consumer.gi() as well. 1311 # e.g. 1312 # VERSION AC007323.5 GI:6587720 1313 while ' ' in data: 1314 data = data.replace(' ', ' ') 1315 if ' GI:' not in data: 1316 consumer.version(data) 1317 else: 1318 if self.debug: 1319 print("Version [" + data.split(' GI:')[0] + "], gi [" + data.split(' GI:')[1] + "]") 1320 consumer.version(data.split(' GI:')[0]) 1321 consumer.gi(data.split(' GI:')[1]) 1322 # Read in the next line! 1323 line = next(line_iter) 1324 elif line_type == 'DBLINK': 1325 # Need to call consumer.dblink() for each line, e.g. 1326 # DBLINK Project: 57779 1327 # BioProject: PRJNA57779 1328 consumer.dblink(data.strip()) 1329 # Read in the next line, and see if its more of the DBLINK section: 1330 while True: 1331 line = next(line_iter) 1332 if line[:GENBANK_INDENT] == GENBANK_SPACER: 1333 # Add this continuation to the data string 1334 consumer.dblink(line[GENBANK_INDENT:].strip()) 1335 else: 1336 # End of the DBLINK, leave this text in the variable "line" 1337 break 1338 elif line_type == 'REFERENCE': 1339 if self.debug > 1: 1340 print("Found reference [" + data + "]") 1341 # Need to call consumer.reference_num() and consumer.reference_bases() 1342 # e.g. 1343 # REFERENCE 1 (bases 1 to 86436) 1344 # 1345 # Note that this can be multiline, see Bug 1968, e.g. 1346 # 1347 # REFERENCE 42 (bases 1517 to 1696; 3932 to 4112; 17880 to 17975; 21142 to 1348 # 28259) 1349 # 1350 # For such cases we will call the consumer once only. 1351 data = data.strip() 1352 1353 # Read in the next line, and see if its more of the reference: 1354 while True: 1355 line = next(line_iter) 1356 if line[:GENBANK_INDENT] == GENBANK_SPACER: 1357 # Add this continuation to the data string 1358 data += " " + line[GENBANK_INDENT:] 1359 if self.debug > 1: 1360 print("Extended reference text [" + data + "]") 1361 else: 1362 # End of the reference, leave this text in the variable "line" 1363 break 1364 1365 # We now have all the reference line(s) stored in a string, data, 1366 # which we pass to the consumer 1367 while ' ' in data: 1368 data = data.replace(' ', ' ') 1369 if ' ' not in data: 1370 if self.debug > 2: 1371 print('Reference number \"' + data + '\"') 1372 consumer.reference_num(data) 1373 else: 1374 if self.debug > 2: 1375 print('Reference number \"' + data[:data.find(' ')] + '\", \"' + data[data.find(' ') + 1:] + '\"') 1376 consumer.reference_num(data[:data.find(' ')]) 1377 consumer.reference_bases(data[data.find(' ') + 1:]) 1378 elif line_type == 'ORGANISM': 1379 # Typically the first line is the organism, and subsequent lines 1380 # are the taxonomy lineage. However, given longer and longer 1381 # species names (as more and more strains and sub strains get 1382 # sequenced) the oragnism name can now get wrapped onto multiple 1383 # lines. The NCBI say we have to recognise the lineage line by 1384 # the presence of semi-colon delimited entries. In the long term, 1385 # they are considering adding a new keyword (e.g. LINEAGE). 1386 # See Bug 2591 for details. 1387 organism_data = data 1388 lineage_data = "" 1389 while True: 1390 line = next(line_iter) 1391 if line[0:GENBANK_INDENT] == GENBANK_SPACER: 1392 if lineage_data or ";" in line: 1393 lineage_data += " " + line[GENBANK_INDENT:] 1394 elif line[GENBANK_INDENT:].strip() == ".": 1395 # No lineage data, just . place holder 1396 pass 1397 else: 1398 organism_data += " " + line[GENBANK_INDENT:].strip() 1399 else: 1400 # End of organism and taxonomy 1401 break 1402 consumer.organism(organism_data) 1403 if lineage_data.strip() == "" and self.debug > 1: 1404 print("Taxonomy line(s) missing or blank") 1405 consumer.taxonomy(lineage_data.strip()) 1406 del organism_data, lineage_data 1407 elif line_type == 'COMMENT': 1408 # A COMMENT can either be plain text or tabular (Structured Comment), 1409 # or contain both. Multiline comments are common. The code calls 1410 # consumer.comment() once with a list where each entry 1411 # is a line. If there's a structured comment consumer.structured_comment() 1412 # is called with a dict of dicts where the secondary key/value pairs are 1413 # the same as those in the structured comment table. The primary key is 1414 # the title or header of the table (e.g. Assembly-Data, FluData). See 1415 # http://www.ncbi.nlm.nih.gov/genbank/structuredcomment 1416 # for more information on Structured Comments. 1417 data = line[GENBANK_INDENT:] 1418 if self.debug > 1: 1419 print("Found comment") 1420 comment_list = [] 1421 structured_comment_dict = defaultdict(dict) 1422 structured_comment_key = '' 1423 1424 if STRUCTURED_COMMENT_START in data: 1425 structured_comment_key = re.search(r"([^#]+){0}$".format(STRUCTURED_COMMENT_START), data).group(1) 1426 if self.debug > 1: 1427 print("Found Structured Comment") 1428 else: 1429 comment_list.append(data) 1430 1431 while True: 1432 line = next(line_iter) 1433 data = line[GENBANK_INDENT:] 1434 if line[0:GENBANK_INDENT] == GENBANK_SPACER: 1435 if STRUCTURED_COMMENT_START in data: 1436 structured_comment_key = re.search(r"([^#]+){0}$".format(STRUCTURED_COMMENT_START), data).group(1) 1437 elif structured_comment_key is not None and STRUCTURED_COMMENT_DELIM in data: 1438 match = re.search(r"(.+?)\s*{0}\s*(.+)".format(STRUCTURED_COMMENT_DELIM), data) 1439 structured_comment_dict[structured_comment_key][match.group(1)] = match.group(2) 1440 if self.debug > 2: 1441 print("Structured Comment continuation [" + data + "]") 1442 elif STRUCTURED_COMMENT_END not in data: 1443 comment_list.append(data) 1444 if self.debug > 2: 1445 print("Comment continuation [" + data + "]") 1446 else: 1447 # End of the comment 1448 break 1449 if comment_list: 1450 consumer.comment(comment_list) 1451 if structured_comment_dict: 1452 consumer.structured_comment(structured_comment_dict) 1453 del comment_list, structured_comment_key, structured_comment_dict 1454 elif line_type in consumer_dict: 1455 # It's a semi-automatic entry! 1456 # Now, this may be a multi line entry... 1457 while True: 1458 line = next(line_iter) 1459 if line[0:GENBANK_INDENT] == GENBANK_SPACER: 1460 data += ' ' + line[GENBANK_INDENT:] 1461 else: 1462 # We now have all the data for this entry: 1463 getattr(consumer, consumer_dict[line_type])(data) 1464 # End of continuation - return to top of loop! 1465 break 1466 else: 1467 if self.debug: 1468 print("Ignoring GenBank header line:\n" % line) 1469 # Read in next line 1470 line = next(line_iter) 1471 except StopIteration: 1472 raise ValueError("Problem in header")
1473
1474 - def _feed_misc_lines(self, consumer, lines):
1475 # Deals with a few misc lines between the features and the sequence 1476 GENBANK_INDENT = self.HEADER_WIDTH 1477 GENBANK_SPACER = " " * GENBANK_INDENT 1478 lines.append("") 1479 line_iter = iter(lines) 1480 try: 1481 for line in line_iter: 1482 if line.startswith('BASE COUNT'): 1483 line = line[10:].strip() 1484 if line: 1485 if self.debug: 1486 print("base_count = " + line) 1487 consumer.base_count(line) 1488 if line.startswith('ORIGIN'): 1489 line = line[6:].strip() 1490 if line: 1491 if self.debug: 1492 print("origin_name = " + line) 1493 consumer.origin_name(line) 1494 if line.startswith('WGS '): 1495 line = line[3:].strip() 1496 consumer.wgs(line) 1497 if line.startswith('WGS_SCAFLD'): 1498 line = line[10:].strip() 1499 consumer.add_wgs_scafld(line) 1500 if line.startswith('CONTIG'): 1501 line = line[6:].strip() 1502 contig_location = line 1503 while True: 1504 line = next(line_iter) 1505 if not line: 1506 break 1507 elif line[:GENBANK_INDENT] == GENBANK_SPACER: 1508 # Don't need to preseve the whitespace here. 1509 contig_location += line[GENBANK_INDENT:].rstrip() 1510 elif line.startswith('ORIGIN'): 1511 # Strange, seen this in GenPept files via Entrez gbwithparts 1512 line = line[6:].strip() 1513 if line: 1514 consumer.origin_name(line) 1515 break 1516 else: 1517 raise ValueError('Expected CONTIG continuation line, got:\n' + line) 1518 consumer.contig_location(contig_location) 1519 return 1520 except StopIteration: 1521 raise ValueError("Problem in misc lines before sequence")
1522 1523 if __name__ == "__main__": 1524 from Bio._py3k import StringIO 1525 1526 gbk_example = \ 1527 """LOCUS SCU49845 5028 bp DNA PLN 21-JUN-1999 1528 DEFINITION Saccharomyces cerevisiae TCP1-beta gene, partial cds, and Axl2p 1529 (AXL2) and Rev7p (REV7) genes, complete cds. 1530 ACCESSION U49845 1531 VERSION U49845.1 GI:1293613 1532 KEYWORDS . 1533 SOURCE Saccharomyces cerevisiae (baker's yeast) 1534 ORGANISM Saccharomyces cerevisiae 1535 Eukaryota; Fungi; Ascomycota; Saccharomycotina; Saccharomycetes; 1536 Saccharomycetales; Saccharomycetaceae; Saccharomyces. 1537 REFERENCE 1 (bases 1 to 5028) 1538 AUTHORS Torpey,L.E., Gibbs,P.E., Nelson,J. and Lawrence,C.W. 1539 TITLE Cloning and sequence of REV7, a gene whose function is required for 1540 DNA damage-induced mutagenesis in Saccharomyces cerevisiae 1541 JOURNAL Yeast 10 (11), 1503-1509 (1994) 1542 PUBMED 7871890 1543 REFERENCE 2 (bases 1 to 5028) 1544 AUTHORS Roemer,T., Madden,K., Chang,J. and Snyder,M. 1545 TITLE Selection of axial growth sites in yeast requires Axl2p, a novel 1546 plasma membrane glycoprotein 1547 JOURNAL Genes Dev. 10 (7), 777-793 (1996) 1548 PUBMED 8846915 1549 REFERENCE 3 (bases 1 to 5028) 1550 AUTHORS Roemer,T. 1551 TITLE Direct Submission 1552 JOURNAL Submitted (22-FEB-1996) Terry Roemer, Biology, Yale University, New 1553 Haven, CT, USA 1554 FEATURES Location/Qualifiers 1555 source 1..5028 1556 /organism="Saccharomyces cerevisiae" 1557 /db_xref="taxon:4932" 1558 /chromosome="IX" 1559 /map="9" 1560 CDS <1..206 1561 /codon_start=3 1562 /product="TCP1-beta" 1563 /protein_id="AAA98665.1" 1564 /db_xref="GI:1293614" 1565 /translation="SSIYNGISTSGLDLNNGTIADMRQLGIVESYKLKRAVVSSASEA 1566 AEVLLRVDNIIRARPRTANRQHM" 1567 gene 687..3158 1568 /gene="AXL2" 1569 CDS 687..3158 1570 /gene="AXL2" 1571 /note="plasma membrane glycoprotein" 1572 /codon_start=1 1573 /function="required for axial budding pattern of S. 1574 cerevisiae" 1575 /product="Axl2p" 1576 /protein_id="AAA98666.1" 1577 /db_xref="GI:1293615" 1578 /translation="MTQLQISLLLTATISLLHLVVATPYEAYPIGKQYPPVARVNESF 1579 TFQISNDTYKSSVDKTAQITYNCFDLPSWLSFDSSSRTFSGEPSSDLLSDANTTLYFN 1580 VILEGTDSADSTSLNNTYQFVVTNRPSISLSSDFNLLALLKNYGYTNGKNALKLDPNE 1581 VFNVTFDRSMFTNEESIVSYYGRSQLYNAPLPNWLFFDSGELKFTGTAPVINSAIAPE 1582 TSYSFVIIATDIEGFSAVEVEFELVIGAHQLTTSIQNSLIINVTDTGNVSYDLPLNYV 1583 YLDDDPISSDKLGSINLLDAPDWVALDNATISGSVPDELLGKNSNPANFSVSIYDTYG 1584 DVIYFNFEVVSTTDLFAISSLPNINATRGEWFSYYFLPSQFTDYVNTNVSLEFTNSSQ 1585 DHDWVKFQSSNLTLAGEVPKNFDKLSLGLKANQGSQSQELYFNIIGMDSKITHSNHSA 1586 NATSTRSSHHSTSTSSYTSSTYTAKISSTSAAATSSAPAALPAANKTSSHNKKAVAIA 1587 CGVAIPLGVILVALICFLIFWRRRRENPDDENLPHAISGPDLNNPANKPNQENATPLN 1588 NPFDDDASSYDDTSIARRLAALNTLKLDNHSATESDISSVDEKRDSLSGMNTYNDQFQ 1589 SQSKEELLAKPPVQPPESPFFDPQNRSSSVYMDSEPAVNKSWRYTGNLSPVSDIVRDS 1590 YGSQKTVDTEKLFDLEAPEKEKRTSRDVTMSSLDPWNSNISPSPVRKSVTPSPYNVTK 1591 HRNRHLQNIQDSQSGKNGITPTTMSTSSSDDFVPVKDGENFCWVHSMEPDRRPSKKRL 1592 VDFSNKSNVNVGQVKDIHGRIPEML" 1593 gene complement(3300..4037) 1594 /gene="REV7" 1595 CDS complement(3300..4037) 1596 /gene="REV7" 1597 /codon_start=1 1598 /product="Rev7p" 1599 /protein_id="AAA98667.1" 1600 /db_xref="GI:1293616" 1601 /translation="MNRWVEKWLRVYLKCYINLILFYRNVYPPQSFDYTTYQSFNLPQ 1602 FVPINRHPALIDYIEELILDVLSKLTHVYRFSICIINKKNDLCIEKYVLDFSELQHVD 1603 KDDQIITETEVFDEFRSSLNSLIMHLEKLPKVNDDTITFEAVINAIELELGHKLDRNR 1604 RVDSLEEKAEIERDSNWVKCQEDENLPDNNGFQPPKIKLTSLVGSDVGPLIIHQFSEK 1605 LISGDDKILNGVYSQYEEGESIFGSLF" 1606 ORIGIN 1607 1 gatcctccat atacaacggt atctccacct caggtttaga tctcaacaac ggaaccattg 1608 61 ccgacatgag acagttaggt atcgtcgaga gttacaagct aaaacgagca gtagtcagct 1609 121 ctgcatctga agccgctgaa gttctactaa gggtggataa catcatccgt gcaagaccaa 1610 181 gaaccgccaa tagacaacat atgtaacata tttaggatat acctcgaaaa taataaaccg 1611 241 ccacactgtc attattataa ttagaaacag aacgcaaaaa ttatccacta tataattcaa 1612 301 agacgcgaaa aaaaaagaac aacgcgtcat agaacttttg gcaattcgcg tcacaaataa 1613 361 attttggcaa cttatgtttc ctcttcgagc agtactcgag ccctgtctca agaatgtaat 1614 421 aatacccatc gtaggtatgg ttaaagatag catctccaca acctcaaagc tccttgccga 1615 481 gagtcgccct cctttgtcga gtaattttca cttttcatat gagaacttat tttcttattc 1616 541 tttactctca catcctgtag tgattgacac tgcaacagcc accatcacta gaagaacaga 1617 601 acaattactt aatagaaaaa ttatatcttc ctcgaaacga tttcctgctt ccaacatcta 1618 661 cgtatatcaa gaagcattca cttaccatga cacagcttca gatttcatta ttgctgacag 1619 721 ctactatatc actactccat ctagtagtgg ccacgcccta tgaggcatat cctatcggaa 1620 781 aacaataccc cccagtggca agagtcaatg aatcgtttac atttcaaatt tccaatgata 1621 841 cctataaatc gtctgtagac aagacagctc aaataacata caattgcttc gacttaccga 1622 901 gctggctttc gtttgactct agttctagaa cgttctcagg tgaaccttct tctgacttac 1623 961 tatctgatgc gaacaccacg ttgtatttca atgtaatact cgagggtacg gactctgccg 1624 1021 acagcacgtc tttgaacaat acataccaat ttgttgttac aaaccgtcca tccatctcgc 1625 1081 tatcgtcaga tttcaatcta ttggcgttgt taaaaaacta tggttatact aacggcaaaa 1626 1141 acgctctgaa actagatcct aatgaagtct tcaacgtgac ttttgaccgt tcaatgttca 1627 1201 ctaacgaaga atccattgtg tcgtattacg gacgttctca gttgtataat gcgccgttac 1628 1261 ccaattggct gttcttcgat tctggcgagt tgaagtttac tgggacggca ccggtgataa 1629 1321 actcggcgat tgctccagaa acaagctaca gttttgtcat catcgctaca gacattgaag 1630 1381 gattttctgc cgttgaggta gaattcgaat tagtcatcgg ggctcaccag ttaactacct 1631 1441 ctattcaaaa tagtttgata atcaacgtta ctgacacagg taacgtttca tatgacttac 1632 1501 ctctaaacta tgtttatctc gatgacgatc ctatttcttc tgataaattg ggttctataa 1633 1561 acttattgga tgctccagac tgggtggcat tagataatgc taccatttcc gggtctgtcc 1634 1621 cagatgaatt actcggtaag aactccaatc ctgccaattt ttctgtgtcc atttatgata 1635 1681 cttatggtga tgtgatttat ttcaacttcg aagttgtctc cacaacggat ttgtttgcca 1636 1741 ttagttctct tcccaatatt aacgctacaa ggggtgaatg gttctcctac tattttttgc 1637 1801 cttctcagtt tacagactac gtgaatacaa acgtttcatt agagtttact aattcaagcc 1638 1861 aagaccatga ctgggtgaaa ttccaatcat ctaatttaac attagctgga gaagtgccca 1639 1921 agaatttcga caagctttca ttaggtttga aagcgaacca aggttcacaa tctcaagagc 1640 1981 tatattttaa catcattggc atggattcaa agataactca ctcaaaccac agtgcgaatg 1641 2041 caacgtccac aagaagttct caccactcca cctcaacaag ttcttacaca tcttctactt 1642 2101 acactgcaaa aatttcttct acctccgctg ctgctacttc ttctgctcca gcagcgctgc 1643 2161 cagcagccaa taaaacttca tctcacaata aaaaagcagt agcaattgcg tgcggtgttg 1644 2221 ctatcccatt aggcgttatc ctagtagctc tcatttgctt cctaatattc tggagacgca 1645 2281 gaagggaaaa tccagacgat gaaaacttac cgcatgctat tagtggacct gatttgaata 1646 2341 atcctgcaaa taaaccaaat caagaaaacg ctacaccttt gaacaacccc tttgatgatg 1647 2401 atgcttcctc gtacgatgat acttcaatag caagaagatt ggctgctttg aacactttga 1648 2461 aattggataa ccactctgcc actgaatctg atatttccag cgtggatgaa aagagagatt 1649 2521 ctctatcagg tatgaataca tacaatgatc agttccaatc ccaaagtaaa gaagaattat 1650 2581 tagcaaaacc cccagtacag cctccagaga gcccgttctt tgacccacag aataggtctt 1651 2641 cttctgtgta tatggatagt gaaccagcag taaataaatc ctggcgatat actggcaacc 1652 2701 tgtcaccagt ctctgatatt gtcagagaca gttacggatc acaaaaaact gttgatacag 1653 2761 aaaaactttt cgatttagaa gcaccagaga aggaaaaacg tacgtcaagg gatgtcacta 1654 2821 tgtcttcact ggacccttgg aacagcaata ttagcccttc tcccgtaaga aaatcagtaa 1655 2881 caccatcacc atataacgta acgaagcatc gtaaccgcca cttacaaaat attcaagact 1656 2941 ctcaaagcgg taaaaacgga atcactccca caacaatgtc aacttcatct tctgacgatt 1657 3001 ttgttccggt taaagatggt gaaaattttt gctgggtcca tagcatggaa ccagacagaa 1658 3061 gaccaagtaa gaaaaggtta gtagattttt caaataagag taatgtcaat gttggtcaag 1659 3121 ttaaggacat tcacggacgc atcccagaaa tgctgtgatt atacgcaacg atattttgct 1660 3181 taattttatt ttcctgtttt attttttatt agtggtttac agatacccta tattttattt 1661 3241 agtttttata cttagagaca tttaatttta attccattct tcaaatttca tttttgcact 1662 3301 taaaacaaag atccaaaaat gctctcgccc tcttcatatt gagaatacac tccattcaaa 1663 3361 attttgtcgt caccgctgat taatttttca ctaaactgat gaataatcaa aggccccacg 1664 3421 tcagaaccga ctaaagaagt gagttttatt ttaggaggtt gaaaaccatt attgtctggt 1665 3481 aaattttcat cttcttgaca tttaacccag tttgaatccc tttcaatttc tgctttttcc 1666 3541 tccaaactat cgaccctcct gtttctgtcc aacttatgtc ctagttccaa ttcgatcgca 1667 3601 ttaataactg cttcaaatgt tattgtgtca tcgttgactt taggtaattt ctccaaatgc 1668 3661 ataatcaaac tatttaagga agatcggaat tcgtcgaaca cttcagtttc cgtaatgatc 1669 3721 tgatcgtctt tatccacatg ttgtaattca ctaaaatcta aaacgtattt ttcaatgcat 1670 3781 aaatcgttct ttttattaat aatgcagatg gaaaatctgt aaacgtgcgt taatttagaa 1671 3841 agaacatcca gtataagttc ttctatatag tcaattaaag caggatgcct attaatggga 1672 3901 acgaactgcg gcaagttgaa tgactggtaa gtagtgtagt cgaatgactg aggtgggtat 1673 3961 acatttctat aaaataaaat caaattaatg tagcatttta agtataccct cagccacttc 1674 4021 tctacccatc tattcataaa gctgacgcaa cgattactat tttttttttc ttcttggatc 1675 4081 tcagtcgtcg caaaaacgta taccttcttt ttccgacctt ttttttagct ttctggaaaa 1676 4141 gtttatatta gttaaacagg gtctagtctt agtgtgaaag ctagtggttt cgattgactg 1677 4201 atattaagaa agtggaaatt aaattagtag tgtagacgta tatgcatatg tatttctcgc 1678 4261 ctgtttatgt ttctacgtac ttttgattta tagcaagggg aaaagaaata catactattt 1679 4321 tttggtaaag gtgaaagcat aatgtaaaag ctagaataaa atggacgaaa taaagagagg 1680 4381 cttagttcat cttttttcca aaaagcaccc aatgataata actaaaatga aaaggatttg 1681 4441 ccatctgtca gcaacatcag ttgtgtgagc aataataaaa tcatcacctc cgttgccttt 1682 4501 agcgcgtttg tcgtttgtat cttccgtaat tttagtctta tcaatgggaa tcataaattt 1683 4561 tccaatgaat tagcaatttc gtccaattct ttttgagctt cttcatattt gctttggaat 1684 4621 tcttcgcact tcttttccca ttcatctctt tcttcttcca aagcaacgat ccttctaccc 1685 4681 atttgctcag agttcaaatc ggcctctttc agtttatcca ttgcttcctt cagtttggct 1686 4741 tcactgtctt ctagctgttg ttctagatcc tggtttttct tggtgtagtt ctcattatta 1687 4801 gatctcaagt tattggagtc ttcagccaat tgctttgtat cagacaattg actctctaac 1688 4861 ttctccactt cactgtcgag ttgctcgttt ttagcggaca aagatttaat ctcgttttct 1689 4921 ttttcagtgt tagattgctc taattctttg agctgttctc tcagctcctc atatttttct 1690 4981 tgccatgact cagattctaa ttttaagcta ttcaatttct ctttgatc 1691 //""" 1692 1693 # GenBank format protein (aka GenPept) file from: 1694 # http://www.molecularevolution.org/resources/fileformats/ 1695 gbk_example2 = \ 1696 """LOCUS AAD51968 143 aa linear BCT 21-AUG-2001 1697 DEFINITION transcriptional regulator RovA [Yersinia enterocolitica]. 1698 ACCESSION AAD51968 1699 VERSION AAD51968.1 GI:5805369 1700 DBSOURCE locus AF171097 accession AF171097.1 1701 KEYWORDS . 1702 SOURCE Yersinia enterocolitica 1703 ORGANISM Yersinia enterocolitica 1704 Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacteriales; 1705 Enterobacteriaceae; Yersinia. 1706 REFERENCE 1 (residues 1 to 143) 1707 AUTHORS Revell,P.A. and Miller,V.L. 1708 TITLE A chromosomally encoded regulator is required for expression of the 1709 Yersinia enterocolitica inv gene and for virulence 1710 JOURNAL Mol. Microbiol. 35 (3), 677-685 (2000) 1711 MEDLINE 20138369 1712 PUBMED 10672189 1713 REFERENCE 2 (residues 1 to 143) 1714 AUTHORS Revell,P.A. and Miller,V.L. 1715 TITLE Direct Submission 1716 JOURNAL Submitted (22-JUL-1999) Molecular Microbiology, Washington 1717 University School of Medicine, Campus Box 8230, 660 South Euclid, 1718 St. Louis, MO 63110, USA 1719 COMMENT Method: conceptual translation. 1720 FEATURES Location/Qualifiers 1721 source 1..143 1722 /organism="Yersinia enterocolitica" 1723 /mol_type="unassigned DNA" 1724 /strain="JB580v" 1725 /serotype="O:8" 1726 /db_xref="taxon:630" 1727 Protein 1..143 1728 /product="transcriptional regulator RovA" 1729 /name="regulates inv expression" 1730 CDS 1..143 1731 /gene="rovA" 1732 /coded_by="AF171097.1:380..811" 1733 /note="regulator of virulence" 1734 /transl_table=11 1735 ORIGIN 1736 1 mestlgsdla rlvrvwrali dhrlkplelt qthwvtlhni nrlppeqsqi qlakaigieq 1737 61 pslvrtldql eekglitrht candrrakri klteqsspii eqvdgvicst rkeilggisp 1738 121 deiellsgli dklerniiql qsk 1739 // 1740 """ 1741 1742 embl_example = """ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP. 1743 XX 1744 AC X56734; S46826; 1745 XX 1746 DT 12-SEP-1991 (Rel. 29, Created) 1747 DT 25-NOV-2005 (Rel. 85, Last updated, Version 11) 1748 XX 1749 DE Trifolium repens mRNA for non-cyanogenic beta-glucosidase 1750 XX 1751 KW beta-glucosidase. 1752 XX 1753 OS Trifolium repens (white clover) 1754 OC Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta; 1755 OC Spermatophyta; Magnoliophyta; eudicotyledons; core eudicotyledons; rosids; 1756 OC eurosids I; Fabales; Fabaceae; Papilionoideae; Trifolieae; Trifolium. 1757 XX 1758 RN [5] 1759 RP 1-1859 1760 RX PUBMED; 1907511. 1761 RA Oxtoby E., Dunn M.A., Pancoro A., Hughes M.A.; 1762 RT "Nucleotide and derived amino acid sequence of the cyanogenic 1763 RT beta-glucosidase (linamarase) from white clover (Trifolium repens L.)"; 1764 RL Plant Mol. Biol. 17(2):209-219(1991). 1765 XX 1766 RN [6] 1767 RP 1-1859 1768 RA Hughes M.A.; 1769 RT ; 1770 RL Submitted (19-NOV-1990) to the EMBL/GenBank/DDBJ databases. 1771 RL Hughes M.A., University of Newcastle Upon Tyne, Medical School, Newcastle 1772 RL Upon Tyne, NE2 4HH, UK 1773 XX 1774 FH Key Location/Qualifiers 1775 FH 1776 FT source 1..1859 1777 FT /organism="Trifolium repens" 1778 FT /mol_type="mRNA" 1779 FT /clone_lib="lambda gt10" 1780 FT /clone="TRE361" 1781 FT /tissue_type="leaves" 1782 FT /db_xref="taxon:3899" 1783 FT CDS 14..1495 1784 FT /product="beta-glucosidase" 1785 FT /EC_number="3.2.1.21" 1786 FT /note="non-cyanogenic" 1787 FT /db_xref="GOA:P26204" 1788 FT /db_xref="InterPro:IPR001360" 1789 FT /db_xref="InterPro:IPR013781" 1790 FT /db_xref="UniProtKB/Swiss-Prot:P26204" 1791 FT /protein_id="CAA40058.1" 1792 FT /translation="MDFIVAIFALFVISSFTITSTNAVEASTLLDIGNLSRSSFPRGFI 1793 FT FGAGSSAYQFEGAVNEGGRGPSIWDTFTHKYPEKIRDGSNADITVDQYHRYKEDVGIMK 1794 FT DQNMDSYRFSISWPRILPKGKLSGGINHEGIKYYNNLINELLANGIQPFVTLFHWDLPQ 1795 FT VLEDEYGGFLNSGVINDFRDYTDLCFKEFGDRVRYWSTLNEPWVFSNSGYALGTNAPGR 1796 FT CSASNVAKPGDSGTGPYIVTHNQILAHAEAVHVYKTKYQAYQKGKIGITLVSNWLMPLD 1797 FT DNSIPDIKAAERSLDFQFGLFMEQLTTGDYSKSMRRIVKNRLPKFSKFESSLVNGSFDF 1798 FT IGINYYSSSYISNAPSHGNAKPSYSTNPMTNISFEKHGIPLGPRAASIWIYVYPYMFIQ 1799 FT EDFEIFCYILKINITILQFSITENGMNEFNDATLPVEEALLNTYRIDYYYRHLYYIRSA 1800 FT IRAGSNVKGFYAWSFLDCNEWFAGFTVRFGLNFVD" 1801 FT mRNA 1..1859 1802 FT /experiment="experimental evidence, no additional details 1803 FT recorded" 1804 XX 1805 SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other; 1806 aaacaaacca aatatggatt ttattgtagc catatttgct ctgtttgtta ttagctcatt 60 1807 cacaattact tccacaaatg cagttgaagc ttctactctt cttgacatag gtaacctgag 120 1808 tcggagcagt tttcctcgtg gcttcatctt tggtgctgga tcttcagcat accaatttga 180 1809 aggtgcagta aacgaaggcg gtagaggacc aagtatttgg gataccttca cccataaata 240 1810 tccagaaaaa ataagggatg gaagcaatgc agacatcacg gttgaccaat atcaccgcta 300 1811 caaggaagat gttgggatta tgaaggatca aaatatggat tcgtatagat tctcaatctc 360 1812 ttggccaaga atactcccaa agggaaagtt gagcggaggc ataaatcacg aaggaatcaa 420 1813 atattacaac aaccttatca acgaactatt ggctaacggt atacaaccat ttgtaactct 480 1814 ttttcattgg gatcttcccc aagtcttaga agatgagtat ggtggtttct taaactccgg 540 1815 tgtaataaat gattttcgag actatacgga tctttgcttc aaggaatttg gagatagagt 600 1816 gaggtattgg agtactctaa atgagccatg ggtgtttagc aattctggat atgcactagg 660 1817 aacaaatgca ccaggtcgat gttcggcctc caacgtggcc aagcctggtg attctggaac 720 1818 aggaccttat atagttacac acaatcaaat tcttgctcat gcagaagctg tacatgtgta 780 1819 taagactaaa taccaggcat atcaaaaggg aaagataggc ataacgttgg tatctaactg 840 1820 gttaatgcca cttgatgata atagcatacc agatataaag gctgccgaga gatcacttga 900 1821 cttccaattt ggattgttta tggaacaatt aacaacagga gattattcta agagcatgcg 960 1822 gcgtatagtt aaaaaccgat tacctaagtt ctcaaaattc gaatcaagcc tagtgaatgg 1020 1823 ttcatttgat tttattggta taaactatta ctcttctagt tatattagca atgccccttc 1080 1824 acatggcaat gccaaaccca gttactcaac aaatcctatg accaatattt catttgaaaa 1140 1825 acatgggata cccttaggtc caagggctgc ttcaatttgg atatatgttt atccatatat 1200 1826 gtttatccaa gaggacttcg agatcttttg ttacatatta aaaataaata taacaatcct 1260 1827 gcaattttca atcactgaaa atggtatgaa tgaattcaac gatgcaacac ttccagtaga 1320 1828 agaagctctt ttgaatactt acagaattga ttactattac cgtcacttat actacattcg 1380 1829 ttctgcaatc agggctggct caaatgtgaa gggtttttac gcatggtcat ttttggactg 1440 1830 taatgaatgg tttgcaggct ttactgttcg ttttggatta aactttgtag attagaaaga 1500 1831 tggattaaaa aggtacccta agctttctgc ccaatggtac aagaactttc tcaaaagaaa 1560 1832 ctagctagta ttattaaaag aactttgtag tagattacag tacatcgttt gaagttgagt 1620 1833 tggtgcacct aattaaataa aagaggttac tcttaacata tttttaggcc attcgttgtg 1680 1834 aagttgttag gctgttattt ctattatact atgttgtagt aataagtgca ttgttgtacc 1740 1835 agaagctatg atcataacta taggttgatc cttcatgtat cagtttgatg ttgagaatac 1800 1836 tttgaattaa aagtcttttt ttattttttt aaaaaaaaaa aaaaaaaaaa aaaaaaaaa 1859 1837 // 1838 """ 1839 1840 print("GenBank CDS Iteration") 1841 print("=====================") 1842 1843 g = GenBankScanner() 1844 for record in g.parse_cds_features(StringIO(gbk_example)): 1845 print(record) 1846 1847 g = GenBankScanner() 1848 for record in g.parse_cds_features(StringIO(gbk_example2), 1849 tags2id=('gene', 'locus_tag', 'product')): 1850 print(record) 1851 1852 g = GenBankScanner() 1853 for record in g.parse_cds_features(StringIO(gbk_example + "\n" + gbk_example2), 1854 tags2id=('gene', 'locus_tag', 'product')): 1855 print(record) 1856 1857 print("") 1858 print("GenBank Iteration") 1859 print("=================") 1860 g = GenBankScanner() 1861 for record in g.parse_records(StringIO(gbk_example), do_features=False): 1862 print("%s %s %s" % (record.id, record.name, record.description)) 1863 print(record.seq) 1864 1865 g = GenBankScanner() 1866 for record in g.parse_records(StringIO(gbk_example), do_features=True): 1867 print("%s %s %s" % (record.id, record.name, record.description)) 1868 print(record.seq) 1869 1870 g = GenBankScanner() 1871 for record in g.parse_records(StringIO(gbk_example2), do_features=False): 1872 print("%s %s %s" % (record.id, record.name, record.description)) 1873 print(record.seq) 1874 1875 g = GenBankScanner() 1876 for record in g.parse_records(StringIO(gbk_example2), do_features=True): 1877 print("%s %s %s" % (record.id, record.name, record.description)) 1878 print(record.seq) 1879 1880 print("") 1881 print("EMBL CDS Iteration") 1882 print("==================") 1883 1884 e = EmblScanner() 1885 for record in e.parse_cds_features(StringIO(embl_example)): 1886 print(record) 1887 1888 print("") 1889 print("EMBL Iteration") 1890 print("==============") 1891 e = EmblScanner() 1892 for record in e.parse_records(StringIO(embl_example), do_features=True): 1893 print("%s %s %s" % (record.id, record.name, record.description)) 1894 print(record.seq) 1895