Package Bio :: Package GenBank :: Module Scanner
[hide private]
[frames] | no frames]

Source Code for Module Bio.GenBank.Scanner

   1  # Copyright 2007-2017 by Peter Cock.  All rights reserved. 
   2  # Revisions copyright 2010 by Uri Laserson.  All rights reserved. 
   3  # This code is part of the Biopython distribution and governed by its 
   4  # license.  Please see the LICENSE file that should have been included 
   5  # as part of this package. 
   6  """Internal code for parsing GenBank and EMBL files (PRIVATE). 
   7   
   8  This code is NOT intended for direct use.  It provides a basic scanner 
   9  (for use with a event consumer such as Bio.GenBank._FeatureConsumer) 
  10  to parse a GenBank or EMBL file (with their shared INSDC feature table). 
  11   
  12  It is used by Bio.GenBank to parse GenBank files 
  13  It is also used by Bio.SeqIO to parse GenBank and EMBL files 
  14   
  15  Feature Table Documentation: 
  16  http://www.insdc.org/files/feature_table.html 
  17  http://www.ncbi.nlm.nih.gov/projects/collab/FT/index.html 
  18  ftp://ftp.ncbi.nih.gov/genbank/docs/ 
  19  """ 
  20  # 17-MAR-2009: added wgs, wgs_scafld for GenBank whole genome shotgun master records. 
  21  # These are GenBank files that summarize the content of a project, and provide lists of 
  22  # scaffold and contig files in the project. These will be in annotations['wgs'] and 
  23  # annotations['wgs_scafld']. These GenBank files do not have sequences. See 
  24  # http://groups.google.com/group/bionet.molbio.genbank/browse_thread/thread/51fb88bf39e7dc36 
  25  # http://is.gd/nNgk 
  26  # for more details of this format, and an example. 
  27  # Added by Ying Huang & Iddo Friedberg 
  28   
  29  from __future__ import print_function 
  30   
  31  import warnings 
  32  import re 
  33  from collections import OrderedDict 
  34  from Bio.Seq import Seq 
  35  from Bio.SeqRecord import SeqRecord 
  36  from Bio.Alphabet import generic_protein 
  37  from Bio import BiopythonParserWarning 
38 39 40 -class InsdcScanner(object):
41 """Basic functions for breaking up a GenBank/EMBL file into sub sections. 42 43 The International Nucleotide Sequence Database Collaboration (INSDC) 44 between the DDBJ, EMBL, and GenBank. These organisations all use the 45 same "Feature Table" layout in their plain text flat file formats. 46 47 However, the header and sequence sections of an EMBL file are very 48 different in layout to those produced by GenBank/DDBJ. 49 """ 50 51 # These constants get redefined with sensible values in the sub classes: 52 RECORD_START = "XXX" # "LOCUS " or "ID " 53 HEADER_WIDTH = 3 # 12 or 5 54 FEATURE_START_MARKERS = ["XXX***FEATURES***XXX"] 55 FEATURE_END_MARKERS = ["XXX***END FEATURES***XXX"] 56 FEATURE_QUALIFIER_INDENT = 0 57 FEATURE_QUALIFIER_SPACER = "" 58 SEQUENCE_HEADERS = ["XXX"] # with right hand side spaces removed 59
60 - def __init__(self, debug=0):
61 """Initialize.""" 62 assert len(self.RECORD_START) == self.HEADER_WIDTH 63 for marker in self.SEQUENCE_HEADERS: 64 assert marker == marker.rstrip() 65 assert len(self.FEATURE_QUALIFIER_SPACER) == self.FEATURE_QUALIFIER_INDENT 66 self.debug = debug 67 self.handle = None 68 self.line = None
69
70 - def set_handle(self, handle):
71 """Set the handle attribute.""" 72 self.handle = handle 73 self.line = ""
74
75 - def find_start(self):
76 """Read in lines until find the ID/LOCUS line, which is returned. 77 78 Any preamble (such as the header used by the NCBI on ``*.seq.gz`` archives) 79 will we ignored. 80 """ 81 while True: 82 if self.line: 83 line = self.line 84 self.line = "" 85 else: 86 line = self.handle.readline() 87 if not line: 88 if self.debug: 89 print("End of file") 90 return None 91 if line[:self.HEADER_WIDTH] == self.RECORD_START: 92 if self.debug > 1: 93 print("Found the start of a record:\n" + line) 94 break 95 line = line.rstrip() 96 if line == "//": 97 if self.debug > 1: 98 print("Skipping // marking end of last record") 99 elif line == "": 100 if self.debug > 1: 101 print("Skipping blank line before record") 102 else: 103 # Ignore any header before the first ID/LOCUS line. 104 if self.debug > 1: 105 print("Skipping header line before record:\n" + line) 106 self.line = line 107 return line
108
109 - def parse_header(self):
110 """Return list of strings making up the header. 111 112 New line characters are removed. 113 114 Assumes you have just read in the ID/LOCUS line. 115 """ 116 assert self.line[:self.HEADER_WIDTH] == self.RECORD_START, \ 117 "Not at start of record" 118 119 header_lines = [] 120 while True: 121 line = self.handle.readline() 122 if not line: 123 raise ValueError("Premature end of line during sequence data") 124 line = line.rstrip() 125 if line in self.FEATURE_START_MARKERS: 126 if self.debug: 127 print("Found feature table") 128 break 129 # if line[:self.HEADER_WIDTH]==self.FEATURE_START_MARKER[:self.HEADER_WIDTH]: 130 # if self.debug : print("Found header table (?)") 131 # break 132 if line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS: 133 if self.debug: 134 print("Found start of sequence") 135 break 136 if line == "//": 137 raise ValueError("Premature end of sequence data marker '//' found") 138 header_lines.append(line) 139 self.line = line 140 return header_lines
141
142 - def parse_features(self, skip=False):
143 """Return list of tuples for the features (if present). 144 145 Each feature is returned as a tuple (key, location, qualifiers) 146 where key and location are strings (e.g. "CDS" and 147 "complement(join(490883..490885,1..879))") while qualifiers 148 is a list of two string tuples (feature qualifier keys and values). 149 150 Assumes you have already read to the start of the features table. 151 """ 152 if self.line.rstrip() not in self.FEATURE_START_MARKERS: 153 if self.debug: 154 print("Didn't find any feature table") 155 return [] 156 157 while self.line.rstrip() in self.FEATURE_START_MARKERS: 158 self.line = self.handle.readline() 159 160 features = [] 161 line = self.line 162 while True: 163 if not line: 164 raise ValueError("Premature end of line during features table") 165 if line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS: 166 if self.debug: 167 print("Found start of sequence") 168 break 169 line = line.rstrip() 170 if line == "//": 171 raise ValueError("Premature end of features table, marker '//' found") 172 if line in self.FEATURE_END_MARKERS: 173 if self.debug: 174 print("Found end of features") 175 line = self.handle.readline() 176 break 177 if line[2:self.FEATURE_QUALIFIER_INDENT].strip() == "": 178 # This is an empty feature line between qualifiers. Empty 179 # feature lines within qualifiers are handled below (ignored). 180 line = self.handle.readline() 181 continue 182 if len(line) < self.FEATURE_QUALIFIER_INDENT: 183 warnings.warn("line too short to contain a feature: %r" % line, 184 BiopythonParserWarning) 185 line = self.handle.readline() 186 continue 187 188 if skip: 189 line = self.handle.readline() 190 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER: 191 line = self.handle.readline() 192 else: 193 # Build up a list of the lines making up this feature: 194 if line[self.FEATURE_QUALIFIER_INDENT] != " " \ 195 and " " in line[self.FEATURE_QUALIFIER_INDENT:]: 196 # The feature table design enforces a length limit on the feature keys. 197 # Some third party files (e.g. IGMT's EMBL like files) solve this by 198 # over indenting the location and qualifiers. 199 feature_key, line = line[2:].strip().split(None, 1) 200 feature_lines = [line] 201 warnings.warn("Overindented %s feature?" % feature_key, 202 BiopythonParserWarning) 203 else: 204 feature_key = line[2:self.FEATURE_QUALIFIER_INDENT].strip() 205 feature_lines = [line[self.FEATURE_QUALIFIER_INDENT:]] 206 line = self.handle.readline() 207 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER \ 208 or (line != '' and line.rstrip() == ""): # cope with blank lines in the midst of a feature 209 # Use strip to remove any harmless trailing white space AND and leading 210 # white space (e.g. out of spec files with too much indentation) 211 feature_lines.append(line[self.FEATURE_QUALIFIER_INDENT:].strip()) 212 line = self.handle.readline() 213 features.append(self.parse_feature(feature_key, feature_lines)) 214 self.line = line 215 return features
216
217 - def parse_feature(self, feature_key, lines):
218 r"""Parse a feature given as a list of strings into a tuple. 219 220 Expects a feature as a list of strings, returns a tuple (key, location, 221 qualifiers) 222 223 For example given this GenBank feature:: 224 225 CDS complement(join(490883..490885,1..879)) 226 /locus_tag="NEQ001" 227 /note="conserved hypothetical [Methanococcus jannaschii]; 228 COG1583:Uncharacterized ACR; IPR001472:Bipartite nuclear 229 localization signal; IPR002743: Protein of unknown 230 function DUF57" 231 /codon_start=1 232 /transl_table=11 233 /product="hypothetical protein" 234 /protein_id="NP_963295.1" 235 /db_xref="GI:41614797" 236 /db_xref="GeneID:2732620" 237 /translation="MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK 238 EKYFNFTLIPKKDIIENKRYYLIISSPDKRFIEVLHNKIKDLDIITIGLAQFQLRKTK 239 KFDPKLRFPWVTITPIVLREGKIVILKGDKYYKVFVKRLEELKKYNLIKKKEPILEEP 240 IEISLNQIKDGWKIIDVKDRYYDFRNKSFSAFSNWLRDLKEQSLRKYNNFCGKNFYFE 241 EAIFEGFTFYKTVSIRIRINRGEAVYIGTLWKELNVYRKLDKEEREFYKFLYDCGLGS 242 LNSMGFGFVNTKKNSAR" 243 244 Then should give input key="CDS" and the rest of the data as a list of strings 245 lines=["complement(join(490883..490885,1..879))", ..., "LNSMGFGFVNTKKNSAR"] 246 where the leading spaces and trailing newlines have been removed. 247 248 Returns tuple containing: (key as string, location string, qualifiers as list) 249 as follows for this example: 250 251 key = "CDS", string 252 location = "complement(join(490883..490885,1..879))", string 253 qualifiers = list of string tuples: 254 255 [('locus_tag', '"NEQ001"'), 256 ('note', '"conserved hypothetical [Methanococcus jannaschii];\nCOG1583:..."'), 257 ('codon_start', '1'), 258 ('transl_table', '11'), 259 ('product', '"hypothetical protein"'), 260 ('protein_id', '"NP_963295.1"'), 261 ('db_xref', '"GI:41614797"'), 262 ('db_xref', '"GeneID:2732620"'), 263 ('translation', '"MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK\nEKYFNFT..."')] 264 265 In the above example, the "note" and "translation" were edited for compactness, 266 and they would contain multiple new line characters (displayed above as \n) 267 268 If a qualifier is quoted (in this case, everything except codon_start and 269 transl_table) then the quotes are NOT removed. 270 271 Note that no whitespace is removed. 272 """ 273 # Skip any blank lines 274 iterator = (x for x in lines if x) 275 try: 276 line = next(iterator) 277 278 feature_location = line.strip() 279 while feature_location[-1:] == ",": 280 # Multiline location, still more to come! 281 line = next(iterator) 282 feature_location += line.strip() 283 if feature_location.count("(") > feature_location.count(")"): 284 # Including the prev line in warning would be more explicit, 285 # but this way get one-and-only-one warning shown by default: 286 warnings.warn("Non-standard feature line wrapping (didn't break on comma)?", 287 BiopythonParserWarning) 288 while feature_location[-1:] == "," or feature_location.count("(") > feature_location.count(")"): 289 line = next(iterator) 290 feature_location += line.strip() 291 292 qualifiers = [] 293 294 for line_number, line in enumerate(iterator): 295 # check for extra wrapping of the location closing parentheses 296 if line_number == 0 and line.startswith(")"): 297 feature_location += line.strip() 298 elif line[0] == "/": 299 # New qualifier 300 i = line.find("=") 301 key = line[1:i] # does not work if i==-1 302 value = line[i + 1:] # we ignore 'value' if i==-1 303 if i == -1: 304 # Qualifier with no key, e.g. /pseudo 305 key = line[1:] 306 qualifiers.append((key, None)) 307 elif not value: 308 # ApE can output /note= 309 qualifiers.append((key, "")) 310 elif value == '"': 311 # One single quote 312 if self.debug: 313 print("Single quote %s:%s" % (key, value)) 314 # DO NOT remove the quote... 315 qualifiers.append((key, value)) 316 elif value[0] == '"': 317 # Quoted... 318 value_list = [value] 319 while value_list[-1][-1] != '"': 320 value_list.append(next(iterator)) 321 value = '\n'.join(value_list) 322 # DO NOT remove the quotes... 323 qualifiers.append((key, value)) 324 else: 325 # Unquoted 326 # if debug : print("Unquoted line %s:%s" % (key,value)) 327 qualifiers.append((key, value)) 328 else: 329 # Unquoted continuation 330 assert len(qualifiers) > 0 331 assert key == qualifiers[-1][0] 332 # if debug : print("Unquoted Cont %s:%s" % (key, line)) 333 if qualifiers[-1][1] is None: 334 raise StopIteration 335 qualifiers[-1] = (key, qualifiers[-1][1] + "\n" + line) 336 return feature_key, feature_location, qualifiers 337 except StopIteration: 338 # Bummer 339 raise ValueError("Problem with '%s' feature:\n%s" 340 % (feature_key, "\n".join(lines)))
341 364
365 - def _feed_first_line(self, consumer, line):
366 """Handle the LOCUS/ID line, passing data to the comsumer (PRIVATE). 367 368 This should be implemented by the EMBL / GenBank specific subclass 369 370 Used by the parse_records() and parse() methods. 371 """ 372 pass
373
374 - def _feed_header_lines(self, consumer, lines):
375 """Handle the header lines (list of strings), passing data to the comsumer (PRIVATE). 376 377 This should be implemented by the EMBL / GenBank specific subclass 378 379 Used by the parse_records() and parse() methods. 380 """ 381 pass
382 383 @staticmethod
384 - def _feed_feature_table(consumer, feature_tuples):
385 """Handle the feature table (list of tuples), passing data to the comsumer (PRIVATE). 386 387 Used by the parse_records() and parse() methods. 388 """ 389 consumer.start_feature_table() 390 for feature_key, location_string, qualifiers in feature_tuples: 391 consumer.feature_key(feature_key) 392 consumer.location(location_string) 393 for q_key, q_value in qualifiers: 394 if q_value is None: 395 consumer.feature_qualifier(q_key, q_value) 396 else: 397 consumer.feature_qualifier(q_key, q_value.replace("\n", " "))
398
399 - def _feed_misc_lines(self, consumer, lines):
400 """Handle any lines between features and sequence (list of strings), passing data to the consumer (PRIVATE). 401 402 This should be implemented by the EMBL / GenBank specific subclass 403 404 Used by the parse_records() and parse() methods. 405 """ 406 pass
407
408 - def feed(self, handle, consumer, do_features=True):
409 """Feed a set of data into the consumer. 410 411 This method is intended for use with the "old" code in Bio.GenBank 412 413 Arguments: 414 - handle - A handle with the information to parse. 415 - consumer - The consumer that should be informed of events. 416 - do_features - Boolean, should the features be parsed? 417 Skipping the features can be much faster. 418 419 Return values: 420 - true - Passed a record 421 - false - Did not find a record 422 423 """ 424 # Should work with both EMBL and GenBank files provided the 425 # equivalent Bio.GenBank._FeatureConsumer methods are called... 426 self.set_handle(handle) 427 if not self.find_start(): 428 # Could not find (another) record 429 consumer.data = None 430 return False 431 432 # We use the above class methods to parse the file into a simplified format. 433 # The first line, header lines and any misc lines after the features will be 434 # dealt with by GenBank / EMBL specific derived classes. 435 436 # First line and header: 437 self._feed_first_line(consumer, self.line) 438 self._feed_header_lines(consumer, self.parse_header()) 439 440 # Features (common to both EMBL and GenBank): 441 if do_features: 442 self._feed_feature_table(consumer, self.parse_features(skip=False)) 443 else: 444 self.parse_features(skip=True) # ignore the data 445 446 # Footer and sequence 447 misc_lines, sequence_string = self.parse_footer() 448 self._feed_misc_lines(consumer, misc_lines) 449 450 consumer.sequence(sequence_string) 451 # Calls to consumer.base_number() do nothing anyway 452 consumer.record_end("//") 453 454 assert self.line == "//" 455 456 # And we are done 457 return True
458
459 - def parse(self, handle, do_features=True):
460 """Return a SeqRecord (with SeqFeatures if do_features=True). 461 462 See also the method parse_records() for use on multi-record files. 463 """ 464 from Bio.GenBank import _FeatureConsumer 465 from Bio.GenBank.utils import FeatureValueCleaner 466 467 consumer = _FeatureConsumer(use_fuzziness=1, 468 feature_cleaner=FeatureValueCleaner()) 469 470 if self.feed(handle, consumer, do_features): 471 return consumer.data 472 else: 473 return None
474
475 - def parse_records(self, handle, do_features=True):
476 """Parse records, return a SeqRecord object iterator. 477 478 Each record (from the ID/LOCUS line to the // line) becomes a SeqRecord 479 480 The SeqRecord objects include SeqFeatures if do_features=True 481 482 This method is intended for use in Bio.SeqIO 483 """ 484 # This is a generator function 485 while True: 486 record = self.parse(handle, do_features) 487 if record is None: 488 break 489 if record.id is None: 490 raise ValueError("Failed to parse the record's ID. Invalid ID line?") 491 if record.name == "<unknown name>": 492 raise ValueError("Failed to parse the record's name. Invalid ID line?") 493 if record.description == "<unknown description>": 494 raise ValueError("Failed to parse the record's description") 495 yield record
496
497 - def parse_cds_features(self, handle, 498 alphabet=generic_protein, 499 tags2id=('protein_id', 'locus_tag', 'product')):
500 """Parse CDS features, return SeqRecord object iterator. 501 502 Each CDS feature becomes a SeqRecord. 503 504 Arguments: 505 - alphabet - Used for any sequence found in a translation field. 506 - tags2id - Tupple of three strings, the feature keys to use 507 for the record id, name and description, 508 509 This method is intended for use in Bio.SeqIO 510 """ 511 self.set_handle(handle) 512 while self.find_start(): 513 # Got an EMBL or GenBank record... 514 self.parse_header() # ignore header lines! 515 feature_tuples = self.parse_features() 516 # self.parse_footer() # ignore footer lines! 517 while True: 518 line = self.handle.readline() 519 if not line: 520 break 521 if line[:2] == "//": 522 break 523 self.line = line.rstrip() 524 525 # Now go though those features... 526 for key, location_string, qualifiers in feature_tuples: 527 if key == "CDS": 528 # Create SeqRecord 529 # ================ 530 # SeqRecord objects cannot be created with annotations, they 531 # must be added afterwards. So create an empty record and 532 # then populate it: 533 record = SeqRecord(seq=None) 534 annotations = record.annotations 535 536 # Should we add a location object to the annotations? 537 # I *think* that only makes sense for SeqFeatures with their 538 # sub features... 539 annotations['raw_location'] = location_string.replace(' ', '') 540 541 for (qualifier_name, qualifier_data) in qualifiers: 542 if qualifier_data is not None \ 543 and qualifier_data[0] == '"' and qualifier_data[-1] == '"': 544 # Remove quotes 545 qualifier_data = qualifier_data[1:-1] 546 # Append the data to the annotation qualifier... 547 if qualifier_name == "translation": 548 assert record.seq is None, "Multiple translations!" 549 record.seq = Seq(qualifier_data.replace("\n", ""), alphabet) 550 elif qualifier_name == "db_xref": 551 # its a list, possibly empty. Its safe to extend 552 record.dbxrefs.append(qualifier_data) 553 else: 554 if qualifier_data is not None: 555 qualifier_data = qualifier_data.replace("\n", " ").replace(" ", " ") 556 try: 557 annotations[qualifier_name] += " " + qualifier_data 558 except KeyError: 559 # Not an addition to existing data, its the first bit 560 annotations[qualifier_name] = qualifier_data 561 562 # Fill in the ID, Name, Description 563 # ================================= 564 try: 565 record.id = annotations[tags2id[0]] 566 except KeyError: 567 pass 568 try: 569 record.name = annotations[tags2id[1]] 570 except KeyError: 571 pass 572 try: 573 record.description = annotations[tags2id[2]] 574 except KeyError: 575 pass 576 577 yield record
578
579 580 -class EmblScanner(InsdcScanner):
581 """For extracting chunks of information in EMBL files.""" 582 583 RECORD_START = "ID " 584 HEADER_WIDTH = 5 585 FEATURE_START_MARKERS = ["FH Key Location/Qualifiers", "FH"] 586 FEATURE_END_MARKERS = ["XX"] # XX can also mark the end of many things! 587 FEATURE_QUALIFIER_INDENT = 21 588 FEATURE_QUALIFIER_SPACER = "FT" + " " * (FEATURE_QUALIFIER_INDENT - 2) 589 SEQUENCE_HEADERS = ["SQ", "CO"] # Remove trailing spaces 590 591 EMBL_INDENT = HEADER_WIDTH 592 EMBL_SPACER = " " * EMBL_INDENT 593 630
631 - def _feed_first_line(self, consumer, line):
632 assert line[:self.HEADER_WIDTH].rstrip() == "ID" 633 if line[self.HEADER_WIDTH:].count(";") == 6: 634 # Looks like the semi colon separated style introduced in 2006 635 self._feed_first_line_new(consumer, line) 636 elif line[self.HEADER_WIDTH:].count(";") == 3: 637 if line.rstrip().endswith(" SQ"): 638 # EMBL-bank patent data 639 self._feed_first_line_patents(consumer, line) 640 else: 641 # Looks like the pre 2006 style 642 self._feed_first_line_old(consumer, line) 643 elif line[self.HEADER_WIDTH:].count(";") == 2: 644 # Looks like KIKO patent data 645 self._feed_first_line_patents_kipo(consumer, line) 646 else: 647 raise ValueError('Did not recognise the ID line layout:\n' + line)
648
649 - def _feed_first_line_patents(self, consumer, line):
650 # Old style EMBL patent records where ID line ended SQ 651 # Not 100% sure that PRT here is really molecule type and 652 # not the data file division... 653 # 654 # Either Non-Redundant Level 1 database records, 655 # ID <accession>; <molecule type>; <non-redundant level 1>; <cluster size L1> 656 # e.g. ID NRP_AX000635; PRT; NR1; 15 SQ 657 # 658 # Or, Non-Redundant Level 2 database records: 659 # ID <L2-accession>; <molecule type>; <non-redundant level 2>; <cluster size L2> 660 # e.g. ID NRP0000016E; PRT; NR2; 5 SQ 661 # e.g. ID NRP_AX000635; PRT; NR1; 15 SQ 662 fields = [data.strip() for data in line[self.HEADER_WIDTH:].strip()[:-3].split(";")] 663 assert len(fields) == 4 664 consumer.locus(fields[0]) 665 consumer.residue_type(fields[1]) # semi-redundant 666 consumer.data_file_division(fields[2])
667 # TODO - Record cluster size? 668
669 - def _feed_first_line_patents_kipo(self, consumer, line):
670 # EMBL format patent sequence from KIPO, e.g. 671 # ftp://ftp.ebi.ac.uk/pub/databases/patentdata/kipo_prt.dat.gz 672 # 673 # e.g. ID DI500001 STANDARD; PRT; 111 AA. 674 # 675 # This follows the style of _feed_first_line_old 676 assert line[:self.HEADER_WIDTH].rstrip() == "ID" 677 fields = [line[self.HEADER_WIDTH:].split(None, 1)[0]] 678 fields.extend(line[self.HEADER_WIDTH:].split(None, 1)[1].split(";")) 679 fields = [entry.strip() for entry in fields] 680 """ 681 The tokens represent: 682 683 0. Primary accession number 684 (space sep) 685 1. ??? (e.g. standard) 686 (semi-colon) 687 2. Molecule type (protein)? Division? Always 'PRT' 688 3. Sequence length (e.g. '111 AA.') 689 """ 690 consumer.locus(fields[0]) # Should we also call the accession consumer? 691 # consumer.molecule_type(fields[2]) 692 self._feed_seq_length(consumer, fields[3])
693
694 - def _feed_first_line_old(self, consumer, line):
695 # Expects an ID line in the style before 2006, e.g. 696 # ID SC10H5 standard; DNA; PRO; 4870 BP. 697 # ID BSUB9999 standard; circular DNA; PRO; 4214630 BP. 698 assert line[:self.HEADER_WIDTH].rstrip() == "ID" 699 fields = [line[self.HEADER_WIDTH:].split(None, 1)[0]] 700 fields.extend(line[self.HEADER_WIDTH:].split(None, 1)[1].split(";")) 701 fields = [entry.strip() for entry in fields] 702 """ 703 The tokens represent: 704 705 0. Primary accession number 706 (space sep) 707 1. ??? (e.g. standard) 708 (semi-colon) 709 2. Topology and/or Molecule type (e.g. 'circular DNA' or 'DNA') 710 3. Taxonomic division (e.g. 'PRO') 711 4. Sequence length (e.g. '4639675 BP.') 712 713 """ 714 consumer.locus(fields[0]) # Should we also call the accession consumer? 715 consumer.residue_type(fields[2]) 716 if "circular" in fields[2]: 717 consumer.topology("circular") 718 consumer.molecule_type(fields[2].replace("circular", "").strip()) 719 elif "linear" in fields[2]: 720 consumer.topology("linear") 721 consumer.molecule_type(fields[2].replace("linear", "").strip()) 722 else: 723 consumer.molecule_type(fields[2].strip()) 724 consumer.data_file_division(fields[3]) 725 self._feed_seq_length(consumer, fields[4])
726
727 - def _feed_first_line_new(self, consumer, line):
728 # Expects an ID line in the style introduced in 2006, e.g. 729 # ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP. 730 # ID CD789012; SV 4; linear; genomic DNA; HTG; MAM; 500 BP. 731 assert line[:self.HEADER_WIDTH].rstrip() == "ID" 732 fields = [data.strip() for data in line[self.HEADER_WIDTH:].strip().split(";")] 733 assert len(fields) == 7 734 """ 735 The tokens represent: 736 737 0. Primary accession number 738 1. Sequence version number 739 2. Topology: 'circular' or 'linear' 740 3. Molecule type (e.g. 'genomic DNA') 741 4. Data class (e.g. 'STD') 742 5. Taxonomic division (e.g. 'PRO') 743 6. Sequence length (e.g. '4639675 BP.') 744 745 """ 746 747 consumer.locus(fields[0]) 748 749 # Call the accession consumer now, to make sure we record 750 # something as the record.id, in case there is no AC line 751 consumer.accession(fields[0]) 752 753 # TODO - How to deal with the version field? At the moment the consumer 754 # will try and use this for the ID which isn't ideal for EMBL files. 755 version_parts = fields[1].split() 756 if len(version_parts) == 2 \ 757 and version_parts[0] == "SV" \ 758 and version_parts[1].isdigit(): 759 consumer.version_suffix(version_parts[1]) 760 761 # Based on how the old GenBank parser worked, merge these two: 762 consumer.residue_type(" ".join(fields[2:4])) # Semi-obsolete 763 764 consumer.topology(fields[2]) 765 consumer.molecule_type(fields[3]) 766 767 # consumer.xxx(fields[4]) # TODO - What should we do with the data class? 768 769 consumer.data_file_division(fields[5]) 770 771 self._feed_seq_length(consumer, fields[6])
772 773 @staticmethod
774 - def _feed_seq_length(consumer, text):
775 length_parts = text.split() 776 assert len(length_parts) == 2, "Invalid sequence length string %r" % text 777 assert length_parts[1].upper() in ["BP", "BP.", "AA", "AA."] 778 consumer.size(length_parts[0])
779
780 - def _feed_header_lines(self, consumer, lines):
781 consumer_dict = { 782 'AC': 'accession', 783 'SV': 'version', # SV line removed in June 2006, now part of ID line 784 'DE': 'definition', 785 # 'RN' : 'reference_num', 786 # 'RC' : reference comment... TODO 787 # 'RP' : 'reference_bases', 788 # 'RX' : reference cross reference... DOI or Pubmed 789 'RG': 'consrtm', # optional consortium 790 # 'RA' : 'authors', 791 # 'RT' : 'title', 792 'RL': 'journal', 793 'OS': 'organism', 794 'OC': 'taxonomy', 795 # 'DR' : data reference 796 'CC': 'comment', 797 # 'XX' : splitter 798 } 799 # We have to handle the following specially: 800 # RX (depending on reference type...) 801 for line in lines: 802 line_type = line[:self.EMBL_INDENT].strip() 803 data = line[self.EMBL_INDENT:].strip() 804 if line_type == 'XX': 805 pass 806 elif line_type == 'RN': 807 # Reformat reference numbers for the GenBank based consumer 808 # e.g. '[1]' becomes '1' 809 if data[0] == "[" and data[-1] == "]": 810 data = data[1:-1] 811 consumer.reference_num(data) 812 elif line_type == 'RP': 813 if data.strip() == "[-]": 814 # Patent EMBL files from KIPO just use: RN [-] 815 pass 816 else: 817 # Reformat reference numbers for the GenBank based consumer 818 # e.g. '1-4639675' becomes '(bases 1 to 4639675)' 819 # and '160-550, 904-1055' becomes '(bases 160 to 550; 904 to 1055)' 820 # Note could be multi-line, and end with a comma 821 parts = [bases.replace("-", " to ").strip() for bases in data.split(",") if bases.strip()] 822 consumer.reference_bases("(bases %s)" % "; ".join(parts)) 823 elif line_type == 'RT': 824 # Remove the enclosing quotes and trailing semi colon. 825 # Note the title can be split over multiple lines. 826 if data.startswith('"'): 827 data = data[1:] 828 if data.endswith('";'): 829 data = data[:-2] 830 consumer.title(data) 831 elif line_type == 'RX': 832 # EMBL support three reference types at the moment: 833 # - PUBMED PUBMED bibliographic database (NLM) 834 # - DOI Digital Object Identifier (International DOI Foundation) 835 # - AGRICOLA US National Agriculture Library (NAL) of the US Department 836 # of Agriculture (USDA) 837 # 838 # Format: 839 # RX resource_identifier; identifier. 840 # 841 # e.g. 842 # RX DOI; 10.1016/0024-3205(83)90010-3. 843 # RX PUBMED; 264242. 844 # 845 # Currently our reference object only supports PUBMED and MEDLINE 846 # (as these were in GenBank files?). 847 key, value = data.split(";", 1) 848 if value.endswith("."): 849 value = value[:-1] 850 value = value.strip() 851 if key == "PUBMED": 852 consumer.pubmed_id(value) 853 # TODO - Handle other reference types (here and in BioSQL bindings) 854 elif line_type == 'CC': 855 # Have to pass a list of strings for this one (not just a string) 856 consumer.comment([data]) 857 elif line_type == 'DR': 858 # Database Cross-reference, format: 859 # DR database_identifier; primary_identifier; secondary_identifier. 860 # 861 # e.g. 862 # DR MGI; 98599; Tcrb-V4. 863 # 864 # TODO - How should we store any secondary identifier? 865 parts = data.rstrip(".").split(";") 866 # Turn it into "database_identifier:primary_identifier" to 867 # mimic the GenBank parser. e.g. "MGI:98599" 868 consumer.dblink("%s:%s" % (parts[0].strip(), 869 parts[1].strip())) 870 elif line_type == 'RA': 871 # Remove trailing ; at end of authors list 872 consumer.authors(data.rstrip(";")) 873 elif line_type == 'PR': 874 # In the EMBL patent files, this is a PR (PRiority) line which 875 # provides the earliest active priority within the family. 876 # The priority number comes first, followed by the priority date. 877 # 878 # e.g. 879 # PR JP19990377484 16-DEC-1999 880 # 881 # However, in most EMBL files this is a PR (PRoject) line which 882 # gives the BioProject reference number. 883 # 884 # e.g. 885 # PR Project:PRJNA60715; 886 # 887 # In GenBank files this corresponds to the old PROJECT line 888 # which was later replaced with the DBLINK line. 889 if data.startswith("Project:"): 890 # Remove trailing ; at end of the project reference 891 consumer.project(data.rstrip(";")) 892 elif line_type == 'KW': 893 consumer.keywords(data.rstrip(";")) 894 elif line_type in consumer_dict: 895 # Its a semi-automatic entry! 896 getattr(consumer, consumer_dict[line_type])(data) 897 else: 898 if self.debug: 899 print("Ignoring EMBL header line:\n%s" % line)
900
901 - def _feed_misc_lines(self, consumer, lines):
902 # TODO - Should we do something with the information on the SQ line(s)? 903 lines.append("") 904 line_iter = iter(lines) 905 try: 906 for line in line_iter: 907 if line.startswith("CO "): 908 line = line[5:].strip() 909 contig_location = line 910 while True: 911 line = next(line_iter) 912 if not line: 913 break 914 elif line.startswith("CO "): 915 # Don't need to preseve the whitespace here. 916 contig_location += line[5:].strip() 917 else: 918 raise ValueError('Expected CO (contig) continuation line, got:\n' + line) 919 consumer.contig_location(contig_location) 920 if line.startswith("SQ Sequence "): 921 # e.g. 922 # SQ Sequence 219 BP; 82 A; 48 C; 33 G; 45 T; 11 other; 923 # 924 # Or, EMBL-bank patent, e.g. 925 # SQ Sequence 465 AA; 3963407aa91d3a0d622fec679a4524e0; MD5; 926 self._feed_seq_length(consumer, line[14:].rstrip().rstrip(";").split(";", 1)[0]) 927 # TODO - Record the checksum etc? 928 return 929 except StopIteration: 930 raise ValueError("Problem in misc lines before sequence")
931
932 933 -class _ImgtScanner(EmblScanner):
934 """For extracting chunks of information in IMGT (EMBL like) files (PRIVATE). 935 936 IMGT files are like EMBL files but in order to allow longer feature types 937 the features should be indented by 25 characters not 21 characters. In 938 practice the IMGT flat files tend to use either 21 or 25 characters, so we 939 must cope with both. 940 941 This is private to encourage use of Bio.SeqIO rather than Bio.GenBank. 942 """ 943 944 FEATURE_START_MARKERS = ["FH Key Location/Qualifiers", 945 "FH Key Location/Qualifiers (from EMBL)", 946 "FH Key Location/Qualifiers", 947 "FH"] 948
949 - def _feed_first_line(self, consumer, line):
950 assert line[:self.HEADER_WIDTH].rstrip() == "ID" 951 if line[self.HEADER_WIDTH:].count(";") != 5: 952 # Assume its an older EMBL-like line, 953 return EmblScanner._feed_first_line(self, consumer, line) 954 # Otherwise assume its the new (circa 2016) IMGT style 955 # as used in the IPD-IMGT/HLA Database 956 # 957 # https://github.com/ANHIG/IMGTHLA/ 958 # 959 # The key changes post 3.16 are the addition of an SV value 960 # to the ID line, these additions should make the format more 961 # similar to the ENA style. 962 # 963 # ID HLA00001 standard; DNA; HUM; 3503 BP. 964 # 965 # becomes 966 # 967 # ID HLA00001; SV 1; standard; DNA; HUM; 3503 BP. 968 fields = [data.strip() for data in line[self.HEADER_WIDTH:].strip().split(";")] 969 assert len(fields) == 6 970 """ 971 The tokens represent: 972 973 0. Primary accession number (eg 'HLA00001') 974 1. Sequence version number (eg 'SV 1') 975 2. ??? eg 'standard' 976 3. Molecule type (e.g. 'DNA') 977 4. Taxonomic division (e.g. 'HUM') 978 5. Sequence length (e.g. '3503 BP.') 979 """ 980 consumer.locus(fields[0]) 981 982 # See TODO on the EMBL _feed_first_line_new about version field 983 version_parts = fields[1].split() 984 if len(version_parts) == 2 \ 985 and version_parts[0] == "SV" \ 986 and version_parts[1].isdigit(): 987 consumer.version_suffix(version_parts[1]) 988 989 consumer.residue_type(fields[3]) 990 if "circular" in fields[3]: 991 consumer.topology("circular") 992 consumer.molecule_type(fields[3].replace("circular", "").strip()) 993 elif "linear" in fields[3]: 994 consumer.topology("linear") 995 consumer.molecule_type(fields[3].replace("linear", "").strip()) 996 else: 997 consumer.molecule_type(fields[3].strip()) 998 consumer.data_file_division(fields[4]) 999 self._feed_seq_length(consumer, fields[5])
1000
1001 - def parse_features(self, skip=False):
1002 """Return list of tuples for the features (if present). 1003 1004 Each feature is returned as a tuple (key, location, qualifiers) 1005 where key and location are strings (e.g. "CDS" and 1006 "complement(join(490883..490885,1..879))") while qualifiers 1007 is a list of two string tuples (feature qualifier keys and values). 1008 1009 Assumes you have already read to the start of the features table. 1010 """ 1011 if self.line.rstrip() not in self.FEATURE_START_MARKERS: 1012 if self.debug: 1013 print("Didn't find any feature table") 1014 return [] 1015 1016 while self.line.rstrip() in self.FEATURE_START_MARKERS: 1017 self.line = self.handle.readline() 1018 1019 bad_position_re = re.compile(r'([0-9]+)>') 1020 1021 features = [] 1022 line = self.line 1023 while True: 1024 if not line: 1025 raise ValueError("Premature end of line during features table") 1026 if line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS: 1027 if self.debug: 1028 print("Found start of sequence") 1029 break 1030 line = line.rstrip() 1031 if line == "//": 1032 raise ValueError("Premature end of features table, marker '//' found") 1033 if line in self.FEATURE_END_MARKERS: 1034 if self.debug: 1035 print("Found end of features") 1036 line = self.handle.readline() 1037 break 1038 if line[2:self.FEATURE_QUALIFIER_INDENT].strip() == "": 1039 # This is an empty feature line between qualifiers. Empty 1040 # feature lines within qualifiers are handled below (ignored). 1041 line = self.handle.readline() 1042 continue 1043 1044 if skip: 1045 line = self.handle.readline() 1046 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER: 1047 line = self.handle.readline() 1048 else: 1049 assert line[:2] == "FT" 1050 try: 1051 feature_key, location_start = line[2:].strip().split() 1052 except ValueError: 1053 # e.g. "FT TRANSMEMBRANE-REGION2163..2240\n" 1054 # Assume indent of 25 as per IMGT spec, with the location 1055 # start in column 26 (one-based). 1056 feature_key = line[2:25].strip() 1057 location_start = line[25:].strip() 1058 feature_lines = [location_start] 1059 line = self.handle.readline() 1060 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER \ 1061 or line.rstrip() == "": # cope with blank lines in the midst of a feature 1062 # Use strip to remove any harmless trailing white space AND and leading 1063 # white space (copes with 21 or 26 indents and orther variants) 1064 assert line[:2] == "FT" 1065 feature_lines.append(line[self.FEATURE_QUALIFIER_INDENT:].strip()) 1066 line = self.handle.readline() 1067 feature_key, location, qualifiers = \ 1068 self.parse_feature(feature_key, feature_lines) 1069 # Try to handle known problems with IMGT locations here: 1070 if ">" in location: 1071 # Nasty hack for common IMGT bug, should be >123 not 123> 1072 # in a location string. At least here the meaning is clear, 1073 # and since it is so common I don't want to issue a warning 1074 # warnings.warn("Feature location %s is invalid, " 1075 # "moving greater than sign before position" 1076 # % location, BiopythonParserWarning) 1077 location = bad_position_re.sub(r'>\1', location) 1078 features.append((feature_key, location, qualifiers)) 1079 self.line = line 1080 return features
1081
1082 1083 -class GenBankScanner(InsdcScanner):
1084 """For extracting chunks of information in GenBank files.""" 1085 1086 RECORD_START = "LOCUS " 1087 HEADER_WIDTH = 12 1088 FEATURE_START_MARKERS = ["FEATURES Location/Qualifiers", "FEATURES"] 1089 FEATURE_END_MARKERS = [] 1090 FEATURE_QUALIFIER_INDENT = 21 1091 FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT 1092 SEQUENCE_HEADERS = ["CONTIG", "ORIGIN", "BASE COUNT", "WGS"] # trailing spaces removed 1093 1094 GENBANK_INDENT = HEADER_WIDTH 1095 GENBANK_SPACER = " " * GENBANK_INDENT 1096 1097 STRUCTURED_COMMENT_START = "-START##" 1098 STRUCTURED_COMMENT_END = "-END##" 1099 STRUCTURED_COMMENT_DELIM = " :: " 1100 1153
1154 - def _feed_first_line(self, consumer, line):
1155 """Scan over and parse GenBank LOCUS line (PRIVATE). 1156 1157 This must cope with several variants, primarily the old and new column 1158 based standards from GenBank. Additionally EnsEMBL produces GenBank 1159 files where the LOCUS line is space separated rather that following 1160 the column based layout. 1161 1162 We also try to cope with GenBank like files with partial LOCUS lines. 1163 """ 1164 ##################################### 1165 # LOCUS line # 1166 ##################################### 1167 assert line[0:self.GENBANK_INDENT] == 'LOCUS ', \ 1168 'LOCUS line does not start correctly:\n' + line 1169 1170 # Have to break up the locus line, and handle the different bits of it. 1171 # There are at least two different versions of the locus line... 1172 if line[29:33] in [' bp ', ' aa ', ' rc '] and line[55:62] == ' ': 1173 # Old... note we insist on the 55:62 being empty to avoid trying 1174 # to parse space separated LOCUS lines from Ensembl etc, see below. 1175 # 1176 # Positions Contents 1177 # --------- -------- 1178 # 00:06 LOCUS 1179 # 06:12 spaces 1180 # 12:?? Locus name 1181 # ??:?? space 1182 # ??:29 Length of sequence, right-justified 1183 # 29:33 space, bp, space 1184 # 33:41 strand type / molecule type, e.g. DNA 1185 # 41:42 space 1186 # 42:51 Blank (implies linear), linear or circular 1187 # 51:52 space 1188 # 52:55 The division code (e.g. BCT, VRL, INV) 1189 # 55:62 space 1190 # 62:73 Date, in the form dd-MMM-yyyy (e.g., 15-MAR-1991) 1191 # 1192 # assert line[29:33] in [' bp ', ' aa ',' rc '] , \ 1193 # 'LOCUS line does not contain size units at expected position:\n' + line 1194 assert line[41:42] == ' ', \ 1195 'LOCUS line does not contain space at position 42:\n' + line 1196 assert line[42:51].strip() in ['', 'linear', 'circular'], \ 1197 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line 1198 assert line[51:52] == ' ', \ 1199 'LOCUS line does not contain space at position 52:\n' + line 1200 # assert line[55:62] == ' ', \ 1201 # 'LOCUS line does not contain spaces from position 56 to 62:\n' + line 1202 if line[62:73].strip(): 1203 assert line[64:65] == '-', \ 1204 'LOCUS line does not contain - at position 65 in date:\n' + line 1205 assert line[68:69] == '-', \ 1206 'LOCUS line does not contain - at position 69 in date:\n' + line 1207 1208 name_and_length_str = line[self.GENBANK_INDENT:29] 1209 while ' ' in name_and_length_str: 1210 name_and_length_str = name_and_length_str.replace(' ', ' ') 1211 name_and_length = name_and_length_str.split(' ') 1212 assert len(name_and_length) <= 2, \ 1213 'Cannot parse the name and length in the LOCUS line:\n' + line 1214 assert len(name_and_length) != 1, \ 1215 'Name and length collide in the LOCUS line:\n' + line 1216 # Should be possible to split them based on position, if 1217 # a clear definition of the standard exists THAT AGREES with 1218 # existing files. 1219 name, length = name_and_length 1220 if len(name) > 16: 1221 # As long as the sequence is short, can steal its leading spaces 1222 # to extend the name over the current 16 character limit. 1223 # However, that deserves a warning as it is out of spec. 1224 warnings.warn("GenBank LOCUS line identifier over 16 characters", 1225 BiopythonParserWarning) 1226 consumer.locus(name) 1227 consumer.size(length) 1228 # consumer.residue_type(line[33:41].strip()) 1229 1230 if line[33:51].strip() == "" and line[29:33] == ' aa ': 1231 # Amino acids -> protein (even if there is no residue type given) 1232 # We want to use a protein alphabet in this case, rather than a 1233 # generic one. Not sure if this is the best way to achieve this, 1234 # but it works because the scanner checks for this: 1235 consumer.residue_type("PROTEIN") 1236 else: 1237 consumer.residue_type(line[33:51].strip()) 1238 1239 consumer.molecule_type(line[33:41].strip()) 1240 consumer.topology(line[42:51].strip()) 1241 consumer.data_file_division(line[52:55]) 1242 if line[62:73].strip(): 1243 consumer.date(line[62:73]) 1244 elif line[40:44] in [' bp ', ' aa ', ' rc '] \ 1245 and line[54:64].strip() in ['', 'linear', 'circular']: 1246 # New... linear/circular/big blank test should avoid EnsEMBL style 1247 # LOCUS line being treated like a proper column based LOCUS line. 1248 # 1249 # Positions Contents 1250 # --------- -------- 1251 # 00:06 LOCUS 1252 # 06:12 spaces 1253 # 12:?? Locus name 1254 # ??:?? space 1255 # ??:40 Length of sequence, right-justified 1256 # 40:44 space, bp, space 1257 # 44:47 Blank, ss-, ds-, ms- 1258 # 47:54 Blank, DNA, RNA, tRNA, mRNA, uRNA, snRNA, cDNA 1259 # 54:55 space 1260 # 55:63 Blank (implies linear), linear or circular 1261 # 63:64 space 1262 # 64:67 The division code (e.g. BCT, VRL, INV) 1263 # 67:68 space 1264 # 68:79 Date, in the form dd-MMM-yyyy (e.g., 15-MAR-1991) 1265 # 1266 assert line[40:44] in [' bp ', ' aa ', ' rc '], \ 1267 'LOCUS line does not contain size units at expected position:\n' + line 1268 assert line[44:47] in [' ', 'ss-', 'ds-', 'ms-'], \ 1269 'LOCUS line does not have valid strand type (Single stranded, ...):\n' + line 1270 assert line[47:54].strip() == "" \ 1271 or 'DNA' in line[47:54].strip().upper() \ 1272 or 'RNA' in line[47:54].strip().upper(), \ 1273 'LOCUS line does not contain valid sequence type (DNA, RNA, ...):\n' + line 1274 assert line[54:55] == ' ', \ 1275 'LOCUS line does not contain space at position 55:\n' + line 1276 assert line[55:63].strip() in ['', 'linear', 'circular'], \ 1277 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line 1278 assert line[63:64] == ' ', \ 1279 'LOCUS line does not contain space at position 64:\n' + line 1280 assert line[67:68] == ' ', \ 1281 'LOCUS line does not contain space at position 68:\n' + line 1282 if line[68:79].strip(): 1283 assert line[70:71] == '-', \ 1284 'LOCUS line does not contain - at position 71 in date:\n' + line 1285 assert line[74:75] == '-', \ 1286 'LOCUS line does not contain - at position 75 in date:\n' + line 1287 1288 name_and_length_str = line[self.GENBANK_INDENT:40] 1289 while ' ' in name_and_length_str: 1290 name_and_length_str = name_and_length_str.replace(' ', ' ') 1291 name_and_length = name_and_length_str.split(' ') 1292 assert len(name_and_length) <= 2, \ 1293 'Cannot parse the name and length in the LOCUS line:\n' + line 1294 assert len(name_and_length) != 1, \ 1295 'Name and length collide in the LOCUS line:\n' + line 1296 # Should be possible to split them based on position, if 1297 # a clear definition of the stand exists THAT AGREES with 1298 # existing files. 1299 consumer.locus(name_and_length[0]) 1300 consumer.size(name_and_length[1]) 1301 1302 if line[44:54].strip() == "" and line[40:44] == ' aa ': 1303 # Amino acids -> protein (even if there is no residue type given) 1304 # We want to use a protein alphabet in this case, rather than a 1305 # generic one. Not sure if this is the best way to achieve this, 1306 # but it works because the scanner checks for this: 1307 consumer.residue_type(("PROTEIN " + line[54:63]).strip()) 1308 else: 1309 consumer.residue_type(line[44:63].strip()) 1310 1311 consumer.molecule_type(line[44:54].strip()) 1312 consumer.topology(line[55:63].strip()) 1313 consumer.data_file_division(line[64:67]) 1314 if line[68:79].strip(): 1315 consumer.date(line[68:79]) 1316 elif line[self.GENBANK_INDENT:].strip().count(" ") == 0: 1317 # Truncated LOCUS line, as produced by some EMBOSS tools - see bug 1762 1318 # 1319 # e.g. 1320 # 1321 # "LOCUS U00096" 1322 # 1323 # rather than: 1324 # 1325 # "LOCUS U00096 4639675 bp DNA circular BCT" 1326 # 1327 # Positions Contents 1328 # --------- -------- 1329 # 00:06 LOCUS 1330 # 06:12 spaces 1331 # 12:?? Locus name 1332 if line[self.GENBANK_INDENT:].strip() != "": 1333 consumer.locus(line[self.GENBANK_INDENT:].strip()) 1334 else: 1335 # Must just have just "LOCUS ", is this even legitimate? 1336 # We should be able to continue parsing... we need real world testcases! 1337 warnings.warn("Minimal LOCUS line found - is this " 1338 "correct?\n:%r" % line, BiopythonParserWarning) 1339 elif len(line.split()) == 8 and line.split()[3] in ("aa", "bp") \ 1340 and line.split()[5] in ('linear', 'circular'): 1341 # Cope with invalidly spaced GenBank LOCUS lines like 1342 # LOCUS AB070938 6497 bp DNA linear BCT 11-OCT-2001 1343 splitline = line.split() 1344 consumer.locus(splitline[1]) 1345 consumer.size(splitline[2]) 1346 consumer.residue_type(splitline[4]) 1347 consumer.topology(splitline[5]) 1348 consumer.data_file_division(splitline[6]) 1349 consumer.date(splitline[7]) 1350 warnings.warn("Attempting to parse malformed locus line:\n%r\n" 1351 "Found locus %r size %r residue_type %r\n" 1352 "Some fields may be wrong." 1353 % (line, splitline[1], splitline[2], splitline[4]), 1354 BiopythonParserWarning) 1355 elif len(line.split()) == 7 and line.split()[3] in ["aa", "bp"]: 1356 # Cope with EnsEMBL genbank files which use space separation rather 1357 # than the expected column based layout. e.g. 1358 # LOCUS HG531_PATCH 1000000 bp DNA HTG 18-JUN-2011 1359 # LOCUS HG531_PATCH 759984 bp DNA HTG 18-JUN-2011 1360 # LOCUS HG506_HG1000_1_PATCH 814959 bp DNA HTG 18-JUN-2011 1361 # LOCUS HG506_HG1000_1_PATCH 1219964 bp DNA HTG 18-JUN-2011 1362 # Notice that the 'bp' can occur in the position expected by either 1363 # the old or the new fixed column standards (parsed above). 1364 splitline = line.split() 1365 consumer.locus(splitline[1]) 1366 consumer.size(splitline[2]) 1367 consumer.residue_type(splitline[4]) 1368 consumer.data_file_division(splitline[5]) 1369 consumer.date(splitline[6]) 1370 elif len(line.split()) >= 4 and line.split()[3] in ["aa", "bp"]: 1371 # Cope with EMBOSS seqret output where it seems the locus id can cause 1372 # the other fields to overflow. We just IGNORE the other fields! 1373 warnings.warn("Malformed LOCUS line found - is this " 1374 "correct?\n:%r" % line, BiopythonParserWarning) 1375 consumer.locus(line.split()[1]) 1376 consumer.size(line.split()[2]) 1377 elif len(line.split()) >= 4 and line.split()[-1] in ["aa", "bp"]: 1378 # Cope with pseudo-GenBank files like this: 1379 # "LOCUS RNA5 complete 1718 bp" 1380 # Treat everything between LOCUS and the size as the identifier. 1381 warnings.warn("Malformed LOCUS line found - is this " 1382 "correct?\n:%r" % line, BiopythonParserWarning) 1383 consumer.locus(line[5:].rsplit(None, 2)[0].strip()) 1384 consumer.size(line.split()[-2]) 1385 else: 1386 raise ValueError('Did not recognise the LOCUS line layout:\n' + line)
1387
1388 - def _feed_header_lines(self, consumer, lines):
1389 # Following dictionary maps GenBank lines to the associated 1390 # consumer methods - the special cases like LOCUS where one 1391 # genbank line triggers several consumer calls have to be 1392 # handled individually. 1393 consumer_dict = { 1394 'DEFINITION': 'definition', 1395 'ACCESSION': 'accession', 1396 'NID': 'nid', 1397 'PID': 'pid', 1398 'DBSOURCE': 'db_source', 1399 'KEYWORDS': 'keywords', 1400 'SEGMENT': 'segment', 1401 'SOURCE': 'source', 1402 'AUTHORS': 'authors', 1403 'CONSRTM': 'consrtm', 1404 'PROJECT': 'project', 1405 'TITLE': 'title', 1406 'JOURNAL': 'journal', 1407 'MEDLINE': 'medline_id', 1408 'PUBMED': 'pubmed_id', 1409 'REMARK': 'remark'} 1410 # We have to handle the following specially: 1411 # ORIGIN (locus, size, residue_type, data_file_division and date) 1412 # COMMENT (comment) 1413 # VERSION (version and gi) 1414 # DBLINK (database links like projects, newlines important) 1415 # REFERENCE (eference_num and reference_bases) 1416 # ORGANISM (organism and taxonomy) 1417 lines = [_f for _f in lines if _f] 1418 lines.append("") # helps avoid getting StopIteration all the time 1419 line_iter = iter(lines) 1420 try: 1421 line = next(line_iter) 1422 while True: 1423 if not line: 1424 break 1425 line_type = line[:self.GENBANK_INDENT].strip() 1426 data = line[self.GENBANK_INDENT:].strip() 1427 1428 if line_type == 'VERSION': 1429 # Need to call consumer.version(), and maybe also consumer.gi() as well. 1430 # e.g. 1431 # VERSION AC007323.5 GI:6587720 1432 while ' ' in data: 1433 data = data.replace(' ', ' ') 1434 if ' GI:' not in data: 1435 consumer.version(data) 1436 else: 1437 if self.debug: 1438 print("Version [" + data.split(' GI:')[0] + "], gi [" + data.split(' GI:')[1] + "]") 1439 consumer.version(data.split(' GI:')[0]) 1440 consumer.gi(data.split(' GI:')[1]) 1441 # Read in the next line! 1442 line = next(line_iter) 1443 elif line_type == 'DBLINK': 1444 # Need to call consumer.dblink() for each line, e.g. 1445 # DBLINK Project: 57779 1446 # BioProject: PRJNA57779 1447 consumer.dblink(data.strip()) 1448 # Read in the next line, and see if its more of the DBLINK section: 1449 while True: 1450 line = next(line_iter) 1451 if line[:self.GENBANK_INDENT] == self.GENBANK_SPACER: 1452 # Add this continuation to the data string 1453 consumer.dblink(line[self.GENBANK_INDENT:].strip()) 1454 else: 1455 # End of the DBLINK, leave this text in the variable "line" 1456 break 1457 elif line_type == 'REFERENCE': 1458 if self.debug > 1: 1459 print("Found reference [" + data + "]") 1460 # Need to call consumer.reference_num() and consumer.reference_bases() 1461 # e.g. 1462 # REFERENCE 1 (bases 1 to 86436) 1463 # 1464 # Note that this can be multiline, see Bug 1968, e.g. 1465 # 1466 # REFERENCE 42 (bases 1517 to 1696; 3932 to 4112; 17880 to 17975; 21142 to 1467 # 28259) 1468 # 1469 # For such cases we will call the consumer once only. 1470 data = data.strip() 1471 1472 # Read in the next line, and see if its more of the reference: 1473 while True: 1474 line = next(line_iter) 1475 if line[:self.GENBANK_INDENT] == self.GENBANK_SPACER: 1476 # Add this continuation to the data string 1477 data += " " + line[self.GENBANK_INDENT:] 1478 if self.debug > 1: 1479 print("Extended reference text [" + data + "]") 1480 else: 1481 # End of the reference, leave this text in the variable "line" 1482 break 1483 1484 # We now have all the reference line(s) stored in a string, data, 1485 # which we pass to the consumer 1486 while ' ' in data: 1487 data = data.replace(' ', ' ') 1488 if ' ' not in data: 1489 if self.debug > 2: 1490 print('Reference number \"' + data + '\"') 1491 consumer.reference_num(data) 1492 else: 1493 if self.debug > 2: 1494 print('Reference number \"' + data[:data.find(' ')] + '\", \"' + data[data.find(' ') + 1:] + '\"') 1495 consumer.reference_num(data[:data.find(' ')]) 1496 consumer.reference_bases(data[data.find(' ') + 1:]) 1497 elif line_type == 'ORGANISM': 1498 # Typically the first line is the organism, and subsequent lines 1499 # are the taxonomy lineage. However, given longer and longer 1500 # species names (as more and more strains and sub strains get 1501 # sequenced) the oragnism name can now get wrapped onto multiple 1502 # lines. The NCBI say we have to recognise the lineage line by 1503 # the presence of semi-colon delimited entries. In the long term, 1504 # they are considering adding a new keyword (e.g. LINEAGE). 1505 # See Bug 2591 for details. 1506 organism_data = data 1507 lineage_data = "" 1508 while True: 1509 line = next(line_iter) 1510 if line[0:self.GENBANK_INDENT] == self.GENBANK_SPACER: 1511 if lineage_data or ";" in line: 1512 lineage_data += " " + line[self.GENBANK_INDENT:] 1513 elif line[self.GENBANK_INDENT:].strip() == ".": 1514 # No lineage data, just . place holder 1515 pass 1516 else: 1517 organism_data += " " + line[self.GENBANK_INDENT:].strip() 1518 else: 1519 # End of organism and taxonomy 1520 break 1521 consumer.organism(organism_data) 1522 if lineage_data.strip() == "" and self.debug > 1: 1523 print("Taxonomy line(s) missing or blank") 1524 consumer.taxonomy(lineage_data.strip()) 1525 del organism_data, lineage_data 1526 elif line_type == 'COMMENT': 1527 # A COMMENT can either be plain text or tabular (Structured Comment), 1528 # or contain both. Multi-line comments are common. The code calls 1529 # consumer.comment() once with a list where each entry 1530 # is a line. If there's a structured comment consumer.structured_comment() 1531 # is called with a dict of dicts where the secondary key/value pairs are 1532 # the same as those in the structured comment table. The primary key is 1533 # the title or header of the table (e.g. Assembly-Data, FluData). See 1534 # http://www.ncbi.nlm.nih.gov/genbank/structuredcomment 1535 # for more information on Structured Comments. 1536 data = line[self.GENBANK_INDENT:] 1537 if self.debug > 1: 1538 print("Found comment") 1539 comment_list = [] 1540 structured_comment_dict = OrderedDict() 1541 regex = r"([^#]+){0}$".format(self.STRUCTURED_COMMENT_START) 1542 structured_comment_key = re.search(regex, data) 1543 if structured_comment_key is not None: 1544 structured_comment_key = structured_comment_key.group(1) 1545 if self.debug > 1: 1546 print("Found Structured Comment") 1547 else: 1548 comment_list.append(data) 1549 1550 while True: 1551 line = next(line_iter) 1552 data = line[self.GENBANK_INDENT:] 1553 if line[0:self.GENBANK_INDENT] == self.GENBANK_SPACER: 1554 if self.STRUCTURED_COMMENT_START in data: 1555 regex = r"([^#]+){0}$".format(self.STRUCTURED_COMMENT_START) 1556 structured_comment_key = re.search(regex, data) 1557 if structured_comment_key is not None: 1558 structured_comment_key = structured_comment_key.group(1) 1559 else: 1560 comment_list.append(data) 1561 elif structured_comment_key is not None and self.STRUCTURED_COMMENT_DELIM in data: 1562 match = re.search(r"(.+?)\s*{0}\s*(.+)".format(self.STRUCTURED_COMMENT_DELIM), data) 1563 structured_comment_dict.setdefault(structured_comment_key, OrderedDict()) 1564 structured_comment_dict[structured_comment_key][match.group(1)] = match.group(2) 1565 if self.debug > 2: 1566 print("Structured Comment continuation [" + data + "]") 1567 elif self.STRUCTURED_COMMENT_END not in data: 1568 comment_list.append(data) 1569 if self.debug > 2: 1570 print("Comment continuation [" + data + "]") 1571 else: 1572 # End of the comment 1573 break 1574 if comment_list: 1575 consumer.comment(comment_list) 1576 if structured_comment_dict: 1577 consumer.structured_comment(structured_comment_dict) 1578 del comment_list, structured_comment_key, structured_comment_dict 1579 elif line_type in consumer_dict: 1580 # It's a semi-automatic entry! 1581 # Now, this may be a multi line entry... 1582 while True: 1583 line = next(line_iter) 1584 if line[0:self.GENBANK_INDENT] == self.GENBANK_SPACER: 1585 data += ' ' + line[self.GENBANK_INDENT:] 1586 else: 1587 # We now have all the data for this entry: 1588 1589 # The DEFINITION field must ends with a period 1590 # # see ftp://ftp.ncbi.nih.gov/genbank/gbrel.txt [3.4.5] 1591 # and discussion https://github.com/biopython/biopython/pull/616 1592 # We consider this period belong to the syntax, not to the data 1593 # So remove it if it exist 1594 if line_type == 'DEFINITION' and data.endswith('.'): 1595 data = data[:-1] 1596 getattr(consumer, consumer_dict[line_type])(data) 1597 # End of continuation - return to top of loop! 1598 break 1599 else: 1600 if self.debug: 1601 print("Ignoring GenBank header line:\n" % line) 1602 # Read in next line 1603 line = next(line_iter) 1604 except StopIteration: 1605 raise ValueError("Problem in header")
1606
1607 - def _feed_misc_lines(self, consumer, lines):
1608 # Deals with a few misc lines between the features and the sequence 1609 lines.append("") 1610 line_iter = iter(lines) 1611 try: 1612 for line in line_iter: 1613 if line.startswith('BASE COUNT'): 1614 line = line[10:].strip() 1615 if line: 1616 if self.debug: 1617 print("base_count = " + line) 1618 consumer.base_count(line) 1619 if line.startswith('ORIGIN'): 1620 line = line[6:].strip() 1621 if line: 1622 if self.debug: 1623 print("origin_name = " + line) 1624 consumer.origin_name(line) 1625 if line.startswith('WGS '): 1626 line = line[3:].strip() 1627 consumer.wgs(line) 1628 if line.startswith('WGS_SCAFLD'): 1629 line = line[10:].strip() 1630 consumer.add_wgs_scafld(line) 1631 if line.startswith('CONTIG'): 1632 line = line[6:].strip() 1633 contig_location = line 1634 while True: 1635 line = next(line_iter) 1636 if not line: 1637 break 1638 elif line[:self.GENBANK_INDENT] == self.GENBANK_SPACER: 1639 # Don't need to preseve the whitespace here. 1640 contig_location += line[self.GENBANK_INDENT:].rstrip() 1641 elif line.startswith('ORIGIN'): 1642 # Strange, seen this in GenPept files via Entrez gbwithparts 1643 line = line[6:].strip() 1644 if line: 1645 consumer.origin_name(line) 1646 break 1647 else: 1648 raise ValueError('Expected CONTIG continuation line, got:\n' + line) 1649 consumer.contig_location(contig_location) 1650 return 1651 except StopIteration: 1652 raise ValueError("Problem in misc lines before sequence")
1653 1654 1655 if __name__ == "__main__": 1656 from Bio._py3k import StringIO 1657 1658 gbk_example = \ 1659 """LOCUS SCU49845 5028 bp DNA PLN 21-JUN-1999 1660 DEFINITION Saccharomyces cerevisiae TCP1-beta gene, partial cds, and Axl2p 1661 (AXL2) and Rev7p (REV7) genes, complete cds. 1662 ACCESSION U49845 1663 VERSION U49845.1 GI:1293613 1664 KEYWORDS . 1665 SOURCE Saccharomyces cerevisiae (baker's yeast) 1666 ORGANISM Saccharomyces cerevisiae 1667 Eukaryota; Fungi; Ascomycota; Saccharomycotina; Saccharomycetes; 1668 Saccharomycetales; Saccharomycetaceae; Saccharomyces. 1669 REFERENCE 1 (bases 1 to 5028) 1670 AUTHORS Torpey,L.E., Gibbs,P.E., Nelson,J. and Lawrence,C.W. 1671 TITLE Cloning and sequence of REV7, a gene whose function is required for 1672 DNA damage-induced mutagenesis in Saccharomyces cerevisiae 1673 JOURNAL Yeast 10 (11), 1503-1509 (1994) 1674 PUBMED 7871890 1675 REFERENCE 2 (bases 1 to 5028) 1676 AUTHORS Roemer,T., Madden,K., Chang,J. and Snyder,M. 1677 TITLE Selection of axial growth sites in yeast requires Axl2p, a novel 1678 plasma membrane glycoprotein 1679 JOURNAL Genes Dev. 10 (7), 777-793 (1996) 1680 PUBMED 8846915 1681 REFERENCE 3 (bases 1 to 5028) 1682 AUTHORS Roemer,T. 1683 TITLE Direct Submission 1684 JOURNAL Submitted (22-FEB-1996) Terry Roemer, Biology, Yale University, New 1685 Haven, CT, USA 1686 FEATURES Location/Qualifiers 1687 source 1..5028 1688 /organism="Saccharomyces cerevisiae" 1689 /db_xref="taxon:4932" 1690 /chromosome="IX" 1691 /map="9" 1692 CDS <1..206 1693 /codon_start=3 1694 /product="TCP1-beta" 1695 /protein_id="AAA98665.1" 1696 /db_xref="GI:1293614" 1697 /translation="SSIYNGISTSGLDLNNGTIADMRQLGIVESYKLKRAVVSSASEA 1698 AEVLLRVDNIIRARPRTANRQHM" 1699 gene 687..3158 1700 /gene="AXL2" 1701 CDS 687..3158 1702 /gene="AXL2" 1703 /note="plasma membrane glycoprotein" 1704 /codon_start=1 1705 /function="required for axial budding pattern of S. 1706 cerevisiae" 1707 /product="Axl2p" 1708 /protein_id="AAA98666.1" 1709 /db_xref="GI:1293615" 1710 /translation="MTQLQISLLLTATISLLHLVVATPYEAYPIGKQYPPVARVNESF 1711 TFQISNDTYKSSVDKTAQITYNCFDLPSWLSFDSSSRTFSGEPSSDLLSDANTTLYFN 1712 VILEGTDSADSTSLNNTYQFVVTNRPSISLSSDFNLLALLKNYGYTNGKNALKLDPNE 1713 VFNVTFDRSMFTNEESIVSYYGRSQLYNAPLPNWLFFDSGELKFTGTAPVINSAIAPE 1714 TSYSFVIIATDIEGFSAVEVEFELVIGAHQLTTSIQNSLIINVTDTGNVSYDLPLNYV 1715 YLDDDPISSDKLGSINLLDAPDWVALDNATISGSVPDELLGKNSNPANFSVSIYDTYG 1716 DVIYFNFEVVSTTDLFAISSLPNINATRGEWFSYYFLPSQFTDYVNTNVSLEFTNSSQ 1717 DHDWVKFQSSNLTLAGEVPKNFDKLSLGLKANQGSQSQELYFNIIGMDSKITHSNHSA 1718 NATSTRSSHHSTSTSSYTSSTYTAKISSTSAAATSSAPAALPAANKTSSHNKKAVAIA 1719 CGVAIPLGVILVALICFLIFWRRRRENPDDENLPHAISGPDLNNPANKPNQENATPLN 1720 NPFDDDASSYDDTSIARRLAALNTLKLDNHSATESDISSVDEKRDSLSGMNTYNDQFQ 1721 SQSKEELLAKPPVQPPESPFFDPQNRSSSVYMDSEPAVNKSWRYTGNLSPVSDIVRDS 1722 YGSQKTVDTEKLFDLEAPEKEKRTSRDVTMSSLDPWNSNISPSPVRKSVTPSPYNVTK 1723 HRNRHLQNIQDSQSGKNGITPTTMSTSSSDDFVPVKDGENFCWVHSMEPDRRPSKKRL 1724 VDFSNKSNVNVGQVKDIHGRIPEML" 1725 gene complement(3300..4037) 1726 /gene="REV7" 1727 CDS complement(3300..4037) 1728 /gene="REV7" 1729 /codon_start=1 1730 /product="Rev7p" 1731 /protein_id="AAA98667.1" 1732 /db_xref="GI:1293616" 1733 /translation="MNRWVEKWLRVYLKCYINLILFYRNVYPPQSFDYTTYQSFNLPQ 1734 FVPINRHPALIDYIEELILDVLSKLTHVYRFSICIINKKNDLCIEKYVLDFSELQHVD 1735 KDDQIITETEVFDEFRSSLNSLIMHLEKLPKVNDDTITFEAVINAIELELGHKLDRNR 1736 RVDSLEEKAEIERDSNWVKCQEDENLPDNNGFQPPKIKLTSLVGSDVGPLIIHQFSEK 1737 LISGDDKILNGVYSQYEEGESIFGSLF" 1738 ORIGIN 1739 1 gatcctccat atacaacggt atctccacct caggtttaga tctcaacaac ggaaccattg 1740 61 ccgacatgag acagttaggt atcgtcgaga gttacaagct aaaacgagca gtagtcagct 1741 121 ctgcatctga agccgctgaa gttctactaa gggtggataa catcatccgt gcaagaccaa 1742 181 gaaccgccaa tagacaacat atgtaacata tttaggatat acctcgaaaa taataaaccg 1743 241 ccacactgtc attattataa ttagaaacag aacgcaaaaa ttatccacta tataattcaa 1744 301 agacgcgaaa aaaaaagaac aacgcgtcat agaacttttg gcaattcgcg tcacaaataa 1745 361 attttggcaa cttatgtttc ctcttcgagc agtactcgag ccctgtctca agaatgtaat 1746 421 aatacccatc gtaggtatgg ttaaagatag catctccaca acctcaaagc tccttgccga 1747 481 gagtcgccct cctttgtcga gtaattttca cttttcatat gagaacttat tttcttattc 1748 541 tttactctca catcctgtag tgattgacac tgcaacagcc accatcacta gaagaacaga 1749 601 acaattactt aatagaaaaa ttatatcttc ctcgaaacga tttcctgctt ccaacatcta 1750 661 cgtatatcaa gaagcattca cttaccatga cacagcttca gatttcatta ttgctgacag 1751 721 ctactatatc actactccat ctagtagtgg ccacgcccta tgaggcatat cctatcggaa 1752 781 aacaataccc cccagtggca agagtcaatg aatcgtttac atttcaaatt tccaatgata 1753 841 cctataaatc gtctgtagac aagacagctc aaataacata caattgcttc gacttaccga 1754 901 gctggctttc gtttgactct agttctagaa cgttctcagg tgaaccttct tctgacttac 1755 961 tatctgatgc gaacaccacg ttgtatttca atgtaatact cgagggtacg gactctgccg 1756 1021 acagcacgtc tttgaacaat acataccaat ttgttgttac aaaccgtcca tccatctcgc 1757 1081 tatcgtcaga tttcaatcta ttggcgttgt taaaaaacta tggttatact aacggcaaaa 1758 1141 acgctctgaa actagatcct aatgaagtct tcaacgtgac ttttgaccgt tcaatgttca 1759 1201 ctaacgaaga atccattgtg tcgtattacg gacgttctca gttgtataat gcgccgttac 1760 1261 ccaattggct gttcttcgat tctggcgagt tgaagtttac tgggacggca ccggtgataa 1761 1321 actcggcgat tgctccagaa acaagctaca gttttgtcat catcgctaca gacattgaag 1762 1381 gattttctgc cgttgaggta gaattcgaat tagtcatcgg ggctcaccag ttaactacct 1763 1441 ctattcaaaa tagtttgata atcaacgtta ctgacacagg taacgtttca tatgacttac 1764 1501 ctctaaacta tgtttatctc gatgacgatc ctatttcttc tgataaattg ggttctataa 1765 1561 acttattgga tgctccagac tgggtggcat tagataatgc taccatttcc gggtctgtcc 1766 1621 cagatgaatt actcggtaag aactccaatc ctgccaattt ttctgtgtcc atttatgata 1767 1681 cttatggtga tgtgatttat ttcaacttcg aagttgtctc cacaacggat ttgtttgcca 1768 1741 ttagttctct tcccaatatt aacgctacaa ggggtgaatg gttctcctac tattttttgc 1769 1801 cttctcagtt tacagactac gtgaatacaa acgtttcatt agagtttact aattcaagcc 1770 1861 aagaccatga ctgggtgaaa ttccaatcat ctaatttaac attagctgga gaagtgccca 1771 1921 agaatttcga caagctttca ttaggtttga aagcgaacca aggttcacaa tctcaagagc 1772 1981 tatattttaa catcattggc atggattcaa agataactca ctcaaaccac agtgcgaatg 1773 2041 caacgtccac aagaagttct caccactcca cctcaacaag ttcttacaca tcttctactt 1774 2101 acactgcaaa aatttcttct acctccgctg ctgctacttc ttctgctcca gcagcgctgc 1775 2161 cagcagccaa taaaacttca tctcacaata aaaaagcagt agcaattgcg tgcggtgttg 1776 2221 ctatcccatt aggcgttatc ctagtagctc tcatttgctt cctaatattc tggagacgca 1777 2281 gaagggaaaa tccagacgat gaaaacttac cgcatgctat tagtggacct gatttgaata 1778 2341 atcctgcaaa taaaccaaat caagaaaacg ctacaccttt gaacaacccc tttgatgatg 1779 2401 atgcttcctc gtacgatgat acttcaatag caagaagatt ggctgctttg aacactttga 1780 2461 aattggataa ccactctgcc actgaatctg atatttccag cgtggatgaa aagagagatt 1781 2521 ctctatcagg tatgaataca tacaatgatc agttccaatc ccaaagtaaa gaagaattat 1782 2581 tagcaaaacc cccagtacag cctccagaga gcccgttctt tgacccacag aataggtctt 1783 2641 cttctgtgta tatggatagt gaaccagcag taaataaatc ctggcgatat actggcaacc 1784 2701 tgtcaccagt ctctgatatt gtcagagaca gttacggatc acaaaaaact gttgatacag 1785 2761 aaaaactttt cgatttagaa gcaccagaga aggaaaaacg tacgtcaagg gatgtcacta 1786 2821 tgtcttcact ggacccttgg aacagcaata ttagcccttc tcccgtaaga aaatcagtaa 1787 2881 caccatcacc atataacgta acgaagcatc gtaaccgcca cttacaaaat attcaagact 1788 2941 ctcaaagcgg taaaaacgga atcactccca caacaatgtc aacttcatct tctgacgatt 1789 3001 ttgttccggt taaagatggt gaaaattttt gctgggtcca tagcatggaa ccagacagaa 1790 3061 gaccaagtaa gaaaaggtta gtagattttt caaataagag taatgtcaat gttggtcaag 1791 3121 ttaaggacat tcacggacgc atcccagaaa tgctgtgatt atacgcaacg atattttgct 1792 3181 taattttatt ttcctgtttt attttttatt agtggtttac agatacccta tattttattt 1793 3241 agtttttata cttagagaca tttaatttta attccattct tcaaatttca tttttgcact 1794 3301 taaaacaaag atccaaaaat gctctcgccc tcttcatatt gagaatacac tccattcaaa 1795 3361 attttgtcgt caccgctgat taatttttca ctaaactgat gaataatcaa aggccccacg 1796 3421 tcagaaccga ctaaagaagt gagttttatt ttaggaggtt gaaaaccatt attgtctggt 1797 3481 aaattttcat cttcttgaca tttaacccag tttgaatccc tttcaatttc tgctttttcc 1798 3541 tccaaactat cgaccctcct gtttctgtcc aacttatgtc ctagttccaa ttcgatcgca 1799 3601 ttaataactg cttcaaatgt tattgtgtca tcgttgactt taggtaattt ctccaaatgc 1800 3661 ataatcaaac tatttaagga agatcggaat tcgtcgaaca cttcagtttc cgtaatgatc 1801 3721 tgatcgtctt tatccacatg ttgtaattca ctaaaatcta aaacgtattt ttcaatgcat 1802 3781 aaatcgttct ttttattaat aatgcagatg gaaaatctgt aaacgtgcgt taatttagaa 1803 3841 agaacatcca gtataagttc ttctatatag tcaattaaag caggatgcct attaatggga 1804 3901 acgaactgcg gcaagttgaa tgactggtaa gtagtgtagt cgaatgactg aggtgggtat 1805 3961 acatttctat aaaataaaat caaattaatg tagcatttta agtataccct cagccacttc 1806 4021 tctacccatc tattcataaa gctgacgcaa cgattactat tttttttttc ttcttggatc 1807 4081 tcagtcgtcg caaaaacgta taccttcttt ttccgacctt ttttttagct ttctggaaaa 1808 4141 gtttatatta gttaaacagg gtctagtctt agtgtgaaag ctagtggttt cgattgactg 1809 4201 atattaagaa agtggaaatt aaattagtag tgtagacgta tatgcatatg tatttctcgc 1810 4261 ctgtttatgt ttctacgtac ttttgattta tagcaagggg aaaagaaata catactattt 1811 4321 tttggtaaag gtgaaagcat aatgtaaaag ctagaataaa atggacgaaa taaagagagg 1812 4381 cttagttcat cttttttcca aaaagcaccc aatgataata actaaaatga aaaggatttg 1813 4441 ccatctgtca gcaacatcag ttgtgtgagc aataataaaa tcatcacctc cgttgccttt 1814 4501 agcgcgtttg tcgtttgtat cttccgtaat tttagtctta tcaatgggaa tcataaattt 1815 4561 tccaatgaat tagcaatttc gtccaattct ttttgagctt cttcatattt gctttggaat 1816 4621 tcttcgcact tcttttccca ttcatctctt tcttcttcca aagcaacgat ccttctaccc 1817 4681 atttgctcag agttcaaatc ggcctctttc agtttatcca ttgcttcctt cagtttggct 1818 4741 tcactgtctt ctagctgttg ttctagatcc tggtttttct tggtgtagtt ctcattatta 1819 4801 gatctcaagt tattggagtc ttcagccaat tgctttgtat cagacaattg actctctaac 1820 4861 ttctccactt cactgtcgag ttgctcgttt ttagcggaca aagatttaat ctcgttttct 1821 4921 ttttcagtgt tagattgctc taattctttg agctgttctc tcagctcctc atatttttct 1822 4981 tgccatgact cagattctaa ttttaagcta ttcaatttct ctttgatc 1823 //""" 1824 1825 # GenBank format protein (aka GenPept) file from: 1826 # http://www.molecularevolution.org/resources/fileformats/ 1827 gbk_example2 = \ 1828 """LOCUS AAD51968 143 aa linear BCT 21-AUG-2001 1829 DEFINITION transcriptional regulator RovA [Yersinia enterocolitica]. 1830 ACCESSION AAD51968 1831 VERSION AAD51968.1 GI:5805369 1832 DBSOURCE locus AF171097 accession AF171097.1 1833 KEYWORDS . 1834 SOURCE Yersinia enterocolitica 1835 ORGANISM Yersinia enterocolitica 1836 Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacteriales; 1837 Enterobacteriaceae; Yersinia. 1838 REFERENCE 1 (residues 1 to 143) 1839 AUTHORS Revell,P.A. and Miller,V.L. 1840 TITLE A chromosomally encoded regulator is required for expression of the 1841 Yersinia enterocolitica inv gene and for virulence 1842 JOURNAL Mol. Microbiol. 35 (3), 677-685 (2000) 1843 MEDLINE 20138369 1844 PUBMED 10672189 1845 REFERENCE 2 (residues 1 to 143) 1846 AUTHORS Revell,P.A. and Miller,V.L. 1847 TITLE Direct Submission 1848 JOURNAL Submitted (22-JUL-1999) Molecular Microbiology, Washington 1849 University School of Medicine, Campus Box 8230, 660 South Euclid, 1850 St. Louis, MO 63110, USA 1851 COMMENT Method: conceptual translation. 1852 FEATURES Location/Qualifiers 1853 source 1..143 1854 /organism="Yersinia enterocolitica" 1855 /mol_type="unassigned DNA" 1856 /strain="JB580v" 1857 /serotype="O:8" 1858 /db_xref="taxon:630" 1859 Protein 1..143 1860 /product="transcriptional regulator RovA" 1861 /name="regulates inv expression" 1862 CDS 1..143 1863 /gene="rovA" 1864 /coded_by="AF171097.1:380..811" 1865 /note="regulator of virulence" 1866 /transl_table=11 1867 ORIGIN 1868 1 mestlgsdla rlvrvwrali dhrlkplelt qthwvtlhni nrlppeqsqi qlakaigieq 1869 61 pslvrtldql eekglitrht candrrakri klteqsspii eqvdgvicst rkeilggisp 1870 121 deiellsgli dklerniiql qsk 1871 // 1872 """ 1873 1874 embl_example = """ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP. 1875 XX 1876 AC X56734; S46826; 1877 XX 1878 DT 12-SEP-1991 (Rel. 29, Created) 1879 DT 25-NOV-2005 (Rel. 85, Last updated, Version 11) 1880 XX 1881 DE Trifolium repens mRNA for non-cyanogenic beta-glucosidase 1882 XX 1883 KW beta-glucosidase. 1884 XX 1885 OS Trifolium repens (white clover) 1886 OC Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta; 1887 OC Spermatophyta; Magnoliophyta; eudicotyledons; core eudicotyledons; rosids; 1888 OC eurosids I; Fabales; Fabaceae; Papilionoideae; Trifolieae; Trifolium. 1889 XX 1890 RN [5] 1891 RP 1-1859 1892 RX PUBMED; 1907511. 1893 RA Oxtoby E., Dunn M.A., Pancoro A., Hughes M.A.; 1894 RT "Nucleotide and derived amino acid sequence of the cyanogenic 1895 RT beta-glucosidase (linamarase) from white clover (Trifolium repens L.)"; 1896 RL Plant Mol. Biol. 17(2):209-219(1991). 1897 XX 1898 RN [6] 1899 RP 1-1859 1900 RA Hughes M.A.; 1901 RT ; 1902 RL Submitted (19-NOV-1990) to the EMBL/GenBank/DDBJ databases. 1903 RL Hughes M.A., University of Newcastle Upon Tyne, Medical School, Newcastle 1904 RL Upon Tyne, NE2 4HH, UK 1905 XX 1906 FH Key Location/Qualifiers 1907 FH 1908 FT source 1..1859 1909 FT /organism="Trifolium repens" 1910 FT /mol_type="mRNA" 1911 FT /clone_lib="lambda gt10" 1912 FT /clone="TRE361" 1913 FT /tissue_type="leaves" 1914 FT /db_xref="taxon:3899" 1915 FT CDS 14..1495 1916 FT /product="beta-glucosidase" 1917 FT /EC_number="3.2.1.21" 1918 FT /note="non-cyanogenic" 1919 FT /db_xref="GOA:P26204" 1920 FT /db_xref="InterPro:IPR001360" 1921 FT /db_xref="InterPro:IPR013781" 1922 FT /db_xref="UniProtKB/Swiss-Prot:P26204" 1923 FT /protein_id="CAA40058.1" 1924 FT /translation="MDFIVAIFALFVISSFTITSTNAVEASTLLDIGNLSRSSFPRGFI 1925 FT FGAGSSAYQFEGAVNEGGRGPSIWDTFTHKYPEKIRDGSNADITVDQYHRYKEDVGIMK 1926 FT DQNMDSYRFSISWPRILPKGKLSGGINHEGIKYYNNLINELLANGIQPFVTLFHWDLPQ 1927 FT VLEDEYGGFLNSGVINDFRDYTDLCFKEFGDRVRYWSTLNEPWVFSNSGYALGTNAPGR 1928 FT CSASNVAKPGDSGTGPYIVTHNQILAHAEAVHVYKTKYQAYQKGKIGITLVSNWLMPLD 1929 FT DNSIPDIKAAERSLDFQFGLFMEQLTTGDYSKSMRRIVKNRLPKFSKFESSLVNGSFDF 1930 FT IGINYYSSSYISNAPSHGNAKPSYSTNPMTNISFEKHGIPLGPRAASIWIYVYPYMFIQ 1931 FT EDFEIFCYILKINITILQFSITENGMNEFNDATLPVEEALLNTYRIDYYYRHLYYIRSA 1932 FT IRAGSNVKGFYAWSFLDCNEWFAGFTVRFGLNFVD" 1933 FT mRNA 1..1859 1934 FT /experiment="experimental evidence, no additional details 1935 FT recorded" 1936 XX 1937 SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other; 1938 aaacaaacca aatatggatt ttattgtagc catatttgct ctgtttgtta ttagctcatt 60 1939 cacaattact tccacaaatg cagttgaagc ttctactctt cttgacatag gtaacctgag 120 1940 tcggagcagt tttcctcgtg gcttcatctt tggtgctgga tcttcagcat accaatttga 180 1941 aggtgcagta aacgaaggcg gtagaggacc aagtatttgg gataccttca cccataaata 240 1942 tccagaaaaa ataagggatg gaagcaatgc agacatcacg gttgaccaat atcaccgcta 300 1943 caaggaagat gttgggatta tgaaggatca aaatatggat tcgtatagat tctcaatctc 360 1944 ttggccaaga atactcccaa agggaaagtt gagcggaggc ataaatcacg aaggaatcaa 420 1945 atattacaac aaccttatca acgaactatt ggctaacggt atacaaccat ttgtaactct 480 1946 ttttcattgg gatcttcccc aagtcttaga agatgagtat ggtggtttct taaactccgg 540 1947 tgtaataaat gattttcgag actatacgga tctttgcttc aaggaatttg gagatagagt 600 1948 gaggtattgg agtactctaa atgagccatg ggtgtttagc aattctggat atgcactagg 660 1949 aacaaatgca ccaggtcgat gttcggcctc caacgtggcc aagcctggtg attctggaac 720 1950 aggaccttat atagttacac acaatcaaat tcttgctcat gcagaagctg tacatgtgta 780 1951 taagactaaa taccaggcat atcaaaaggg aaagataggc ataacgttgg tatctaactg 840 1952 gttaatgcca cttgatgata atagcatacc agatataaag gctgccgaga gatcacttga 900 1953 cttccaattt ggattgttta tggaacaatt aacaacagga gattattcta agagcatgcg 960 1954 gcgtatagtt aaaaaccgat tacctaagtt ctcaaaattc gaatcaagcc tagtgaatgg 1020 1955 ttcatttgat tttattggta taaactatta ctcttctagt tatattagca atgccccttc 1080 1956 acatggcaat gccaaaccca gttactcaac aaatcctatg accaatattt catttgaaaa 1140 1957 acatgggata cccttaggtc caagggctgc ttcaatttgg atatatgttt atccatatat 1200 1958 gtttatccaa gaggacttcg agatcttttg ttacatatta aaaataaata taacaatcct 1260 1959 gcaattttca atcactgaaa atggtatgaa tgaattcaac gatgcaacac ttccagtaga 1320 1960 agaagctctt ttgaatactt acagaattga ttactattac cgtcacttat actacattcg 1380 1961 ttctgcaatc agggctggct caaatgtgaa gggtttttac gcatggtcat ttttggactg 1440 1962 taatgaatgg tttgcaggct ttactgttcg ttttggatta aactttgtag attagaaaga 1500 1963 tggattaaaa aggtacccta agctttctgc ccaatggtac aagaactttc tcaaaagaaa 1560 1964 ctagctagta ttattaaaag aactttgtag tagattacag tacatcgttt gaagttgagt 1620 1965 tggtgcacct aattaaataa aagaggttac tcttaacata tttttaggcc attcgttgtg 1680 1966 aagttgttag gctgttattt ctattatact atgttgtagt aataagtgca ttgttgtacc 1740 1967 agaagctatg atcataacta taggttgatc cttcatgtat cagtttgatg ttgagaatac 1800 1968 tttgaattaa aagtcttttt ttattttttt aaaaaaaaaa aaaaaaaaaa aaaaaaaaa 1859 1969 // 1970 """ 1971 1972 print("GenBank CDS Iteration") 1973 print("=====================") 1974 1975 g = GenBankScanner() 1976 for record in g.parse_cds_features(StringIO(gbk_example)): 1977 print(record) 1978 1979 g = GenBankScanner() 1980 for record in g.parse_cds_features(StringIO(gbk_example2), 1981 tags2id=('gene', 'locus_tag', 'product')): 1982 print(record) 1983 1984 g = GenBankScanner() 1985 for record in g.parse_cds_features(StringIO(gbk_example + "\n" + gbk_example2), 1986 tags2id=('gene', 'locus_tag', 'product')): 1987 print(record) 1988 1989 print("") 1990 print("GenBank Iteration") 1991 print("=================") 1992 g = GenBankScanner() 1993 for record in g.parse_records(StringIO(gbk_example), do_features=False): 1994 print("%s %s %s" % (record.id, record.name, record.description)) 1995 print(record.seq) 1996 1997 g = GenBankScanner() 1998 for record in g.parse_records(StringIO(gbk_example), do_features=True): 1999 print("%s %s %s" % (record.id, record.name, record.description)) 2000 print(record.seq) 2001 2002 g = GenBankScanner() 2003 for record in g.parse_records(StringIO(gbk_example2), do_features=False): 2004 print("%s %s %s" % (record.id, record.name, record.description)) 2005 print(record.seq) 2006 2007 g = GenBankScanner() 2008 for record in g.parse_records(StringIO(gbk_example2), do_features=True): 2009 print("%s %s %s" % (record.id, record.name, record.description)) 2010 print(record.seq) 2011 2012 print("") 2013 print("EMBL CDS Iteration") 2014 print("==================") 2015 2016 e = EmblScanner() 2017 for record in e.parse_cds_features(StringIO(embl_example)): 2018 print(record) 2019 2020 print("") 2021 print("EMBL Iteration") 2022 print("==============") 2023 e = EmblScanner() 2024 for record in e.parse_records(StringIO(embl_example), do_features=True): 2025 print("%s %s %s" % (record.id, record.name, record.description)) 2026 print(record.seq) 2027