Package Bio :: Package GenBank :: Module Scanner
[hide private]
[frames] | no frames]

Source Code for Module Bio.GenBank.Scanner

   1  # Copyright 2007-2010 by Peter Cock.  All rights reserved. 
   2  # Revisions copyright 2010 by Uri Laserson.  All rights reserved. 
   3  # This code is part of the Biopython distribution and governed by its 
   4  # license.  Please see the LICENSE file that should have been included 
   5  # as part of this package. 
   6  """Internal code for parsing GenBank and EMBL files (PRIVATE). 
   7   
   8  This code is NOT intended for direct use.  It provides a basic scanner 
   9  (for use with a event consumer such as Bio.GenBank._FeatureConsumer) 
  10  to parse a GenBank or EMBL file (with their shared INSDC feature table). 
  11   
  12  It is used by Bio.GenBank to parse GenBank files 
  13  It is also used by Bio.SeqIO to parse GenBank and EMBL files 
  14   
  15  Feature Table Documentation: 
  16  http://www.insdc.org/files/feature_table.html 
  17  http://www.ncbi.nlm.nih.gov/projects/collab/FT/index.html 
  18  ftp://ftp.ncbi.nih.gov/genbank/docs/ 
  19  """ 
  20  # 17-MAR-2009: added wgs, wgs_scafld for GenBank whole genome shotgun master records. 
  21  # These are GenBank files that summarize the content of a project, and provide lists of 
  22  # scaffold and contig files in the project. These will be in annotations['wgs'] and 
  23  # annotations['wgs_scafld']. These GenBank files do not have sequences. See 
  24  # http://groups.google.com/group/bionet.molbio.genbank/browse_thread/thread/51fb88bf39e7dc36 
  25  # http://is.gd/nNgk 
  26  # for more details of this format, and an example. 
  27  # Added by Ying Huang & Iddo Friedberg 
  28   
  29  from __future__ import print_function 
  30   
  31  import warnings 
  32  import re 
  33  from Bio.Seq import Seq 
  34  from Bio.SeqRecord import SeqRecord 
  35  from Bio.Alphabet import generic_protein 
  36  from Bio import BiopythonParserWarning 
  37   
  38  __docformat__ = "restructuredtext en" 
  39   
  40   
41 -class InsdcScanner(object):
42 """Basic functions for breaking up a GenBank/EMBL file into sub sections. 43 44 The International Nucleotide Sequence Database Collaboration (INSDC) 45 between the DDBJ, EMBL, and GenBank. These organisations all use the 46 same "Feature Table" layout in their plain text flat file formats. 47 48 However, the header and sequence sections of an EMBL file are very 49 different in layout to those produced by GenBank/DDBJ.""" 50 51 # These constants get redefined with sensible values in the sub classes: 52 RECORD_START = "XXX" # "LOCUS " or "ID " 53 HEADER_WIDTH = 3 # 12 or 5 54 FEATURE_START_MARKERS = ["XXX***FEATURES***XXX"] 55 FEATURE_END_MARKERS = ["XXX***END FEATURES***XXX"] 56 FEATURE_QUALIFIER_INDENT = 0 57 FEATURE_QUALIFIER_SPACER = "" 58 SEQUENCE_HEADERS = ["XXX"] # with right hand side spaces removed 59
60 - def __init__(self, debug=0):
61 assert len(self.RECORD_START) == self.HEADER_WIDTH 62 for marker in self.SEQUENCE_HEADERS: 63 assert marker == marker.rstrip() 64 assert len(self.FEATURE_QUALIFIER_SPACER) == self.FEATURE_QUALIFIER_INDENT 65 self.debug = debug 66 self.line = None
67
68 - def set_handle(self, handle):
69 self.handle = handle 70 self.line = ""
71
72 - def find_start(self):
73 """Read in lines until find the ID/LOCUS line, which is returned. 74 75 Any preamble (such as the header used by the NCBI on ``*.seq.gz`` archives) 76 will we ignored.""" 77 while True: 78 if self.line: 79 line = self.line 80 self.line = "" 81 else: 82 line = self.handle.readline() 83 if not line: 84 if self.debug: 85 print("End of file") 86 return None 87 if line[:self.HEADER_WIDTH] == self.RECORD_START: 88 if self.debug > 1: 89 print("Found the start of a record:\n" + line) 90 break 91 line = line.rstrip() 92 if line == "//": 93 if self.debug > 1: 94 print("Skipping // marking end of last record") 95 elif line == "": 96 if self.debug > 1: 97 print("Skipping blank line before record") 98 else: 99 # Ignore any header before the first ID/LOCUS line. 100 if self.debug > 1: 101 print("Skipping header line before record:\n" + line) 102 self.line = line 103 return line
104
105 - def parse_header(self):
106 """Return list of strings making up the header 107 108 New line characters are removed. 109 110 Assumes you have just read in the ID/LOCUS line. 111 """ 112 assert self.line[:self.HEADER_WIDTH] == self.RECORD_START, \ 113 "Not at start of record" 114 115 header_lines = [] 116 while True: 117 line = self.handle.readline() 118 if not line: 119 raise ValueError("Premature end of line during sequence data") 120 line = line.rstrip() 121 if line in self.FEATURE_START_MARKERS: 122 if self.debug: 123 print("Found feature table") 124 break 125 # if line[:self.HEADER_WIDTH]==self.FEATURE_START_MARKER[:self.HEADER_WIDTH]: 126 # if self.debug : print("Found header table (?)") 127 # break 128 if line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS: 129 if self.debug: 130 print("Found start of sequence") 131 break 132 if line == "//": 133 raise ValueError("Premature end of sequence data marker '//' found") 134 header_lines.append(line) 135 self.line = line 136 return header_lines
137
138 - def parse_features(self, skip=False):
139 """Return list of tuples for the features (if present) 140 141 Each feature is returned as a tuple (key, location, qualifiers) 142 where key and location are strings (e.g. "CDS" and 143 "complement(join(490883..490885,1..879))") while qualifiers 144 is a list of two string tuples (feature qualifier keys and values). 145 146 Assumes you have already read to the start of the features table. 147 """ 148 if self.line.rstrip() not in self.FEATURE_START_MARKERS: 149 if self.debug: 150 print("Didn't find any feature table") 151 return [] 152 153 while self.line.rstrip() in self.FEATURE_START_MARKERS: 154 self.line = self.handle.readline() 155 156 features = [] 157 line = self.line 158 while True: 159 if not line: 160 raise ValueError("Premature end of line during features table") 161 if line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS: 162 if self.debug: 163 print("Found start of sequence") 164 break 165 line = line.rstrip() 166 if line == "//": 167 raise ValueError("Premature end of features table, marker '//' found") 168 if line in self.FEATURE_END_MARKERS: 169 if self.debug: 170 print("Found end of features") 171 line = self.handle.readline() 172 break 173 if line[2:self.FEATURE_QUALIFIER_INDENT].strip() == "": 174 # This is an empty feature line between qualifiers. Empty 175 # feature lines within qualifiers are handled below (ignored). 176 line = self.handle.readline() 177 continue 178 if len(line) < self.FEATURE_QUALIFIER_INDENT: 179 warnings.warn("line too short to contain a feature: %r" % line, 180 BiopythonParserWarning) 181 line = self.handle.readline() 182 continue 183 184 if skip: 185 line = self.handle.readline() 186 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER: 187 line = self.handle.readline() 188 else: 189 # Build up a list of the lines making up this feature: 190 if line[self.FEATURE_QUALIFIER_INDENT] != " " \ 191 and " " in line[self.FEATURE_QUALIFIER_INDENT:]: 192 # The feature table design enforces a length limit on the feature keys. 193 # Some third party files (e.g. IGMT's EMBL like files) solve this by 194 # over indenting the location and qualifiers. 195 feature_key, line = line[2:].strip().split(None, 1) 196 feature_lines = [line] 197 warnings.warn("Overindented %s feature?" % feature_key, 198 BiopythonParserWarning) 199 else: 200 feature_key = line[2:self.FEATURE_QUALIFIER_INDENT].strip() 201 feature_lines = [line[self.FEATURE_QUALIFIER_INDENT:]] 202 line = self.handle.readline() 203 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER \ 204 or (line != '' and line.rstrip() == ""): # cope with blank lines in the midst of a feature 205 # Use strip to remove any harmless trailing white space AND and leading 206 # white space (e.g. out of spec files with too much indentation) 207 feature_lines.append(line[self.FEATURE_QUALIFIER_INDENT:].strip()) 208 line = self.handle.readline() 209 features.append(self.parse_feature(feature_key, feature_lines)) 210 self.line = line 211 return features
212
213 - def parse_feature(self, feature_key, lines):
214 r"""Expects a feature as a list of strings, returns a tuple (key, location, qualifiers) 215 216 For example given this GenBank feature:: 217 218 CDS complement(join(490883..490885,1..879)) 219 /locus_tag="NEQ001" 220 /note="conserved hypothetical [Methanococcus jannaschii]; 221 COG1583:Uncharacterized ACR; IPR001472:Bipartite nuclear 222 localization signal; IPR002743: Protein of unknown 223 function DUF57" 224 /codon_start=1 225 /transl_table=11 226 /product="hypothetical protein" 227 /protein_id="NP_963295.1" 228 /db_xref="GI:41614797" 229 /db_xref="GeneID:2732620" 230 /translation="MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK 231 EKYFNFTLIPKKDIIENKRYYLIISSPDKRFIEVLHNKIKDLDIITIGLAQFQLRKTK 232 KFDPKLRFPWVTITPIVLREGKIVILKGDKYYKVFVKRLEELKKYNLIKKKEPILEEP 233 IEISLNQIKDGWKIIDVKDRYYDFRNKSFSAFSNWLRDLKEQSLRKYNNFCGKNFYFE 234 EAIFEGFTFYKTVSIRIRINRGEAVYIGTLWKELNVYRKLDKEEREFYKFLYDCGLGS 235 LNSMGFGFVNTKKNSAR" 236 237 Then should give input key="CDS" and the rest of the data as a list of strings 238 lines=["complement(join(490883..490885,1..879))", ..., "LNSMGFGFVNTKKNSAR"] 239 where the leading spaces and trailing newlines have been removed. 240 241 Returns tuple containing: (key as string, location string, qualifiers as list) 242 as follows for this example: 243 244 key = "CDS", string 245 location = "complement(join(490883..490885,1..879))", string 246 qualifiers = list of string tuples: 247 248 [('locus_tag', '"NEQ001"'), 249 ('note', '"conserved hypothetical [Methanococcus jannaschii];\nCOG1583:..."'), 250 ('codon_start', '1'), 251 ('transl_table', '11'), 252 ('product', '"hypothetical protein"'), 253 ('protein_id', '"NP_963295.1"'), 254 ('db_xref', '"GI:41614797"'), 255 ('db_xref', '"GeneID:2732620"'), 256 ('translation', '"MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK\nEKYFNFT..."')] 257 258 In the above example, the "note" and "translation" were edited for compactness, 259 and they would contain multiple new line characters (displayed above as \n) 260 261 If a qualifier is quoted (in this case, everything except codon_start and 262 transl_table) then the quotes are NOT removed. 263 264 Note that no whitespace is removed. 265 """ 266 # Skip any blank lines 267 iterator = (x for x in lines if x) 268 try: 269 line = next(iterator) 270 271 feature_location = line.strip() 272 while feature_location[-1:] == ",": 273 # Multiline location, still more to come! 274 line = next(iterator) 275 feature_location += line.strip() 276 if feature_location.count("(") > feature_location.count(")"): 277 # Including the prev line in warning would be more explicit, 278 # but this way get one-and-only-one warning shown by default: 279 warnings.warn("Non-standard feature line wrapping (didn't break on comma)?", 280 BiopythonParserWarning) 281 while feature_location[-1:] == "," or feature_location.count("(") > feature_location.count(")"): 282 line = next(iterator) 283 feature_location += line.strip() 284 285 qualifiers = [] 286 287 for line_number, line in enumerate(iterator): 288 # check for extra wrapping of the location closing parentheses 289 if line_number == 0 and line.startswith(")"): 290 feature_location += line.strip() 291 elif line[0] == "/": 292 # New qualifier 293 i = line.find("=") 294 key = line[1:i] # does not work if i==-1 295 value = line[i + 1:] # we ignore 'value' if i==-1 296 if i == -1: 297 # Qualifier with no key, e.g. /pseudo 298 key = line[1:] 299 qualifiers.append((key, None)) 300 elif not value: 301 # ApE can output /note= 302 qualifiers.append((key, "")) 303 elif value == '"': 304 # One single quote 305 if self.debug: 306 print("Single quote %s:%s" % (key, value)) 307 # DO NOT remove the quote... 308 qualifiers.append((key, value)) 309 elif value[0] == '"': 310 # Quoted... 311 value_list = [value] 312 while value_list[-1][-1] != '"': 313 value_list.append(next(iterator)) 314 value = '\n'.join(value_list) 315 # DO NOT remove the quotes... 316 qualifiers.append((key, value)) 317 else: 318 # Unquoted 319 # if debug : print("Unquoted line %s:%s" % (key,value)) 320 qualifiers.append((key, value)) 321 else: 322 # Unquoted continuation 323 assert len(qualifiers) > 0 324 assert key == qualifiers[-1][0] 325 # if debug : print("Unquoted Cont %s:%s" % (key, line)) 326 if qualifiers[-1][1] is None: 327 raise StopIteration 328 qualifiers[-1] = (key, qualifiers[-1][1] + "\n" + line) 329 return (feature_key, feature_location, qualifiers) 330 except StopIteration: 331 # Bummer 332 raise ValueError("Problem with '%s' feature:\n%s" 333 % (feature_key, "\n".join(lines)))
334 357
358 - def _feed_first_line(self, consumer, line):
359 """Handle the LOCUS/ID line, passing data to the comsumer 360 361 This should be implemented by the EMBL / GenBank specific subclass 362 363 Used by the parse_records() and parse() methods. 364 """ 365 pass
366
367 - def _feed_header_lines(self, consumer, lines):
368 """Handle the header lines (list of strings), passing data to the comsumer 369 370 This should be implemented by the EMBL / GenBank specific subclass 371 372 Used by the parse_records() and parse() methods. 373 """ 374 pass
375
376 - def _feed_feature_table(self, consumer, feature_tuples):
377 """Handle the feature table (list of tuples), passing data to the comsumer 378 379 Used by the parse_records() and parse() methods. 380 """ 381 consumer.start_feature_table() 382 for feature_key, location_string, qualifiers in feature_tuples: 383 consumer.feature_key(feature_key) 384 consumer.location(location_string) 385 for q_key, q_value in qualifiers: 386 if q_value is None: 387 consumer.feature_qualifier(q_key, q_value) 388 else: 389 consumer.feature_qualifier(q_key, q_value.replace("\n", " "))
390
391 - def _feed_misc_lines(self, consumer, lines):
392 """Handle any lines between features and sequence (list of strings), passing data to the consumer 393 394 This should be implemented by the EMBL / GenBank specific subclass 395 396 Used by the parse_records() and parse() methods. 397 """ 398 pass
399
400 - def feed(self, handle, consumer, do_features=True):
401 """Feed a set of data into the consumer. 402 403 This method is intended for use with the "old" code in Bio.GenBank 404 405 Arguments: 406 407 - handle - A handle with the information to parse. 408 - consumer - The consumer that should be informed of events. 409 - do_features - Boolean, should the features be parsed? 410 Skipping the features can be much faster. 411 412 Return values: 413 414 - true - Passed a record 415 - false - Did not find a record 416 """ 417 # Should work with both EMBL and GenBank files provided the 418 # equivalent Bio.GenBank._FeatureConsumer methods are called... 419 self.set_handle(handle) 420 if not self.find_start(): 421 # Could not find (another) record 422 consumer.data = None 423 return False 424 425 # We use the above class methods to parse the file into a simplified format. 426 # The first line, header lines and any misc lines after the features will be 427 # dealt with by GenBank / EMBL specific derived classes. 428 429 # First line and header: 430 self._feed_first_line(consumer, self.line) 431 self._feed_header_lines(consumer, self.parse_header()) 432 433 # Features (common to both EMBL and GenBank): 434 if do_features: 435 self._feed_feature_table(consumer, self.parse_features(skip=False)) 436 else: 437 self.parse_features(skip=True) # ignore the data 438 439 # Footer and sequence 440 misc_lines, sequence_string = self.parse_footer() 441 self._feed_misc_lines(consumer, misc_lines) 442 443 consumer.sequence(sequence_string) 444 # Calls to consumer.base_number() do nothing anyway 445 consumer.record_end("//") 446 447 assert self.line == "//" 448 449 # And we are done 450 return True
451
452 - def parse(self, handle, do_features=True):
453 """Returns a SeqRecord (with SeqFeatures if do_features=True) 454 455 See also the method parse_records() for use on multi-record files. 456 """ 457 from Bio.GenBank import _FeatureConsumer 458 from Bio.GenBank.utils import FeatureValueCleaner 459 460 consumer = _FeatureConsumer(use_fuzziness=1, 461 feature_cleaner=FeatureValueCleaner()) 462 463 if self.feed(handle, consumer, do_features): 464 return consumer.data 465 else: 466 return None
467
468 - def parse_records(self, handle, do_features=True):
469 """Returns a SeqRecord object iterator 470 471 Each record (from the ID/LOCUS line to the // line) becomes a SeqRecord 472 473 The SeqRecord objects include SeqFeatures if do_features=True 474 475 This method is intended for use in Bio.SeqIO 476 """ 477 # This is a generator function 478 while True: 479 record = self.parse(handle, do_features) 480 if record is None: 481 break 482 if record.id is None: 483 raise ValueError("Failed to parse the record's ID. Invalid ID line?") 484 if record.name == "<unknown name>": 485 raise ValueError("Failed to parse the record's name. Invalid ID line?") 486 if record.description == "<unknown description>": 487 raise ValueError("Failed to parse the record's description") 488 yield record
489
490 - def parse_cds_features(self, handle, 491 alphabet=generic_protein, 492 tags2id=('protein_id', 'locus_tag', 'product')):
493 """Returns SeqRecord object iterator 494 495 Each CDS feature becomes a SeqRecord. 496 497 - alphabet - Used for any sequence found in a translation field. 498 - tags2id - Tupple of three strings, the feature keys to use 499 for the record id, name and description, 500 501 This method is intended for use in Bio.SeqIO 502 """ 503 self.set_handle(handle) 504 while self.find_start(): 505 # Got an EMBL or GenBank record... 506 self.parse_header() # ignore header lines! 507 feature_tuples = self.parse_features() 508 # self.parse_footer() # ignore footer lines! 509 while True: 510 line = self.handle.readline() 511 if not line: 512 break 513 if line[:2] == "//": 514 break 515 self.line = line.rstrip() 516 517 # Now go though those features... 518 for key, location_string, qualifiers in feature_tuples: 519 if key == "CDS": 520 # Create SeqRecord 521 # ================ 522 # SeqRecord objects cannot be created with annotations, they 523 # must be added afterwards. So create an empty record and 524 # then populate it: 525 record = SeqRecord(seq=None) 526 annotations = record.annotations 527 528 # Should we add a location object to the annotations? 529 # I *think* that only makes sense for SeqFeatures with their 530 # sub features... 531 annotations['raw_location'] = location_string.replace(' ', '') 532 533 for (qualifier_name, qualifier_data) in qualifiers: 534 if qualifier_data is not None \ 535 and qualifier_data[0] == '"' and qualifier_data[-1] == '"': 536 # Remove quotes 537 qualifier_data = qualifier_data[1:-1] 538 # Append the data to the annotation qualifier... 539 if qualifier_name == "translation": 540 assert record.seq is None, "Multiple translations!" 541 record.seq = Seq(qualifier_data.replace("\n", ""), alphabet) 542 elif qualifier_name == "db_xref": 543 # its a list, possibly empty. Its safe to extend 544 record.dbxrefs.append(qualifier_data) 545 else: 546 if qualifier_data is not None: 547 qualifier_data = qualifier_data.replace("\n", " ").replace(" ", " ") 548 try: 549 annotations[qualifier_name] += " " + qualifier_data 550 except KeyError: 551 # Not an addition to existing data, its the first bit 552 annotations[qualifier_name] = qualifier_data 553 554 # Fill in the ID, Name, Description 555 # ================================= 556 try: 557 record.id = annotations[tags2id[0]] 558 except KeyError: 559 pass 560 try: 561 record.name = annotations[tags2id[1]] 562 except KeyError: 563 pass 564 try: 565 record.description = annotations[tags2id[2]] 566 except KeyError: 567 pass 568 569 yield record
570 571
572 -class EmblScanner(InsdcScanner):
573 """For extracting chunks of information in EMBL files""" 574 575 RECORD_START = "ID " 576 HEADER_WIDTH = 5 577 FEATURE_START_MARKERS = ["FH Key Location/Qualifiers", "FH"] 578 FEATURE_END_MARKERS = ["XX"] # XX can also mark the end of many things! 579 FEATURE_QUALIFIER_INDENT = 21 580 FEATURE_QUALIFIER_SPACER = "FT" + " " * (FEATURE_QUALIFIER_INDENT - 2) 581 SEQUENCE_HEADERS = ["SQ", "CO"] # Remove trailing spaces 582 617
618 - def _feed_first_line(self, consumer, line):
619 assert line[:self.HEADER_WIDTH].rstrip() == "ID" 620 if line[self.HEADER_WIDTH:].count(";") == 6: 621 # Looks like the semi colon separated style introduced in 2006 622 self._feed_first_line_new(consumer, line) 623 elif line[self.HEADER_WIDTH:].count(";") == 3: 624 if line.rstrip().endswith(" SQ"): 625 # EMBL-bank patent data 626 self._feed_first_line_patents(consumer, line) 627 else: 628 # Looks like the pre 2006 style 629 self._feed_first_line_old(consumer, line) 630 else: 631 raise ValueError('Did not recognise the ID line layout:\n' + line)
632
633 - def _feed_first_line_patents(self, consumer, line):
634 # Either Non-Redundant Level 1 database records, 635 # ID <accession>; <molecule type>; <non-redundant level 1>; <cluster size L1> 636 # e.g. ID NRP_AX000635; PRT; NR1; 15 SQ 637 # 638 # Or, Non-Redundant Level 2 database records: 639 # ID <L2-accession>; <molecule type>; <non-redundant level 2>; <cluster size L2> 640 # e.g. ID NRP0000016E; PRT; NR2; 5 SQ 641 fields = line[self.HEADER_WIDTH:].rstrip()[:-3].split(";") 642 assert len(fields) == 4 643 consumer.locus(fields[0]) 644 consumer.residue_type(fields[1]) 645 consumer.data_file_division(fields[2])
646 # TODO - Record cluster size? 647
648 - def _feed_first_line_old(self, consumer, line):
649 # Expects an ID line in the style before 2006, e.g. 650 # ID SC10H5 standard; DNA; PRO; 4870 BP. 651 # ID BSUB9999 standard; circular DNA; PRO; 4214630 BP. 652 assert line[:self.HEADER_WIDTH].rstrip() == "ID" 653 fields = [line[self.HEADER_WIDTH:].split(None, 1)[0]] 654 fields.extend(line[self.HEADER_WIDTH:].split(None, 1)[1].split(";")) 655 fields = [entry.strip() for entry in fields] 656 """ 657 The tokens represent: 658 659 0. Primary accession number 660 (space sep) 661 1. ??? (e.g. standard) 662 (semi-colon) 663 2. Topology and/or Molecule type (e.g. 'circular DNA' or 'DNA') 664 3. Taxonomic division (e.g. 'PRO') 665 4. Sequence length (e.g. '4639675 BP.') 666 """ 667 consumer.locus(fields[0]) # Should we also call the accession consumer? 668 consumer.residue_type(fields[2]) 669 consumer.data_file_division(fields[3]) 670 self._feed_seq_length(consumer, fields[4])
671
672 - def _feed_first_line_new(self, consumer, line):
673 # Expects an ID line in the style introduced in 2006, e.g. 674 # ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP. 675 # ID CD789012; SV 4; linear; genomic DNA; HTG; MAM; 500 BP. 676 assert line[:self.HEADER_WIDTH].rstrip() == "ID" 677 fields = [data.strip() for data in line[self.HEADER_WIDTH:].strip().split(";")] 678 assert len(fields) == 7 679 """ 680 The tokens represent: 681 682 0. Primary accession number 683 1. Sequence version number 684 2. Topology: 'circular' or 'linear' 685 3. Molecule type (e.g. 'genomic DNA') 686 4. Data class (e.g. 'STD') 687 5. Taxonomic division (e.g. 'PRO') 688 6. Sequence length (e.g. '4639675 BP.') 689 """ 690 691 consumer.locus(fields[0]) 692 693 # Call the accession consumer now, to make sure we record 694 # something as the record.id, in case there is no AC line 695 consumer.accession(fields[0]) 696 697 # TODO - How to deal with the version field? At the moment the consumer 698 # will try and use this for the ID which isn't ideal for EMBL files. 699 version_parts = fields[1].split() 700 if len(version_parts) == 2 \ 701 and version_parts[0] == "SV" \ 702 and version_parts[1].isdigit(): 703 consumer.version_suffix(version_parts[1]) 704 705 # Based on how the old GenBank parser worked, merge these two: 706 consumer.residue_type(" ".join(fields[2:4])) # TODO - Store as two fields? 707 708 # consumer.xxx(fields[4]) # TODO - What should we do with the data class? 709 710 consumer.data_file_division(fields[5]) 711 712 self._feed_seq_length(consumer, fields[6])
713
714 - def _feed_seq_length(self, consumer, text):
715 length_parts = text.split() 716 assert len(length_parts) == 2, "Invalid sequence length string %r" % text 717 assert length_parts[1].upper() in ["BP", "BP.", "AA", "AA."] 718 consumer.size(length_parts[0])
719
720 - def _feed_header_lines(self, consumer, lines):
721 EMBL_INDENT = self.HEADER_WIDTH 722 EMBL_SPACER = " " * EMBL_INDENT 723 consumer_dict = { 724 'AC': 'accession', 725 'SV': 'version', # SV line removed in June 2006, now part of ID line 726 'DE': 'definition', 727 # 'RN' : 'reference_num', 728 # 'RC' : reference comment... TODO 729 # 'RP' : 'reference_bases', 730 # 'RX' : reference cross reference... DOI or Pubmed 731 'RG': 'consrtm', # optional consortium 732 # 'RA' : 'authors', 733 # 'RT' : 'title', 734 'RL': 'journal', 735 'OS': 'organism', 736 'OC': 'taxonomy', 737 # 'DR' : data reference 738 'CC': 'comment', 739 # 'XX' : splitter 740 } 741 # We have to handle the following specially: 742 # RX (depending on reference type...) 743 for line in lines: 744 line_type = line[:EMBL_INDENT].strip() 745 data = line[EMBL_INDENT:].strip() 746 if line_type == 'XX': 747 pass 748 elif line_type == 'RN': 749 # Reformat reference numbers for the GenBank based consumer 750 # e.g. '[1]' becomes '1' 751 if data[0] == "[" and data[-1] == "]": 752 data = data[1:-1] 753 consumer.reference_num(data) 754 elif line_type == 'RP': 755 # Reformat reference numbers for the GenBank based consumer 756 # e.g. '1-4639675' becomes '(bases 1 to 4639675)' 757 # and '160-550, 904-1055' becomes '(bases 160 to 550; 904 to 1055)' 758 # Note could be multi-line, and end with a comma 759 parts = [bases.replace("-", " to ").strip() for bases in data.split(",") if bases.strip()] 760 consumer.reference_bases("(bases %s)" % "; ".join(parts)) 761 elif line_type == 'RT': 762 # Remove the enclosing quotes and trailing semi colon. 763 # Note the title can be split over multiple lines. 764 if data.startswith('"'): 765 data = data[1:] 766 if data.endswith('";'): 767 data = data[:-2] 768 consumer.title(data) 769 elif line_type == 'RX': 770 # EMBL support three reference types at the moment: 771 # - PUBMED PUBMED bibliographic database (NLM) 772 # - DOI Digital Object Identifier (International DOI Foundation) 773 # - AGRICOLA US National Agriculture Library (NAL) of the US Department 774 # of Agriculture (USDA) 775 # 776 # Format: 777 # RX resource_identifier; identifier. 778 # 779 # e.g. 780 # RX DOI; 10.1016/0024-3205(83)90010-3. 781 # RX PUBMED; 264242. 782 # 783 # Currently our reference object only supports PUBMED and MEDLINE 784 # (as these were in GenBank files?). 785 key, value = data.split(";", 1) 786 if value.endswith("."): 787 value = value[:-1] 788 value = value.strip() 789 if key == "PUBMED": 790 consumer.pubmed_id(value) 791 # TODO - Handle other reference types (here and in BioSQL bindings) 792 elif line_type == 'CC': 793 # Have to pass a list of strings for this one (not just a string) 794 consumer.comment([data]) 795 elif line_type == 'DR': 796 # Database Cross-reference, format: 797 # DR database_identifier; primary_identifier; secondary_identifier. 798 # 799 # e.g. 800 # DR MGI; 98599; Tcrb-V4. 801 # 802 # TODO - How should we store any secondary identifier? 803 parts = data.rstrip(".").split(";") 804 # Turn it into "database_identifier:primary_identifier" to 805 # mimic the GenBank parser. e.g. "MGI:98599" 806 consumer.dblink("%s:%s" % (parts[0].strip(), 807 parts[1].strip())) 808 elif line_type == 'RA': 809 # Remove trailing ; at end of authors list 810 consumer.authors(data.rstrip(";")) 811 elif line_type == 'PR': 812 # Remove trailing ; at end of the project reference 813 # In GenBank files this corresponds to the old PROJECT 814 # line which is being replaced with the DBLINK line. 815 consumer.project(data.rstrip(";")) 816 elif line_type == 'KW': 817 consumer.keywords(data.rstrip(";")) 818 elif line_type in consumer_dict: 819 # Its a semi-automatic entry! 820 getattr(consumer, consumer_dict[line_type])(data) 821 else: 822 if self.debug: 823 print("Ignoring EMBL header line:\n%s" % line)
824
825 - def _feed_misc_lines(self, consumer, lines):
826 # TODO - Should we do something with the information on the SQ line(s)? 827 lines.append("") 828 line_iter = iter(lines) 829 try: 830 for line in line_iter: 831 if line.startswith("CO "): 832 line = line[5:].strip() 833 contig_location = line 834 while True: 835 line = next(line_iter) 836 if not line: 837 break 838 elif line.startswith("CO "): 839 # Don't need to preseve the whitespace here. 840 contig_location += line[5:].strip() 841 else: 842 raise ValueError('Expected CO (contig) continuation line, got:\n' + line) 843 consumer.contig_location(contig_location) 844 if line.startswith("SQ Sequence "): 845 # e.g. 846 # SQ Sequence 219 BP; 82 A; 48 C; 33 G; 45 T; 11 other; 847 # 848 # Or, EMBL-bank patent, e.g. 849 # SQ Sequence 465 AA; 3963407aa91d3a0d622fec679a4524e0; MD5; 850 self._feed_seq_length(consumer, line[14:].rstrip().rstrip(";").split(";", 1)[0]) 851 # TODO - Record the checksum etc? 852 return 853 except StopIteration: 854 raise ValueError("Problem in misc lines before sequence")
855 856
857 -class _ImgtScanner(EmblScanner):
858 """For extracting chunks of information in IMGT (EMBL like) files (PRIVATE). 859 860 IMGT files are like EMBL files but in order to allow longer feature types 861 the features should be indented by 25 characters not 21 characters. In 862 practice the IMGT flat files tend to use either 21 or 25 characters, so we 863 must cope with both. 864 865 This is private to encourage use of Bio.SeqIO rather than Bio.GenBank. 866 """ 867 868 FEATURE_START_MARKERS = ["FH Key Location/Qualifiers", 869 "FH Key Location/Qualifiers (from EMBL)", 870 "FH Key Location/Qualifiers", 871 "FH"] 872
873 - def parse_features(self, skip=False):
874 """Return list of tuples for the features (if present) 875 876 Each feature is returned as a tuple (key, location, qualifiers) 877 where key and location are strings (e.g. "CDS" and 878 "complement(join(490883..490885,1..879))") while qualifiers 879 is a list of two string tuples (feature qualifier keys and values). 880 881 Assumes you have already read to the start of the features table. 882 """ 883 if self.line.rstrip() not in self.FEATURE_START_MARKERS: 884 if self.debug: 885 print("Didn't find any feature table") 886 return [] 887 888 while self.line.rstrip() in self.FEATURE_START_MARKERS: 889 self.line = self.handle.readline() 890 891 bad_position_re = re.compile(r'([0-9]+)>{1}') 892 893 features = [] 894 line = self.line 895 while True: 896 if not line: 897 raise ValueError("Premature end of line during features table") 898 if line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS: 899 if self.debug: 900 print("Found start of sequence") 901 break 902 line = line.rstrip() 903 if line == "//": 904 raise ValueError("Premature end of features table, marker '//' found") 905 if line in self.FEATURE_END_MARKERS: 906 if self.debug: 907 print("Found end of features") 908 line = self.handle.readline() 909 break 910 if line[2:self.FEATURE_QUALIFIER_INDENT].strip() == "": 911 # This is an empty feature line between qualifiers. Empty 912 # feature lines within qualifiers are handled below (ignored). 913 line = self.handle.readline() 914 continue 915 916 if skip: 917 line = self.handle.readline() 918 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER: 919 line = self.handle.readline() 920 else: 921 assert line[:2] == "FT" 922 try: 923 feature_key, location_start = line[2:].strip().split() 924 except ValueError: 925 # e.g. "FT TRANSMEMBRANE-REGION2163..2240\n" 926 # Assume indent of 25 as per IMGT spec, with the location 927 # start in column 26 (one-based). 928 feature_key = line[2:25].strip() 929 location_start = line[25:].strip() 930 feature_lines = [location_start] 931 line = self.handle.readline() 932 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER \ 933 or line.rstrip() == "": # cope with blank lines in the midst of a feature 934 # Use strip to remove any harmless trailing white space AND and leading 935 # white space (copes with 21 or 26 indents and orther variants) 936 assert line[:2] == "FT" 937 feature_lines.append(line[self.FEATURE_QUALIFIER_INDENT:].strip()) 938 line = self.handle.readline() 939 feature_key, location, qualifiers = \ 940 self.parse_feature(feature_key, feature_lines) 941 # Try to handle known problems with IMGT locations here: 942 if ">" in location: 943 # Nasty hack for common IMGT bug, should be >123 not 123> 944 # in a location string. At least here the meaning is clear, 945 # and since it is so common I don't want to issue a warning 946 # warnings.warn("Feature location %s is invalid, " 947 # "moving greater than sign before position" 948 # % location, BiopythonParserWarning) 949 location = bad_position_re.sub(r'>\1', location) 950 features.append((feature_key, location, qualifiers)) 951 self.line = line 952 return features
953 954
955 -class GenBankScanner(InsdcScanner):
956 """For extracting chunks of information in GenBank files""" 957 958 RECORD_START = "LOCUS " 959 HEADER_WIDTH = 12 960 FEATURE_START_MARKERS = ["FEATURES Location/Qualifiers", "FEATURES"] 961 FEATURE_END_MARKERS = [] 962 FEATURE_QUALIFIER_INDENT = 21 963 FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT 964 SEQUENCE_HEADERS = ["CONTIG", "ORIGIN", "BASE COUNT", "WGS"] # trailing spaces removed 965 1018
1019 - def _feed_first_line(self, consumer, line):
1020 """Scan over and parse GenBank LOCUS line (PRIVATE). 1021 1022 This must cope with several variants, primarily the old and new column 1023 based standards from GenBank. Additionally EnsEMBL produces GenBank 1024 files where the LOCUS line is space separated rather that following 1025 the column based layout. 1026 1027 We also try to cope with GenBank like files with partial LOCUS lines. 1028 """ 1029 ##################################### 1030 # LOCUS line # 1031 ##################################### 1032 GENBANK_INDENT = self.HEADER_WIDTH 1033 GENBANK_SPACER = " " * GENBANK_INDENT 1034 assert line[0:GENBANK_INDENT] == 'LOCUS ', \ 1035 'LOCUS line does not start correctly:\n' + line 1036 1037 # Have to break up the locus line, and handle the different bits of it. 1038 # There are at least two different versions of the locus line... 1039 if line[29:33] in [' bp ', ' aa ', ' rc '] and line[55:62] == ' ': 1040 # Old... note we insist on the 55:62 being empty to avoid trying 1041 # to parse space separated LOCUS lines from Ensembl etc, see below. 1042 # 1043 # Positions Contents 1044 # --------- -------- 1045 # 00:06 LOCUS 1046 # 06:12 spaces 1047 # 12:?? Locus name 1048 # ??:?? space 1049 # ??:29 Length of sequence, right-justified 1050 # 29:33 space, bp, space 1051 # 33:41 strand type 1052 # 41:42 space 1053 # 42:51 Blank (implies linear), linear or circular 1054 # 51:52 space 1055 # 52:55 The division code (e.g. BCT, VRL, INV) 1056 # 55:62 space 1057 # 62:73 Date, in the form dd-MMM-yyyy (e.g., 15-MAR-1991) 1058 # 1059 # assert line[29:33] in [' bp ', ' aa ',' rc '] , \ 1060 # 'LOCUS line does not contain size units at expected position:\n' + line 1061 assert line[41:42] == ' ', \ 1062 'LOCUS line does not contain space at position 42:\n' + line 1063 assert line[42:51].strip() in ['', 'linear', 'circular'], \ 1064 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line 1065 assert line[51:52] == ' ', \ 1066 'LOCUS line does not contain space at position 52:\n' + line 1067 # assert line[55:62] == ' ', \ 1068 # 'LOCUS line does not contain spaces from position 56 to 62:\n' + line 1069 if line[62:73].strip(): 1070 assert line[64:65] == '-', \ 1071 'LOCUS line does not contain - at position 65 in date:\n' + line 1072 assert line[68:69] == '-', \ 1073 'LOCUS line does not contain - at position 69 in date:\n' + line 1074 1075 name_and_length_str = line[GENBANK_INDENT:29] 1076 while ' ' in name_and_length_str: 1077 name_and_length_str = name_and_length_str.replace(' ', ' ') 1078 name_and_length = name_and_length_str.split(' ') 1079 assert len(name_and_length) <= 2, \ 1080 'Cannot parse the name and length in the LOCUS line:\n' + line 1081 assert len(name_and_length) != 1, \ 1082 'Name and length collide in the LOCUS line:\n' + line 1083 # Should be possible to split them based on position, if 1084 # a clear definition of the standard exists THAT AGREES with 1085 # existing files. 1086 consumer.locus(name_and_length[0]) 1087 consumer.size(name_and_length[1]) 1088 # consumer.residue_type(line[33:41].strip()) 1089 1090 if line[33:51].strip() == "" and line[29:33] == ' aa ': 1091 # Amino acids -> protein (even if there is no residue type given) 1092 # We want to use a protein alphabet in this case, rather than a 1093 # generic one. Not sure if this is the best way to achieve this, 1094 # but it works because the scanner checks for this: 1095 consumer.residue_type("PROTEIN") 1096 else: 1097 consumer.residue_type(line[33:51].strip()) 1098 1099 consumer.data_file_division(line[52:55]) 1100 if line[62:73].strip(): 1101 consumer.date(line[62:73]) 1102 elif line[40:44] in [' bp ', ' aa ', ' rc '] \ 1103 and line[54:64].strip() in ['', 'linear', 'circular']: 1104 # New... linear/circular/big blank test should avoid EnsEMBL style 1105 # LOCUS line being treated like a proper column based LOCUS line. 1106 # 1107 # Positions Contents 1108 # --------- -------- 1109 # 00:06 LOCUS 1110 # 06:12 spaces 1111 # 12:?? Locus name 1112 # ??:?? space 1113 # ??:40 Length of sequence, right-justified 1114 # 40:44 space, bp, space 1115 # 44:47 Blank, ss-, ds-, ms- 1116 # 47:54 Blank, DNA, RNA, tRNA, mRNA, uRNA, snRNA, cDNA 1117 # 54:55 space 1118 # 55:63 Blank (implies linear), linear or circular 1119 # 63:64 space 1120 # 64:67 The division code (e.g. BCT, VRL, INV) 1121 # 67:68 space 1122 # 68:79 Date, in the form dd-MMM-yyyy (e.g., 15-MAR-1991) 1123 # 1124 assert line[40:44] in [' bp ', ' aa ', ' rc '], \ 1125 'LOCUS line does not contain size units at expected position:\n' + line 1126 assert line[44:47] in [' ', 'ss-', 'ds-', 'ms-'], \ 1127 'LOCUS line does not have valid strand type (Single stranded, ...):\n' + line 1128 assert line[47:54].strip() == "" \ 1129 or 'DNA' in line[47:54].strip().upper() \ 1130 or 'RNA' in line[47:54].strip().upper(), \ 1131 'LOCUS line does not contain valid sequence type (DNA, RNA, ...):\n' + line 1132 assert line[54:55] == ' ', \ 1133 'LOCUS line does not contain space at position 55:\n' + line 1134 assert line[55:63].strip() in ['', 'linear', 'circular'], \ 1135 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line 1136 assert line[63:64] == ' ', \ 1137 'LOCUS line does not contain space at position 64:\n' + line 1138 assert line[67:68] == ' ', \ 1139 'LOCUS line does not contain space at position 68:\n' + line 1140 if line[68:79].strip(): 1141 assert line[70:71] == '-', \ 1142 'LOCUS line does not contain - at position 71 in date:\n' + line 1143 assert line[74:75] == '-', \ 1144 'LOCUS line does not contain - at position 75 in date:\n' + line 1145 1146 name_and_length_str = line[GENBANK_INDENT:40] 1147 while ' ' in name_and_length_str: 1148 name_and_length_str = name_and_length_str.replace(' ', ' ') 1149 name_and_length = name_and_length_str.split(' ') 1150 assert len(name_and_length) <= 2, \ 1151 'Cannot parse the name and length in the LOCUS line:\n' + line 1152 assert len(name_and_length) != 1, \ 1153 'Name and length collide in the LOCUS line:\n' + line 1154 # Should be possible to split them based on position, if 1155 # a clear definition of the stand exists THAT AGREES with 1156 # existing files. 1157 consumer.locus(name_and_length[0]) 1158 consumer.size(name_and_length[1]) 1159 1160 if line[44:54].strip() == "" and line[40:44] == ' aa ': 1161 # Amino acids -> protein (even if there is no residue type given) 1162 # We want to use a protein alphabet in this case, rather than a 1163 # generic one. Not sure if this is the best way to achieve this, 1164 # but it works because the scanner checks for this: 1165 consumer.residue_type(("PROTEIN " + line[54:63]).strip()) 1166 else: 1167 consumer.residue_type(line[44:63].strip()) 1168 1169 consumer.data_file_division(line[64:67]) 1170 if line[68:79].strip(): 1171 consumer.date(line[68:79]) 1172 elif line[GENBANK_INDENT:].strip().count(" ") == 0: 1173 # Truncated LOCUS line, as produced by some EMBOSS tools - see bug 1762 1174 # 1175 # e.g. 1176 # 1177 # "LOCUS U00096" 1178 # 1179 # rather than: 1180 # 1181 # "LOCUS U00096 4639675 bp DNA circular BCT" 1182 # 1183 # Positions Contents 1184 # --------- -------- 1185 # 00:06 LOCUS 1186 # 06:12 spaces 1187 # 12:?? Locus name 1188 if line[GENBANK_INDENT:].strip() != "": 1189 consumer.locus(line[GENBANK_INDENT:].strip()) 1190 else: 1191 # Must just have just "LOCUS ", is this even legitimate? 1192 # We should be able to continue parsing... we need real world testcases! 1193 warnings.warn("Minimal LOCUS line found - is this " 1194 "correct?\n:%r" % line, BiopythonParserWarning) 1195 elif len(line.split()) == 8 and line.split()[3] in ("aa", "bp") and \ 1196 line.split()[5] in ('linear', 'circular'): 1197 # Cope with invalidly spaced GenBank LOCUS lines like 1198 # LOCUS AB070938 6497 bp DNA linear BCT 11-OCT-2001 1199 splitline = line.split() 1200 consumer.locus(splitline[1]) 1201 consumer.size(splitline[2]) 1202 consumer.residue_type(splitline[4]) 1203 consumer.data_file_division(splitline[6]) 1204 consumer.date(splitline[7]) 1205 warnings.warn("Attempting to parse malformed locus line:\n%r\n" 1206 "Found locus %r size %r residue_type %r\n" 1207 "Some fields may be wrong." % (line, splitline[1], 1208 splitline[2], splitline[4]), BiopythonParserWarning) 1209 elif len(line.split()) == 7 and line.split()[3] in ["aa", "bp"]: 1210 # Cope with EnsEMBL genbank files which use space separation rather 1211 # than the expected column based layout. e.g. 1212 # LOCUS HG531_PATCH 1000000 bp DNA HTG 18-JUN-2011 1213 # LOCUS HG531_PATCH 759984 bp DNA HTG 18-JUN-2011 1214 # LOCUS HG506_HG1000_1_PATCH 814959 bp DNA HTG 18-JUN-2011 1215 # LOCUS HG506_HG1000_1_PATCH 1219964 bp DNA HTG 18-JUN-2011 1216 # Notice that the 'bp' can occur in the position expected by either 1217 # the old or the new fixed column standards (parsed above). 1218 splitline = line.split() 1219 consumer.locus(splitline[1]) 1220 consumer.size(splitline[2]) 1221 consumer.residue_type(splitline[4]) 1222 consumer.data_file_division(splitline[5]) 1223 consumer.date(splitline[6]) 1224 elif len(line.split()) >= 4 and line.split()[3] in ["aa", "bp"]: 1225 # Cope with EMBOSS seqret output where it seems the locus id can cause 1226 # the other fields to overflow. We just IGNORE the other fields! 1227 warnings.warn("Malformed LOCUS line found - is this " 1228 "correct?\n:%r" % line, BiopythonParserWarning) 1229 consumer.locus(line.split()[1]) 1230 consumer.size(line.split()[2]) 1231 elif len(line.split()) >= 4 and line.split()[-1] in ["aa", "bp"]: 1232 # Cope with pseudo-GenBank files like this: 1233 # "LOCUS RNA5 complete 1718 bp" 1234 # Treat everything between LOCUS and the size as the identifier. 1235 warnings.warn("Malformed LOCUS line found - is this " 1236 "correct?\n:%r" % line, BiopythonParserWarning) 1237 consumer.locus(line[5:].rsplit(None, 2)[0].strip()) 1238 consumer.size(line.split()[-2]) 1239 else: 1240 raise ValueError('Did not recognise the LOCUS line layout:\n' + line)
1241
1242 - def _feed_header_lines(self, consumer, lines):
1243 # Following dictionary maps GenBank lines to the associated 1244 # consumer methods - the special cases like LOCUS where one 1245 # genbank line triggers several consumer calls have to be 1246 # handled individually. 1247 GENBANK_INDENT = self.HEADER_WIDTH 1248 GENBANK_SPACER = " " * GENBANK_INDENT 1249 consumer_dict = { 1250 'DEFINITION': 'definition', 1251 'ACCESSION': 'accession', 1252 'NID': 'nid', 1253 'PID': 'pid', 1254 'DBSOURCE': 'db_source', 1255 'KEYWORDS': 'keywords', 1256 'SEGMENT': 'segment', 1257 'SOURCE': 'source', 1258 'AUTHORS': 'authors', 1259 'CONSRTM': 'consrtm', 1260 'PROJECT': 'project', 1261 'DBLINK': 'dblink', 1262 'TITLE': 'title', 1263 'JOURNAL': 'journal', 1264 'MEDLINE': 'medline_id', 1265 'PUBMED': 'pubmed_id', 1266 'REMARK': 'remark'} 1267 # We have to handle the following specially: 1268 # ORIGIN (locus, size, residue_type, data_file_division and date) 1269 # COMMENT (comment) 1270 # VERSION (version and gi) 1271 # REFERENCE (eference_num and reference_bases) 1272 # ORGANISM (organism and taxonomy) 1273 lines = [_f for _f in lines if _f] 1274 lines.append("") # helps avoid getting StopIteration all the time 1275 line_iter = iter(lines) 1276 try: 1277 line = next(line_iter) 1278 while True: 1279 if not line: 1280 break 1281 line_type = line[:GENBANK_INDENT].strip() 1282 data = line[GENBANK_INDENT:].strip() 1283 1284 if line_type == 'VERSION': 1285 # Need to call consumer.version(), and maybe also consumer.gi() as well. 1286 # e.g. 1287 # VERSION AC007323.5 GI:6587720 1288 while ' ' in data: 1289 data = data.replace(' ', ' ') 1290 if ' GI:' not in data: 1291 consumer.version(data) 1292 else: 1293 if self.debug: 1294 print("Version [" + data.split(' GI:')[0] + "], gi [" + data.split(' GI:')[1] + "]") 1295 consumer.version(data.split(' GI:')[0]) 1296 consumer.gi(data.split(' GI:')[1]) 1297 # Read in the next line! 1298 line = next(line_iter) 1299 elif line_type == 'REFERENCE': 1300 if self.debug > 1: 1301 print("Found reference [" + data + "]") 1302 # Need to call consumer.reference_num() and consumer.reference_bases() 1303 # e.g. 1304 # REFERENCE 1 (bases 1 to 86436) 1305 # 1306 # Note that this can be multiline, see Bug 1968, e.g. 1307 # 1308 # REFERENCE 42 (bases 1517 to 1696; 3932 to 4112; 17880 to 17975; 21142 to 1309 # 28259) 1310 # 1311 # For such cases we will call the consumer once only. 1312 data = data.strip() 1313 1314 # Read in the next line, and see if its more of the reference: 1315 while True: 1316 line = next(line_iter) 1317 if line[:GENBANK_INDENT] == GENBANK_SPACER: 1318 # Add this continuation to the data string 1319 data += " " + line[GENBANK_INDENT:] 1320 if self.debug > 1: 1321 print("Extended reference text [" + data + "]") 1322 else: 1323 # End of the reference, leave this text in the variable "line" 1324 break 1325 1326 # We now have all the reference line(s) stored in a string, data, 1327 # which we pass to the consumer 1328 while ' ' in data: 1329 data = data.replace(' ', ' ') 1330 if ' ' not in data: 1331 if self.debug > 2: 1332 print('Reference number \"' + data + '\"') 1333 consumer.reference_num(data) 1334 else: 1335 if self.debug > 2: 1336 print('Reference number \"' + data[:data.find(' ')] + '\", \"' + data[data.find(' ') + 1:] + '\"') 1337 consumer.reference_num(data[:data.find(' ')]) 1338 consumer.reference_bases(data[data.find(' ') + 1:]) 1339 elif line_type == 'ORGANISM': 1340 # Typically the first line is the organism, and subsequent lines 1341 # are the taxonomy lineage. However, given longer and longer 1342 # species names (as more and more strains and sub strains get 1343 # sequenced) the oragnism name can now get wrapped onto multiple 1344 # lines. The NCBI say we have to recognise the lineage line by 1345 # the presence of semi-colon delimited entries. In the long term, 1346 # they are considering adding a new keyword (e.g. LINEAGE). 1347 # See Bug 2591 for details. 1348 organism_data = data 1349 lineage_data = "" 1350 while True: 1351 line = next(line_iter) 1352 if line[0:GENBANK_INDENT] == GENBANK_SPACER: 1353 if lineage_data or ";" in line: 1354 lineage_data += " " + line[GENBANK_INDENT:] 1355 elif line[GENBANK_INDENT:].strip() == ".": 1356 # No lineage data, just . place holder 1357 pass 1358 else: 1359 organism_data += " " + line[GENBANK_INDENT:].strip() 1360 else: 1361 # End of organism and taxonomy 1362 break 1363 consumer.organism(organism_data) 1364 if lineage_data.strip() == "" and self.debug > 1: 1365 print("Taxonomy line(s) missing or blank") 1366 consumer.taxonomy(lineage_data.strip()) 1367 del organism_data, lineage_data 1368 elif line_type == 'COMMENT': 1369 if self.debug > 1: 1370 print("Found comment") 1371 # This can be multiline, and should call consumer.comment() once 1372 # with a list where each entry is a line. 1373 comment_list = [] 1374 comment_list.append(data) 1375 while True: 1376 line = next(line_iter) 1377 if line[0:GENBANK_INDENT] == GENBANK_SPACER: 1378 data = line[GENBANK_INDENT:] 1379 comment_list.append(data) 1380 if self.debug > 2: 1381 print("Comment continuation [" + data + "]") 1382 else: 1383 # End of the comment 1384 break 1385 consumer.comment(comment_list) 1386 del comment_list 1387 elif line_type in consumer_dict: 1388 # Its a semi-automatic entry! 1389 # Now, this may be a multi line entry... 1390 while True: 1391 line = next(line_iter) 1392 if line[0:GENBANK_INDENT] == GENBANK_SPACER: 1393 data += ' ' + line[GENBANK_INDENT:] 1394 else: 1395 # We now have all the data for this entry: 1396 getattr(consumer, consumer_dict[line_type])(data) 1397 # End of continuation - return to top of loop! 1398 break 1399 else: 1400 if self.debug: 1401 print("Ignoring GenBank header line:\n" % line) 1402 # Read in next line 1403 line = next(line_iter) 1404 except StopIteration: 1405 raise ValueError("Problem in header")
1406
1407 - def _feed_misc_lines(self, consumer, lines):
1408 # Deals with a few misc lines between the features and the sequence 1409 GENBANK_INDENT = self.HEADER_WIDTH 1410 GENBANK_SPACER = " " * GENBANK_INDENT 1411 lines.append("") 1412 line_iter = iter(lines) 1413 try: 1414 for line in line_iter: 1415 if line.startswith('BASE COUNT'): 1416 line = line[10:].strip() 1417 if line: 1418 if self.debug: 1419 print("base_count = " + line) 1420 consumer.base_count(line) 1421 if line.startswith('ORIGIN'): 1422 line = line[6:].strip() 1423 if line: 1424 if self.debug: 1425 print("origin_name = " + line) 1426 consumer.origin_name(line) 1427 if line.startswith('WGS '): 1428 line = line[3:].strip() 1429 consumer.wgs(line) 1430 if line.startswith('WGS_SCAFLD'): 1431 line = line[10:].strip() 1432 consumer.add_wgs_scafld(line) 1433 if line.startswith('CONTIG'): 1434 line = line[6:].strip() 1435 contig_location = line 1436 while True: 1437 line = next(line_iter) 1438 if not line: 1439 break 1440 elif line[:GENBANK_INDENT] == GENBANK_SPACER: 1441 # Don't need to preseve the whitespace here. 1442 contig_location += line[GENBANK_INDENT:].rstrip() 1443 elif line.startswith('ORIGIN'): 1444 # Strange, seen this in GenPept files via Entrez gbwithparts 1445 line = line[6:].strip() 1446 if line: 1447 consumer.origin_name(line) 1448 break 1449 else: 1450 raise ValueError('Expected CONTIG continuation line, got:\n' + line) 1451 consumer.contig_location(contig_location) 1452 return 1453 except StopIteration: 1454 raise ValueError("Problem in misc lines before sequence")
1455 1456 if __name__ == "__main__": 1457 from Bio._py3k import StringIO 1458 1459 gbk_example = \ 1460 """LOCUS SCU49845 5028 bp DNA PLN 21-JUN-1999 1461 DEFINITION Saccharomyces cerevisiae TCP1-beta gene, partial cds, and Axl2p 1462 (AXL2) and Rev7p (REV7) genes, complete cds. 1463 ACCESSION U49845 1464 VERSION U49845.1 GI:1293613 1465 KEYWORDS . 1466 SOURCE Saccharomyces cerevisiae (baker's yeast) 1467 ORGANISM Saccharomyces cerevisiae 1468 Eukaryota; Fungi; Ascomycota; Saccharomycotina; Saccharomycetes; 1469 Saccharomycetales; Saccharomycetaceae; Saccharomyces. 1470 REFERENCE 1 (bases 1 to 5028) 1471 AUTHORS Torpey,L.E., Gibbs,P.E., Nelson,J. and Lawrence,C.W. 1472 TITLE Cloning and sequence of REV7, a gene whose function is required for 1473 DNA damage-induced mutagenesis in Saccharomyces cerevisiae 1474 JOURNAL Yeast 10 (11), 1503-1509 (1994) 1475 PUBMED 7871890 1476 REFERENCE 2 (bases 1 to 5028) 1477 AUTHORS Roemer,T., Madden,K., Chang,J. and Snyder,M. 1478 TITLE Selection of axial growth sites in yeast requires Axl2p, a novel 1479 plasma membrane glycoprotein 1480 JOURNAL Genes Dev. 10 (7), 777-793 (1996) 1481 PUBMED 8846915 1482 REFERENCE 3 (bases 1 to 5028) 1483 AUTHORS Roemer,T. 1484 TITLE Direct Submission 1485 JOURNAL Submitted (22-FEB-1996) Terry Roemer, Biology, Yale University, New 1486 Haven, CT, USA 1487 FEATURES Location/Qualifiers 1488 source 1..5028 1489 /organism="Saccharomyces cerevisiae" 1490 /db_xref="taxon:4932" 1491 /chromosome="IX" 1492 /map="9" 1493 CDS <1..206 1494 /codon_start=3 1495 /product="TCP1-beta" 1496 /protein_id="AAA98665.1" 1497 /db_xref="GI:1293614" 1498 /translation="SSIYNGISTSGLDLNNGTIADMRQLGIVESYKLKRAVVSSASEA 1499 AEVLLRVDNIIRARPRTANRQHM" 1500 gene 687..3158 1501 /gene="AXL2" 1502 CDS 687..3158 1503 /gene="AXL2" 1504 /note="plasma membrane glycoprotein" 1505 /codon_start=1 1506 /function="required for axial budding pattern of S. 1507 cerevisiae" 1508 /product="Axl2p" 1509 /protein_id="AAA98666.1" 1510 /db_xref="GI:1293615" 1511 /translation="MTQLQISLLLTATISLLHLVVATPYEAYPIGKQYPPVARVNESF 1512 TFQISNDTYKSSVDKTAQITYNCFDLPSWLSFDSSSRTFSGEPSSDLLSDANTTLYFN 1513 VILEGTDSADSTSLNNTYQFVVTNRPSISLSSDFNLLALLKNYGYTNGKNALKLDPNE 1514 VFNVTFDRSMFTNEESIVSYYGRSQLYNAPLPNWLFFDSGELKFTGTAPVINSAIAPE 1515 TSYSFVIIATDIEGFSAVEVEFELVIGAHQLTTSIQNSLIINVTDTGNVSYDLPLNYV 1516 YLDDDPISSDKLGSINLLDAPDWVALDNATISGSVPDELLGKNSNPANFSVSIYDTYG 1517 DVIYFNFEVVSTTDLFAISSLPNINATRGEWFSYYFLPSQFTDYVNTNVSLEFTNSSQ 1518 DHDWVKFQSSNLTLAGEVPKNFDKLSLGLKANQGSQSQELYFNIIGMDSKITHSNHSA 1519 NATSTRSSHHSTSTSSYTSSTYTAKISSTSAAATSSAPAALPAANKTSSHNKKAVAIA 1520 CGVAIPLGVILVALICFLIFWRRRRENPDDENLPHAISGPDLNNPANKPNQENATPLN 1521 NPFDDDASSYDDTSIARRLAALNTLKLDNHSATESDISSVDEKRDSLSGMNTYNDQFQ 1522 SQSKEELLAKPPVQPPESPFFDPQNRSSSVYMDSEPAVNKSWRYTGNLSPVSDIVRDS 1523 YGSQKTVDTEKLFDLEAPEKEKRTSRDVTMSSLDPWNSNISPSPVRKSVTPSPYNVTK 1524 HRNRHLQNIQDSQSGKNGITPTTMSTSSSDDFVPVKDGENFCWVHSMEPDRRPSKKRL 1525 VDFSNKSNVNVGQVKDIHGRIPEML" 1526 gene complement(3300..4037) 1527 /gene="REV7" 1528 CDS complement(3300..4037) 1529 /gene="REV7" 1530 /codon_start=1 1531 /product="Rev7p" 1532 /protein_id="AAA98667.1" 1533 /db_xref="GI:1293616" 1534 /translation="MNRWVEKWLRVYLKCYINLILFYRNVYPPQSFDYTTYQSFNLPQ 1535 FVPINRHPALIDYIEELILDVLSKLTHVYRFSICIINKKNDLCIEKYVLDFSELQHVD 1536 KDDQIITETEVFDEFRSSLNSLIMHLEKLPKVNDDTITFEAVINAIELELGHKLDRNR 1537 RVDSLEEKAEIERDSNWVKCQEDENLPDNNGFQPPKIKLTSLVGSDVGPLIIHQFSEK 1538 LISGDDKILNGVYSQYEEGESIFGSLF" 1539 ORIGIN 1540 1 gatcctccat atacaacggt atctccacct caggtttaga tctcaacaac ggaaccattg 1541 61 ccgacatgag acagttaggt atcgtcgaga gttacaagct aaaacgagca gtagtcagct 1542 121 ctgcatctga agccgctgaa gttctactaa gggtggataa catcatccgt gcaagaccaa 1543 181 gaaccgccaa tagacaacat atgtaacata tttaggatat acctcgaaaa taataaaccg 1544 241 ccacactgtc attattataa ttagaaacag aacgcaaaaa ttatccacta tataattcaa 1545 301 agacgcgaaa aaaaaagaac aacgcgtcat agaacttttg gcaattcgcg tcacaaataa 1546 361 attttggcaa cttatgtttc ctcttcgagc agtactcgag ccctgtctca agaatgtaat 1547 421 aatacccatc gtaggtatgg ttaaagatag catctccaca acctcaaagc tccttgccga 1548 481 gagtcgccct cctttgtcga gtaattttca cttttcatat gagaacttat tttcttattc 1549 541 tttactctca catcctgtag tgattgacac tgcaacagcc accatcacta gaagaacaga 1550 601 acaattactt aatagaaaaa ttatatcttc ctcgaaacga tttcctgctt ccaacatcta 1551 661 cgtatatcaa gaagcattca cttaccatga cacagcttca gatttcatta ttgctgacag 1552 721 ctactatatc actactccat ctagtagtgg ccacgcccta tgaggcatat cctatcggaa 1553 781 aacaataccc cccagtggca agagtcaatg aatcgtttac atttcaaatt tccaatgata 1554 841 cctataaatc gtctgtagac aagacagctc aaataacata caattgcttc gacttaccga 1555 901 gctggctttc gtttgactct agttctagaa cgttctcagg tgaaccttct tctgacttac 1556 961 tatctgatgc gaacaccacg ttgtatttca atgtaatact cgagggtacg gactctgccg 1557 1021 acagcacgtc tttgaacaat acataccaat ttgttgttac aaaccgtcca tccatctcgc 1558 1081 tatcgtcaga tttcaatcta ttggcgttgt taaaaaacta tggttatact aacggcaaaa 1559 1141 acgctctgaa actagatcct aatgaagtct tcaacgtgac ttttgaccgt tcaatgttca 1560 1201 ctaacgaaga atccattgtg tcgtattacg gacgttctca gttgtataat gcgccgttac 1561 1261 ccaattggct gttcttcgat tctggcgagt tgaagtttac tgggacggca ccggtgataa 1562 1321 actcggcgat tgctccagaa acaagctaca gttttgtcat catcgctaca gacattgaag 1563 1381 gattttctgc cgttgaggta gaattcgaat tagtcatcgg ggctcaccag ttaactacct 1564 1441 ctattcaaaa tagtttgata atcaacgtta ctgacacagg taacgtttca tatgacttac 1565 1501 ctctaaacta tgtttatctc gatgacgatc ctatttcttc tgataaattg ggttctataa 1566 1561 acttattgga tgctccagac tgggtggcat tagataatgc taccatttcc gggtctgtcc 1567 1621 cagatgaatt actcggtaag aactccaatc ctgccaattt ttctgtgtcc atttatgata 1568 1681 cttatggtga tgtgatttat ttcaacttcg aagttgtctc cacaacggat ttgtttgcca 1569 1741 ttagttctct tcccaatatt aacgctacaa ggggtgaatg gttctcctac tattttttgc 1570 1801 cttctcagtt tacagactac gtgaatacaa acgtttcatt agagtttact aattcaagcc 1571 1861 aagaccatga ctgggtgaaa ttccaatcat ctaatttaac attagctgga gaagtgccca 1572 1921 agaatttcga caagctttca ttaggtttga aagcgaacca aggttcacaa tctcaagagc 1573 1981 tatattttaa catcattggc atggattcaa agataactca ctcaaaccac agtgcgaatg 1574 2041 caacgtccac aagaagttct caccactcca cctcaacaag ttcttacaca tcttctactt 1575 2101 acactgcaaa aatttcttct acctccgctg ctgctacttc ttctgctcca gcagcgctgc 1576 2161 cagcagccaa taaaacttca tctcacaata aaaaagcagt agcaattgcg tgcggtgttg 1577 2221 ctatcccatt aggcgttatc ctagtagctc tcatttgctt cctaatattc tggagacgca 1578 2281 gaagggaaaa tccagacgat gaaaacttac cgcatgctat tagtggacct gatttgaata 1579 2341 atcctgcaaa taaaccaaat caagaaaacg ctacaccttt gaacaacccc tttgatgatg 1580 2401 atgcttcctc gtacgatgat acttcaatag caagaagatt ggctgctttg aacactttga 1581 2461 aattggataa ccactctgcc actgaatctg atatttccag cgtggatgaa aagagagatt 1582 2521 ctctatcagg tatgaataca tacaatgatc agttccaatc ccaaagtaaa gaagaattat 1583 2581 tagcaaaacc cccagtacag cctccagaga gcccgttctt tgacccacag aataggtctt 1584 2641 cttctgtgta tatggatagt gaaccagcag taaataaatc ctggcgatat actggcaacc 1585 2701 tgtcaccagt ctctgatatt gtcagagaca gttacggatc acaaaaaact gttgatacag 1586 2761 aaaaactttt cgatttagaa gcaccagaga aggaaaaacg tacgtcaagg gatgtcacta 1587 2821 tgtcttcact ggacccttgg aacagcaata ttagcccttc tcccgtaaga aaatcagtaa 1588 2881 caccatcacc atataacgta acgaagcatc gtaaccgcca cttacaaaat attcaagact 1589 2941 ctcaaagcgg taaaaacgga atcactccca caacaatgtc aacttcatct tctgacgatt 1590 3001 ttgttccggt taaagatggt gaaaattttt gctgggtcca tagcatggaa ccagacagaa 1591 3061 gaccaagtaa gaaaaggtta gtagattttt caaataagag taatgtcaat gttggtcaag 1592 3121 ttaaggacat tcacggacgc atcccagaaa tgctgtgatt atacgcaacg atattttgct 1593 3181 taattttatt ttcctgtttt attttttatt agtggtttac agatacccta tattttattt 1594 3241 agtttttata cttagagaca tttaatttta attccattct tcaaatttca tttttgcact 1595 3301 taaaacaaag atccaaaaat gctctcgccc tcttcatatt gagaatacac tccattcaaa 1596 3361 attttgtcgt caccgctgat taatttttca ctaaactgat gaataatcaa aggccccacg 1597 3421 tcagaaccga ctaaagaagt gagttttatt ttaggaggtt gaaaaccatt attgtctggt 1598 3481 aaattttcat cttcttgaca tttaacccag tttgaatccc tttcaatttc tgctttttcc 1599 3541 tccaaactat cgaccctcct gtttctgtcc aacttatgtc ctagttccaa ttcgatcgca 1600 3601 ttaataactg cttcaaatgt tattgtgtca tcgttgactt taggtaattt ctccaaatgc 1601 3661 ataatcaaac tatttaagga agatcggaat tcgtcgaaca cttcagtttc cgtaatgatc 1602 3721 tgatcgtctt tatccacatg ttgtaattca ctaaaatcta aaacgtattt ttcaatgcat 1603 3781 aaatcgttct ttttattaat aatgcagatg gaaaatctgt aaacgtgcgt taatttagaa 1604 3841 agaacatcca gtataagttc ttctatatag tcaattaaag caggatgcct attaatggga 1605 3901 acgaactgcg gcaagttgaa tgactggtaa gtagtgtagt cgaatgactg aggtgggtat 1606 3961 acatttctat aaaataaaat caaattaatg tagcatttta agtataccct cagccacttc 1607 4021 tctacccatc tattcataaa gctgacgcaa cgattactat tttttttttc ttcttggatc 1608 4081 tcagtcgtcg caaaaacgta taccttcttt ttccgacctt ttttttagct ttctggaaaa 1609 4141 gtttatatta gttaaacagg gtctagtctt agtgtgaaag ctagtggttt cgattgactg 1610 4201 atattaagaa agtggaaatt aaattagtag tgtagacgta tatgcatatg tatttctcgc 1611 4261 ctgtttatgt ttctacgtac ttttgattta tagcaagggg aaaagaaata catactattt 1612 4321 tttggtaaag gtgaaagcat aatgtaaaag ctagaataaa atggacgaaa taaagagagg 1613 4381 cttagttcat cttttttcca aaaagcaccc aatgataata actaaaatga aaaggatttg 1614 4441 ccatctgtca gcaacatcag ttgtgtgagc aataataaaa tcatcacctc cgttgccttt 1615 4501 agcgcgtttg tcgtttgtat cttccgtaat tttagtctta tcaatgggaa tcataaattt 1616 4561 tccaatgaat tagcaatttc gtccaattct ttttgagctt cttcatattt gctttggaat 1617 4621 tcttcgcact tcttttccca ttcatctctt tcttcttcca aagcaacgat ccttctaccc 1618 4681 atttgctcag agttcaaatc ggcctctttc agtttatcca ttgcttcctt cagtttggct 1619 4741 tcactgtctt ctagctgttg ttctagatcc tggtttttct tggtgtagtt ctcattatta 1620 4801 gatctcaagt tattggagtc ttcagccaat tgctttgtat cagacaattg actctctaac 1621 4861 ttctccactt cactgtcgag ttgctcgttt ttagcggaca aagatttaat ctcgttttct 1622 4921 ttttcagtgt tagattgctc taattctttg agctgttctc tcagctcctc atatttttct 1623 4981 tgccatgact cagattctaa ttttaagcta ttcaatttct ctttgatc 1624 //""" 1625 1626 # GenBank format protein (aka GenPept) file from: 1627 # http://www.molecularevolution.org/resources/fileformats/ 1628 gbk_example2 = \ 1629 """LOCUS AAD51968 143 aa linear BCT 21-AUG-2001 1630 DEFINITION transcriptional regulator RovA [Yersinia enterocolitica]. 1631 ACCESSION AAD51968 1632 VERSION AAD51968.1 GI:5805369 1633 DBSOURCE locus AF171097 accession AF171097.1 1634 KEYWORDS . 1635 SOURCE Yersinia enterocolitica 1636 ORGANISM Yersinia enterocolitica 1637 Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacteriales; 1638 Enterobacteriaceae; Yersinia. 1639 REFERENCE 1 (residues 1 to 143) 1640 AUTHORS Revell,P.A. and Miller,V.L. 1641 TITLE A chromosomally encoded regulator is required for expression of the 1642 Yersinia enterocolitica inv gene and for virulence 1643 JOURNAL Mol. Microbiol. 35 (3), 677-685 (2000) 1644 MEDLINE 20138369 1645 PUBMED 10672189 1646 REFERENCE 2 (residues 1 to 143) 1647 AUTHORS Revell,P.A. and Miller,V.L. 1648 TITLE Direct Submission 1649 JOURNAL Submitted (22-JUL-1999) Molecular Microbiology, Washington 1650 University School of Medicine, Campus Box 8230, 660 South Euclid, 1651 St. Louis, MO 63110, USA 1652 COMMENT Method: conceptual translation. 1653 FEATURES Location/Qualifiers 1654 source 1..143 1655 /organism="Yersinia enterocolitica" 1656 /mol_type="unassigned DNA" 1657 /strain="JB580v" 1658 /serotype="O:8" 1659 /db_xref="taxon:630" 1660 Protein 1..143 1661 /product="transcriptional regulator RovA" 1662 /name="regulates inv expression" 1663 CDS 1..143 1664 /gene="rovA" 1665 /coded_by="AF171097.1:380..811" 1666 /note="regulator of virulence" 1667 /transl_table=11 1668 ORIGIN 1669 1 mestlgsdla rlvrvwrali dhrlkplelt qthwvtlhni nrlppeqsqi qlakaigieq 1670 61 pslvrtldql eekglitrht candrrakri klteqsspii eqvdgvicst rkeilggisp 1671 121 deiellsgli dklerniiql qsk 1672 // 1673 """ 1674 1675 embl_example = """ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP. 1676 XX 1677 AC X56734; S46826; 1678 XX 1679 DT 12-SEP-1991 (Rel. 29, Created) 1680 DT 25-NOV-2005 (Rel. 85, Last updated, Version 11) 1681 XX 1682 DE Trifolium repens mRNA for non-cyanogenic beta-glucosidase 1683 XX 1684 KW beta-glucosidase. 1685 XX 1686 OS Trifolium repens (white clover) 1687 OC Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta; 1688 OC Spermatophyta; Magnoliophyta; eudicotyledons; core eudicotyledons; rosids; 1689 OC eurosids I; Fabales; Fabaceae; Papilionoideae; Trifolieae; Trifolium. 1690 XX 1691 RN [5] 1692 RP 1-1859 1693 RX PUBMED; 1907511. 1694 RA Oxtoby E., Dunn M.A., Pancoro A., Hughes M.A.; 1695 RT "Nucleotide and derived amino acid sequence of the cyanogenic 1696 RT beta-glucosidase (linamarase) from white clover (Trifolium repens L.)"; 1697 RL Plant Mol. Biol. 17(2):209-219(1991). 1698 XX 1699 RN [6] 1700 RP 1-1859 1701 RA Hughes M.A.; 1702 RT ; 1703 RL Submitted (19-NOV-1990) to the EMBL/GenBank/DDBJ databases. 1704 RL Hughes M.A., University of Newcastle Upon Tyne, Medical School, Newcastle 1705 RL Upon Tyne, NE2 4HH, UK 1706 XX 1707 FH Key Location/Qualifiers 1708 FH 1709 FT source 1..1859 1710 FT /organism="Trifolium repens" 1711 FT /mol_type="mRNA" 1712 FT /clone_lib="lambda gt10" 1713 FT /clone="TRE361" 1714 FT /tissue_type="leaves" 1715 FT /db_xref="taxon:3899" 1716 FT CDS 14..1495 1717 FT /product="beta-glucosidase" 1718 FT /EC_number="3.2.1.21" 1719 FT /note="non-cyanogenic" 1720 FT /db_xref="GOA:P26204" 1721 FT /db_xref="InterPro:IPR001360" 1722 FT /db_xref="InterPro:IPR013781" 1723 FT /db_xref="UniProtKB/Swiss-Prot:P26204" 1724 FT /protein_id="CAA40058.1" 1725 FT /translation="MDFIVAIFALFVISSFTITSTNAVEASTLLDIGNLSRSSFPRGFI 1726 FT FGAGSSAYQFEGAVNEGGRGPSIWDTFTHKYPEKIRDGSNADITVDQYHRYKEDVGIMK 1727 FT DQNMDSYRFSISWPRILPKGKLSGGINHEGIKYYNNLINELLANGIQPFVTLFHWDLPQ 1728 FT VLEDEYGGFLNSGVINDFRDYTDLCFKEFGDRVRYWSTLNEPWVFSNSGYALGTNAPGR 1729 FT CSASNVAKPGDSGTGPYIVTHNQILAHAEAVHVYKTKYQAYQKGKIGITLVSNWLMPLD 1730 FT DNSIPDIKAAERSLDFQFGLFMEQLTTGDYSKSMRRIVKNRLPKFSKFESSLVNGSFDF 1731 FT IGINYYSSSYISNAPSHGNAKPSYSTNPMTNISFEKHGIPLGPRAASIWIYVYPYMFIQ 1732 FT EDFEIFCYILKINITILQFSITENGMNEFNDATLPVEEALLNTYRIDYYYRHLYYIRSA 1733 FT IRAGSNVKGFYAWSFLDCNEWFAGFTVRFGLNFVD" 1734 FT mRNA 1..1859 1735 FT /experiment="experimental evidence, no additional details 1736 FT recorded" 1737 XX 1738 SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other; 1739 aaacaaacca aatatggatt ttattgtagc catatttgct ctgtttgtta ttagctcatt 60 1740 cacaattact tccacaaatg cagttgaagc ttctactctt cttgacatag gtaacctgag 120 1741 tcggagcagt tttcctcgtg gcttcatctt tggtgctgga tcttcagcat accaatttga 180 1742 aggtgcagta aacgaaggcg gtagaggacc aagtatttgg gataccttca cccataaata 240 1743 tccagaaaaa ataagggatg gaagcaatgc agacatcacg gttgaccaat atcaccgcta 300 1744 caaggaagat gttgggatta tgaaggatca aaatatggat tcgtatagat tctcaatctc 360 1745 ttggccaaga atactcccaa agggaaagtt gagcggaggc ataaatcacg aaggaatcaa 420 1746 atattacaac aaccttatca acgaactatt ggctaacggt atacaaccat ttgtaactct 480 1747 ttttcattgg gatcttcccc aagtcttaga agatgagtat ggtggtttct taaactccgg 540 1748 tgtaataaat gattttcgag actatacgga tctttgcttc aaggaatttg gagatagagt 600 1749 gaggtattgg agtactctaa atgagccatg ggtgtttagc aattctggat atgcactagg 660 1750 aacaaatgca ccaggtcgat gttcggcctc caacgtggcc aagcctggtg attctggaac 720 1751 aggaccttat atagttacac acaatcaaat tcttgctcat gcagaagctg tacatgtgta 780 1752 taagactaaa taccaggcat atcaaaaggg aaagataggc ataacgttgg tatctaactg 840 1753 gttaatgcca cttgatgata atagcatacc agatataaag gctgccgaga gatcacttga 900 1754 cttccaattt ggattgttta tggaacaatt aacaacagga gattattcta agagcatgcg 960 1755 gcgtatagtt aaaaaccgat tacctaagtt ctcaaaattc gaatcaagcc tagtgaatgg 1020 1756 ttcatttgat tttattggta taaactatta ctcttctagt tatattagca atgccccttc 1080 1757 acatggcaat gccaaaccca gttactcaac aaatcctatg accaatattt catttgaaaa 1140 1758 acatgggata cccttaggtc caagggctgc ttcaatttgg atatatgttt atccatatat 1200 1759 gtttatccaa gaggacttcg agatcttttg ttacatatta aaaataaata taacaatcct 1260 1760 gcaattttca atcactgaaa atggtatgaa tgaattcaac gatgcaacac ttccagtaga 1320 1761 agaagctctt ttgaatactt acagaattga ttactattac cgtcacttat actacattcg 1380 1762 ttctgcaatc agggctggct caaatgtgaa gggtttttac gcatggtcat ttttggactg 1440 1763 taatgaatgg tttgcaggct ttactgttcg ttttggatta aactttgtag attagaaaga 1500 1764 tggattaaaa aggtacccta agctttctgc ccaatggtac aagaactttc tcaaaagaaa 1560 1765 ctagctagta ttattaaaag aactttgtag tagattacag tacatcgttt gaagttgagt 1620 1766 tggtgcacct aattaaataa aagaggttac tcttaacata tttttaggcc attcgttgtg 1680 1767 aagttgttag gctgttattt ctattatact atgttgtagt aataagtgca ttgttgtacc 1740 1768 agaagctatg atcataacta taggttgatc cttcatgtat cagtttgatg ttgagaatac 1800 1769 tttgaattaa aagtcttttt ttattttttt aaaaaaaaaa aaaaaaaaaa aaaaaaaaa 1859 1770 // 1771 """ 1772 1773 print("GenBank CDS Iteration") 1774 print("=====================") 1775 1776 g = GenBankScanner() 1777 for record in g.parse_cds_features(StringIO(gbk_example)): 1778 print(record) 1779 1780 g = GenBankScanner() 1781 for record in g.parse_cds_features(StringIO(gbk_example2), 1782 tags2id=('gene', 'locus_tag', 'product')): 1783 print(record) 1784 1785 g = GenBankScanner() 1786 for record in g.parse_cds_features(StringIO(gbk_example + "\n" + gbk_example2), 1787 tags2id=('gene', 'locus_tag', 'product')): 1788 print(record) 1789 1790 print("") 1791 print("GenBank Iteration") 1792 print("=================") 1793 g = GenBankScanner() 1794 for record in g.parse_records(StringIO(gbk_example), do_features=False): 1795 print("%s %s %s" % (record.id, record.name, record.description)) 1796 print(record.seq) 1797 1798 g = GenBankScanner() 1799 for record in g.parse_records(StringIO(gbk_example), do_features=True): 1800 print("%s %s %s" % (record.id, record.name, record.description)) 1801 print(record.seq) 1802 1803 g = GenBankScanner() 1804 for record in g.parse_records(StringIO(gbk_example2), do_features=False): 1805 print("%s %s %s" % (record.id, record.name, record.description)) 1806 print(record.seq) 1807 1808 g = GenBankScanner() 1809 for record in g.parse_records(StringIO(gbk_example2), do_features=True): 1810 print("%s %s %s" % (record.id, record.name, record.description)) 1811 print(record.seq) 1812 1813 print("") 1814 print("EMBL CDS Iteration") 1815 print("==================") 1816 1817 e = EmblScanner() 1818 for record in e.parse_cds_features(StringIO(embl_example)): 1819 print(record) 1820 1821 print("") 1822 print("EMBL Iteration") 1823 print("==============") 1824 e = EmblScanner() 1825 for record in e.parse_records(StringIO(embl_example), do_features=True): 1826 print("%s %s %s" % (record.id, record.name, record.description)) 1827 print(record.seq) 1828