Package Bio :: Package GenBank :: Module Scanner
[hide private]
[frames] | no frames]

Source Code for Module Bio.GenBank.Scanner

   1  # Copyright 2007-2010 by Peter Cock.  All rights reserved. 
   2  # Revisions copyright 2010 by Uri Laserson.  All rights reserved. 
   3  # This code is part of the Biopython distribution and governed by its 
   4  # license.  Please see the LICENSE file that should have been included 
   5  # as part of this package. 
   6  """Internal code for parsing GenBank and EMBL files (PRIVATE). 
   7   
   8  This code is NOT intended for direct use.  It provides a basic scanner 
   9  (for use with a event consumer such as Bio.GenBank._FeatureConsumer) 
  10  to parse a GenBank or EMBL file (with their shared INSDC feature table). 
  11   
  12  It is used by Bio.GenBank to parse GenBank files 
  13  It is also used by Bio.SeqIO to parse GenBank and EMBL files 
  14   
  15  Feature Table Documentation: 
  16  http://www.insdc.org/files/feature_table.html 
  17  http://www.ncbi.nlm.nih.gov/projects/collab/FT/index.html 
  18  ftp://ftp.ncbi.nih.gov/genbank/docs/ 
  19  """ 
  20  # 17-MAR-2009: added wgs, wgs_scafld for GenBank whole genome shotgun master records. 
  21  # These are GenBank files that summarize the content of a project, and provide lists of 
  22  # scaffold and contig files in the project. These will be in annotations['wgs'] and 
  23  # annotations['wgs_scafld']. These GenBank files do not have sequences. See 
  24  # http://groups.google.com/group/bionet.molbio.genbank/browse_thread/thread/51fb88bf39e7dc36 
  25  # http://is.gd/nNgk 
  26  # for more details of this format, and an example. 
  27  # Added by Ying Huang & Iddo Friedberg 
  28   
  29  from __future__ import print_function 
  30   
  31  import warnings 
  32  import re 
  33  from Bio.Seq import Seq 
  34  from Bio.SeqRecord import SeqRecord 
  35  from Bio.Alphabet import generic_protein 
  36  from Bio import BiopythonParserWarning 
  37   
  38   
39 -class InsdcScanner(object):
40 """Basic functions for breaking up a GenBank/EMBL file into sub sections. 41 42 The International Nucleotide Sequence Database Collaboration (INSDC) 43 between the DDBJ, EMBL, and GenBank. These organisations all use the 44 same "Feature Table" layout in their plain text flat file formats. 45 46 However, the header and sequence sections of an EMBL file are very 47 different in layout to those produced by GenBank/DDBJ.""" 48 49 #These constants get redefined with sensible values in the sub classes: 50 RECORD_START = "XXX" # "LOCUS " or "ID " 51 HEADER_WIDTH = 3 # 12 or 5 52 FEATURE_START_MARKERS = ["XXX***FEATURES***XXX"] 53 FEATURE_END_MARKERS = ["XXX***END FEATURES***XXX"] 54 FEATURE_QUALIFIER_INDENT = 0 55 FEATURE_QUALIFIER_SPACER = "" 56 SEQUENCE_HEADERS = ["XXX"] # with right hand side spaces removed 57
58 - def __init__(self, debug=0):
59 assert len(self.RECORD_START) == self.HEADER_WIDTH 60 for marker in self.SEQUENCE_HEADERS: 61 assert marker == marker.rstrip() 62 assert len(self.FEATURE_QUALIFIER_SPACER) == self.FEATURE_QUALIFIER_INDENT 63 self.debug = debug 64 self.line = None
65
66 - def set_handle(self, handle):
67 self.handle = handle 68 self.line = ""
69
70 - def find_start(self):
71 """Read in lines until find the ID/LOCUS line, which is returned. 72 73 Any preamble (such as the header used by the NCBI on *.seq.gz archives) 74 will we ignored.""" 75 while True: 76 if self.line: 77 line = self.line 78 self.line = "" 79 else: 80 line = self.handle.readline() 81 if not line: 82 if self.debug: 83 print("End of file") 84 return None 85 if line[:self.HEADER_WIDTH] == self.RECORD_START: 86 if self.debug > 1: 87 print("Found the start of a record:\n" + line) 88 break 89 line = line.rstrip() 90 if line == "//": 91 if self.debug > 1: 92 print("Skipping // marking end of last record") 93 elif line == "": 94 if self.debug > 1: 95 print("Skipping blank line before record") 96 else: 97 #Ignore any header before the first ID/LOCUS line. 98 if self.debug > 1: 99 print("Skipping header line before record:\n" + line) 100 self.line = line 101 return line
102
103 - def parse_header(self):
104 """Return list of strings making up the header 105 106 New line characters are removed. 107 108 Assumes you have just read in the ID/LOCUS line. 109 """ 110 assert self.line[:self.HEADER_WIDTH] == self.RECORD_START, \ 111 "Not at start of record" 112 113 header_lines = [] 114 while True: 115 line = self.handle.readline() 116 if not line: 117 raise ValueError("Premature end of line during sequence data") 118 line = line.rstrip() 119 if line in self.FEATURE_START_MARKERS: 120 if self.debug: 121 print("Found feature table") 122 break 123 #if line[:self.HEADER_WIDTH]==self.FEATURE_START_MARKER[:self.HEADER_WIDTH]: 124 # if self.debug : print("Found header table (?)") 125 # break 126 if line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS: 127 if self.debug: 128 print("Found start of sequence") 129 break 130 if line == "//": 131 raise ValueError("Premature end of sequence data marker '//' found") 132 header_lines.append(line) 133 self.line = line 134 return header_lines
135
136 - def parse_features(self, skip=False):
137 """Return list of tuples for the features (if present) 138 139 Each feature is returned as a tuple (key, location, qualifiers) 140 where key and location are strings (e.g. "CDS" and 141 "complement(join(490883..490885,1..879))") while qualifiers 142 is a list of two string tuples (feature qualifier keys and values). 143 144 Assumes you have already read to the start of the features table. 145 """ 146 if self.line.rstrip() not in self.FEATURE_START_MARKERS: 147 if self.debug: 148 print("Didn't find any feature table") 149 return [] 150 151 while self.line.rstrip() in self.FEATURE_START_MARKERS: 152 self.line = self.handle.readline() 153 154 features = [] 155 line = self.line 156 while True: 157 if not line: 158 raise ValueError("Premature end of line during features table") 159 if line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS: 160 if self.debug: 161 print("Found start of sequence") 162 break 163 line = line.rstrip() 164 if line == "//": 165 raise ValueError("Premature end of features table, marker '//' found") 166 if line in self.FEATURE_END_MARKERS: 167 if self.debug: 168 print("Found end of features") 169 line = self.handle.readline() 170 break 171 if line[2:self.FEATURE_QUALIFIER_INDENT].strip() == "": 172 #This is an empty feature line between qualifiers. Empty 173 #feature lines within qualifiers are handled below (ignored). 174 line = self.handle.readline() 175 continue 176 if len(line) < self.FEATURE_QUALIFIER_INDENT: 177 warnings.warn("line too short to contain a feature: %r" % line, 178 BiopythonParserWarning) 179 line = self.handle.readline() 180 continue 181 182 if skip: 183 line = self.handle.readline() 184 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER: 185 line = self.handle.readline() 186 else: 187 #Build up a list of the lines making up this feature: 188 if line[self.FEATURE_QUALIFIER_INDENT] != " " \ 189 and " " in line[self.FEATURE_QUALIFIER_INDENT:]: 190 #The feature table design enforces a length limit on the feature keys. 191 #Some third party files (e.g. IGMT's EMBL like files) solve this by 192 #over indenting the location and qualifiers. 193 feature_key, line = line[2:].strip().split(None, 1) 194 feature_lines = [line] 195 warnings.warn("Overindented %s feature?" % feature_key, 196 BiopythonParserWarning) 197 else: 198 feature_key = line[2:self.FEATURE_QUALIFIER_INDENT].strip() 199 feature_lines = [line[self.FEATURE_QUALIFIER_INDENT:]] 200 line = self.handle.readline() 201 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER \ 202 or line.rstrip() == "": # cope with blank lines in the midst of a feature 203 #Use strip to remove any harmless trailing white space AND and leading 204 #white space (e.g. out of spec files with too much indentation) 205 feature_lines.append(line[self.FEATURE_QUALIFIER_INDENT:].strip()) 206 line = self.handle.readline() 207 features.append(self.parse_feature(feature_key, feature_lines)) 208 self.line = line 209 return features
210
211 - def parse_feature(self, feature_key, lines):
212 """Expects a feature as a list of strings, returns a tuple (key, location, qualifiers) 213 214 For example given this GenBank feature: 215 216 CDS complement(join(490883..490885,1..879)) 217 /locus_tag="NEQ001" 218 /note="conserved hypothetical [Methanococcus jannaschii]; 219 COG1583:Uncharacterized ACR; IPR001472:Bipartite nuclear 220 localization signal; IPR002743: Protein of unknown 221 function DUF57" 222 /codon_start=1 223 /transl_table=11 224 /product="hypothetical protein" 225 /protein_id="NP_963295.1" 226 /db_xref="GI:41614797" 227 /db_xref="GeneID:2732620" 228 /translation="MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK 229 EKYFNFTLIPKKDIIENKRYYLIISSPDKRFIEVLHNKIKDLDIITIGLAQFQLRKTK 230 KFDPKLRFPWVTITPIVLREGKIVILKGDKYYKVFVKRLEELKKYNLIKKKEPILEEP 231 IEISLNQIKDGWKIIDVKDRYYDFRNKSFSAFSNWLRDLKEQSLRKYNNFCGKNFYFE 232 EAIFEGFTFYKTVSIRIRINRGEAVYIGTLWKELNVYRKLDKEEREFYKFLYDCGLGS 233 LNSMGFGFVNTKKNSAR" 234 235 Then should give input key="CDS" and the rest of the data as a list of strings 236 lines=["complement(join(490883..490885,1..879))", ..., "LNSMGFGFVNTKKNSAR"] 237 where the leading spaces and trailing newlines have been removed. 238 239 Returns tuple containing: (key as string, location string, qualifiers as list) 240 as follows for this example: 241 242 key = "CDS", string 243 location = "complement(join(490883..490885,1..879))", string 244 qualifiers = list of string tuples: 245 246 [('locus_tag', '"NEQ001"'), 247 ('note', '"conserved hypothetical [Methanococcus jannaschii];\nCOG1583:..."'), 248 ('codon_start', '1'), 249 ('transl_table', '11'), 250 ('product', '"hypothetical protein"'), 251 ('protein_id', '"NP_963295.1"'), 252 ('db_xref', '"GI:41614797"'), 253 ('db_xref', '"GeneID:2732620"'), 254 ('translation', '"MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK\nEKYFNFT..."')] 255 256 In the above example, the "note" and "translation" were edited for compactness, 257 and they would contain multiple new line characters (displayed above as \n) 258 259 If a qualifier is quoted (in this case, everything except codon_start and 260 transl_table) then the quotes are NOT removed. 261 262 Note that no whitespace is removed. 263 """ 264 #Skip any blank lines 265 iterator = (x for x in lines if x) 266 try: 267 line = next(iterator) 268 269 feature_location = line.strip() 270 while feature_location[-1:] == ",": 271 #Multiline location, still more to come! 272 line = next(iterator) 273 feature_location += line.strip() 274 275 qualifiers = [] 276 277 for line_number, line in enumerate(iterator): 278 # check for extra wrapping of the location closing parentheses 279 if line_number == 0 and line.startswith(")"): 280 feature_location += line.strip() 281 elif line[0] == "/": 282 #New qualifier 283 i = line.find("=") 284 key = line[1:i] # does not work if i==-1 285 value = line[i + 1:] # we ignore 'value' if i==-1 286 if i == -1: 287 #Qualifier with no key, e.g. /pseudo 288 key = line[1:] 289 qualifiers.append((key, None)) 290 elif not value: 291 #ApE can output /note= 292 qualifiers.append((key, "")) 293 elif value == '"': 294 #One single quote 295 if self.debug: 296 print("Single quote %s:%s" % (key, value)) 297 #DO NOT remove the quote... 298 qualifiers.append((key, value)) 299 elif value[0] == '"': 300 #Quoted... 301 value_list = [value] 302 while value_list[-1][-1] != '"': 303 value_list.append(next(iterator)) 304 value = '\n'.join(value_list) 305 #DO NOT remove the quotes... 306 qualifiers.append((key, value)) 307 else: 308 #Unquoted 309 #if debug : print("Unquoted line %s:%s" % (key,value)) 310 qualifiers.append((key, value)) 311 else: 312 #Unquoted continuation 313 assert len(qualifiers) > 0 314 assert key == qualifiers[-1][0] 315 #if debug : print("Unquoted Cont %s:%s" % (key, line)) 316 if qualifiers[-1][1] is None: 317 raise StopIteration 318 qualifiers[-1] = (key, qualifiers[-1][1] + "\n" + line) 319 return (feature_key, feature_location, qualifiers) 320 except StopIteration: 321 #Bummer 322 raise ValueError("Problem with '%s' feature:\n%s" 323 % (feature_key, "\n".join(lines)))
324 347
348 - def _feed_first_line(self, consumer, line):
349 """Handle the LOCUS/ID line, passing data to the comsumer 350 351 This should be implemented by the EMBL / GenBank specific subclass 352 353 Used by the parse_records() and parse() methods. 354 """ 355 pass
356
357 - def _feed_header_lines(self, consumer, lines):
358 """Handle the header lines (list of strings), passing data to the comsumer 359 360 This should be implemented by the EMBL / GenBank specific subclass 361 362 Used by the parse_records() and parse() methods. 363 """ 364 pass
365
366 - def _feed_feature_table(self, consumer, feature_tuples):
367 """Handle the feature table (list of tuples), passing data to the comsumer 368 369 Used by the parse_records() and parse() methods. 370 """ 371 consumer.start_feature_table() 372 for feature_key, location_string, qualifiers in feature_tuples: 373 consumer.feature_key(feature_key) 374 consumer.location(location_string) 375 for q_key, q_value in qualifiers: 376 if q_value is None: 377 consumer.feature_qualifier(q_key, q_value) 378 else: 379 consumer.feature_qualifier(q_key, q_value.replace("\n", " "))
380
381 - def _feed_misc_lines(self, consumer, lines):
382 """Handle any lines between features and sequence (list of strings), passing data to the consumer 383 384 This should be implemented by the EMBL / GenBank specific subclass 385 386 Used by the parse_records() and parse() methods. 387 """ 388 pass
389
390 - def feed(self, handle, consumer, do_features=True):
391 """Feed a set of data into the consumer. 392 393 This method is intended for use with the "old" code in Bio.GenBank 394 395 Arguments: 396 handle - A handle with the information to parse. 397 consumer - The consumer that should be informed of events. 398 do_features - Boolean, should the features be parsed? 399 Skipping the features can be much faster. 400 401 Return values: 402 true - Passed a record 403 false - Did not find a record 404 """ 405 #Should work with both EMBL and GenBank files provided the 406 #equivalent Bio.GenBank._FeatureConsumer methods are called... 407 self.set_handle(handle) 408 if not self.find_start(): 409 #Could not find (another) record 410 consumer.data = None 411 return False 412 413 #We use the above class methods to parse the file into a simplified format. 414 #The first line, header lines and any misc lines after the features will be 415 #dealt with by GenBank / EMBL specific derived classes. 416 417 #First line and header: 418 self._feed_first_line(consumer, self.line) 419 self._feed_header_lines(consumer, self.parse_header()) 420 421 #Features (common to both EMBL and GenBank): 422 if do_features: 423 self._feed_feature_table(consumer, self.parse_features(skip=False)) 424 else: 425 self.parse_features(skip=True) # ignore the data 426 427 #Footer and sequence 428 misc_lines, sequence_string = self.parse_footer() 429 self._feed_misc_lines(consumer, misc_lines) 430 431 consumer.sequence(sequence_string) 432 #Calls to consumer.base_number() do nothing anyway 433 consumer.record_end("//") 434 435 assert self.line == "//" 436 437 #And we are done 438 return True
439
440 - def parse(self, handle, do_features=True):
441 """Returns a SeqRecord (with SeqFeatures if do_features=True) 442 443 See also the method parse_records() for use on multi-record files. 444 """ 445 from Bio.GenBank import _FeatureConsumer 446 from Bio.GenBank.utils import FeatureValueCleaner 447 448 consumer = _FeatureConsumer(use_fuzziness=1, 449 feature_cleaner=FeatureValueCleaner()) 450 451 if self.feed(handle, consumer, do_features): 452 return consumer.data 453 else: 454 return None
455
456 - def parse_records(self, handle, do_features=True):
457 """Returns a SeqRecord object iterator 458 459 Each record (from the ID/LOCUS line to the // line) becomes a SeqRecord 460 461 The SeqRecord objects include SeqFeatures if do_features=True 462 463 This method is intended for use in Bio.SeqIO 464 """ 465 #This is a generator function 466 while True: 467 record = self.parse(handle, do_features) 468 if record is None: 469 break 470 if record.id is None: 471 raise ValueError("Failed to parse the record's ID. Invalid ID line?") 472 if record.name == "<unknown name>": 473 raise ValueError("Failed to parse the record's name. Invalid ID line?") 474 if record.description == "<unknown description>": 475 raise ValueError("Failed to parse the record's description") 476 yield record
477
478 - def parse_cds_features(self, handle, 479 alphabet=generic_protein, 480 tags2id=('protein_id', 'locus_tag', 'product')):
481 """Returns SeqRecord object iterator 482 483 Each CDS feature becomes a SeqRecord. 484 485 alphabet - Used for any sequence found in a translation field. 486 tags2id - Tupple of three strings, the feature keys to use 487 for the record id, name and description, 488 489 This method is intended for use in Bio.SeqIO 490 """ 491 self.set_handle(handle) 492 while self.find_start(): 493 #Got an EMBL or GenBank record... 494 self.parse_header() # ignore header lines! 495 feature_tuples = self.parse_features() 496 #self.parse_footer() # ignore footer lines! 497 while True: 498 line = self.handle.readline() 499 if not line: 500 break 501 if line[:2] == "//": 502 break 503 self.line = line.rstrip() 504 505 #Now go though those features... 506 for key, location_string, qualifiers in feature_tuples: 507 if key == "CDS": 508 #Create SeqRecord 509 #================ 510 #SeqRecord objects cannot be created with annotations, they 511 #must be added afterwards. So create an empty record and 512 #then populate it: 513 record = SeqRecord(seq=None) 514 annotations = record.annotations 515 516 #Should we add a location object to the annotations? 517 #I *think* that only makes sense for SeqFeatures with their 518 #sub features... 519 annotations['raw_location'] = location_string.replace(' ', '') 520 521 for (qualifier_name, qualifier_data) in qualifiers: 522 if qualifier_data is not None \ 523 and qualifier_data[0] == '"' and qualifier_data[-1] == '"': 524 #Remove quotes 525 qualifier_data = qualifier_data[1:-1] 526 #Append the data to the annotation qualifier... 527 if qualifier_name == "translation": 528 assert record.seq is None, "Multiple translations!" 529 record.seq = Seq(qualifier_data.replace("\n", ""), alphabet) 530 elif qualifier_name == "db_xref": 531 #its a list, possibly empty. Its safe to extend 532 record.dbxrefs.append(qualifier_data) 533 else: 534 if qualifier_data is not None: 535 qualifier_data = qualifier_data.replace("\n", " ").replace(" ", " ") 536 try: 537 annotations[qualifier_name] += " " + qualifier_data 538 except KeyError: 539 #Not an addition to existing data, its the first bit 540 annotations[qualifier_name] = qualifier_data 541 542 #Fill in the ID, Name, Description 543 #================================= 544 try: 545 record.id = annotations[tags2id[0]] 546 except KeyError: 547 pass 548 try: 549 record.name = annotations[tags2id[1]] 550 except KeyError: 551 pass 552 try: 553 record.description = annotations[tags2id[2]] 554 except KeyError: 555 pass 556 557 yield record
558 559
560 -class EmblScanner(InsdcScanner):
561 """For extracting chunks of information in EMBL files""" 562 563 RECORD_START = "ID " 564 HEADER_WIDTH = 5 565 FEATURE_START_MARKERS = ["FH Key Location/Qualifiers", "FH"] 566 FEATURE_END_MARKERS = ["XX"] # XX can also mark the end of many things! 567 FEATURE_QUALIFIER_INDENT = 21 568 FEATURE_QUALIFIER_SPACER = "FT" + " " * (FEATURE_QUALIFIER_INDENT - 2) 569 SEQUENCE_HEADERS = ["SQ", "CO"] # Remove trailing spaces 570 605
606 - def _feed_first_line(self, consumer, line):
607 assert line[:self.HEADER_WIDTH].rstrip() == "ID" 608 if line[self.HEADER_WIDTH:].count(";") == 6: 609 #Looks like the semi colon separated style introduced in 2006 610 self._feed_first_line_new(consumer, line) 611 elif line[self.HEADER_WIDTH:].count(";") == 3: 612 if line.rstrip().endswith(" SQ"): 613 #EMBL-bank patent data 614 self._feed_first_line_patents(consumer,line) 615 else: 616 #Looks like the pre 2006 style 617 self._feed_first_line_old(consumer, line) 618 else: 619 raise ValueError('Did not recognise the ID line layout:\n' + line)
620
621 - def _feed_first_line_patents(self, consumer, line):
622 #Either Non-Redundant Level 1 database records, 623 #ID <accession>; <molecule type>; <non-redundant level 1>; <cluster size L1> 624 #e.g. ID NRP_AX000635; PRT; NR1; 15 SQ 625 # 626 #Or, Non-Redundant Level 2 database records: 627 #ID <L2-accession>; <molecule type>; <non-redundant level 2>; <cluster size L2> 628 #e.g. ID NRP0000016E; PRT; NR2; 5 SQ 629 fields = line[self.HEADER_WIDTH:].rstrip()[:-3].split(";") 630 assert len(fields) == 4 631 consumer.locus(fields[0]) 632 consumer.residue_type(fields[1]) 633 consumer.data_file_division(fields[2])
634 #TODO - Record cluster size? 635
636 - def _feed_first_line_old(self, consumer, line):
637 #Expects an ID line in the style before 2006, e.g. 638 #ID SC10H5 standard; DNA; PRO; 4870 BP. 639 #ID BSUB9999 standard; circular DNA; PRO; 4214630 BP. 640 assert line[:self.HEADER_WIDTH].rstrip() == "ID" 641 fields = [line[self.HEADER_WIDTH:].split(None, 1)[0]] 642 fields.extend(line[self.HEADER_WIDTH:].split(None, 1)[1].split(";")) 643 fields = [entry.strip() for entry in fields] 644 """ 645 The tokens represent: 646 0. Primary accession number 647 (space sep) 648 1. ??? (e.g. standard) 649 (semi-colon) 650 2. Topology and/or Molecule type (e.g. 'circular DNA' or 'DNA') 651 3. Taxonomic division (e.g. 'PRO') 652 4. Sequence length (e.g. '4639675 BP.') 653 """ 654 consumer.locus(fields[0]) # Should we also call the accession consumer? 655 consumer.residue_type(fields[2]) 656 consumer.data_file_division(fields[3]) 657 self._feed_seq_length(consumer, fields[4])
658
659 - def _feed_first_line_new(self, consumer, line):
660 #Expects an ID line in the style introduced in 2006, e.g. 661 #ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP. 662 #ID CD789012; SV 4; linear; genomic DNA; HTG; MAM; 500 BP. 663 assert line[:self.HEADER_WIDTH].rstrip() == "ID" 664 fields = [data.strip() for data in line[self.HEADER_WIDTH:].strip().split(";")] 665 assert len(fields) == 7 666 """ 667 The tokens represent: 668 0. Primary accession number 669 1. Sequence version number 670 2. Topology: 'circular' or 'linear' 671 3. Molecule type (e.g. 'genomic DNA') 672 4. Data class (e.g. 'STD') 673 5. Taxonomic division (e.g. 'PRO') 674 6. Sequence length (e.g. '4639675 BP.') 675 """ 676 677 consumer.locus(fields[0]) 678 679 #Call the accession consumer now, to make sure we record 680 #something as the record.id, in case there is no AC line 681 consumer.accession(fields[0]) 682 683 #TODO - How to deal with the version field? At the moment the consumer 684 #will try and use this for the ID which isn't ideal for EMBL files. 685 version_parts = fields[1].split() 686 if len(version_parts) == 2 \ 687 and version_parts[0] == "SV" \ 688 and version_parts[1].isdigit(): 689 consumer.version_suffix(version_parts[1]) 690 691 #Based on how the old GenBank parser worked, merge these two: 692 consumer.residue_type(" ".join(fields[2:4])) # TODO - Store as two fields? 693 694 #consumer.xxx(fields[4]) #TODO - What should we do with the data class? 695 696 consumer.data_file_division(fields[5]) 697 698 self._feed_seq_length(consumer, fields[6])
699
700 - def _feed_seq_length(self, consumer, text):
701 length_parts = text.split() 702 assert len(length_parts) == 2, "Invalid sequence length string %r" % text 703 assert length_parts[1].upper() in ["BP", "BP.", "AA", "AA."] 704 consumer.size(length_parts[0])
705
706 - def _feed_header_lines(self, consumer, lines):
707 EMBL_INDENT = self.HEADER_WIDTH 708 EMBL_SPACER = " " * EMBL_INDENT 709 consumer_dict = { 710 'AC': 'accession', 711 'SV': 'version', # SV line removed in June 2006, now part of ID line 712 'DE': 'definition', 713 #'RN' : 'reference_num', 714 #'RC' : reference comment... TODO 715 #'RP' : 'reference_bases', 716 #'RX' : reference cross reference... DOI or Pubmed 717 'RG': 'consrtm', # optional consortium 718 #'RA' : 'authors', 719 #'RT' : 'title', 720 'RL': 'journal', 721 'OS': 'organism', 722 'OC': 'taxonomy', 723 #'DR' : data reference 724 'CC': 'comment', 725 #'XX' : splitter 726 } 727 #We have to handle the following specially: 728 #RX (depending on reference type...) 729 for line in lines: 730 line_type = line[:EMBL_INDENT].strip() 731 data = line[EMBL_INDENT:].strip() 732 if line_type == 'XX': 733 pass 734 elif line_type == 'RN': 735 # Reformat reference numbers for the GenBank based consumer 736 # e.g. '[1]' becomes '1' 737 if data[0] == "[" and data[-1] == "]": 738 data = data[1:-1] 739 consumer.reference_num(data) 740 elif line_type == 'RP': 741 # Reformat reference numbers for the GenBank based consumer 742 # e.g. '1-4639675' becomes '(bases 1 to 4639675)' 743 # and '160-550, 904-1055' becomes '(bases 160 to 550; 904 to 1055)' 744 # Note could be multi-line, and end with a comma 745 parts = [bases.replace("-", " to ").strip() for bases in data.split(",") if bases.strip()] 746 consumer.reference_bases("(bases %s)" % "; ".join(parts)) 747 elif line_type == 'RT': 748 #Remove the enclosing quotes and trailing semi colon. 749 #Note the title can be split over multiple lines. 750 if data.startswith('"'): 751 data = data[1:] 752 if data.endswith('";'): 753 data = data[:-2] 754 consumer.title(data) 755 elif line_type == 'RX': 756 # EMBL support three reference types at the moment: 757 # - PUBMED PUBMED bibliographic database (NLM) 758 # - DOI Digital Object Identifier (International DOI Foundation) 759 # - AGRICOLA US National Agriculture Library (NAL) of the US Department 760 # of Agriculture (USDA) 761 # 762 # Format: 763 # RX resource_identifier; identifier. 764 # 765 # e.g. 766 # RX DOI; 10.1016/0024-3205(83)90010-3. 767 # RX PUBMED; 264242. 768 # 769 # Currently our reference object only supports PUBMED and MEDLINE 770 # (as these were in GenBank files?). 771 key, value = data.split(";", 1) 772 if value.endswith("."): 773 value = value[:-1] 774 value = value.strip() 775 if key == "PUBMED": 776 consumer.pubmed_id(value) 777 #TODO - Handle other reference types (here and in BioSQL bindings) 778 elif line_type == 'CC': 779 # Have to pass a list of strings for this one (not just a string) 780 consumer.comment([data]) 781 elif line_type == 'DR': 782 # Database Cross-reference, format: 783 # DR database_identifier; primary_identifier; secondary_identifier. 784 # 785 # e.g. 786 # DR MGI; 98599; Tcrb-V4. 787 # 788 # TODO - How should we store any secondary identifier? 789 parts = data.rstrip(".").split(";") 790 #Turn it into "database_identifier:primary_identifier" to 791 #mimic the GenBank parser. e.g. "MGI:98599" 792 consumer.dblink("%s:%s" % (parts[0].strip(), 793 parts[1].strip())) 794 elif line_type == 'RA': 795 # Remove trailing ; at end of authors list 796 consumer.authors(data.rstrip(";")) 797 elif line_type == 'PR': 798 # Remove trailing ; at end of the project reference 799 # In GenBank files this corresponds to the old PROJECT 800 # line which is being replaced with the DBLINK line. 801 consumer.project(data.rstrip(";")) 802 elif line_type == 'KW': 803 consumer.keywords(data.rstrip(";")) 804 elif line_type in consumer_dict: 805 #Its a semi-automatic entry! 806 getattr(consumer, consumer_dict[line_type])(data) 807 else: 808 if self.debug: 809 print("Ignoring EMBL header line:\n%s" % line)
810
811 - def _feed_misc_lines(self, consumer, lines):
812 #TODO - Should we do something with the information on the SQ line(s)? 813 lines.append("") 814 line_iter = iter(lines) 815 try: 816 for line in line_iter: 817 if line.startswith("CO "): 818 line = line[5:].strip() 819 contig_location = line 820 while True: 821 line = next(line_iter) 822 if not line: 823 break 824 elif line.startswith("CO "): 825 #Don't need to preseve the whitespace here. 826 contig_location += line[5:].strip() 827 else: 828 raise ValueError('Expected CO (contig) continuation line, got:\n' + line) 829 consumer.contig_location(contig_location) 830 if line.startswith("SQ Sequence "): 831 #e.g. 832 #SQ Sequence 219 BP; 82 A; 48 C; 33 G; 45 T; 11 other; 833 # 834 #Or, EMBL-bank patent, e.g. 835 #SQ Sequence 465 AA; 3963407aa91d3a0d622fec679a4524e0; MD5; 836 self._feed_seq_length(consumer, line[14:].rstrip().rstrip(";").split(";", 1)[0]) 837 #TODO - Record the checksum etc? 838 return 839 except StopIteration: 840 raise ValueError("Problem in misc lines before sequence")
841 842
843 -class _ImgtScanner(EmblScanner):
844 """For extracting chunks of information in IMGT (EMBL like) files (PRIVATE). 845 846 IMGT files are like EMBL files but in order to allow longer feature types 847 the features should be indented by 25 characters not 21 characters. In 848 practice the IMGT flat files tend to use either 21 or 25 characters, so we 849 must cope with both. 850 851 This is private to encourage use of Bio.SeqIO rather than Bio.GenBank. 852 """ 853 854 FEATURE_START_MARKERS = ["FH Key Location/Qualifiers", 855 "FH Key Location/Qualifiers (from EMBL)", 856 "FH Key Location/Qualifiers", 857 "FH"] 858
859 - def parse_features(self, skip=False):
860 """Return list of tuples for the features (if present) 861 862 Each feature is returned as a tuple (key, location, qualifiers) 863 where key and location are strings (e.g. "CDS" and 864 "complement(join(490883..490885,1..879))") while qualifiers 865 is a list of two string tuples (feature qualifier keys and values). 866 867 Assumes you have already read to the start of the features table. 868 """ 869 if self.line.rstrip() not in self.FEATURE_START_MARKERS: 870 if self.debug: 871 print("Didn't find any feature table") 872 return [] 873 874 while self.line.rstrip() in self.FEATURE_START_MARKERS: 875 self.line = self.handle.readline() 876 877 bad_position_re = re.compile(r'([0-9]+)>{1}') 878 879 features = [] 880 line = self.line 881 while True: 882 if not line: 883 raise ValueError("Premature end of line during features table") 884 if line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS: 885 if self.debug: 886 print("Found start of sequence") 887 break 888 line = line.rstrip() 889 if line == "//": 890 raise ValueError("Premature end of features table, marker '//' found") 891 if line in self.FEATURE_END_MARKERS: 892 if self.debug: 893 print("Found end of features") 894 line = self.handle.readline() 895 break 896 if line[2:self.FEATURE_QUALIFIER_INDENT].strip() == "": 897 #This is an empty feature line between qualifiers. Empty 898 #feature lines within qualifiers are handled below (ignored). 899 line = self.handle.readline() 900 continue 901 902 if skip: 903 line = self.handle.readline() 904 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER: 905 line = self.handle.readline() 906 else: 907 assert line[:2] == "FT" 908 try: 909 feature_key, location_start = line[2:].strip().split() 910 except ValueError: 911 #e.g. "FT TRANSMEMBRANE-REGION2163..2240\n" 912 #Assume indent of 25 as per IMGT spec, with the location 913 #start in column 26 (one-based). 914 feature_key = line[2:25].strip() 915 location_start = line[25:].strip() 916 feature_lines = [location_start] 917 line = self.handle.readline() 918 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER \ 919 or line.rstrip() == "": # cope with blank lines in the midst of a feature 920 #Use strip to remove any harmless trailing white space AND and leading 921 #white space (copes with 21 or 26 indents and orther variants) 922 assert line[:2] == "FT" 923 feature_lines.append(line[self.FEATURE_QUALIFIER_INDENT:].strip()) 924 line = self.handle.readline() 925 feature_key, location, qualifiers = \ 926 self.parse_feature(feature_key, feature_lines) 927 #Try to handle known problems with IMGT locations here: 928 if ">" in location: 929 #Nasty hack for common IMGT bug, should be >123 not 123> 930 #in a location string. At least here the meaning is clear, 931 #and since it is so common I don't want to issue a warning 932 #warnings.warn("Feature location %s is invalid, " 933 # "moving greater than sign before position" 934 # % location, BiopythonParserWarning) 935 location = bad_position_re.sub(r'>\1', location) 936 features.append((feature_key, location, qualifiers)) 937 self.line = line 938 return features
939 940
941 -class GenBankScanner(InsdcScanner):
942 """For extracting chunks of information in GenBank files""" 943 944 RECORD_START = "LOCUS " 945 HEADER_WIDTH = 12 946 FEATURE_START_MARKERS = ["FEATURES Location/Qualifiers", "FEATURES"] 947 FEATURE_END_MARKERS = [] 948 FEATURE_QUALIFIER_INDENT = 21 949 FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT 950 SEQUENCE_HEADERS = ["CONTIG", "ORIGIN", "BASE COUNT", "WGS"] # trailing spaces removed 951 1004
1005 - def _feed_first_line(self, consumer, line):
1006 """Scan over and parse GenBank LOCUS line (PRIVATE). 1007 1008 This must cope with several variants, primarily the old and new column 1009 based standards from GenBank. Additionally EnsEMBL produces GenBank 1010 files where the LOCUS line is space separated rather that following 1011 the column based layout. 1012 1013 We also try to cope with GenBank like files with partial LOCUS lines. 1014 """ 1015 ##################################### 1016 # LOCUS line # 1017 ##################################### 1018 GENBANK_INDENT = self.HEADER_WIDTH 1019 GENBANK_SPACER = " " * GENBANK_INDENT 1020 assert line[0:GENBANK_INDENT] == 'LOCUS ', \ 1021 'LOCUS line does not start correctly:\n' + line 1022 1023 #Have to break up the locus line, and handle the different bits of it. 1024 #There are at least two different versions of the locus line... 1025 if line[29:33] in [' bp ', ' aa ', ' rc '] and line[55:62] == ' ': 1026 #Old... note we insist on the 55:62 being empty to avoid trying 1027 #to parse space separated LOCUS lines from Ensembl etc, see below. 1028 # 1029 # Positions Contents 1030 # --------- -------- 1031 # 00:06 LOCUS 1032 # 06:12 spaces 1033 # 12:?? Locus name 1034 # ??:?? space 1035 # ??:29 Length of sequence, right-justified 1036 # 29:33 space, bp, space 1037 # 33:41 strand type 1038 # 41:42 space 1039 # 42:51 Blank (implies linear), linear or circular 1040 # 51:52 space 1041 # 52:55 The division code (e.g. BCT, VRL, INV) 1042 # 55:62 space 1043 # 62:73 Date, in the form dd-MMM-yyyy (e.g., 15-MAR-1991) 1044 # 1045 #assert line[29:33] in [' bp ', ' aa ',' rc '] , \ 1046 # 'LOCUS line does not contain size units at expected position:\n' + line 1047 assert line[41:42] == ' ', \ 1048 'LOCUS line does not contain space at position 42:\n' + line 1049 assert line[42:51].strip() in ['', 'linear', 'circular'], \ 1050 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line 1051 assert line[51:52] == ' ', \ 1052 'LOCUS line does not contain space at position 52:\n' + line 1053 #assert line[55:62] == ' ', \ 1054 # 'LOCUS line does not contain spaces from position 56 to 62:\n' + line 1055 if line[62:73].strip(): 1056 assert line[64:65] == '-', \ 1057 'LOCUS line does not contain - at position 65 in date:\n' + line 1058 assert line[68:69] == '-', \ 1059 'LOCUS line does not contain - at position 69 in date:\n' + line 1060 1061 name_and_length_str = line[GENBANK_INDENT:29] 1062 while ' ' in name_and_length_str: 1063 name_and_length_str = name_and_length_str.replace(' ', ' ') 1064 name_and_length = name_and_length_str.split(' ') 1065 assert len(name_and_length) <= 2, \ 1066 'Cannot parse the name and length in the LOCUS line:\n' + line 1067 assert len(name_and_length) != 1, \ 1068 'Name and length collide in the LOCUS line:\n' + line 1069 #Should be possible to split them based on position, if 1070 #a clear definition of the standard exists THAT AGREES with 1071 #existing files. 1072 consumer.locus(name_and_length[0]) 1073 consumer.size(name_and_length[1]) 1074 #consumer.residue_type(line[33:41].strip()) 1075 1076 if line[33:51].strip() == "" and line[29:33] == ' aa ': 1077 #Amino acids -> protein (even if there is no residue type given) 1078 #We want to use a protein alphabet in this case, rather than a 1079 #generic one. Not sure if this is the best way to achieve this, 1080 #but it works because the scanner checks for this: 1081 consumer.residue_type("PROTEIN") 1082 else: 1083 consumer.residue_type(line[33:51].strip()) 1084 1085 consumer.data_file_division(line[52:55]) 1086 if line[62:73].strip(): 1087 consumer.date(line[62:73]) 1088 elif line[40:44] in [' bp ', ' aa ', ' rc '] \ 1089 and line[54:64].strip() in ['', 'linear', 'circular']: 1090 #New... linear/circular/big blank test should avoid EnsEMBL style 1091 #LOCUS line being treated like a proper column based LOCUS line. 1092 # 1093 # Positions Contents 1094 # --------- -------- 1095 # 00:06 LOCUS 1096 # 06:12 spaces 1097 # 12:?? Locus name 1098 # ??:?? space 1099 # ??:40 Length of sequence, right-justified 1100 # 40:44 space, bp, space 1101 # 44:47 Blank, ss-, ds-, ms- 1102 # 47:54 Blank, DNA, RNA, tRNA, mRNA, uRNA, snRNA, cDNA 1103 # 54:55 space 1104 # 55:63 Blank (implies linear), linear or circular 1105 # 63:64 space 1106 # 64:67 The division code (e.g. BCT, VRL, INV) 1107 # 67:68 space 1108 # 68:79 Date, in the form dd-MMM-yyyy (e.g., 15-MAR-1991) 1109 # 1110 assert line[40:44] in [' bp ', ' aa ', ' rc '], \ 1111 'LOCUS line does not contain size units at expected position:\n' + line 1112 assert line[44:47] in [' ', 'ss-', 'ds-', 'ms-'], \ 1113 'LOCUS line does not have valid strand type (Single stranded, ...):\n' + line 1114 assert line[47:54].strip() == "" \ 1115 or 'DNA' in line[47:54].strip().upper() \ 1116 or 'RNA' in line[47:54].strip().upper(), \ 1117 'LOCUS line does not contain valid sequence type (DNA, RNA, ...):\n' + line 1118 assert line[54:55] == ' ', \ 1119 'LOCUS line does not contain space at position 55:\n' + line 1120 assert line[55:63].strip() in ['', 'linear', 'circular'], \ 1121 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line 1122 assert line[63:64] == ' ', \ 1123 'LOCUS line does not contain space at position 64:\n' + line 1124 assert line[67:68] == ' ', \ 1125 'LOCUS line does not contain space at position 68:\n' + line 1126 if line[68:79].strip(): 1127 assert line[70:71] == '-', \ 1128 'LOCUS line does not contain - at position 71 in date:\n' + line 1129 assert line[74:75] == '-', \ 1130 'LOCUS line does not contain - at position 75 in date:\n' + line 1131 1132 name_and_length_str = line[GENBANK_INDENT:40] 1133 while ' ' in name_and_length_str: 1134 name_and_length_str = name_and_length_str.replace(' ', ' ') 1135 name_and_length = name_and_length_str.split(' ') 1136 assert len(name_and_length) <= 2, \ 1137 'Cannot parse the name and length in the LOCUS line:\n' + line 1138 assert len(name_and_length) != 1, \ 1139 'Name and length collide in the LOCUS line:\n' + line 1140 #Should be possible to split them based on position, if 1141 #a clear definition of the stand exists THAT AGREES with 1142 #existing files. 1143 consumer.locus(name_and_length[0]) 1144 consumer.size(name_and_length[1]) 1145 1146 if line[44:54].strip() == "" and line[40:44] == ' aa ': 1147 #Amino acids -> protein (even if there is no residue type given) 1148 #We want to use a protein alphabet in this case, rather than a 1149 #generic one. Not sure if this is the best way to achieve this, 1150 #but it works because the scanner checks for this: 1151 consumer.residue_type(("PROTEIN " + line[54:63]).strip()) 1152 else: 1153 consumer.residue_type(line[44:63].strip()) 1154 1155 consumer.data_file_division(line[64:67]) 1156 if line[68:79].strip(): 1157 consumer.date(line[68:79]) 1158 elif line[GENBANK_INDENT:].strip().count(" ") == 0: 1159 #Truncated LOCUS line, as produced by some EMBOSS tools - see bug 1762 1160 # 1161 #e.g. 1162 # 1163 # "LOCUS U00096" 1164 # 1165 #rather than: 1166 # 1167 # "LOCUS U00096 4639675 bp DNA circular BCT" 1168 # 1169 # Positions Contents 1170 # --------- -------- 1171 # 00:06 LOCUS 1172 # 06:12 spaces 1173 # 12:?? Locus name 1174 if line[GENBANK_INDENT:].strip() != "": 1175 consumer.locus(line[GENBANK_INDENT:].strip()) 1176 else: 1177 #Must just have just "LOCUS ", is this even legitimate? 1178 #We should be able to continue parsing... we need real world testcases! 1179 warnings.warn("Minimal LOCUS line found - is this " 1180 "correct?\n:%r" % line, BiopythonParserWarning) 1181 elif len(line.split()) == 8 and line.split()[3] in ("aa", "bp") and \ 1182 line.split()[5] in ('linear', 'circular'): 1183 # Cope with invalidly spaced GenBank LOCUS lines like 1184 #LOCUS AB070938 6497 bp DNA linear BCT 11-OCT-2001 1185 splitline = line.split() 1186 consumer.locus(splitline[1]) 1187 consumer.size(splitline[2]) 1188 consumer.residue_type(splitline[4]) 1189 consumer.data_file_division(splitline[6]) 1190 consumer.date(splitline[7]) 1191 warnings.warn("Attempting to parse malformed locus line:\n%r\n" 1192 "Found locus %r size %r residue_type %r\n" 1193 "Some fields may be wrong." % (line, splitline[1], 1194 splitline[2], splitline[4]), BiopythonParserWarning) 1195 elif len(line.split()) == 7 and line.split()[3] in ["aa", "bp"]: 1196 #Cope with EnsEMBL genbank files which use space separation rather 1197 #than the expected column based layout. e.g. 1198 #LOCUS HG531_PATCH 1000000 bp DNA HTG 18-JUN-2011 1199 #LOCUS HG531_PATCH 759984 bp DNA HTG 18-JUN-2011 1200 #LOCUS HG506_HG1000_1_PATCH 814959 bp DNA HTG 18-JUN-2011 1201 #LOCUS HG506_HG1000_1_PATCH 1219964 bp DNA HTG 18-JUN-2011 1202 #Notice that the 'bp' can occur in the position expected by either 1203 #the old or the new fixed column standards (parsed above). 1204 splitline = line.split() 1205 consumer.locus(splitline[1]) 1206 consumer.size(splitline[2]) 1207 consumer.residue_type(splitline[4]) 1208 consumer.data_file_division(splitline[5]) 1209 consumer.date(splitline[6]) 1210 elif len(line.split()) >= 4 and line.split()[3] in ["aa", "bp"]: 1211 #Cope with EMBOSS seqret output where it seems the locus id can cause 1212 #the other fields to overflow. We just IGNORE the other fields! 1213 warnings.warn("Malformed LOCUS line found - is this " 1214 "correct?\n:%r" % line, BiopythonParserWarning) 1215 consumer.locus(line.split()[1]) 1216 consumer.size(line.split()[2]) 1217 elif len(line.split()) >= 4 and line.split()[-1] in ["aa", "bp"]: 1218 #Cope with pseudo-GenBank files like this: 1219 # "LOCUS RNA5 complete 1718 bp" 1220 #Treat everything between LOCUS and the size as the identifier. 1221 warnings.warn("Malformed LOCUS line found - is this " 1222 "correct?\n:%r" % line, BiopythonParserWarning) 1223 consumer.locus(line[5:].rsplit(None, 2)[0].strip()) 1224 consumer.size(line.split()[-2]) 1225 else: 1226 raise ValueError('Did not recognise the LOCUS line layout:\n' + line)
1227
1228 - def _feed_header_lines(self, consumer, lines):
1229 #Following dictionary maps GenBank lines to the associated 1230 #consumer methods - the special cases like LOCUS where one 1231 #genbank line triggers several consumer calls have to be 1232 #handled individually. 1233 GENBANK_INDENT = self.HEADER_WIDTH 1234 GENBANK_SPACER = " " * GENBANK_INDENT 1235 consumer_dict = { 1236 'DEFINITION': 'definition', 1237 'ACCESSION': 'accession', 1238 'NID': 'nid', 1239 'PID': 'pid', 1240 'DBSOURCE': 'db_source', 1241 'KEYWORDS': 'keywords', 1242 'SEGMENT': 'segment', 1243 'SOURCE': 'source', 1244 'AUTHORS': 'authors', 1245 'CONSRTM': 'consrtm', 1246 'PROJECT': 'project', 1247 'DBLINK': 'dblink', 1248 'TITLE': 'title', 1249 'JOURNAL': 'journal', 1250 'MEDLINE': 'medline_id', 1251 'PUBMED': 'pubmed_id', 1252 'REMARK': 'remark'} 1253 #We have to handle the following specially: 1254 #ORIGIN (locus, size, residue_type, data_file_division and date) 1255 #COMMENT (comment) 1256 #VERSION (version and gi) 1257 #REFERENCE (eference_num and reference_bases) 1258 #ORGANISM (organism and taxonomy) 1259 lines = [_f for _f in lines if _f] 1260 lines.append("") # helps avoid getting StopIteration all the time 1261 line_iter = iter(lines) 1262 try: 1263 line = next(line_iter) 1264 while True: 1265 if not line: 1266 break 1267 line_type = line[:GENBANK_INDENT].strip() 1268 data = line[GENBANK_INDENT:].strip() 1269 1270 if line_type == 'VERSION': 1271 #Need to call consumer.version(), and maybe also consumer.gi() as well. 1272 #e.g. 1273 # VERSION AC007323.5 GI:6587720 1274 while ' ' in data: 1275 data = data.replace(' ', ' ') 1276 if ' GI:' not in data: 1277 consumer.version(data) 1278 else: 1279 if self.debug: 1280 print("Version [" + data.split(' GI:')[0] + "], gi [" + data.split(' GI:')[1] + "]") 1281 consumer.version(data.split(' GI:')[0]) 1282 consumer.gi(data.split(' GI:')[1]) 1283 #Read in the next line! 1284 line = next(line_iter) 1285 elif line_type == 'REFERENCE': 1286 if self.debug > 1: 1287 print("Found reference [" + data + "]") 1288 #Need to call consumer.reference_num() and consumer.reference_bases() 1289 #e.g. 1290 # REFERENCE 1 (bases 1 to 86436) 1291 # 1292 #Note that this can be multiline, see Bug 1968, e.g. 1293 # 1294 # REFERENCE 42 (bases 1517 to 1696; 3932 to 4112; 17880 to 17975; 21142 to 1295 # 28259) 1296 # 1297 #For such cases we will call the consumer once only. 1298 data = data.strip() 1299 1300 #Read in the next line, and see if its more of the reference: 1301 while True: 1302 line = next(line_iter) 1303 if line[:GENBANK_INDENT] == GENBANK_SPACER: 1304 #Add this continuation to the data string 1305 data += " " + line[GENBANK_INDENT:] 1306 if self.debug > 1: 1307 print("Extended reference text [" + data + "]") 1308 else: 1309 #End of the reference, leave this text in the variable "line" 1310 break 1311 1312 #We now have all the reference line(s) stored in a string, data, 1313 #which we pass to the consumer 1314 while ' ' in data: 1315 data = data.replace(' ', ' ') 1316 if ' ' not in data: 1317 if self.debug > 2: 1318 print('Reference number \"' + data + '\"') 1319 consumer.reference_num(data) 1320 else: 1321 if self.debug > 2: 1322 print('Reference number \"' + data[:data.find(' ')] + '\", \"' + data[data.find(' ') + 1:] + '\"') 1323 consumer.reference_num(data[:data.find(' ')]) 1324 consumer.reference_bases(data[data.find(' ') + 1:]) 1325 elif line_type == 'ORGANISM': 1326 #Typically the first line is the organism, and subsequent lines 1327 #are the taxonomy lineage. However, given longer and longer 1328 #species names (as more and more strains and sub strains get 1329 #sequenced) the oragnism name can now get wrapped onto multiple 1330 #lines. The NCBI say we have to recognise the lineage line by 1331 #the presence of semi-colon delimited entries. In the long term, 1332 #they are considering adding a new keyword (e.g. LINEAGE). 1333 #See Bug 2591 for details. 1334 organism_data = data 1335 lineage_data = "" 1336 while True: 1337 line = next(line_iter) 1338 if line[0:GENBANK_INDENT] == GENBANK_SPACER: 1339 if lineage_data or ";" in line: 1340 lineage_data += " " + line[GENBANK_INDENT:] 1341 else: 1342 organism_data += " " + line[GENBANK_INDENT:].strip() 1343 else: 1344 #End of organism and taxonomy 1345 break 1346 consumer.organism(organism_data) 1347 if lineage_data.strip() == "" and self.debug > 1: 1348 print("Taxonomy line(s) missing or blank") 1349 consumer.taxonomy(lineage_data.strip()) 1350 del organism_data, lineage_data 1351 elif line_type == 'COMMENT': 1352 if self.debug > 1: 1353 print("Found comment") 1354 #This can be multiline, and should call consumer.comment() once 1355 #with a list where each entry is a line. 1356 comment_list = [] 1357 comment_list.append(data) 1358 while True: 1359 line = next(line_iter) 1360 if line[0:GENBANK_INDENT] == GENBANK_SPACER: 1361 data = line[GENBANK_INDENT:] 1362 comment_list.append(data) 1363 if self.debug > 2: 1364 print("Comment continuation [" + data + "]") 1365 else: 1366 #End of the comment 1367 break 1368 consumer.comment(comment_list) 1369 del comment_list 1370 elif line_type in consumer_dict: 1371 #Its a semi-automatic entry! 1372 #Now, this may be a multi line entry... 1373 while True: 1374 line = next(line_iter) 1375 if line[0:GENBANK_INDENT] == GENBANK_SPACER: 1376 data += ' ' + line[GENBANK_INDENT:] 1377 else: 1378 #We now have all the data for this entry: 1379 getattr(consumer, consumer_dict[line_type])(data) 1380 #End of continuation - return to top of loop! 1381 break 1382 else: 1383 if self.debug: 1384 print("Ignoring GenBank header line:\n" % line) 1385 #Read in next line 1386 line = next(line_iter) 1387 except StopIteration: 1388 raise ValueError("Problem in header")
1389
1390 - def _feed_misc_lines(self, consumer, lines):
1391 #Deals with a few misc lines between the features and the sequence 1392 GENBANK_INDENT = self.HEADER_WIDTH 1393 GENBANK_SPACER = " " * GENBANK_INDENT 1394 lines.append("") 1395 line_iter = iter(lines) 1396 try: 1397 for line in line_iter: 1398 if line.startswith('BASE COUNT'): 1399 line = line[10:].strip() 1400 if line: 1401 if self.debug: 1402 print("base_count = " + line) 1403 consumer.base_count(line) 1404 if line.startswith('ORIGIN'): 1405 line = line[6:].strip() 1406 if line: 1407 if self.debug: 1408 print("origin_name = " + line) 1409 consumer.origin_name(line) 1410 if line.startswith('WGS '): 1411 line = line[3:].strip() 1412 consumer.wgs(line) 1413 if line.startswith('WGS_SCAFLD'): 1414 line = line[10:].strip() 1415 consumer.add_wgs_scafld(line) 1416 if line.startswith('CONTIG'): 1417 line = line[6:].strip() 1418 contig_location = line 1419 while True: 1420 line = next(line_iter) 1421 if not line: 1422 break 1423 elif line[:GENBANK_INDENT] == GENBANK_SPACER: 1424 #Don't need to preseve the whitespace here. 1425 contig_location += line[GENBANK_INDENT:].rstrip() 1426 elif line.startswith('ORIGIN'): 1427 #Strange, seen this in GenPept files via Entrez gbwithparts 1428 line = line[6:].strip() 1429 if line: 1430 consumer.origin_name(line) 1431 break 1432 else: 1433 raise ValueError('Expected CONTIG continuation line, got:\n' + line) 1434 consumer.contig_location(contig_location) 1435 return 1436 except StopIteration: 1437 raise ValueError("Problem in misc lines before sequence")
1438 1439 if __name__ == "__main__": 1440 from Bio._py3k import StringIO 1441 1442 gbk_example = \ 1443 """LOCUS SCU49845 5028 bp DNA PLN 21-JUN-1999 1444 DEFINITION Saccharomyces cerevisiae TCP1-beta gene, partial cds, and Axl2p 1445 (AXL2) and Rev7p (REV7) genes, complete cds. 1446 ACCESSION U49845 1447 VERSION U49845.1 GI:1293613 1448 KEYWORDS . 1449 SOURCE Saccharomyces cerevisiae (baker's yeast) 1450 ORGANISM Saccharomyces cerevisiae 1451 Eukaryota; Fungi; Ascomycota; Saccharomycotina; Saccharomycetes; 1452 Saccharomycetales; Saccharomycetaceae; Saccharomyces. 1453 REFERENCE 1 (bases 1 to 5028) 1454 AUTHORS Torpey,L.E., Gibbs,P.E., Nelson,J. and Lawrence,C.W. 1455 TITLE Cloning and sequence of REV7, a gene whose function is required for 1456 DNA damage-induced mutagenesis in Saccharomyces cerevisiae 1457 JOURNAL Yeast 10 (11), 1503-1509 (1994) 1458 PUBMED 7871890 1459 REFERENCE 2 (bases 1 to 5028) 1460 AUTHORS Roemer,T., Madden,K., Chang,J. and Snyder,M. 1461 TITLE Selection of axial growth sites in yeast requires Axl2p, a novel 1462 plasma membrane glycoprotein 1463 JOURNAL Genes Dev. 10 (7), 777-793 (1996) 1464 PUBMED 8846915 1465 REFERENCE 3 (bases 1 to 5028) 1466 AUTHORS Roemer,T. 1467 TITLE Direct Submission 1468 JOURNAL Submitted (22-FEB-1996) Terry Roemer, Biology, Yale University, New 1469 Haven, CT, USA 1470 FEATURES Location/Qualifiers 1471 source 1..5028 1472 /organism="Saccharomyces cerevisiae" 1473 /db_xref="taxon:4932" 1474 /chromosome="IX" 1475 /map="9" 1476 CDS <1..206 1477 /codon_start=3 1478 /product="TCP1-beta" 1479 /protein_id="AAA98665.1" 1480 /db_xref="GI:1293614" 1481 /translation="SSIYNGISTSGLDLNNGTIADMRQLGIVESYKLKRAVVSSASEA 1482 AEVLLRVDNIIRARPRTANRQHM" 1483 gene 687..3158 1484 /gene="AXL2" 1485 CDS 687..3158 1486 /gene="AXL2" 1487 /note="plasma membrane glycoprotein" 1488 /codon_start=1 1489 /function="required for axial budding pattern of S. 1490 cerevisiae" 1491 /product="Axl2p" 1492 /protein_id="AAA98666.1" 1493 /db_xref="GI:1293615" 1494 /translation="MTQLQISLLLTATISLLHLVVATPYEAYPIGKQYPPVARVNESF 1495 TFQISNDTYKSSVDKTAQITYNCFDLPSWLSFDSSSRTFSGEPSSDLLSDANTTLYFN 1496 VILEGTDSADSTSLNNTYQFVVTNRPSISLSSDFNLLALLKNYGYTNGKNALKLDPNE 1497 VFNVTFDRSMFTNEESIVSYYGRSQLYNAPLPNWLFFDSGELKFTGTAPVINSAIAPE 1498 TSYSFVIIATDIEGFSAVEVEFELVIGAHQLTTSIQNSLIINVTDTGNVSYDLPLNYV 1499 YLDDDPISSDKLGSINLLDAPDWVALDNATISGSVPDELLGKNSNPANFSVSIYDTYG 1500 DVIYFNFEVVSTTDLFAISSLPNINATRGEWFSYYFLPSQFTDYVNTNVSLEFTNSSQ 1501 DHDWVKFQSSNLTLAGEVPKNFDKLSLGLKANQGSQSQELYFNIIGMDSKITHSNHSA 1502 NATSTRSSHHSTSTSSYTSSTYTAKISSTSAAATSSAPAALPAANKTSSHNKKAVAIA 1503 CGVAIPLGVILVALICFLIFWRRRRENPDDENLPHAISGPDLNNPANKPNQENATPLN 1504 NPFDDDASSYDDTSIARRLAALNTLKLDNHSATESDISSVDEKRDSLSGMNTYNDQFQ 1505 SQSKEELLAKPPVQPPESPFFDPQNRSSSVYMDSEPAVNKSWRYTGNLSPVSDIVRDS 1506 YGSQKTVDTEKLFDLEAPEKEKRTSRDVTMSSLDPWNSNISPSPVRKSVTPSPYNVTK 1507 HRNRHLQNIQDSQSGKNGITPTTMSTSSSDDFVPVKDGENFCWVHSMEPDRRPSKKRL 1508 VDFSNKSNVNVGQVKDIHGRIPEML" 1509 gene complement(3300..4037) 1510 /gene="REV7" 1511 CDS complement(3300..4037) 1512 /gene="REV7" 1513 /codon_start=1 1514 /product="Rev7p" 1515 /protein_id="AAA98667.1" 1516 /db_xref="GI:1293616" 1517 /translation="MNRWVEKWLRVYLKCYINLILFYRNVYPPQSFDYTTYQSFNLPQ 1518 FVPINRHPALIDYIEELILDVLSKLTHVYRFSICIINKKNDLCIEKYVLDFSELQHVD 1519 KDDQIITETEVFDEFRSSLNSLIMHLEKLPKVNDDTITFEAVINAIELELGHKLDRNR 1520 RVDSLEEKAEIERDSNWVKCQEDENLPDNNGFQPPKIKLTSLVGSDVGPLIIHQFSEK 1521 LISGDDKILNGVYSQYEEGESIFGSLF" 1522 ORIGIN 1523 1 gatcctccat atacaacggt atctccacct caggtttaga tctcaacaac ggaaccattg 1524 61 ccgacatgag acagttaggt atcgtcgaga gttacaagct aaaacgagca gtagtcagct 1525 121 ctgcatctga agccgctgaa gttctactaa gggtggataa catcatccgt gcaagaccaa 1526 181 gaaccgccaa tagacaacat atgtaacata tttaggatat acctcgaaaa taataaaccg 1527 241 ccacactgtc attattataa ttagaaacag aacgcaaaaa ttatccacta tataattcaa 1528 301 agacgcgaaa aaaaaagaac aacgcgtcat agaacttttg gcaattcgcg tcacaaataa 1529 361 attttggcaa cttatgtttc ctcttcgagc agtactcgag ccctgtctca agaatgtaat 1530 421 aatacccatc gtaggtatgg ttaaagatag catctccaca acctcaaagc tccttgccga 1531 481 gagtcgccct cctttgtcga gtaattttca cttttcatat gagaacttat tttcttattc 1532 541 tttactctca catcctgtag tgattgacac tgcaacagcc accatcacta gaagaacaga 1533 601 acaattactt aatagaaaaa ttatatcttc ctcgaaacga tttcctgctt ccaacatcta 1534 661 cgtatatcaa gaagcattca cttaccatga cacagcttca gatttcatta ttgctgacag 1535 721 ctactatatc actactccat ctagtagtgg ccacgcccta tgaggcatat cctatcggaa 1536 781 aacaataccc cccagtggca agagtcaatg aatcgtttac atttcaaatt tccaatgata 1537 841 cctataaatc gtctgtagac aagacagctc aaataacata caattgcttc gacttaccga 1538 901 gctggctttc gtttgactct agttctagaa cgttctcagg tgaaccttct tctgacttac 1539 961 tatctgatgc gaacaccacg ttgtatttca atgtaatact cgagggtacg gactctgccg 1540 1021 acagcacgtc tttgaacaat acataccaat ttgttgttac aaaccgtcca tccatctcgc 1541 1081 tatcgtcaga tttcaatcta ttggcgttgt taaaaaacta tggttatact aacggcaaaa 1542 1141 acgctctgaa actagatcct aatgaagtct tcaacgtgac ttttgaccgt tcaatgttca 1543 1201 ctaacgaaga atccattgtg tcgtattacg gacgttctca gttgtataat gcgccgttac 1544 1261 ccaattggct gttcttcgat tctggcgagt tgaagtttac tgggacggca ccggtgataa 1545 1321 actcggcgat tgctccagaa acaagctaca gttttgtcat catcgctaca gacattgaag 1546 1381 gattttctgc cgttgaggta gaattcgaat tagtcatcgg ggctcaccag ttaactacct 1547 1441 ctattcaaaa tagtttgata atcaacgtta ctgacacagg taacgtttca tatgacttac 1548 1501 ctctaaacta tgtttatctc gatgacgatc ctatttcttc tgataaattg ggttctataa 1549 1561 acttattgga tgctccagac tgggtggcat tagataatgc taccatttcc gggtctgtcc 1550 1621 cagatgaatt actcggtaag aactccaatc ctgccaattt ttctgtgtcc atttatgata 1551 1681 cttatggtga tgtgatttat ttcaacttcg aagttgtctc cacaacggat ttgtttgcca 1552 1741 ttagttctct tcccaatatt aacgctacaa ggggtgaatg gttctcctac tattttttgc 1553 1801 cttctcagtt tacagactac gtgaatacaa acgtttcatt agagtttact aattcaagcc 1554 1861 aagaccatga ctgggtgaaa ttccaatcat ctaatttaac attagctgga gaagtgccca 1555 1921 agaatttcga caagctttca ttaggtttga aagcgaacca aggttcacaa tctcaagagc 1556 1981 tatattttaa catcattggc atggattcaa agataactca ctcaaaccac agtgcgaatg 1557 2041 caacgtccac aagaagttct caccactcca cctcaacaag ttcttacaca tcttctactt 1558 2101 acactgcaaa aatttcttct acctccgctg ctgctacttc ttctgctcca gcagcgctgc 1559 2161 cagcagccaa taaaacttca tctcacaata aaaaagcagt agcaattgcg tgcggtgttg 1560 2221 ctatcccatt aggcgttatc ctagtagctc tcatttgctt cctaatattc tggagacgca 1561 2281 gaagggaaaa tccagacgat gaaaacttac cgcatgctat tagtggacct gatttgaata 1562 2341 atcctgcaaa taaaccaaat caagaaaacg ctacaccttt gaacaacccc tttgatgatg 1563 2401 atgcttcctc gtacgatgat acttcaatag caagaagatt ggctgctttg aacactttga 1564 2461 aattggataa ccactctgcc actgaatctg atatttccag cgtggatgaa aagagagatt 1565 2521 ctctatcagg tatgaataca tacaatgatc agttccaatc ccaaagtaaa gaagaattat 1566 2581 tagcaaaacc cccagtacag cctccagaga gcccgttctt tgacccacag aataggtctt 1567 2641 cttctgtgta tatggatagt gaaccagcag taaataaatc ctggcgatat actggcaacc 1568 2701 tgtcaccagt ctctgatatt gtcagagaca gttacggatc acaaaaaact gttgatacag 1569 2761 aaaaactttt cgatttagaa gcaccagaga aggaaaaacg tacgtcaagg gatgtcacta 1570 2821 tgtcttcact ggacccttgg aacagcaata ttagcccttc tcccgtaaga aaatcagtaa 1571 2881 caccatcacc atataacgta acgaagcatc gtaaccgcca cttacaaaat attcaagact 1572 2941 ctcaaagcgg taaaaacgga atcactccca caacaatgtc aacttcatct tctgacgatt 1573 3001 ttgttccggt taaagatggt gaaaattttt gctgggtcca tagcatggaa ccagacagaa 1574 3061 gaccaagtaa gaaaaggtta gtagattttt caaataagag taatgtcaat gttggtcaag 1575 3121 ttaaggacat tcacggacgc atcccagaaa tgctgtgatt atacgcaacg atattttgct 1576 3181 taattttatt ttcctgtttt attttttatt agtggtttac agatacccta tattttattt 1577 3241 agtttttata cttagagaca tttaatttta attccattct tcaaatttca tttttgcact 1578 3301 taaaacaaag atccaaaaat gctctcgccc tcttcatatt gagaatacac tccattcaaa 1579 3361 attttgtcgt caccgctgat taatttttca ctaaactgat gaataatcaa aggccccacg 1580 3421 tcagaaccga ctaaagaagt gagttttatt ttaggaggtt gaaaaccatt attgtctggt 1581 3481 aaattttcat cttcttgaca tttaacccag tttgaatccc tttcaatttc tgctttttcc 1582 3541 tccaaactat cgaccctcct gtttctgtcc aacttatgtc ctagttccaa ttcgatcgca 1583 3601 ttaataactg cttcaaatgt tattgtgtca tcgttgactt taggtaattt ctccaaatgc 1584 3661 ataatcaaac tatttaagga agatcggaat tcgtcgaaca cttcagtttc cgtaatgatc 1585 3721 tgatcgtctt tatccacatg ttgtaattca ctaaaatcta aaacgtattt ttcaatgcat 1586 3781 aaatcgttct ttttattaat aatgcagatg gaaaatctgt aaacgtgcgt taatttagaa 1587 3841 agaacatcca gtataagttc ttctatatag tcaattaaag caggatgcct attaatggga 1588 3901 acgaactgcg gcaagttgaa tgactggtaa gtagtgtagt cgaatgactg aggtgggtat 1589 3961 acatttctat aaaataaaat caaattaatg tagcatttta agtataccct cagccacttc 1590 4021 tctacccatc tattcataaa gctgacgcaa cgattactat tttttttttc ttcttggatc 1591 4081 tcagtcgtcg caaaaacgta taccttcttt ttccgacctt ttttttagct ttctggaaaa 1592 4141 gtttatatta gttaaacagg gtctagtctt agtgtgaaag ctagtggttt cgattgactg 1593 4201 atattaagaa agtggaaatt aaattagtag tgtagacgta tatgcatatg tatttctcgc 1594 4261 ctgtttatgt ttctacgtac ttttgattta tagcaagggg aaaagaaata catactattt 1595 4321 tttggtaaag gtgaaagcat aatgtaaaag ctagaataaa atggacgaaa taaagagagg 1596 4381 cttagttcat cttttttcca aaaagcaccc aatgataata actaaaatga aaaggatttg 1597 4441 ccatctgtca gcaacatcag ttgtgtgagc aataataaaa tcatcacctc cgttgccttt 1598 4501 agcgcgtttg tcgtttgtat cttccgtaat tttagtctta tcaatgggaa tcataaattt 1599 4561 tccaatgaat tagcaatttc gtccaattct ttttgagctt cttcatattt gctttggaat 1600 4621 tcttcgcact tcttttccca ttcatctctt tcttcttcca aagcaacgat ccttctaccc 1601 4681 atttgctcag agttcaaatc ggcctctttc agtttatcca ttgcttcctt cagtttggct 1602 4741 tcactgtctt ctagctgttg ttctagatcc tggtttttct tggtgtagtt ctcattatta 1603 4801 gatctcaagt tattggagtc ttcagccaat tgctttgtat cagacaattg actctctaac 1604 4861 ttctccactt cactgtcgag ttgctcgttt ttagcggaca aagatttaat ctcgttttct 1605 4921 ttttcagtgt tagattgctc taattctttg agctgttctc tcagctcctc atatttttct 1606 4981 tgccatgact cagattctaa ttttaagcta ttcaatttct ctttgatc 1607 //""" 1608 1609 # GenBank format protein (aka GenPept) file from: 1610 # http://www.molecularevolution.org/resources/fileformats/ 1611 gbk_example2 = \ 1612 """LOCUS AAD51968 143 aa linear BCT 21-AUG-2001 1613 DEFINITION transcriptional regulator RovA [Yersinia enterocolitica]. 1614 ACCESSION AAD51968 1615 VERSION AAD51968.1 GI:5805369 1616 DBSOURCE locus AF171097 accession AF171097.1 1617 KEYWORDS . 1618 SOURCE Yersinia enterocolitica 1619 ORGANISM Yersinia enterocolitica 1620 Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacteriales; 1621 Enterobacteriaceae; Yersinia. 1622 REFERENCE 1 (residues 1 to 143) 1623 AUTHORS Revell,P.A. and Miller,V.L. 1624 TITLE A chromosomally encoded regulator is required for expression of the 1625 Yersinia enterocolitica inv gene and for virulence 1626 JOURNAL Mol. Microbiol. 35 (3), 677-685 (2000) 1627 MEDLINE 20138369 1628 PUBMED 10672189 1629 REFERENCE 2 (residues 1 to 143) 1630 AUTHORS Revell,P.A. and Miller,V.L. 1631 TITLE Direct Submission 1632 JOURNAL Submitted (22-JUL-1999) Molecular Microbiology, Washington 1633 University School of Medicine, Campus Box 8230, 660 South Euclid, 1634 St. Louis, MO 63110, USA 1635 COMMENT Method: conceptual translation. 1636 FEATURES Location/Qualifiers 1637 source 1..143 1638 /organism="Yersinia enterocolitica" 1639 /mol_type="unassigned DNA" 1640 /strain="JB580v" 1641 /serotype="O:8" 1642 /db_xref="taxon:630" 1643 Protein 1..143 1644 /product="transcriptional regulator RovA" 1645 /name="regulates inv expression" 1646 CDS 1..143 1647 /gene="rovA" 1648 /coded_by="AF171097.1:380..811" 1649 /note="regulator of virulence" 1650 /transl_table=11 1651 ORIGIN 1652 1 mestlgsdla rlvrvwrali dhrlkplelt qthwvtlhni nrlppeqsqi qlakaigieq 1653 61 pslvrtldql eekglitrht candrrakri klteqsspii eqvdgvicst rkeilggisp 1654 121 deiellsgli dklerniiql qsk 1655 // 1656 """ 1657 1658 embl_example = """ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP. 1659 XX 1660 AC X56734; S46826; 1661 XX 1662 DT 12-SEP-1991 (Rel. 29, Created) 1663 DT 25-NOV-2005 (Rel. 85, Last updated, Version 11) 1664 XX 1665 DE Trifolium repens mRNA for non-cyanogenic beta-glucosidase 1666 XX 1667 KW beta-glucosidase. 1668 XX 1669 OS Trifolium repens (white clover) 1670 OC Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta; 1671 OC Spermatophyta; Magnoliophyta; eudicotyledons; core eudicotyledons; rosids; 1672 OC eurosids I; Fabales; Fabaceae; Papilionoideae; Trifolieae; Trifolium. 1673 XX 1674 RN [5] 1675 RP 1-1859 1676 RX PUBMED; 1907511. 1677 RA Oxtoby E., Dunn M.A., Pancoro A., Hughes M.A.; 1678 RT "Nucleotide and derived amino acid sequence of the cyanogenic 1679 RT beta-glucosidase (linamarase) from white clover (Trifolium repens L.)"; 1680 RL Plant Mol. Biol. 17(2):209-219(1991). 1681 XX 1682 RN [6] 1683 RP 1-1859 1684 RA Hughes M.A.; 1685 RT ; 1686 RL Submitted (19-NOV-1990) to the EMBL/GenBank/DDBJ databases. 1687 RL Hughes M.A., University of Newcastle Upon Tyne, Medical School, Newcastle 1688 RL Upon Tyne, NE2 4HH, UK 1689 XX 1690 FH Key Location/Qualifiers 1691 FH 1692 FT source 1..1859 1693 FT /organism="Trifolium repens" 1694 FT /mol_type="mRNA" 1695 FT /clone_lib="lambda gt10" 1696 FT /clone="TRE361" 1697 FT /tissue_type="leaves" 1698 FT /db_xref="taxon:3899" 1699 FT CDS 14..1495 1700 FT /product="beta-glucosidase" 1701 FT /EC_number="3.2.1.21" 1702 FT /note="non-cyanogenic" 1703 FT /db_xref="GOA:P26204" 1704 FT /db_xref="InterPro:IPR001360" 1705 FT /db_xref="InterPro:IPR013781" 1706 FT /db_xref="UniProtKB/Swiss-Prot:P26204" 1707 FT /protein_id="CAA40058.1" 1708 FT /translation="MDFIVAIFALFVISSFTITSTNAVEASTLLDIGNLSRSSFPRGFI 1709 FT FGAGSSAYQFEGAVNEGGRGPSIWDTFTHKYPEKIRDGSNADITVDQYHRYKEDVGIMK 1710 FT DQNMDSYRFSISWPRILPKGKLSGGINHEGIKYYNNLINELLANGIQPFVTLFHWDLPQ 1711 FT VLEDEYGGFLNSGVINDFRDYTDLCFKEFGDRVRYWSTLNEPWVFSNSGYALGTNAPGR 1712 FT CSASNVAKPGDSGTGPYIVTHNQILAHAEAVHVYKTKYQAYQKGKIGITLVSNWLMPLD 1713 FT DNSIPDIKAAERSLDFQFGLFMEQLTTGDYSKSMRRIVKNRLPKFSKFESSLVNGSFDF 1714 FT IGINYYSSSYISNAPSHGNAKPSYSTNPMTNISFEKHGIPLGPRAASIWIYVYPYMFIQ 1715 FT EDFEIFCYILKINITILQFSITENGMNEFNDATLPVEEALLNTYRIDYYYRHLYYIRSA 1716 FT IRAGSNVKGFYAWSFLDCNEWFAGFTVRFGLNFVD" 1717 FT mRNA 1..1859 1718 FT /experiment="experimental evidence, no additional details 1719 FT recorded" 1720 XX 1721 SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other; 1722 aaacaaacca aatatggatt ttattgtagc catatttgct ctgtttgtta ttagctcatt 60 1723 cacaattact tccacaaatg cagttgaagc ttctactctt cttgacatag gtaacctgag 120 1724 tcggagcagt tttcctcgtg gcttcatctt tggtgctgga tcttcagcat accaatttga 180 1725 aggtgcagta aacgaaggcg gtagaggacc aagtatttgg gataccttca cccataaata 240 1726 tccagaaaaa ataagggatg gaagcaatgc agacatcacg gttgaccaat atcaccgcta 300 1727 caaggaagat gttgggatta tgaaggatca aaatatggat tcgtatagat tctcaatctc 360 1728 ttggccaaga atactcccaa agggaaagtt gagcggaggc ataaatcacg aaggaatcaa 420 1729 atattacaac aaccttatca acgaactatt ggctaacggt atacaaccat ttgtaactct 480 1730 ttttcattgg gatcttcccc aagtcttaga agatgagtat ggtggtttct taaactccgg 540 1731 tgtaataaat gattttcgag actatacgga tctttgcttc aaggaatttg gagatagagt 600 1732 gaggtattgg agtactctaa atgagccatg ggtgtttagc aattctggat atgcactagg 660 1733 aacaaatgca ccaggtcgat gttcggcctc caacgtggcc aagcctggtg attctggaac 720 1734 aggaccttat atagttacac acaatcaaat tcttgctcat gcagaagctg tacatgtgta 780 1735 taagactaaa taccaggcat atcaaaaggg aaagataggc ataacgttgg tatctaactg 840 1736 gttaatgcca cttgatgata atagcatacc agatataaag gctgccgaga gatcacttga 900 1737 cttccaattt ggattgttta tggaacaatt aacaacagga gattattcta agagcatgcg 960 1738 gcgtatagtt aaaaaccgat tacctaagtt ctcaaaattc gaatcaagcc tagtgaatgg 1020 1739 ttcatttgat tttattggta taaactatta ctcttctagt tatattagca atgccccttc 1080 1740 acatggcaat gccaaaccca gttactcaac aaatcctatg accaatattt catttgaaaa 1140 1741 acatgggata cccttaggtc caagggctgc ttcaatttgg atatatgttt atccatatat 1200 1742 gtttatccaa gaggacttcg agatcttttg ttacatatta aaaataaata taacaatcct 1260 1743 gcaattttca atcactgaaa atggtatgaa tgaattcaac gatgcaacac ttccagtaga 1320 1744 agaagctctt ttgaatactt acagaattga ttactattac cgtcacttat actacattcg 1380 1745 ttctgcaatc agggctggct caaatgtgaa gggtttttac gcatggtcat ttttggactg 1440 1746 taatgaatgg tttgcaggct ttactgttcg ttttggatta aactttgtag attagaaaga 1500 1747 tggattaaaa aggtacccta agctttctgc ccaatggtac aagaactttc tcaaaagaaa 1560 1748 ctagctagta ttattaaaag aactttgtag tagattacag tacatcgttt gaagttgagt 1620 1749 tggtgcacct aattaaataa aagaggttac tcttaacata tttttaggcc attcgttgtg 1680 1750 aagttgttag gctgttattt ctattatact atgttgtagt aataagtgca ttgttgtacc 1740 1751 agaagctatg atcataacta taggttgatc cttcatgtat cagtttgatg ttgagaatac 1800 1752 tttgaattaa aagtcttttt ttattttttt aaaaaaaaaa aaaaaaaaaa aaaaaaaaa 1859 1753 // 1754 """ 1755 1756 print("GenBank CDS Iteration") 1757 print("=====================") 1758 1759 g = GenBankScanner() 1760 for record in g.parse_cds_features(StringIO(gbk_example)): 1761 print(record) 1762 1763 g = GenBankScanner() 1764 for record in g.parse_cds_features(StringIO(gbk_example2), 1765 tags2id=('gene', 'locus_tag', 'product')): 1766 print(record) 1767 1768 g = GenBankScanner() 1769 for record in g.parse_cds_features(StringIO(gbk_example + "\n" + gbk_example2), 1770 tags2id=('gene', 'locus_tag', 'product')): 1771 print(record) 1772 1773 print("") 1774 print("GenBank Iteration") 1775 print("=================") 1776 g = GenBankScanner() 1777 for record in g.parse_records(StringIO(gbk_example), do_features=False): 1778 print("%s %s %s" % (record.id, record.name, record.description)) 1779 print(record.seq) 1780 1781 g = GenBankScanner() 1782 for record in g.parse_records(StringIO(gbk_example), do_features=True): 1783 print("%s %s %s" % (record.id, record.name, record.description)) 1784 print(record.seq) 1785 1786 g = GenBankScanner() 1787 for record in g.parse_records(StringIO(gbk_example2), do_features=False): 1788 print("%s %s %s" % (record.id, record.name, record.description)) 1789 print(record.seq) 1790 1791 g = GenBankScanner() 1792 for record in g.parse_records(StringIO(gbk_example2), do_features=True): 1793 print("%s %s %s" % (record.id, record.name, record.description)) 1794 print(record.seq) 1795 1796 print("") 1797 print("EMBL CDS Iteration") 1798 print("==================") 1799 1800 e = EmblScanner() 1801 for record in e.parse_cds_features(StringIO(embl_example)): 1802 print(record) 1803 1804 print("") 1805 print("EMBL Iteration") 1806 print("==============") 1807 e = EmblScanner() 1808 for record in e.parse_records(StringIO(embl_example), do_features=True): 1809 print("%s %s %s" % (record.id, record.name, record.description)) 1810 print(record.seq) 1811