Package Bio :: Package GenBank :: Module Scanner
[hide private]
[frames] | no frames]

Source Code for Module Bio.GenBank.Scanner

   1  # Copyright 2007-2010 by Peter Cock.  All rights reserved. 
   2  # Revisions copyright 2010 by Uri Laserson.  All rights reserved. 
   3  # This code is part of the Biopython distribution and governed by its 
   4  # license.  Please see the LICENSE file that should have been included 
   5  # as part of this package. 
   6  """Internal code for parsing GenBank and EMBL files (PRIVATE). 
   7   
   8  This code is NOT intended for direct use.  It provides a basic scanner 
   9  (for use with a event consumer such as Bio.GenBank._FeatureConsumer) 
  10  to parse a GenBank or EMBL file (with their shared INSDC feature table). 
  11   
  12  It is used by Bio.GenBank to parse GenBank files 
  13  It is also used by Bio.SeqIO to parse GenBank and EMBL files 
  14   
  15  Feature Table Documentation: 
  16  http://www.insdc.org/files/feature_table.html 
  17  http://www.ncbi.nlm.nih.gov/projects/collab/FT/index.html 
  18  ftp://ftp.ncbi.nih.gov/genbank/docs/ 
  19  """ 
  20  # 17-MAR-2009: added wgs, wgs_scafld for GenBank whole genome shotgun master records. 
  21  # These are GenBank files that summarize the content of a project, and provide lists of 
  22  # scaffold and contig files in the project. These will be in annotations['wgs'] and 
  23  # annotations['wgs_scafld']. These GenBank files do not have sequences. See 
  24  # http://groups.google.com/group/bionet.molbio.genbank/browse_thread/thread/51fb88bf39e7dc36 
  25  # http://is.gd/nNgk 
  26  # for more details of this format, and an example. 
  27  # Added by Ying Huang & Iddo Friedberg 
  28   
  29  import warnings 
  30  import re 
  31  from Bio.Seq import Seq 
  32  from Bio.SeqRecord import SeqRecord 
  33  from Bio.Alphabet import generic_protein 
  34  from Bio import BiopythonParserWarning 
  35   
  36   
37 -class InsdcScanner(object):
38 """Basic functions for breaking up a GenBank/EMBL file into sub sections. 39 40 The International Nucleotide Sequence Database Collaboration (INSDC) 41 between the DDBJ, EMBL, and GenBank. These organisations all use the 42 same "Feature Table" layout in their plain text flat file formats. 43 44 However, the header and sequence sections of an EMBL file are very 45 different in layout to those produced by GenBank/DDBJ.""" 46 47 #These constants get redefined with sensible values in the sub classes: 48 RECORD_START = "XXX" # "LOCUS " or "ID " 49 HEADER_WIDTH = 3 # 12 or 5 50 FEATURE_START_MARKERS = ["XXX***FEATURES***XXX"] 51 FEATURE_END_MARKERS = ["XXX***END FEATURES***XXX"] 52 FEATURE_QUALIFIER_INDENT = 0 53 FEATURE_QUALIFIER_SPACER = "" 54 SEQUENCE_HEADERS = ["XXX"] # with right hand side spaces removed 55
56 - def __init__(self, debug=0):
57 assert len(self.RECORD_START) == self.HEADER_WIDTH 58 for marker in self.SEQUENCE_HEADERS: 59 assert marker == marker.rstrip() 60 assert len(self.FEATURE_QUALIFIER_SPACER) == self.FEATURE_QUALIFIER_INDENT 61 self.debug = debug 62 self.line = None
63
64 - def set_handle(self, handle):
65 self.handle = handle 66 self.line = ""
67
68 - def find_start(self):
69 """Read in lines until find the ID/LOCUS line, which is returned. 70 71 Any preamble (such as the header used by the NCBI on *.seq.gz archives) 72 will we ignored.""" 73 while True: 74 if self.line: 75 line = self.line 76 self.line = "" 77 else: 78 line = self.handle.readline() 79 if not line: 80 if self.debug: 81 print "End of file" 82 return None 83 if line[:self.HEADER_WIDTH] == self.RECORD_START: 84 if self.debug > 1: 85 print "Found the start of a record:\n" + line 86 break 87 line = line.rstrip() 88 if line == "//": 89 if self.debug > 1: 90 print "Skipping // marking end of last record" 91 elif line == "": 92 if self.debug > 1: 93 print "Skipping blank line before record" 94 else: 95 #Ignore any header before the first ID/LOCUS line. 96 if self.debug > 1: 97 print "Skipping header line before record:\n" + line 98 self.line = line 99 return line
100
101 - def parse_header(self):
102 """Return list of strings making up the header 103 104 New line characters are removed. 105 106 Assumes you have just read in the ID/LOCUS line. 107 """ 108 assert self.line[:self.HEADER_WIDTH] == self.RECORD_START, \ 109 "Not at start of record" 110 111 header_lines = [] 112 while True: 113 line = self.handle.readline() 114 if not line: 115 raise ValueError("Premature end of line during sequence data") 116 line = line.rstrip() 117 if line in self.FEATURE_START_MARKERS: 118 if self.debug: 119 print "Found header table" 120 break 121 #if line[:self.HEADER_WIDTH]==self.FEATURE_START_MARKER[:self.HEADER_WIDTH]: 122 # if self.debug : print "Found header table (?)" 123 # break 124 if line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS: 125 if self.debug: 126 print "Found start of sequence" 127 break 128 if line == "//": 129 raise ValueError("Premature end of sequence data marker '//' found") 130 header_lines.append(line) 131 self.line = line 132 return header_lines
133
134 - def parse_features(self, skip=False):
135 """Return list of tuples for the features (if present) 136 137 Each feature is returned as a tuple (key, location, qualifiers) 138 where key and location are strings (e.g. "CDS" and 139 "complement(join(490883..490885,1..879))") while qualifiers 140 is a list of two string tuples (feature qualifier keys and values). 141 142 Assumes you have already read to the start of the features table. 143 """ 144 if self.line.rstrip() not in self.FEATURE_START_MARKERS: 145 if self.debug: 146 print "Didn't find any feature table" 147 return [] 148 149 while self.line.rstrip() in self.FEATURE_START_MARKERS: 150 self.line = self.handle.readline() 151 152 features = [] 153 line = self.line 154 while True: 155 if not line: 156 raise ValueError("Premature end of line during features table") 157 if line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS: 158 if self.debug: 159 print "Found start of sequence" 160 break 161 line = line.rstrip() 162 if line == "//": 163 raise ValueError("Premature end of features table, marker '//' found") 164 if line in self.FEATURE_END_MARKERS: 165 if self.debug: 166 print "Found end of features" 167 line = self.handle.readline() 168 break 169 if line[2:self.FEATURE_QUALIFIER_INDENT].strip() == "": 170 #This is an empty feature line between qualifiers. Empty 171 #feature lines within qualifiers are handled below (ignored). 172 line = self.handle.readline() 173 continue 174 175 if skip: 176 line = self.handle.readline() 177 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER: 178 line = self.handle.readline() 179 else: 180 #Build up a list of the lines making up this feature: 181 if line[self.FEATURE_QUALIFIER_INDENT] != " " \ 182 and " " in line[self.FEATURE_QUALIFIER_INDENT:]: 183 #The feature table design enforces a length limit on the feature keys. 184 #Some third party files (e.g. IGMT's EMBL like files) solve this by 185 #over indenting the location and qualifiers. 186 feature_key, line = line[2:].strip().split(None, 1) 187 feature_lines = [line] 188 warnings.warn("Overindented %s feature?" % feature_key) 189 else: 190 feature_key = line[2:self.FEATURE_QUALIFIER_INDENT].strip() 191 feature_lines = [line[self.FEATURE_QUALIFIER_INDENT:]] 192 line = self.handle.readline() 193 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER \ 194 or line.rstrip() == "": # cope with blank lines in the midst of a feature 195 #Use strip to remove any harmless trailing white space AND and leading 196 #white space (e.g. out of spec files with too much intentation) 197 feature_lines.append(line[self.FEATURE_QUALIFIER_INDENT:].strip()) 198 line = self.handle.readline() 199 features.append(self.parse_feature(feature_key, feature_lines)) 200 self.line = line 201 return features
202
203 - def parse_feature(self, feature_key, lines):
204 """Expects a feature as a list of strings, returns a tuple (key, location, qualifiers) 205 206 For example given this GenBank feature: 207 208 CDS complement(join(490883..490885,1..879)) 209 /locus_tag="NEQ001" 210 /note="conserved hypothetical [Methanococcus jannaschii]; 211 COG1583:Uncharacterized ACR; IPR001472:Bipartite nuclear 212 localization signal; IPR002743: Protein of unknown 213 function DUF57" 214 /codon_start=1 215 /transl_table=11 216 /product="hypothetical protein" 217 /protein_id="NP_963295.1" 218 /db_xref="GI:41614797" 219 /db_xref="GeneID:2732620" 220 /translation="MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK 221 EKYFNFTLIPKKDIIENKRYYLIISSPDKRFIEVLHNKIKDLDIITIGLAQFQLRKTK 222 KFDPKLRFPWVTITPIVLREGKIVILKGDKYYKVFVKRLEELKKYNLIKKKEPILEEP 223 IEISLNQIKDGWKIIDVKDRYYDFRNKSFSAFSNWLRDLKEQSLRKYNNFCGKNFYFE 224 EAIFEGFTFYKTVSIRIRINRGEAVYIGTLWKELNVYRKLDKEEREFYKFLYDCGLGS 225 LNSMGFGFVNTKKNSAR" 226 227 Then should give input key="CDS" and the rest of the data as a list of strings 228 lines=["complement(join(490883..490885,1..879))", ..., "LNSMGFGFVNTKKNSAR"] 229 where the leading spaces and trailing newlines have been removed. 230 231 Returns tuple containing: (key as string, location string, qualifiers as list) 232 as follows for this example: 233 234 key = "CDS", string 235 location = "complement(join(490883..490885,1..879))", string 236 qualifiers = list of string tuples: 237 238 [('locus_tag', '"NEQ001"'), 239 ('note', '"conserved hypothetical [Methanococcus jannaschii];\nCOG1583:..."'), 240 ('codon_start', '1'), 241 ('transl_table', '11'), 242 ('product', '"hypothetical protein"'), 243 ('protein_id', '"NP_963295.1"'), 244 ('db_xref', '"GI:41614797"'), 245 ('db_xref', '"GeneID:2732620"'), 246 ('translation', '"MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK\nEKYFNFT..."')] 247 248 In the above example, the "note" and "translation" were edited for compactness, 249 and they would contain multiple new line characters (displayed above as \n) 250 251 If a qualifier is quoted (in this case, everything except codon_start and 252 transl_table) then the quotes are NOT removed. 253 254 Note that no whitespace is removed. 255 """ 256 #Skip any blank lines 257 iterator = iter(filter(None, lines)) 258 try: 259 line = iterator.next() 260 261 feature_location = line.strip() 262 while feature_location[-1:] == ",": 263 #Multiline location, still more to come! 264 line = iterator.next() 265 feature_location += line.strip() 266 267 qualifiers = [] 268 269 for line_number, line in enumerate(iterator): 270 # check for extra wrapping of the location closing parentheses 271 if line_number == 0 and line.startswith(")"): 272 feature_location += line.strip() 273 elif line[0] == "/": 274 #New qualifier 275 i = line.find("=") 276 key = line[1:i] # does not work if i==-1 277 value = line[i + 1:] # we ignore 'value' if i==-1 278 if i == -1: 279 #Qualifier with no key, e.g. /pseudo 280 key = line[1:] 281 qualifiers.append((key, None)) 282 elif not value: 283 #ApE can output /note= 284 qualifiers.append((key, "")) 285 elif value == '"': 286 #One single quote 287 if self.debug: 288 print "Single quote %s:%s" % (key, value) 289 #DO NOT remove the quote... 290 qualifiers.append((key, value)) 291 elif value[0] == '"': 292 #Quoted... 293 value_list = [value] 294 while value_list[-1][-1] != '"': 295 value_list.append(iterator.next()) 296 value = '\n'.join(value_list) 297 #DO NOT remove the quotes... 298 qualifiers.append((key, value)) 299 else: 300 #Unquoted 301 #if debug : print "Unquoted line %s:%s" % (key,value) 302 qualifiers.append((key, value)) 303 else: 304 #Unquoted continuation 305 assert len(qualifiers) > 0 306 assert key == qualifiers[-1][0] 307 #if debug : print "Unquoted Cont %s:%s" % (key, line) 308 qualifiers[-1] = (key, qualifiers[-1][1] + "\n" + line) 309 return (feature_key, feature_location, qualifiers) 310 except StopIteration: 311 #Bummer 312 raise ValueError("Problem with '%s' feature:\n%s" 313 % (feature_key, "\n".join(lines)))
314 337
338 - def _feed_first_line(self, consumer, line):
339 """Handle the LOCUS/ID line, passing data to the comsumer 340 341 This should be implemented by the EMBL / GenBank specific subclass 342 343 Used by the parse_records() and parse() methods. 344 """ 345 pass
346
347 - def _feed_header_lines(self, consumer, lines):
348 """Handle the header lines (list of strings), passing data to the comsumer 349 350 This should be implemented by the EMBL / GenBank specific subclass 351 352 Used by the parse_records() and parse() methods. 353 """ 354 pass
355
356 - def _feed_feature_table(self, consumer, feature_tuples):
357 """Handle the feature table (list of tuples), passing data to the comsumer 358 359 Used by the parse_records() and parse() methods. 360 """ 361 consumer.start_feature_table() 362 for feature_key, location_string, qualifiers in feature_tuples: 363 consumer.feature_key(feature_key) 364 consumer.location(location_string) 365 for q_key, q_value in qualifiers: 366 if q_value is None: 367 consumer.feature_qualifier(q_key, q_value) 368 else: 369 consumer.feature_qualifier(q_key, q_value.replace("\n", " "))
370
371 - def _feed_misc_lines(self, consumer, lines):
372 """Handle any lines between features and sequence (list of strings), passing data to the consumer 373 374 This should be implemented by the EMBL / GenBank specific subclass 375 376 Used by the parse_records() and parse() methods. 377 """ 378 pass
379
380 - def feed(self, handle, consumer, do_features=True):
381 """Feed a set of data into the consumer. 382 383 This method is intended for use with the "old" code in Bio.GenBank 384 385 Arguments: 386 handle - A handle with the information to parse. 387 consumer - The consumer that should be informed of events. 388 do_features - Boolean, should the features be parsed? 389 Skipping the features can be much faster. 390 391 Return values: 392 true - Passed a record 393 false - Did not find a record 394 """ 395 #Should work with both EMBL and GenBank files provided the 396 #equivalent Bio.GenBank._FeatureConsumer methods are called... 397 self.set_handle(handle) 398 if not self.find_start(): 399 #Could not find (another) record 400 consumer.data = None 401 return False 402 403 #We use the above class methods to parse the file into a simplified format. 404 #The first line, header lines and any misc lines after the features will be 405 #dealt with by GenBank / EMBL specific derived classes. 406 407 #First line and header: 408 self._feed_first_line(consumer, self.line) 409 self._feed_header_lines(consumer, self.parse_header()) 410 411 #Features (common to both EMBL and GenBank): 412 if do_features: 413 self._feed_feature_table(consumer, self.parse_features(skip=False)) 414 else: 415 self.parse_features(skip=True) # ignore the data 416 417 #Footer and sequence 418 misc_lines, sequence_string = self.parse_footer() 419 self._feed_misc_lines(consumer, misc_lines) 420 421 consumer.sequence(sequence_string) 422 #Calls to consumer.base_number() do nothing anyway 423 consumer.record_end("//") 424 425 assert self.line == "//" 426 427 #And we are done 428 return True
429
430 - def parse(self, handle, do_features=True):
431 """Returns a SeqRecord (with SeqFeatures if do_features=True) 432 433 See also the method parse_records() for use on multi-record files. 434 """ 435 from Bio.GenBank import _FeatureConsumer 436 from Bio.GenBank.utils import FeatureValueCleaner 437 438 consumer = _FeatureConsumer(use_fuzziness=1, 439 feature_cleaner=FeatureValueCleaner()) 440 441 if self.feed(handle, consumer, do_features): 442 return consumer.data 443 else: 444 return None
445
446 - def parse_records(self, handle, do_features=True):
447 """Returns a SeqRecord object iterator 448 449 Each record (from the ID/LOCUS line to the // line) becomes a SeqRecord 450 451 The SeqRecord objects include SeqFeatures if do_features=True 452 453 This method is intended for use in Bio.SeqIO 454 """ 455 #This is a generator function 456 while True: 457 record = self.parse(handle, do_features) 458 if record is None: 459 break 460 if record.id is None: 461 raise ValueError("Failed to parse the record's ID. Invalid ID line?") 462 if record.name == "<unknown name>": 463 raise ValueError("Failed to parse the record's name. Invalid ID line?") 464 if record.description == "<unknown description>": 465 raise ValueError("Failed to parse the record's description") 466 yield record
467
468 - def parse_cds_features(self, handle, 469 alphabet=generic_protein, 470 tags2id=('protein_id', 'locus_tag', 'product')):
471 """Returns SeqRecord object iterator 472 473 Each CDS feature becomes a SeqRecord. 474 475 alphabet - Used for any sequence found in a translation field. 476 tags2id - Tupple of three strings, the feature keys to use 477 for the record id, name and description, 478 479 This method is intended for use in Bio.SeqIO 480 """ 481 self.set_handle(handle) 482 while self.find_start(): 483 #Got an EMBL or GenBank record... 484 self.parse_header() # ignore header lines! 485 feature_tuples = self.parse_features() 486 #self.parse_footer() # ignore footer lines! 487 while True: 488 line = self.handle.readline() 489 if not line: 490 break 491 if line[:2] == "//": break 492 self.line = line.rstrip() 493 494 #Now go though those features... 495 for key, location_string, qualifiers in feature_tuples: 496 if key == "CDS": 497 #Create SeqRecord 498 #================ 499 #SeqRecord objects cannot be created with annotations, they 500 #must be added afterwards. So create an empty record and 501 #then populate it: 502 record = SeqRecord(seq=None) 503 annotations = record.annotations 504 505 #Should we add a location object to the annotations? 506 #I *think* that only makes sense for SeqFeatures with their 507 #sub features... 508 annotations['raw_location'] = location_string.replace(' ', '') 509 510 for (qualifier_name, qualifier_data) in qualifiers: 511 if qualifier_data is not None \ 512 and qualifier_data[0] == '"' and qualifier_data[-1] == '"': 513 #Remove quotes 514 qualifier_data = qualifier_data[1:-1] 515 #Append the data to the annotation qualifier... 516 if qualifier_name == "translation": 517 assert record.seq is None, "Multiple translations!" 518 record.seq = Seq(qualifier_data.replace("\n", ""), alphabet) 519 elif qualifier_name == "db_xref": 520 #its a list, possibly empty. Its safe to extend 521 record.dbxrefs.append(qualifier_data) 522 else: 523 if qualifier_data is not None: 524 qualifier_data = qualifier_data.replace("\n", " ").replace(" ", " ") 525 try: 526 annotations[qualifier_name] += " " + qualifier_data 527 except KeyError: 528 #Not an addition to existing data, its the first bit 529 annotations[qualifier_name] = qualifier_data 530 531 #Fill in the ID, Name, Description 532 #================================= 533 try: 534 record.id = annotations[tags2id[0]] 535 except KeyError: 536 pass 537 try: 538 record.name = annotations[tags2id[1]] 539 except KeyError: 540 pass 541 try: 542 record.description = annotations[tags2id[2]] 543 except KeyError: 544 pass 545 546 yield record
547 548
549 -class EmblScanner(InsdcScanner):
550 """For extracting chunks of information in EMBL files""" 551 552 RECORD_START = "ID " 553 HEADER_WIDTH = 5 554 FEATURE_START_MARKERS = ["FH Key Location/Qualifiers", "FH"] 555 FEATURE_END_MARKERS = ["XX"] # XX can also mark the end of many things! 556 FEATURE_QUALIFIER_INDENT = 21 557 FEATURE_QUALIFIER_SPACER = "FT" + " " * (FEATURE_QUALIFIER_INDENT - 2) 558 SEQUENCE_HEADERS = ["SQ", "CO"] # Remove trailing spaces 559 594
595 - def _feed_first_line(self, consumer, line):
596 assert line[:self.HEADER_WIDTH].rstrip() == "ID" 597 if line[self.HEADER_WIDTH:].count(";") == 6: 598 #Looks like the semi colon separated style introduced in 2006 599 self._feed_first_line_new(consumer, line) 600 elif line[self.HEADER_WIDTH:].count(";") == 3: 601 #Looks like the pre 2006 style 602 self._feed_first_line_old(consumer, line) 603 else: 604 raise ValueError('Did not recognise the ID line layout:\n' + line)
605
606 - def _feed_first_line_old(self, consumer, line):
607 #Expects an ID line in the style before 2006, e.g. 608 #ID SC10H5 standard; DNA; PRO; 4870 BP. 609 #ID BSUB9999 standard; circular DNA; PRO; 4214630 BP. 610 assert line[:self.HEADER_WIDTH].rstrip() == "ID" 611 fields = [line[self.HEADER_WIDTH:].split(None, 1)[0]] 612 fields.extend(line[self.HEADER_WIDTH:].split(None, 1)[1].split(";")) 613 fields = [entry.strip() for entry in fields] 614 """ 615 The tokens represent: 616 0. Primary accession number 617 (space sep) 618 1. ??? (e.g. standard) 619 (semi-colon) 620 2. Topology and/or Molecule type (e.g. 'circular DNA' or 'DNA') 621 3. Taxonomic division (e.g. 'PRO') 622 4. Sequence length (e.g. '4639675 BP.') 623 """ 624 consumer.locus(fields[0]) # Should we also call the accession consumer? 625 consumer.residue_type(fields[2]) 626 consumer.data_file_division(fields[3]) 627 self._feed_seq_length(consumer, fields[4])
628
629 - def _feed_first_line_new(self, consumer, line):
630 #Expects an ID line in the style introduced in 2006, e.g. 631 #ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP. 632 #ID CD789012; SV 4; linear; genomic DNA; HTG; MAM; 500 BP. 633 assert line[:self.HEADER_WIDTH].rstrip() == "ID" 634 fields = [data.strip() for data in line[self.HEADER_WIDTH:].strip().split(";")] 635 assert len(fields) == 7 636 """ 637 The tokens represent: 638 0. Primary accession number 639 1. Sequence version number 640 2. Topology: 'circular' or 'linear' 641 3. Molecule type (e.g. 'genomic DNA') 642 4. Data class (e.g. 'STD') 643 5. Taxonomic division (e.g. 'PRO') 644 6. Sequence length (e.g. '4639675 BP.') 645 """ 646 647 consumer.locus(fields[0]) 648 649 #Call the accession consumer now, to make sure we record 650 #something as the record.id, in case there is no AC line 651 consumer.accession(fields[0]) 652 653 #TODO - How to deal with the version field? At the moment the consumer 654 #will try and use this for the ID which isn't ideal for EMBL files. 655 version_parts = fields[1].split() 656 if len(version_parts) == 2 \ 657 and version_parts[0] == "SV" \ 658 and version_parts[1].isdigit(): 659 consumer.version_suffix(version_parts[1]) 660 661 #Based on how the old GenBank parser worked, merge these two: 662 consumer.residue_type(" ".join(fields[2:4])) # TODO - Store as two fields? 663 664 #consumer.xxx(fields[4]) #TODO - What should we do with the data class? 665 666 consumer.data_file_division(fields[5]) 667 668 self._feed_seq_length(consumer, fields[6])
669
670 - def _feed_seq_length(self, consumer, text):
671 length_parts = text.split() 672 assert len(length_parts) == 2 673 assert length_parts[1].upper() in ["BP", "BP.", "AA."] 674 consumer.size(length_parts[0])
675
676 - def _feed_header_lines(self, consumer, lines):
677 EMBL_INDENT = self.HEADER_WIDTH 678 EMBL_SPACER = " " * EMBL_INDENT 679 consumer_dict = { 680 'AC': 'accession', 681 'SV': 'version', # SV line removed in June 2006, now part of ID line 682 'DE': 'definition', 683 #'RN' : 'reference_num', 684 #'RC' : reference comment... TODO 685 #'RP' : 'reference_bases', 686 #'RX' : reference cross reference... DOI or Pubmed 687 'RG': 'consrtm', # optional consortium 688 #'RA' : 'authors', 689 #'RT' : 'title', 690 'RL': 'journal', 691 'OS': 'organism', 692 'OC': 'taxonomy', 693 #'DR' : data reference 694 'CC': 'comment', 695 #'XX' : splitter 696 } 697 #We have to handle the following specially: 698 #RX (depending on reference type...) 699 for line in lines: 700 line_type = line[:EMBL_INDENT].strip() 701 data = line[EMBL_INDENT:].strip() 702 if line_type == 'XX': 703 pass 704 elif line_type == 'RN': 705 # Reformat reference numbers for the GenBank based consumer 706 # e.g. '[1]' becomes '1' 707 if data[0] == "[" and data[-1] == "]": 708 data = data[1:-1] 709 consumer.reference_num(data) 710 elif line_type == 'RP': 711 # Reformat reference numbers for the GenBank based consumer 712 # e.g. '1-4639675' becomes '(bases 1 to 4639675)' 713 # and '160-550, 904-1055' becomes '(bases 160 to 550; 904 to 1055)' 714 # Note could be multi-line, and end with a comma 715 parts = [bases.replace("-", " to ").strip() for bases in data.split(",") if bases.strip()] 716 consumer.reference_bases("(bases %s)" % "; ".join(parts)) 717 elif line_type == 'RT': 718 #Remove the enclosing quotes and trailing semi colon. 719 #Note the title can be split over multiple lines. 720 if data.startswith('"'): 721 data = data[1:] 722 if data.endswith('";'): 723 data = data[:-2] 724 consumer.title(data) 725 elif line_type == 'RX': 726 # EMBL support three reference types at the moment: 727 # - PUBMED PUBMED bibliographic database (NLM) 728 # - DOI Digital Object Identifier (International DOI Foundation) 729 # - AGRICOLA US National Agriculture Library (NAL) of the US Department 730 # of Agriculture (USDA) 731 # 732 # Format: 733 # RX resource_identifier; identifier. 734 # 735 # e.g. 736 # RX DOI; 10.1016/0024-3205(83)90010-3. 737 # RX PUBMED; 264242. 738 # 739 # Currently our reference object only supports PUBMED and MEDLINE 740 # (as these were in GenBank files?). 741 key, value = data.split(";", 1) 742 if value.endswith("."): 743 value = value[:-1] 744 value = value.strip() 745 if key == "PUBMED": 746 consumer.pubmed_id(value) 747 #TODO - Handle other reference types (here and in BioSQL bindings) 748 elif line_type == 'CC': 749 # Have to pass a list of strings for this one (not just a string) 750 consumer.comment([data]) 751 elif line_type == 'DR': 752 # Database Cross-reference, format: 753 # DR database_identifier; primary_identifier; secondary_identifier. 754 # 755 # e.g. 756 # DR MGI; 98599; Tcrb-V4. 757 # 758 # TODO - How should we store any secondary identifier? 759 parts = data.rstrip(".").split(";") 760 #Turn it into "database_identifier:primary_identifier" to 761 #mimic the GenBank parser. e.g. "MGI:98599" 762 consumer.dblink("%s:%s" % (parts[0].strip(), 763 parts[1].strip())) 764 elif line_type == 'RA': 765 # Remove trailing ; at end of authors list 766 consumer.authors(data.rstrip(";")) 767 elif line_type == 'PR': 768 # Remove trailing ; at end of the project reference 769 # In GenBank files this corresponds to the old PROJECT 770 # line which is being replaced with the DBLINK line. 771 consumer.project(data.rstrip(";")) 772 elif line_type in consumer_dict: 773 #Its a semi-automatic entry! 774 getattr(consumer, consumer_dict[line_type])(data) 775 else: 776 if self.debug: 777 print "Ignoring EMBL header line:\n%s" % line
778
779 - def _feed_misc_lines(self, consumer, lines):
780 #TODO - Should we do something with the information on the SQ line(s)? 781 lines.append("") 782 line_iter = iter(lines) 783 try: 784 for line in line_iter: 785 if line.startswith("CO "): 786 line = line[5:].strip() 787 contig_location = line 788 while True: 789 line = line_iter.next() 790 if not line: 791 break 792 elif line.startswith("CO "): 793 #Don't need to preseve the whitespace here. 794 contig_location += line[5:].strip() 795 else: 796 raise ValueError('Expected CO (contig) continuation line, got:\n' + line) 797 consumer.contig_location(contig_location) 798 return 799 except StopIteration: 800 raise ValueError("Problem in misc lines before sequence")
801 802
803 -class _ImgtScanner(EmblScanner):
804 """For extracting chunks of information in IMGT (EMBL like) files (PRIVATE). 805 806 IMGT files are like EMBL files but in order to allow longer feature types 807 the features should be indented by 25 characters not 21 characters. In 808 practice the IMGT flat files tend to use either 21 or 25 characters, so we 809 must cope with both. 810 811 This is private to encourage use of Bio.SeqIO rather than Bio.GenBank. 812 """ 813 814 FEATURE_START_MARKERS = ["FH Key Location/Qualifiers", 815 "FH Key Location/Qualifiers (from EMBL)", 816 "FH Key Location/Qualifiers", 817 "FH"] 818
819 - def parse_features(self, skip=False):
820 """Return list of tuples for the features (if present) 821 822 Each feature is returned as a tuple (key, location, qualifiers) 823 where key and location are strings (e.g. "CDS" and 824 "complement(join(490883..490885,1..879))") while qualifiers 825 is a list of two string tuples (feature qualifier keys and values). 826 827 Assumes you have already read to the start of the features table. 828 """ 829 if self.line.rstrip() not in self.FEATURE_START_MARKERS: 830 if self.debug: 831 print "Didn't find any feature table" 832 return [] 833 834 while self.line.rstrip() in self.FEATURE_START_MARKERS: 835 self.line = self.handle.readline() 836 837 bad_position_re = re.compile(r'([0-9]+)>{1}') 838 839 features = [] 840 line = self.line 841 while True: 842 if not line: 843 raise ValueError("Premature end of line during features table") 844 if line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS: 845 if self.debug: 846 print "Found start of sequence" 847 break 848 line = line.rstrip() 849 if line == "//": 850 raise ValueError("Premature end of features table, marker '//' found") 851 if line in self.FEATURE_END_MARKERS: 852 if self.debug: 853 print "Found end of features" 854 line = self.handle.readline() 855 break 856 if line[2:self.FEATURE_QUALIFIER_INDENT].strip() == "": 857 #This is an empty feature line between qualifiers. Empty 858 #feature lines within qualifiers are handled below (ignored). 859 line = self.handle.readline() 860 continue 861 862 if skip: 863 line = self.handle.readline() 864 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER: 865 line = self.handle.readline() 866 else: 867 assert line[:2] == "FT" 868 try: 869 feature_key, location_start = line[2:].strip().split() 870 except ValueError: 871 #e.g. "FT TRANSMEMBRANE-REGION2163..2240\n" 872 #Assume indent of 25 as per IMGT spec, with the location 873 #start in column 26 (one-based). 874 feature_key = line[2:25].strip() 875 location_start = line[25:].strip() 876 feature_lines = [location_start] 877 line = self.handle.readline() 878 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER \ 879 or line.rstrip() == "": # cope with blank lines in the midst of a feature 880 #Use strip to remove any harmless trailing white space AND and leading 881 #white space (copes with 21 or 26 indents and orther variants) 882 assert line[:2] == "FT" 883 feature_lines.append(line[self.FEATURE_QUALIFIER_INDENT:].strip()) 884 line = self.handle.readline() 885 feature_key, location, qualifiers = \ 886 self.parse_feature(feature_key, feature_lines) 887 #Try to handle known problems with IMGT locations here: 888 if ">" in location: 889 #Nasty hack for common IMGT bug, should be >123 not 123> 890 #in a location string. At least here the meaning is clear, 891 #and since it is so common I don't want to issue a warning 892 #warnings.warn("Feature location %s is invalid, " 893 # "moving greater than sign before position" 894 # % location) 895 location = bad_position_re.sub(r'>\1', location) 896 features.append((feature_key, location, qualifiers)) 897 self.line = line 898 return features
899 900
901 -class GenBankScanner(InsdcScanner):
902 """For extracting chunks of information in GenBank files""" 903 904 RECORD_START = "LOCUS " 905 HEADER_WIDTH = 12 906 FEATURE_START_MARKERS = ["FEATURES Location/Qualifiers", "FEATURES"] 907 FEATURE_END_MARKERS = [] 908 FEATURE_QUALIFIER_INDENT = 21 909 FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT 910 SEQUENCE_HEADERS = ["CONTIG", "ORIGIN", "BASE COUNT", "WGS"] # trailing spaces removed 911 961
962 - def _feed_first_line(self, consumer, line):
963 """Scan over and parse GenBank LOCUS line (PRIVATE). 964 965 This must cope with several variants, primarily the old and new column 966 based standards from GenBank. Additionally EnsEMBL produces GenBank 967 files where the LOCUS line is space separated rather that following 968 the column based layout. 969 970 We also try to cope with GenBank like files with partial LOCUS lines. 971 """ 972 ##################################### 973 # LOCUS line # 974 ##################################### 975 GENBANK_INDENT = self.HEADER_WIDTH 976 GENBANK_SPACER = " " * GENBANK_INDENT 977 assert line[0:GENBANK_INDENT] == 'LOCUS ', \ 978 'LOCUS line does not start correctly:\n' + line 979 980 #Have to break up the locus line, and handle the different bits of it. 981 #There are at least two different versions of the locus line... 982 if line[29:33] in [' bp ', ' aa ', ' rc '] and line[55:62] == ' ': 983 #Old... note we insist on the 55:62 being empty to avoid trying 984 #to parse space separated LOCUS lines from Ensembl etc, see below. 985 # 986 # Positions Contents 987 # --------- -------- 988 # 00:06 LOCUS 989 # 06:12 spaces 990 # 12:?? Locus name 991 # ??:?? space 992 # ??:29 Length of sequence, right-justified 993 # 29:33 space, bp, space 994 # 33:41 strand type 995 # 41:42 space 996 # 42:51 Blank (implies linear), linear or circular 997 # 51:52 space 998 # 52:55 The division code (e.g. BCT, VRL, INV) 999 # 55:62 space 1000 # 62:73 Date, in the form dd-MMM-yyyy (e.g., 15-MAR-1991) 1001 # 1002 #assert line[29:33] in [' bp ', ' aa ',' rc '] , \ 1003 # 'LOCUS line does not contain size units at expected position:\n' + line 1004 assert line[41:42] == ' ', \ 1005 'LOCUS line does not contain space at position 42:\n' + line 1006 assert line[42:51].strip() in ['', 'linear', 'circular'], \ 1007 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line 1008 assert line[51:52] == ' ', \ 1009 'LOCUS line does not contain space at position 52:\n' + line 1010 #assert line[55:62] == ' ', \ 1011 # 'LOCUS line does not contain spaces from position 56 to 62:\n' + line 1012 if line[62:73].strip(): 1013 assert line[64:65] == '-', \ 1014 'LOCUS line does not contain - at position 65 in date:\n' + line 1015 assert line[68:69] == '-', \ 1016 'LOCUS line does not contain - at position 69 in date:\n' + line 1017 1018 name_and_length_str = line[GENBANK_INDENT:29] 1019 while ' ' in name_and_length_str: 1020 name_and_length_str = name_and_length_str.replace(' ', ' ') 1021 name_and_length = name_and_length_str.split(' ') 1022 assert len(name_and_length) <= 2, \ 1023 'Cannot parse the name and length in the LOCUS line:\n' + line 1024 assert len(name_and_length) != 1, \ 1025 'Name and length collide in the LOCUS line:\n' + line 1026 #Should be possible to split them based on position, if 1027 #a clear definition of the standard exists THAT AGREES with 1028 #existing files. 1029 consumer.locus(name_and_length[0]) 1030 consumer.size(name_and_length[1]) 1031 #consumer.residue_type(line[33:41].strip()) 1032 1033 if line[33:51].strip() == "" and line[29:33] == ' aa ': 1034 #Amino acids -> protein (even if there is no residue type given) 1035 #We want to use a protein alphabet in this case, rather than a 1036 #generic one. Not sure if this is the best way to achieve this, 1037 #but it works because the scanner checks for this: 1038 consumer.residue_type("PROTEIN") 1039 else: 1040 consumer.residue_type(line[33:51].strip()) 1041 1042 consumer.data_file_division(line[52:55]) 1043 if line[62:73].strip(): 1044 consumer.date(line[62:73]) 1045 elif line[40:44] in [' bp ', ' aa ', ' rc '] \ 1046 and line[54:64].strip() in ['', 'linear', 'circular']: 1047 #New... linear/circular/big blank test should avoid EnsEMBL style 1048 #LOCUS line being treated like a proper column based LOCUS line. 1049 # 1050 # Positions Contents 1051 # --------- -------- 1052 # 00:06 LOCUS 1053 # 06:12 spaces 1054 # 12:?? Locus name 1055 # ??:?? space 1056 # ??:40 Length of sequence, right-justified 1057 # 40:44 space, bp, space 1058 # 44:47 Blank, ss-, ds-, ms- 1059 # 47:54 Blank, DNA, RNA, tRNA, mRNA, uRNA, snRNA, cDNA 1060 # 54:55 space 1061 # 55:63 Blank (implies linear), linear or circular 1062 # 63:64 space 1063 # 64:67 The division code (e.g. BCT, VRL, INV) 1064 # 67:68 space 1065 # 68:79 Date, in the form dd-MMM-yyyy (e.g., 15-MAR-1991) 1066 # 1067 assert line[40:44] in [' bp ', ' aa ', ' rc '], \ 1068 'LOCUS line does not contain size units at expected position:\n' + line 1069 assert line[44:47] in [' ', 'ss-', 'ds-', 'ms-'], \ 1070 'LOCUS line does not have valid strand type (Single stranded, ...):\n' + line 1071 assert line[47:54].strip() == "" \ 1072 or 'DNA' in line[47:54].strip().upper() \ 1073 or 'RNA' in line[47:54].strip().upper(), \ 1074 'LOCUS line does not contain valid sequence type (DNA, RNA, ...):\n' + line 1075 assert line[54:55] == ' ', \ 1076 'LOCUS line does not contain space at position 55:\n' + line 1077 assert line[55:63].strip() in ['', 'linear', 'circular'], \ 1078 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line 1079 assert line[63:64] == ' ', \ 1080 'LOCUS line does not contain space at position 64:\n' + line 1081 assert line[67:68] == ' ', \ 1082 'LOCUS line does not contain space at position 68:\n' + line 1083 if line[68:79].strip(): 1084 assert line[70:71] == '-', \ 1085 'LOCUS line does not contain - at position 71 in date:\n' + line 1086 assert line[74:75] == '-', \ 1087 'LOCUS line does not contain - at position 75 in date:\n' + line 1088 1089 name_and_length_str = line[GENBANK_INDENT:40] 1090 while ' ' in name_and_length_str: 1091 name_and_length_str = name_and_length_str.replace(' ', ' ') 1092 name_and_length = name_and_length_str.split(' ') 1093 assert len(name_and_length) <= 2, \ 1094 'Cannot parse the name and length in the LOCUS line:\n' + line 1095 assert len(name_and_length) != 1, \ 1096 'Name and length collide in the LOCUS line:\n' + line 1097 #Should be possible to split them based on position, if 1098 #a clear definition of the stand exists THAT AGREES with 1099 #existing files. 1100 consumer.locus(name_and_length[0]) 1101 consumer.size(name_and_length[1]) 1102 1103 if line[44:54].strip() == "" and line[40:44] == ' aa ': 1104 #Amino acids -> protein (even if there is no residue type given) 1105 #We want to use a protein alphabet in this case, rather than a 1106 #generic one. Not sure if this is the best way to achieve this, 1107 #but it works because the scanner checks for this: 1108 consumer.residue_type(("PROTEIN " + line[54:63]).strip()) 1109 else: 1110 consumer.residue_type(line[44:63].strip()) 1111 1112 consumer.data_file_division(line[64:67]) 1113 if line[68:79].strip(): 1114 consumer.date(line[68:79]) 1115 elif line[GENBANK_INDENT:].strip().count(" ") == 0: 1116 #Truncated LOCUS line, as produced by some EMBOSS tools - see bug 1762 1117 # 1118 #e.g. 1119 # 1120 # "LOCUS U00096" 1121 # 1122 #rather than: 1123 # 1124 # "LOCUS U00096 4639675 bp DNA circular BCT" 1125 # 1126 # Positions Contents 1127 # --------- -------- 1128 # 00:06 LOCUS 1129 # 06:12 spaces 1130 # 12:?? Locus name 1131 if line[GENBANK_INDENT:].strip() != "": 1132 consumer.locus(line[GENBANK_INDENT:].strip()) 1133 else: 1134 #Must just have just "LOCUS ", is this even legitimate? 1135 #We should be able to continue parsing... we need real world testcases! 1136 warnings.warn("Minimal LOCUS line found - is this correct?\n:%r" % line) 1137 elif len(line.split()) == 7 and line.split()[3] in ["aa", "bp"]: 1138 #Cope with EnsEMBL genbank files which use space separation rather 1139 #than the expected column based layout. e.g. 1140 #LOCUS HG531_PATCH 1000000 bp DNA HTG 18-JUN-2011 1141 #LOCUS HG531_PATCH 759984 bp DNA HTG 18-JUN-2011 1142 #LOCUS HG506_HG1000_1_PATCH 814959 bp DNA HTG 18-JUN-2011 1143 #LOCUS HG506_HG1000_1_PATCH 1219964 bp DNA HTG 18-JUN-2011 1144 #Notice that the 'bp' can occur in the position expected by either 1145 #the old or the new fixed column standards (parsed above). 1146 splitline = line.split() 1147 consumer.locus(splitline[1]) 1148 consumer.size(splitline[2]) 1149 consumer.residue_type(splitline[4]) 1150 consumer.data_file_division(splitline[5]) 1151 consumer.date(splitline[6]) 1152 elif len(line.split()) >= 4 and line.split()[3] in ["aa", "bp"]: 1153 #Cope with EMBOSS seqret output where it seems the locus id can cause 1154 #the other fields to overflow. We just IGNORE the other fields! 1155 warnings.warn("Malformed LOCUS line found - is this correct?\n:%r" % line) 1156 consumer.locus(line.split()[1]) 1157 consumer.size(line.split()[2]) 1158 elif len(line.split()) >= 4 and line.split()[-1] in ["aa", "bp"]: 1159 #Cope with pseudo-GenBank files like this: 1160 # "LOCUS RNA5 complete 1718 bp" 1161 #Treat everything between LOCUS and the size as the identifier. 1162 warnings.warn("Malformed LOCUS line found - is this correct?\n:%r" % line) 1163 consumer.locus(line[5:].rsplit(None, 2)[0].strip()) 1164 consumer.size(line.split()[-2]) 1165 else: 1166 raise ValueError('Did not recognise the LOCUS line layout:\n' + line)
1167
1168 - def _feed_header_lines(self, consumer, lines):
1169 #Following dictionary maps GenBank lines to the associated 1170 #consumer methods - the special cases like LOCUS where one 1171 #genbank line triggers several consumer calls have to be 1172 #handled individually. 1173 GENBANK_INDENT = self.HEADER_WIDTH 1174 GENBANK_SPACER = " " * GENBANK_INDENT 1175 consumer_dict = { 1176 'DEFINITION': 'definition', 1177 'ACCESSION': 'accession', 1178 'NID': 'nid', 1179 'PID': 'pid', 1180 'DBSOURCE': 'db_source', 1181 'KEYWORDS': 'keywords', 1182 'SEGMENT': 'segment', 1183 'SOURCE': 'source', 1184 'AUTHORS': 'authors', 1185 'CONSRTM': 'consrtm', 1186 'PROJECT': 'project', 1187 'DBLINK': 'dblink', 1188 'TITLE': 'title', 1189 'JOURNAL': 'journal', 1190 'MEDLINE': 'medline_id', 1191 'PUBMED': 'pubmed_id', 1192 'REMARK': 'remark'} 1193 #We have to handle the following specially: 1194 #ORIGIN (locus, size, residue_type, data_file_division and date) 1195 #COMMENT (comment) 1196 #VERSION (version and gi) 1197 #REFERENCE (eference_num and reference_bases) 1198 #ORGANISM (organism and taxonomy) 1199 lines = filter(None, lines) 1200 lines.append("") # helps avoid getting StopIteration all the time 1201 line_iter = iter(lines) 1202 try: 1203 line = line_iter.next() 1204 while True: 1205 if not line: 1206 break 1207 line_type = line[:GENBANK_INDENT].strip() 1208 data = line[GENBANK_INDENT:].strip() 1209 1210 if line_type == 'VERSION': 1211 #Need to call consumer.version(), and maybe also consumer.gi() as well. 1212 #e.g. 1213 # VERSION AC007323.5 GI:6587720 1214 while ' ' in data: 1215 data = data.replace(' ', ' ') 1216 if ' GI:' not in data: 1217 consumer.version(data) 1218 else: 1219 if self.debug: 1220 print "Version [" + data.split(' GI:')[0] + "], gi [" + data.split(' GI:')[1] + "]" 1221 consumer.version(data.split(' GI:')[0]) 1222 consumer.gi(data.split(' GI:')[1]) 1223 #Read in the next line! 1224 line = line_iter.next() 1225 elif line_type == 'REFERENCE': 1226 if self.debug > 1: 1227 print "Found reference [" + data + "]" 1228 #Need to call consumer.reference_num() and consumer.reference_bases() 1229 #e.g. 1230 # REFERENCE 1 (bases 1 to 86436) 1231 # 1232 #Note that this can be multiline, see Bug 1968, e.g. 1233 # 1234 # REFERENCE 42 (bases 1517 to 1696; 3932 to 4112; 17880 to 17975; 21142 to 1235 # 28259) 1236 # 1237 #For such cases we will call the consumer once only. 1238 data = data.strip() 1239 1240 #Read in the next line, and see if its more of the reference: 1241 while True: 1242 line = line_iter.next() 1243 if line[:GENBANK_INDENT] == GENBANK_SPACER: 1244 #Add this continuation to the data string 1245 data += " " + line[GENBANK_INDENT:] 1246 if self.debug > 1: 1247 print "Extended reference text [" + data + "]" 1248 else: 1249 #End of the reference, leave this text in the variable "line" 1250 break 1251 1252 #We now have all the reference line(s) stored in a string, data, 1253 #which we pass to the consumer 1254 while ' ' in data: 1255 data = data.replace(' ', ' ') 1256 if ' ' not in data: 1257 if self.debug > 2: 1258 print 'Reference number \"' + data + '\"' 1259 consumer.reference_num(data) 1260 else: 1261 if self.debug > 2: 1262 print 'Reference number \"' + data[:data.find(' ')] + '\", \"' + data[data.find(' ') + 1:] + '\"' 1263 consumer.reference_num(data[:data.find(' ')]) 1264 consumer.reference_bases(data[data.find(' ') + 1:]) 1265 elif line_type == 'ORGANISM': 1266 #Typically the first line is the organism, and subsequent lines 1267 #are the taxonomy lineage. However, given longer and longer 1268 #species names (as more and more strains and sub strains get 1269 #sequenced) the oragnism name can now get wrapped onto multiple 1270 #lines. The NCBI say we have to recognise the lineage line by 1271 #the presence of semi-colon delimited entries. In the long term, 1272 #they are considering adding a new keyword (e.g. LINEAGE). 1273 #See Bug 2591 for details. 1274 organism_data = data 1275 lineage_data = "" 1276 while True: 1277 line = line_iter.next() 1278 if line[0:GENBANK_INDENT] == GENBANK_SPACER: 1279 if lineage_data or ";" in line: 1280 lineage_data += " " + line[GENBANK_INDENT:] 1281 else: 1282 organism_data += " " + line[GENBANK_INDENT:].strip() 1283 else: 1284 #End of organism and taxonomy 1285 break 1286 consumer.organism(organism_data) 1287 if lineage_data.strip() == "" and self.debug > 1: 1288 print "Taxonomy line(s) missing or blank" 1289 consumer.taxonomy(lineage_data.strip()) 1290 del organism_data, lineage_data 1291 elif line_type == 'COMMENT': 1292 if self.debug > 1: 1293 print "Found comment" 1294 #This can be multiline, and should call consumer.comment() once 1295 #with a list where each entry is a line. 1296 comment_list = [] 1297 comment_list.append(data) 1298 while True: 1299 line = line_iter.next() 1300 if line[0:GENBANK_INDENT] == GENBANK_SPACER: 1301 data = line[GENBANK_INDENT:] 1302 comment_list.append(data) 1303 if self.debug > 2: 1304 print "Comment continuation [" + data + "]" 1305 else: 1306 #End of the comment 1307 break 1308 consumer.comment(comment_list) 1309 del comment_list 1310 elif line_type in consumer_dict: 1311 #Its a semi-automatic entry! 1312 #Now, this may be a multi line entry... 1313 while True: 1314 line = line_iter.next() 1315 if line[0:GENBANK_INDENT] == GENBANK_SPACER: 1316 data += ' ' + line[GENBANK_INDENT:] 1317 else: 1318 #We now have all the data for this entry: 1319 getattr(consumer, consumer_dict[line_type])(data) 1320 #End of continuation - return to top of loop! 1321 break 1322 else: 1323 if self.debug: 1324 print "Ignoring GenBank header line:\n" % line 1325 #Read in next line 1326 line = line_iter.next() 1327 except StopIteration: 1328 raise ValueError("Problem in header")
1329
1330 - def _feed_misc_lines(self, consumer, lines):
1331 #Deals with a few misc lines between the features and the sequence 1332 GENBANK_INDENT = self.HEADER_WIDTH 1333 GENBANK_SPACER = " " * GENBANK_INDENT 1334 lines.append("") 1335 line_iter = iter(lines) 1336 try: 1337 for line in line_iter: 1338 if line.startswith('BASE COUNT'): 1339 line = line[10:].strip() 1340 if line: 1341 if self.debug: 1342 print "base_count = " + line 1343 consumer.base_count(line) 1344 if line.startswith('ORIGIN'): 1345 line = line[6:].strip() 1346 if line: 1347 if self.debug: 1348 print "origin_name = " + line 1349 consumer.origin_name(line) 1350 if line.startswith('WGS '): 1351 line = line[3:].strip() 1352 consumer.wgs(line) 1353 if line.startswith('WGS_SCAFLD'): 1354 line = line[10:].strip() 1355 consumer.add_wgs_scafld(line) 1356 if line.startswith('CONTIG'): 1357 line = line[6:].strip() 1358 contig_location = line 1359 while True: 1360 line = line_iter.next() 1361 if not line: 1362 break 1363 elif line[:GENBANK_INDENT] == GENBANK_SPACER: 1364 #Don't need to preseve the whitespace here. 1365 contig_location += line[GENBANK_INDENT:].rstrip() 1366 else: 1367 raise ValueError('Expected CONTIG continuation line, got:\n' + line) 1368 consumer.contig_location(contig_location) 1369 return 1370 except StopIteration: 1371 raise ValueError("Problem in misc lines before sequence")
1372 1373 if __name__ == "__main__": 1374 from StringIO import StringIO 1375 1376 gbk_example = \ 1377 """LOCUS SCU49845 5028 bp DNA PLN 21-JUN-1999 1378 DEFINITION Saccharomyces cerevisiae TCP1-beta gene, partial cds, and Axl2p 1379 (AXL2) and Rev7p (REV7) genes, complete cds. 1380 ACCESSION U49845 1381 VERSION U49845.1 GI:1293613 1382 KEYWORDS . 1383 SOURCE Saccharomyces cerevisiae (baker's yeast) 1384 ORGANISM Saccharomyces cerevisiae 1385 Eukaryota; Fungi; Ascomycota; Saccharomycotina; Saccharomycetes; 1386 Saccharomycetales; Saccharomycetaceae; Saccharomyces. 1387 REFERENCE 1 (bases 1 to 5028) 1388 AUTHORS Torpey,L.E., Gibbs,P.E., Nelson,J. and Lawrence,C.W. 1389 TITLE Cloning and sequence of REV7, a gene whose function is required for 1390 DNA damage-induced mutagenesis in Saccharomyces cerevisiae 1391 JOURNAL Yeast 10 (11), 1503-1509 (1994) 1392 PUBMED 7871890 1393 REFERENCE 2 (bases 1 to 5028) 1394 AUTHORS Roemer,T., Madden,K., Chang,J. and Snyder,M. 1395 TITLE Selection of axial growth sites in yeast requires Axl2p, a novel 1396 plasma membrane glycoprotein 1397 JOURNAL Genes Dev. 10 (7), 777-793 (1996) 1398 PUBMED 8846915 1399 REFERENCE 3 (bases 1 to 5028) 1400 AUTHORS Roemer,T. 1401 TITLE Direct Submission 1402 JOURNAL Submitted (22-FEB-1996) Terry Roemer, Biology, Yale University, New 1403 Haven, CT, USA 1404 FEATURES Location/Qualifiers 1405 source 1..5028 1406 /organism="Saccharomyces cerevisiae" 1407 /db_xref="taxon:4932" 1408 /chromosome="IX" 1409 /map="9" 1410 CDS <1..206 1411 /codon_start=3 1412 /product="TCP1-beta" 1413 /protein_id="AAA98665.1" 1414 /db_xref="GI:1293614" 1415 /translation="SSIYNGISTSGLDLNNGTIADMRQLGIVESYKLKRAVVSSASEA 1416 AEVLLRVDNIIRARPRTANRQHM" 1417 gene 687..3158 1418 /gene="AXL2" 1419 CDS 687..3158 1420 /gene="AXL2" 1421 /note="plasma membrane glycoprotein" 1422 /codon_start=1 1423 /function="required for axial budding pattern of S. 1424 cerevisiae" 1425 /product="Axl2p" 1426 /protein_id="AAA98666.1" 1427 /db_xref="GI:1293615" 1428 /translation="MTQLQISLLLTATISLLHLVVATPYEAYPIGKQYPPVARVNESF 1429 TFQISNDTYKSSVDKTAQITYNCFDLPSWLSFDSSSRTFSGEPSSDLLSDANTTLYFN 1430 VILEGTDSADSTSLNNTYQFVVTNRPSISLSSDFNLLALLKNYGYTNGKNALKLDPNE 1431 VFNVTFDRSMFTNEESIVSYYGRSQLYNAPLPNWLFFDSGELKFTGTAPVINSAIAPE 1432 TSYSFVIIATDIEGFSAVEVEFELVIGAHQLTTSIQNSLIINVTDTGNVSYDLPLNYV 1433 YLDDDPISSDKLGSINLLDAPDWVALDNATISGSVPDELLGKNSNPANFSVSIYDTYG 1434 DVIYFNFEVVSTTDLFAISSLPNINATRGEWFSYYFLPSQFTDYVNTNVSLEFTNSSQ 1435 DHDWVKFQSSNLTLAGEVPKNFDKLSLGLKANQGSQSQELYFNIIGMDSKITHSNHSA 1436 NATSTRSSHHSTSTSSYTSSTYTAKISSTSAAATSSAPAALPAANKTSSHNKKAVAIA 1437 CGVAIPLGVILVALICFLIFWRRRRENPDDENLPHAISGPDLNNPANKPNQENATPLN 1438 NPFDDDASSYDDTSIARRLAALNTLKLDNHSATESDISSVDEKRDSLSGMNTYNDQFQ 1439 SQSKEELLAKPPVQPPESPFFDPQNRSSSVYMDSEPAVNKSWRYTGNLSPVSDIVRDS 1440 YGSQKTVDTEKLFDLEAPEKEKRTSRDVTMSSLDPWNSNISPSPVRKSVTPSPYNVTK 1441 HRNRHLQNIQDSQSGKNGITPTTMSTSSSDDFVPVKDGENFCWVHSMEPDRRPSKKRL 1442 VDFSNKSNVNVGQVKDIHGRIPEML" 1443 gene complement(3300..4037) 1444 /gene="REV7" 1445 CDS complement(3300..4037) 1446 /gene="REV7" 1447 /codon_start=1 1448 /product="Rev7p" 1449 /protein_id="AAA98667.1" 1450 /db_xref="GI:1293616" 1451 /translation="MNRWVEKWLRVYLKCYINLILFYRNVYPPQSFDYTTYQSFNLPQ 1452 FVPINRHPALIDYIEELILDVLSKLTHVYRFSICIINKKNDLCIEKYVLDFSELQHVD 1453 KDDQIITETEVFDEFRSSLNSLIMHLEKLPKVNDDTITFEAVINAIELELGHKLDRNR 1454 RVDSLEEKAEIERDSNWVKCQEDENLPDNNGFQPPKIKLTSLVGSDVGPLIIHQFSEK 1455 LISGDDKILNGVYSQYEEGESIFGSLF" 1456 ORIGIN 1457 1 gatcctccat atacaacggt atctccacct caggtttaga tctcaacaac ggaaccattg 1458 61 ccgacatgag acagttaggt atcgtcgaga gttacaagct aaaacgagca gtagtcagct 1459 121 ctgcatctga agccgctgaa gttctactaa gggtggataa catcatccgt gcaagaccaa 1460 181 gaaccgccaa tagacaacat atgtaacata tttaggatat acctcgaaaa taataaaccg 1461 241 ccacactgtc attattataa ttagaaacag aacgcaaaaa ttatccacta tataattcaa 1462 301 agacgcgaaa aaaaaagaac aacgcgtcat agaacttttg gcaattcgcg tcacaaataa 1463 361 attttggcaa cttatgtttc ctcttcgagc agtactcgag ccctgtctca agaatgtaat 1464 421 aatacccatc gtaggtatgg ttaaagatag catctccaca acctcaaagc tccttgccga 1465 481 gagtcgccct cctttgtcga gtaattttca cttttcatat gagaacttat tttcttattc 1466 541 tttactctca catcctgtag tgattgacac tgcaacagcc accatcacta gaagaacaga 1467 601 acaattactt aatagaaaaa ttatatcttc ctcgaaacga tttcctgctt ccaacatcta 1468 661 cgtatatcaa gaagcattca cttaccatga cacagcttca gatttcatta ttgctgacag 1469 721 ctactatatc actactccat ctagtagtgg ccacgcccta tgaggcatat cctatcggaa 1470 781 aacaataccc cccagtggca agagtcaatg aatcgtttac atttcaaatt tccaatgata 1471 841 cctataaatc gtctgtagac aagacagctc aaataacata caattgcttc gacttaccga 1472 901 gctggctttc gtttgactct agttctagaa cgttctcagg tgaaccttct tctgacttac 1473 961 tatctgatgc gaacaccacg ttgtatttca atgtaatact cgagggtacg gactctgccg 1474 1021 acagcacgtc tttgaacaat acataccaat ttgttgttac aaaccgtcca tccatctcgc 1475 1081 tatcgtcaga tttcaatcta ttggcgttgt taaaaaacta tggttatact aacggcaaaa 1476 1141 acgctctgaa actagatcct aatgaagtct tcaacgtgac ttttgaccgt tcaatgttca 1477 1201 ctaacgaaga atccattgtg tcgtattacg gacgttctca gttgtataat gcgccgttac 1478 1261 ccaattggct gttcttcgat tctggcgagt tgaagtttac tgggacggca ccggtgataa 1479 1321 actcggcgat tgctccagaa acaagctaca gttttgtcat catcgctaca gacattgaag 1480 1381 gattttctgc cgttgaggta gaattcgaat tagtcatcgg ggctcaccag ttaactacct 1481 1441 ctattcaaaa tagtttgata atcaacgtta ctgacacagg taacgtttca tatgacttac 1482 1501 ctctaaacta tgtttatctc gatgacgatc ctatttcttc tgataaattg ggttctataa 1483 1561 acttattgga tgctccagac tgggtggcat tagataatgc taccatttcc gggtctgtcc 1484 1621 cagatgaatt actcggtaag aactccaatc ctgccaattt ttctgtgtcc atttatgata 1485 1681 cttatggtga tgtgatttat ttcaacttcg aagttgtctc cacaacggat ttgtttgcca 1486 1741 ttagttctct tcccaatatt aacgctacaa ggggtgaatg gttctcctac tattttttgc 1487 1801 cttctcagtt tacagactac gtgaatacaa acgtttcatt agagtttact aattcaagcc 1488 1861 aagaccatga ctgggtgaaa ttccaatcat ctaatttaac attagctgga gaagtgccca 1489 1921 agaatttcga caagctttca ttaggtttga aagcgaacca aggttcacaa tctcaagagc 1490 1981 tatattttaa catcattggc atggattcaa agataactca ctcaaaccac agtgcgaatg 1491 2041 caacgtccac aagaagttct caccactcca cctcaacaag ttcttacaca tcttctactt 1492 2101 acactgcaaa aatttcttct acctccgctg ctgctacttc ttctgctcca gcagcgctgc 1493 2161 cagcagccaa taaaacttca tctcacaata aaaaagcagt agcaattgcg tgcggtgttg 1494 2221 ctatcccatt aggcgttatc ctagtagctc tcatttgctt cctaatattc tggagacgca 1495 2281 gaagggaaaa tccagacgat gaaaacttac cgcatgctat tagtggacct gatttgaata 1496 2341 atcctgcaaa taaaccaaat caagaaaacg ctacaccttt gaacaacccc tttgatgatg 1497 2401 atgcttcctc gtacgatgat acttcaatag caagaagatt ggctgctttg aacactttga 1498 2461 aattggataa ccactctgcc actgaatctg atatttccag cgtggatgaa aagagagatt 1499 2521 ctctatcagg tatgaataca tacaatgatc agttccaatc ccaaagtaaa gaagaattat 1500 2581 tagcaaaacc cccagtacag cctccagaga gcccgttctt tgacccacag aataggtctt 1501 2641 cttctgtgta tatggatagt gaaccagcag taaataaatc ctggcgatat actggcaacc 1502 2701 tgtcaccagt ctctgatatt gtcagagaca gttacggatc acaaaaaact gttgatacag 1503 2761 aaaaactttt cgatttagaa gcaccagaga aggaaaaacg tacgtcaagg gatgtcacta 1504 2821 tgtcttcact ggacccttgg aacagcaata ttagcccttc tcccgtaaga aaatcagtaa 1505 2881 caccatcacc atataacgta acgaagcatc gtaaccgcca cttacaaaat attcaagact 1506 2941 ctcaaagcgg taaaaacgga atcactccca caacaatgtc aacttcatct tctgacgatt 1507 3001 ttgttccggt taaagatggt gaaaattttt gctgggtcca tagcatggaa ccagacagaa 1508 3061 gaccaagtaa gaaaaggtta gtagattttt caaataagag taatgtcaat gttggtcaag 1509 3121 ttaaggacat tcacggacgc atcccagaaa tgctgtgatt atacgcaacg atattttgct 1510 3181 taattttatt ttcctgtttt attttttatt agtggtttac agatacccta tattttattt 1511 3241 agtttttata cttagagaca tttaatttta attccattct tcaaatttca tttttgcact 1512 3301 taaaacaaag atccaaaaat gctctcgccc tcttcatatt gagaatacac tccattcaaa 1513 3361 attttgtcgt caccgctgat taatttttca ctaaactgat gaataatcaa aggccccacg 1514 3421 tcagaaccga ctaaagaagt gagttttatt ttaggaggtt gaaaaccatt attgtctggt 1515 3481 aaattttcat cttcttgaca tttaacccag tttgaatccc tttcaatttc tgctttttcc 1516 3541 tccaaactat cgaccctcct gtttctgtcc aacttatgtc ctagttccaa ttcgatcgca 1517 3601 ttaataactg cttcaaatgt tattgtgtca tcgttgactt taggtaattt ctccaaatgc 1518 3661 ataatcaaac tatttaagga agatcggaat tcgtcgaaca cttcagtttc cgtaatgatc 1519 3721 tgatcgtctt tatccacatg ttgtaattca ctaaaatcta aaacgtattt ttcaatgcat 1520 3781 aaatcgttct ttttattaat aatgcagatg gaaaatctgt aaacgtgcgt taatttagaa 1521 3841 agaacatcca gtataagttc ttctatatag tcaattaaag caggatgcct attaatggga 1522 3901 acgaactgcg gcaagttgaa tgactggtaa gtagtgtagt cgaatgactg aggtgggtat 1523 3961 acatttctat aaaataaaat caaattaatg tagcatttta agtataccct cagccacttc 1524 4021 tctacccatc tattcataaa gctgacgcaa cgattactat tttttttttc ttcttggatc 1525 4081 tcagtcgtcg caaaaacgta taccttcttt ttccgacctt ttttttagct ttctggaaaa 1526 4141 gtttatatta gttaaacagg gtctagtctt agtgtgaaag ctagtggttt cgattgactg 1527 4201 atattaagaa agtggaaatt aaattagtag tgtagacgta tatgcatatg tatttctcgc 1528 4261 ctgtttatgt ttctacgtac ttttgattta tagcaagggg aaaagaaata catactattt 1529 4321 tttggtaaag gtgaaagcat aatgtaaaag ctagaataaa atggacgaaa taaagagagg 1530 4381 cttagttcat cttttttcca aaaagcaccc aatgataata actaaaatga aaaggatttg 1531 4441 ccatctgtca gcaacatcag ttgtgtgagc aataataaaa tcatcacctc cgttgccttt 1532 4501 agcgcgtttg tcgtttgtat cttccgtaat tttagtctta tcaatgggaa tcataaattt 1533 4561 tccaatgaat tagcaatttc gtccaattct ttttgagctt cttcatattt gctttggaat 1534 4621 tcttcgcact tcttttccca ttcatctctt tcttcttcca aagcaacgat ccttctaccc 1535 4681 atttgctcag agttcaaatc ggcctctttc agtttatcca ttgcttcctt cagtttggct 1536 4741 tcactgtctt ctagctgttg ttctagatcc tggtttttct tggtgtagtt ctcattatta 1537 4801 gatctcaagt tattggagtc ttcagccaat tgctttgtat cagacaattg actctctaac 1538 4861 ttctccactt cactgtcgag ttgctcgttt ttagcggaca aagatttaat ctcgttttct 1539 4921 ttttcagtgt tagattgctc taattctttg agctgttctc tcagctcctc atatttttct 1540 4981 tgccatgact cagattctaa ttttaagcta ttcaatttct ctttgatc 1541 //""" 1542 1543 # GenBank format protein (aka GenPept) file from: 1544 # http://www.molecularevolution.org/resources/fileformats/ 1545 gbk_example2 = \ 1546 """LOCUS AAD51968 143 aa linear BCT 21-AUG-2001 1547 DEFINITION transcriptional regulator RovA [Yersinia enterocolitica]. 1548 ACCESSION AAD51968 1549 VERSION AAD51968.1 GI:5805369 1550 DBSOURCE locus AF171097 accession AF171097.1 1551 KEYWORDS . 1552 SOURCE Yersinia enterocolitica 1553 ORGANISM Yersinia enterocolitica 1554 Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacteriales; 1555 Enterobacteriaceae; Yersinia. 1556 REFERENCE 1 (residues 1 to 143) 1557 AUTHORS Revell,P.A. and Miller,V.L. 1558 TITLE A chromosomally encoded regulator is required for expression of the 1559 Yersinia enterocolitica inv gene and for virulence 1560 JOURNAL Mol. Microbiol. 35 (3), 677-685 (2000) 1561 MEDLINE 20138369 1562 PUBMED 10672189 1563 REFERENCE 2 (residues 1 to 143) 1564 AUTHORS Revell,P.A. and Miller,V.L. 1565 TITLE Direct Submission 1566 JOURNAL Submitted (22-JUL-1999) Molecular Microbiology, Washington 1567 University School of Medicine, Campus Box 8230, 660 South Euclid, 1568 St. Louis, MO 63110, USA 1569 COMMENT Method: conceptual translation. 1570 FEATURES Location/Qualifiers 1571 source 1..143 1572 /organism="Yersinia enterocolitica" 1573 /mol_type="unassigned DNA" 1574 /strain="JB580v" 1575 /serotype="O:8" 1576 /db_xref="taxon:630" 1577 Protein 1..143 1578 /product="transcriptional regulator RovA" 1579 /name="regulates inv expression" 1580 CDS 1..143 1581 /gene="rovA" 1582 /coded_by="AF171097.1:380..811" 1583 /note="regulator of virulence" 1584 /transl_table=11 1585 ORIGIN 1586 1 mestlgsdla rlvrvwrali dhrlkplelt qthwvtlhni nrlppeqsqi qlakaigieq 1587 61 pslvrtldql eekglitrht candrrakri klteqsspii eqvdgvicst rkeilggisp 1588 121 deiellsgli dklerniiql qsk 1589 // 1590 """ 1591 1592 embl_example = """ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP. 1593 XX 1594 AC X56734; S46826; 1595 XX 1596 DT 12-SEP-1991 (Rel. 29, Created) 1597 DT 25-NOV-2005 (Rel. 85, Last updated, Version 11) 1598 XX 1599 DE Trifolium repens mRNA for non-cyanogenic beta-glucosidase 1600 XX 1601 KW beta-glucosidase. 1602 XX 1603 OS Trifolium repens (white clover) 1604 OC Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta; 1605 OC Spermatophyta; Magnoliophyta; eudicotyledons; core eudicotyledons; rosids; 1606 OC eurosids I; Fabales; Fabaceae; Papilionoideae; Trifolieae; Trifolium. 1607 XX 1608 RN [5] 1609 RP 1-1859 1610 RX PUBMED; 1907511. 1611 RA Oxtoby E., Dunn M.A., Pancoro A., Hughes M.A.; 1612 RT "Nucleotide and derived amino acid sequence of the cyanogenic 1613 RT beta-glucosidase (linamarase) from white clover (Trifolium repens L.)"; 1614 RL Plant Mol. Biol. 17(2):209-219(1991). 1615 XX 1616 RN [6] 1617 RP 1-1859 1618 RA Hughes M.A.; 1619 RT ; 1620 RL Submitted (19-NOV-1990) to the EMBL/GenBank/DDBJ databases. 1621 RL Hughes M.A., University of Newcastle Upon Tyne, Medical School, Newcastle 1622 RL Upon Tyne, NE2 4HH, UK 1623 XX 1624 FH Key Location/Qualifiers 1625 FH 1626 FT source 1..1859 1627 FT /organism="Trifolium repens" 1628 FT /mol_type="mRNA" 1629 FT /clone_lib="lambda gt10" 1630 FT /clone="TRE361" 1631 FT /tissue_type="leaves" 1632 FT /db_xref="taxon:3899" 1633 FT CDS 14..1495 1634 FT /product="beta-glucosidase" 1635 FT /EC_number="3.2.1.21" 1636 FT /note="non-cyanogenic" 1637 FT /db_xref="GOA:P26204" 1638 FT /db_xref="InterPro:IPR001360" 1639 FT /db_xref="InterPro:IPR013781" 1640 FT /db_xref="UniProtKB/Swiss-Prot:P26204" 1641 FT /protein_id="CAA40058.1" 1642 FT /translation="MDFIVAIFALFVISSFTITSTNAVEASTLLDIGNLSRSSFPRGFI 1643 FT FGAGSSAYQFEGAVNEGGRGPSIWDTFTHKYPEKIRDGSNADITVDQYHRYKEDVGIMK 1644 FT DQNMDSYRFSISWPRILPKGKLSGGINHEGIKYYNNLINELLANGIQPFVTLFHWDLPQ 1645 FT VLEDEYGGFLNSGVINDFRDYTDLCFKEFGDRVRYWSTLNEPWVFSNSGYALGTNAPGR 1646 FT CSASNVAKPGDSGTGPYIVTHNQILAHAEAVHVYKTKYQAYQKGKIGITLVSNWLMPLD 1647 FT DNSIPDIKAAERSLDFQFGLFMEQLTTGDYSKSMRRIVKNRLPKFSKFESSLVNGSFDF 1648 FT IGINYYSSSYISNAPSHGNAKPSYSTNPMTNISFEKHGIPLGPRAASIWIYVYPYMFIQ 1649 FT EDFEIFCYILKINITILQFSITENGMNEFNDATLPVEEALLNTYRIDYYYRHLYYIRSA 1650 FT IRAGSNVKGFYAWSFLDCNEWFAGFTVRFGLNFVD" 1651 FT mRNA 1..1859 1652 FT /experiment="experimental evidence, no additional details 1653 FT recorded" 1654 XX 1655 SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other; 1656 aaacaaacca aatatggatt ttattgtagc catatttgct ctgtttgtta ttagctcatt 60 1657 cacaattact tccacaaatg cagttgaagc ttctactctt cttgacatag gtaacctgag 120 1658 tcggagcagt tttcctcgtg gcttcatctt tggtgctgga tcttcagcat accaatttga 180 1659 aggtgcagta aacgaaggcg gtagaggacc aagtatttgg gataccttca cccataaata 240 1660 tccagaaaaa ataagggatg gaagcaatgc agacatcacg gttgaccaat atcaccgcta 300 1661 caaggaagat gttgggatta tgaaggatca aaatatggat tcgtatagat tctcaatctc 360 1662 ttggccaaga atactcccaa agggaaagtt gagcggaggc ataaatcacg aaggaatcaa 420 1663 atattacaac aaccttatca acgaactatt ggctaacggt atacaaccat ttgtaactct 480 1664 ttttcattgg gatcttcccc aagtcttaga agatgagtat ggtggtttct taaactccgg 540 1665 tgtaataaat gattttcgag actatacgga tctttgcttc aaggaatttg gagatagagt 600 1666 gaggtattgg agtactctaa atgagccatg ggtgtttagc aattctggat atgcactagg 660 1667 aacaaatgca ccaggtcgat gttcggcctc caacgtggcc aagcctggtg attctggaac 720 1668 aggaccttat atagttacac acaatcaaat tcttgctcat gcagaagctg tacatgtgta 780 1669 taagactaaa taccaggcat atcaaaaggg aaagataggc ataacgttgg tatctaactg 840 1670 gttaatgcca cttgatgata atagcatacc agatataaag gctgccgaga gatcacttga 900 1671 cttccaattt ggattgttta tggaacaatt aacaacagga gattattcta agagcatgcg 960 1672 gcgtatagtt aaaaaccgat tacctaagtt ctcaaaattc gaatcaagcc tagtgaatgg 1020 1673 ttcatttgat tttattggta taaactatta ctcttctagt tatattagca atgccccttc 1080 1674 acatggcaat gccaaaccca gttactcaac aaatcctatg accaatattt catttgaaaa 1140 1675 acatgggata cccttaggtc caagggctgc ttcaatttgg atatatgttt atccatatat 1200 1676 gtttatccaa gaggacttcg agatcttttg ttacatatta aaaataaata taacaatcct 1260 1677 gcaattttca atcactgaaa atggtatgaa tgaattcaac gatgcaacac ttccagtaga 1320 1678 agaagctctt ttgaatactt acagaattga ttactattac cgtcacttat actacattcg 1380 1679 ttctgcaatc agggctggct caaatgtgaa gggtttttac gcatggtcat ttttggactg 1440 1680 taatgaatgg tttgcaggct ttactgttcg ttttggatta aactttgtag attagaaaga 1500 1681 tggattaaaa aggtacccta agctttctgc ccaatggtac aagaactttc tcaaaagaaa 1560 1682 ctagctagta ttattaaaag aactttgtag tagattacag tacatcgttt gaagttgagt 1620 1683 tggtgcacct aattaaataa aagaggttac tcttaacata tttttaggcc attcgttgtg 1680 1684 aagttgttag gctgttattt ctattatact atgttgtagt aataagtgca ttgttgtacc 1740 1685 agaagctatg atcataacta taggttgatc cttcatgtat cagtttgatg ttgagaatac 1800 1686 tttgaattaa aagtcttttt ttattttttt aaaaaaaaaa aaaaaaaaaa aaaaaaaaa 1859 1687 // 1688 """ 1689 1690 print "GenBank CDS Iteration" 1691 print "=====================" 1692 1693 g = GenBankScanner() 1694 for record in g.parse_cds_features(StringIO(gbk_example)): 1695 print record 1696 1697 g = GenBankScanner() 1698 for record in g.parse_cds_features(StringIO(gbk_example2), 1699 tags2id=('gene', 'locus_tag', 'product')): 1700 print record 1701 1702 g = GenBankScanner() 1703 for record in g.parse_cds_features(StringIO(gbk_example + "\n" + gbk_example2), 1704 tags2id=('gene', 'locus_tag', 'product')): 1705 print record 1706 1707 print 1708 print "GenBank Iteration" 1709 print "=================" 1710 g = GenBankScanner() 1711 for record in g.parse_records(StringIO(gbk_example), do_features=False): 1712 print record.id, record.name, record.description 1713 print record.seq 1714 1715 g = GenBankScanner() 1716 for record in g.parse_records(StringIO(gbk_example), do_features=True): 1717 print record.id, record.name, record.description 1718 print record.seq 1719 1720 g = GenBankScanner() 1721 for record in g.parse_records(StringIO(gbk_example2), do_features=False): 1722 print record.id, record.name, record.description 1723 print record.seq 1724 1725 g = GenBankScanner() 1726 for record in g.parse_records(StringIO(gbk_example2), do_features=True): 1727 print record.id, record.name, record.description 1728 print record.seq 1729 1730 print 1731 print "EMBL CDS Iteration" 1732 print "==================" 1733 1734 e = EmblScanner() 1735 for record in e.parse_cds_features(StringIO(embl_example)): 1736 print record 1737 1738 print 1739 print "EMBL Iteration" 1740 print "==============" 1741 e = EmblScanner() 1742 for record in e.parse_records(StringIO(embl_example), do_features=True): 1743 print record.id, record.name, record.description 1744 print record.seq 1745