1
2
3
4
5
6 """Internal code for parsing GenBank and EMBL files (PRIVATE).
7
8 This code is NOT intended for direct use. It provides a basic scanner
9 (for use with a event consumer such as Bio.GenBank._FeatureConsumer)
10 to parse a GenBank or EMBL file (with their shared INSDC feature table).
11
12 It is used by Bio.GenBank to parse GenBank files
13 It is also used by Bio.SeqIO to parse GenBank and EMBL files
14
15 Feature Table Documentation:
16 http://www.insdc.org/files/feature_table.html
17 http://www.ncbi.nlm.nih.gov/projects/collab/FT/index.html
18 ftp://ftp.ncbi.nih.gov/genbank/docs/
19 """
20
21
22
23
24
25
26
27
28
29 import warnings
30 import re
31 from Bio.Seq import Seq
32 from Bio.SeqRecord import SeqRecord
33 from Bio.Alphabet import generic_protein
34 from Bio import BiopythonParserWarning
35
36
38 """Basic functions for breaking up a GenBank/EMBL file into sub sections.
39
40 The International Nucleotide Sequence Database Collaboration (INSDC)
41 between the DDBJ, EMBL, and GenBank. These organisations all use the
42 same "Feature Table" layout in their plain text flat file formats.
43
44 However, the header and sequence sections of an EMBL file are very
45 different in layout to those produced by GenBank/DDBJ."""
46
47
48 RECORD_START = "XXX"
49 HEADER_WIDTH = 3
50 FEATURE_START_MARKERS = ["XXX***FEATURES***XXX"]
51 FEATURE_END_MARKERS = ["XXX***END FEATURES***XXX"]
52 FEATURE_QUALIFIER_INDENT = 0
53 FEATURE_QUALIFIER_SPACER = ""
54 SEQUENCE_HEADERS = ["XXX"]
55
63
67
69 """Read in lines until find the ID/LOCUS line, which is returned.
70
71 Any preamble (such as the header used by the NCBI on *.seq.gz archives)
72 will we ignored."""
73 while True:
74 if self.line:
75 line = self.line
76 self.line = ""
77 else:
78 line = self.handle.readline()
79 if not line:
80 if self.debug:
81 print "End of file"
82 return None
83 if line[:self.HEADER_WIDTH] == self.RECORD_START:
84 if self.debug > 1:
85 print "Found the start of a record:\n" + line
86 break
87 line = line.rstrip()
88 if line == "//":
89 if self.debug > 1:
90 print "Skipping // marking end of last record"
91 elif line == "":
92 if self.debug > 1:
93 print "Skipping blank line before record"
94 else:
95
96 if self.debug > 1:
97 print "Skipping header line before record:\n" + line
98 self.line = line
99 return line
100
102 """Return list of strings making up the header
103
104 New line characters are removed.
105
106 Assumes you have just read in the ID/LOCUS line.
107 """
108 assert self.line[:self.HEADER_WIDTH] == self.RECORD_START, \
109 "Not at start of record"
110
111 header_lines = []
112 while True:
113 line = self.handle.readline()
114 if not line:
115 raise ValueError("Premature end of line during sequence data")
116 line = line.rstrip()
117 if line in self.FEATURE_START_MARKERS:
118 if self.debug:
119 print "Found header table"
120 break
121
122
123
124 if line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS:
125 if self.debug:
126 print "Found start of sequence"
127 break
128 if line == "//":
129 raise ValueError("Premature end of sequence data marker '//' found")
130 header_lines.append(line)
131 self.line = line
132 return header_lines
133
202
204 """Expects a feature as a list of strings, returns a tuple (key, location, qualifiers)
205
206 For example given this GenBank feature:
207
208 CDS complement(join(490883..490885,1..879))
209 /locus_tag="NEQ001"
210 /note="conserved hypothetical [Methanococcus jannaschii];
211 COG1583:Uncharacterized ACR; IPR001472:Bipartite nuclear
212 localization signal; IPR002743: Protein of unknown
213 function DUF57"
214 /codon_start=1
215 /transl_table=11
216 /product="hypothetical protein"
217 /protein_id="NP_963295.1"
218 /db_xref="GI:41614797"
219 /db_xref="GeneID:2732620"
220 /translation="MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK
221 EKYFNFTLIPKKDIIENKRYYLIISSPDKRFIEVLHNKIKDLDIITIGLAQFQLRKTK
222 KFDPKLRFPWVTITPIVLREGKIVILKGDKYYKVFVKRLEELKKYNLIKKKEPILEEP
223 IEISLNQIKDGWKIIDVKDRYYDFRNKSFSAFSNWLRDLKEQSLRKYNNFCGKNFYFE
224 EAIFEGFTFYKTVSIRIRINRGEAVYIGTLWKELNVYRKLDKEEREFYKFLYDCGLGS
225 LNSMGFGFVNTKKNSAR"
226
227 Then should give input key="CDS" and the rest of the data as a list of strings
228 lines=["complement(join(490883..490885,1..879))", ..., "LNSMGFGFVNTKKNSAR"]
229 where the leading spaces and trailing newlines have been removed.
230
231 Returns tuple containing: (key as string, location string, qualifiers as list)
232 as follows for this example:
233
234 key = "CDS", string
235 location = "complement(join(490883..490885,1..879))", string
236 qualifiers = list of string tuples:
237
238 [('locus_tag', '"NEQ001"'),
239 ('note', '"conserved hypothetical [Methanococcus jannaschii];\nCOG1583:..."'),
240 ('codon_start', '1'),
241 ('transl_table', '11'),
242 ('product', '"hypothetical protein"'),
243 ('protein_id', '"NP_963295.1"'),
244 ('db_xref', '"GI:41614797"'),
245 ('db_xref', '"GeneID:2732620"'),
246 ('translation', '"MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK\nEKYFNFT..."')]
247
248 In the above example, the "note" and "translation" were edited for compactness,
249 and they would contain multiple new line characters (displayed above as \n)
250
251 If a qualifier is quoted (in this case, everything except codon_start and
252 transl_table) then the quotes are NOT removed.
253
254 Note that no whitespace is removed.
255 """
256
257 iterator = iter(filter(None, lines))
258 try:
259 line = iterator.next()
260
261 feature_location = line.strip()
262 while feature_location[-1:] == ",":
263
264 line = iterator.next()
265 feature_location += line.strip()
266
267 qualifiers = []
268
269 for line_number, line in enumerate(iterator):
270
271 if line_number == 0 and line.startswith(")"):
272 feature_location += line.strip()
273 elif line[0] == "/":
274
275 i = line.find("=")
276 key = line[1:i]
277 value = line[i + 1:]
278 if i == -1:
279
280 key = line[1:]
281 qualifiers.append((key, None))
282 elif not value:
283
284 qualifiers.append((key, ""))
285 elif value == '"':
286
287 if self.debug:
288 print "Single quote %s:%s" % (key, value)
289
290 qualifiers.append((key, value))
291 elif value[0] == '"':
292
293 value_list = [value]
294 while value_list[-1][-1] != '"':
295 value_list.append(iterator.next())
296 value = '\n'.join(value_list)
297
298 qualifiers.append((key, value))
299 else:
300
301
302 qualifiers.append((key, value))
303 else:
304
305 assert len(qualifiers) > 0
306 assert key == qualifiers[-1][0]
307
308 qualifiers[-1] = (key, qualifiers[-1][1] + "\n" + line)
309 return (feature_key, feature_location, qualifiers)
310 except StopIteration:
311
312 raise ValueError("Problem with '%s' feature:\n%s"
313 % (feature_key, "\n".join(lines)))
314
337
339 """Handle the LOCUS/ID line, passing data to the comsumer
340
341 This should be implemented by the EMBL / GenBank specific subclass
342
343 Used by the parse_records() and parse() methods.
344 """
345 pass
346
348 """Handle the header lines (list of strings), passing data to the comsumer
349
350 This should be implemented by the EMBL / GenBank specific subclass
351
352 Used by the parse_records() and parse() methods.
353 """
354 pass
355
357 """Handle the feature table (list of tuples), passing data to the comsumer
358
359 Used by the parse_records() and parse() methods.
360 """
361 consumer.start_feature_table()
362 for feature_key, location_string, qualifiers in feature_tuples:
363 consumer.feature_key(feature_key)
364 consumer.location(location_string)
365 for q_key, q_value in qualifiers:
366 if q_value is None:
367 consumer.feature_qualifier(q_key, q_value)
368 else:
369 consumer.feature_qualifier(q_key, q_value.replace("\n", " "))
370
372 """Handle any lines between features and sequence (list of strings), passing data to the consumer
373
374 This should be implemented by the EMBL / GenBank specific subclass
375
376 Used by the parse_records() and parse() methods.
377 """
378 pass
379
380 - def feed(self, handle, consumer, do_features=True):
381 """Feed a set of data into the consumer.
382
383 This method is intended for use with the "old" code in Bio.GenBank
384
385 Arguments:
386 handle - A handle with the information to parse.
387 consumer - The consumer that should be informed of events.
388 do_features - Boolean, should the features be parsed?
389 Skipping the features can be much faster.
390
391 Return values:
392 true - Passed a record
393 false - Did not find a record
394 """
395
396
397 self.set_handle(handle)
398 if not self.find_start():
399
400 consumer.data = None
401 return False
402
403
404
405
406
407
408 self._feed_first_line(consumer, self.line)
409 self._feed_header_lines(consumer, self.parse_header())
410
411
412 if do_features:
413 self._feed_feature_table(consumer, self.parse_features(skip=False))
414 else:
415 self.parse_features(skip=True)
416
417
418 misc_lines, sequence_string = self.parse_footer()
419 self._feed_misc_lines(consumer, misc_lines)
420
421 consumer.sequence(sequence_string)
422
423 consumer.record_end("//")
424
425 assert self.line == "//"
426
427
428 return True
429
430 - def parse(self, handle, do_features=True):
445
447 """Returns a SeqRecord object iterator
448
449 Each record (from the ID/LOCUS line to the // line) becomes a SeqRecord
450
451 The SeqRecord objects include SeqFeatures if do_features=True
452
453 This method is intended for use in Bio.SeqIO
454 """
455
456 while True:
457 record = self.parse(handle, do_features)
458 if record is None:
459 break
460 if record.id is None:
461 raise ValueError("Failed to parse the record's ID. Invalid ID line?")
462 if record.name == "<unknown name>":
463 raise ValueError("Failed to parse the record's name. Invalid ID line?")
464 if record.description == "<unknown description>":
465 raise ValueError("Failed to parse the record's description")
466 yield record
467
471 """Returns SeqRecord object iterator
472
473 Each CDS feature becomes a SeqRecord.
474
475 alphabet - Used for any sequence found in a translation field.
476 tags2id - Tupple of three strings, the feature keys to use
477 for the record id, name and description,
478
479 This method is intended for use in Bio.SeqIO
480 """
481 self.set_handle(handle)
482 while self.find_start():
483
484 self.parse_header()
485 feature_tuples = self.parse_features()
486
487 while True:
488 line = self.handle.readline()
489 if not line:
490 break
491 if line[:2] == "//": break
492 self.line = line.rstrip()
493
494
495 for key, location_string, qualifiers in feature_tuples:
496 if key == "CDS":
497
498
499
500
501
502 record = SeqRecord(seq=None)
503 annotations = record.annotations
504
505
506
507
508 annotations['raw_location'] = location_string.replace(' ', '')
509
510 for (qualifier_name, qualifier_data) in qualifiers:
511 if qualifier_data is not None \
512 and qualifier_data[0] == '"' and qualifier_data[-1] == '"':
513
514 qualifier_data = qualifier_data[1:-1]
515
516 if qualifier_name == "translation":
517 assert record.seq is None, "Multiple translations!"
518 record.seq = Seq(qualifier_data.replace("\n", ""), alphabet)
519 elif qualifier_name == "db_xref":
520
521 record.dbxrefs.append(qualifier_data)
522 else:
523 if qualifier_data is not None:
524 qualifier_data = qualifier_data.replace("\n", " ").replace(" ", " ")
525 try:
526 annotations[qualifier_name] += " " + qualifier_data
527 except KeyError:
528
529 annotations[qualifier_name] = qualifier_data
530
531
532
533 try:
534 record.id = annotations[tags2id[0]]
535 except KeyError:
536 pass
537 try:
538 record.name = annotations[tags2id[1]]
539 except KeyError:
540 pass
541 try:
542 record.description = annotations[tags2id[2]]
543 except KeyError:
544 pass
545
546 yield record
547
548
550 """For extracting chunks of information in EMBL files"""
551
552 RECORD_START = "ID "
553 HEADER_WIDTH = 5
554 FEATURE_START_MARKERS = ["FH Key Location/Qualifiers", "FH"]
555 FEATURE_END_MARKERS = ["XX"]
556 FEATURE_QUALIFIER_INDENT = 21
557 FEATURE_QUALIFIER_SPACER = "FT" + " " * (FEATURE_QUALIFIER_INDENT - 2)
558 SEQUENCE_HEADERS = ["SQ", "CO"]
559
594
605
607
608
609
610 assert line[:self.HEADER_WIDTH].rstrip() == "ID"
611 fields = [line[self.HEADER_WIDTH:].split(None, 1)[0]]
612 fields.extend(line[self.HEADER_WIDTH:].split(None, 1)[1].split(";"))
613 fields = [entry.strip() for entry in fields]
614 """
615 The tokens represent:
616 0. Primary accession number
617 (space sep)
618 1. ??? (e.g. standard)
619 (semi-colon)
620 2. Topology and/or Molecule type (e.g. 'circular DNA' or 'DNA')
621 3. Taxonomic division (e.g. 'PRO')
622 4. Sequence length (e.g. '4639675 BP.')
623 """
624 consumer.locus(fields[0])
625 consumer.residue_type(fields[2])
626 consumer.data_file_division(fields[3])
627 self._feed_seq_length(consumer, fields[4])
628
630
631
632
633 assert line[:self.HEADER_WIDTH].rstrip() == "ID"
634 fields = [data.strip() for data in line[self.HEADER_WIDTH:].strip().split(";")]
635 assert len(fields) == 7
636 """
637 The tokens represent:
638 0. Primary accession number
639 1. Sequence version number
640 2. Topology: 'circular' or 'linear'
641 3. Molecule type (e.g. 'genomic DNA')
642 4. Data class (e.g. 'STD')
643 5. Taxonomic division (e.g. 'PRO')
644 6. Sequence length (e.g. '4639675 BP.')
645 """
646
647 consumer.locus(fields[0])
648
649
650
651 consumer.accession(fields[0])
652
653
654
655 version_parts = fields[1].split()
656 if len(version_parts) == 2 \
657 and version_parts[0] == "SV" \
658 and version_parts[1].isdigit():
659 consumer.version_suffix(version_parts[1])
660
661
662 consumer.residue_type(" ".join(fields[2:4]))
663
664
665
666 consumer.data_file_division(fields[5])
667
668 self._feed_seq_length(consumer, fields[6])
669
671 length_parts = text.split()
672 assert len(length_parts) == 2
673 assert length_parts[1].upper() in ["BP", "BP.", "AA."]
674 consumer.size(length_parts[0])
675
677 EMBL_INDENT = self.HEADER_WIDTH
678 EMBL_SPACER = " " * EMBL_INDENT
679 consumer_dict = {
680 'AC': 'accession',
681 'SV': 'version',
682 'DE': 'definition',
683
684
685
686
687 'RG': 'consrtm',
688
689
690 'RL': 'journal',
691 'OS': 'organism',
692 'OC': 'taxonomy',
693
694 'CC': 'comment',
695
696 }
697
698
699 for line in lines:
700 line_type = line[:EMBL_INDENT].strip()
701 data = line[EMBL_INDENT:].strip()
702 if line_type == 'XX':
703 pass
704 elif line_type == 'RN':
705
706
707 if data[0] == "[" and data[-1] == "]":
708 data = data[1:-1]
709 consumer.reference_num(data)
710 elif line_type == 'RP':
711
712
713
714
715 parts = [bases.replace("-", " to ").strip() for bases in data.split(",") if bases.strip()]
716 consumer.reference_bases("(bases %s)" % "; ".join(parts))
717 elif line_type == 'RT':
718
719
720 if data.startswith('"'):
721 data = data[1:]
722 if data.endswith('";'):
723 data = data[:-2]
724 consumer.title(data)
725 elif line_type == 'RX':
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741 key, value = data.split(";", 1)
742 if value.endswith("."):
743 value = value[:-1]
744 value = value.strip()
745 if key == "PUBMED":
746 consumer.pubmed_id(value)
747
748 elif line_type == 'CC':
749
750 consumer.comment([data])
751 elif line_type == 'DR':
752
753
754
755
756
757
758
759 parts = data.rstrip(".").split(";")
760
761
762 consumer.dblink("%s:%s" % (parts[0].strip(),
763 parts[1].strip()))
764 elif line_type == 'RA':
765
766 consumer.authors(data.rstrip(";"))
767 elif line_type == 'PR':
768
769
770
771 consumer.project(data.rstrip(";"))
772 elif line_type in consumer_dict:
773
774 getattr(consumer, consumer_dict[line_type])(data)
775 else:
776 if self.debug:
777 print "Ignoring EMBL header line:\n%s" % line
778
780
781 lines.append("")
782 line_iter = iter(lines)
783 try:
784 for line in line_iter:
785 if line.startswith("CO "):
786 line = line[5:].strip()
787 contig_location = line
788 while True:
789 line = line_iter.next()
790 if not line:
791 break
792 elif line.startswith("CO "):
793
794 contig_location += line[5:].strip()
795 else:
796 raise ValueError('Expected CO (contig) continuation line, got:\n' + line)
797 consumer.contig_location(contig_location)
798 return
799 except StopIteration:
800 raise ValueError("Problem in misc lines before sequence")
801
802
804 """For extracting chunks of information in IMGT (EMBL like) files (PRIVATE).
805
806 IMGT files are like EMBL files but in order to allow longer feature types
807 the features should be indented by 25 characters not 21 characters. In
808 practice the IMGT flat files tend to use either 21 or 25 characters, so we
809 must cope with both.
810
811 This is private to encourage use of Bio.SeqIO rather than Bio.GenBank.
812 """
813
814 FEATURE_START_MARKERS = ["FH Key Location/Qualifiers",
815 "FH Key Location/Qualifiers (from EMBL)",
816 "FH Key Location/Qualifiers",
817 "FH"]
818
820 """Return list of tuples for the features (if present)
821
822 Each feature is returned as a tuple (key, location, qualifiers)
823 where key and location are strings (e.g. "CDS" and
824 "complement(join(490883..490885,1..879))") while qualifiers
825 is a list of two string tuples (feature qualifier keys and values).
826
827 Assumes you have already read to the start of the features table.
828 """
829 if self.line.rstrip() not in self.FEATURE_START_MARKERS:
830 if self.debug:
831 print "Didn't find any feature table"
832 return []
833
834 while self.line.rstrip() in self.FEATURE_START_MARKERS:
835 self.line = self.handle.readline()
836
837 bad_position_re = re.compile(r'([0-9]+)>{1}')
838
839 features = []
840 line = self.line
841 while True:
842 if not line:
843 raise ValueError("Premature end of line during features table")
844 if line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS:
845 if self.debug:
846 print "Found start of sequence"
847 break
848 line = line.rstrip()
849 if line == "//":
850 raise ValueError("Premature end of features table, marker '//' found")
851 if line in self.FEATURE_END_MARKERS:
852 if self.debug:
853 print "Found end of features"
854 line = self.handle.readline()
855 break
856 if line[2:self.FEATURE_QUALIFIER_INDENT].strip() == "":
857
858
859 line = self.handle.readline()
860 continue
861
862 if skip:
863 line = self.handle.readline()
864 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER:
865 line = self.handle.readline()
866 else:
867 assert line[:2] == "FT"
868 try:
869 feature_key, location_start = line[2:].strip().split()
870 except ValueError:
871
872
873
874 feature_key = line[2:25].strip()
875 location_start = line[25:].strip()
876 feature_lines = [location_start]
877 line = self.handle.readline()
878 while line[:self.FEATURE_QUALIFIER_INDENT] == self.FEATURE_QUALIFIER_SPACER \
879 or line.rstrip() == "":
880
881
882 assert line[:2] == "FT"
883 feature_lines.append(line[self.FEATURE_QUALIFIER_INDENT:].strip())
884 line = self.handle.readline()
885 feature_key, location, qualifiers = \
886 self.parse_feature(feature_key, feature_lines)
887
888 if ">" in location:
889
890
891
892
893
894
895 location = bad_position_re.sub(r'>\1', location)
896 features.append((feature_key, location, qualifiers))
897 self.line = line
898 return features
899
900
902 """For extracting chunks of information in GenBank files"""
903
904 RECORD_START = "LOCUS "
905 HEADER_WIDTH = 12
906 FEATURE_START_MARKERS = ["FEATURES Location/Qualifiers", "FEATURES"]
907 FEATURE_END_MARKERS = []
908 FEATURE_QUALIFIER_INDENT = 21
909 FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT
910 SEQUENCE_HEADERS = ["CONTIG", "ORIGIN", "BASE COUNT", "WGS"]
911
961
963 """Scan over and parse GenBank LOCUS line (PRIVATE).
964
965 This must cope with several variants, primarily the old and new column
966 based standards from GenBank. Additionally EnsEMBL produces GenBank
967 files where the LOCUS line is space separated rather that following
968 the column based layout.
969
970 We also try to cope with GenBank like files with partial LOCUS lines.
971 """
972
973
974
975 GENBANK_INDENT = self.HEADER_WIDTH
976 GENBANK_SPACER = " " * GENBANK_INDENT
977 assert line[0:GENBANK_INDENT] == 'LOCUS ', \
978 'LOCUS line does not start correctly:\n' + line
979
980
981
982 if line[29:33] in [' bp ', ' aa ', ' rc '] and line[55:62] == ' ':
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004 assert line[41:42] == ' ', \
1005 'LOCUS line does not contain space at position 42:\n' + line
1006 assert line[42:51].strip() in ['', 'linear', 'circular'], \
1007 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line
1008 assert line[51:52] == ' ', \
1009 'LOCUS line does not contain space at position 52:\n' + line
1010
1011
1012 if line[62:73].strip():
1013 assert line[64:65] == '-', \
1014 'LOCUS line does not contain - at position 65 in date:\n' + line
1015 assert line[68:69] == '-', \
1016 'LOCUS line does not contain - at position 69 in date:\n' + line
1017
1018 name_and_length_str = line[GENBANK_INDENT:29]
1019 while ' ' in name_and_length_str:
1020 name_and_length_str = name_and_length_str.replace(' ', ' ')
1021 name_and_length = name_and_length_str.split(' ')
1022 assert len(name_and_length) <= 2, \
1023 'Cannot parse the name and length in the LOCUS line:\n' + line
1024 assert len(name_and_length) != 1, \
1025 'Name and length collide in the LOCUS line:\n' + line
1026
1027
1028
1029 consumer.locus(name_and_length[0])
1030 consumer.size(name_and_length[1])
1031
1032
1033 if line[33:51].strip() == "" and line[29:33] == ' aa ':
1034
1035
1036
1037
1038 consumer.residue_type("PROTEIN")
1039 else:
1040 consumer.residue_type(line[33:51].strip())
1041
1042 consumer.data_file_division(line[52:55])
1043 if line[62:73].strip():
1044 consumer.date(line[62:73])
1045 elif line[40:44] in [' bp ', ' aa ', ' rc '] \
1046 and line[54:64].strip() in ['', 'linear', 'circular']:
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067 assert line[40:44] in [' bp ', ' aa ', ' rc '], \
1068 'LOCUS line does not contain size units at expected position:\n' + line
1069 assert line[44:47] in [' ', 'ss-', 'ds-', 'ms-'], \
1070 'LOCUS line does not have valid strand type (Single stranded, ...):\n' + line
1071 assert line[47:54].strip() == "" \
1072 or 'DNA' in line[47:54].strip().upper() \
1073 or 'RNA' in line[47:54].strip().upper(), \
1074 'LOCUS line does not contain valid sequence type (DNA, RNA, ...):\n' + line
1075 assert line[54:55] == ' ', \
1076 'LOCUS line does not contain space at position 55:\n' + line
1077 assert line[55:63].strip() in ['', 'linear', 'circular'], \
1078 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line
1079 assert line[63:64] == ' ', \
1080 'LOCUS line does not contain space at position 64:\n' + line
1081 assert line[67:68] == ' ', \
1082 'LOCUS line does not contain space at position 68:\n' + line
1083 if line[68:79].strip():
1084 assert line[70:71] == '-', \
1085 'LOCUS line does not contain - at position 71 in date:\n' + line
1086 assert line[74:75] == '-', \
1087 'LOCUS line does not contain - at position 75 in date:\n' + line
1088
1089 name_and_length_str = line[GENBANK_INDENT:40]
1090 while ' ' in name_and_length_str:
1091 name_and_length_str = name_and_length_str.replace(' ', ' ')
1092 name_and_length = name_and_length_str.split(' ')
1093 assert len(name_and_length) <= 2, \
1094 'Cannot parse the name and length in the LOCUS line:\n' + line
1095 assert len(name_and_length) != 1, \
1096 'Name and length collide in the LOCUS line:\n' + line
1097
1098
1099
1100 consumer.locus(name_and_length[0])
1101 consumer.size(name_and_length[1])
1102
1103 if line[44:54].strip() == "" and line[40:44] == ' aa ':
1104
1105
1106
1107
1108 consumer.residue_type(("PROTEIN " + line[54:63]).strip())
1109 else:
1110 consumer.residue_type(line[44:63].strip())
1111
1112 consumer.data_file_division(line[64:67])
1113 if line[68:79].strip():
1114 consumer.date(line[68:79])
1115 elif line[GENBANK_INDENT:].strip().count(" ") == 0:
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131 if line[GENBANK_INDENT:].strip() != "":
1132 consumer.locus(line[GENBANK_INDENT:].strip())
1133 else:
1134
1135
1136 warnings.warn("Minimal LOCUS line found - is this correct?\n:%r" % line)
1137 elif len(line.split()) == 7 and line.split()[3] in ["aa", "bp"]:
1138
1139
1140
1141
1142
1143
1144
1145
1146 splitline = line.split()
1147 consumer.locus(splitline[1])
1148 consumer.size(splitline[2])
1149 consumer.residue_type(splitline[4])
1150 consumer.data_file_division(splitline[5])
1151 consumer.date(splitline[6])
1152 elif len(line.split()) >= 4 and line.split()[3] in ["aa", "bp"]:
1153
1154
1155 warnings.warn("Malformed LOCUS line found - is this correct?\n:%r" % line)
1156 consumer.locus(line.split()[1])
1157 consumer.size(line.split()[2])
1158 elif len(line.split()) >= 4 and line.split()[-1] in ["aa", "bp"]:
1159
1160
1161
1162 warnings.warn("Malformed LOCUS line found - is this correct?\n:%r" % line)
1163 consumer.locus(line[5:].rsplit(None, 2)[0].strip())
1164 consumer.size(line.split()[-2])
1165 else:
1166 raise ValueError('Did not recognise the LOCUS line layout:\n' + line)
1167
1169
1170
1171
1172
1173 GENBANK_INDENT = self.HEADER_WIDTH
1174 GENBANK_SPACER = " " * GENBANK_INDENT
1175 consumer_dict = {
1176 'DEFINITION': 'definition',
1177 'ACCESSION': 'accession',
1178 'NID': 'nid',
1179 'PID': 'pid',
1180 'DBSOURCE': 'db_source',
1181 'KEYWORDS': 'keywords',
1182 'SEGMENT': 'segment',
1183 'SOURCE': 'source',
1184 'AUTHORS': 'authors',
1185 'CONSRTM': 'consrtm',
1186 'PROJECT': 'project',
1187 'DBLINK': 'dblink',
1188 'TITLE': 'title',
1189 'JOURNAL': 'journal',
1190 'MEDLINE': 'medline_id',
1191 'PUBMED': 'pubmed_id',
1192 'REMARK': 'remark'}
1193
1194
1195
1196
1197
1198
1199 lines = filter(None, lines)
1200 lines.append("")
1201 line_iter = iter(lines)
1202 try:
1203 line = line_iter.next()
1204 while True:
1205 if not line:
1206 break
1207 line_type = line[:GENBANK_INDENT].strip()
1208 data = line[GENBANK_INDENT:].strip()
1209
1210 if line_type == 'VERSION':
1211
1212
1213
1214 while ' ' in data:
1215 data = data.replace(' ', ' ')
1216 if ' GI:' not in data:
1217 consumer.version(data)
1218 else:
1219 if self.debug:
1220 print "Version [" + data.split(' GI:')[0] + "], gi [" + data.split(' GI:')[1] + "]"
1221 consumer.version(data.split(' GI:')[0])
1222 consumer.gi(data.split(' GI:')[1])
1223
1224 line = line_iter.next()
1225 elif line_type == 'REFERENCE':
1226 if self.debug > 1:
1227 print "Found reference [" + data + "]"
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238 data = data.strip()
1239
1240
1241 while True:
1242 line = line_iter.next()
1243 if line[:GENBANK_INDENT] == GENBANK_SPACER:
1244
1245 data += " " + line[GENBANK_INDENT:]
1246 if self.debug > 1:
1247 print "Extended reference text [" + data + "]"
1248 else:
1249
1250 break
1251
1252
1253
1254 while ' ' in data:
1255 data = data.replace(' ', ' ')
1256 if ' ' not in data:
1257 if self.debug > 2:
1258 print 'Reference number \"' + data + '\"'
1259 consumer.reference_num(data)
1260 else:
1261 if self.debug > 2:
1262 print 'Reference number \"' + data[:data.find(' ')] + '\", \"' + data[data.find(' ') + 1:] + '\"'
1263 consumer.reference_num(data[:data.find(' ')])
1264 consumer.reference_bases(data[data.find(' ') + 1:])
1265 elif line_type == 'ORGANISM':
1266
1267
1268
1269
1270
1271
1272
1273
1274 organism_data = data
1275 lineage_data = ""
1276 while True:
1277 line = line_iter.next()
1278 if line[0:GENBANK_INDENT] == GENBANK_SPACER:
1279 if lineage_data or ";" in line:
1280 lineage_data += " " + line[GENBANK_INDENT:]
1281 else:
1282 organism_data += " " + line[GENBANK_INDENT:].strip()
1283 else:
1284
1285 break
1286 consumer.organism(organism_data)
1287 if lineage_data.strip() == "" and self.debug > 1:
1288 print "Taxonomy line(s) missing or blank"
1289 consumer.taxonomy(lineage_data.strip())
1290 del organism_data, lineage_data
1291 elif line_type == 'COMMENT':
1292 if self.debug > 1:
1293 print "Found comment"
1294
1295
1296 comment_list = []
1297 comment_list.append(data)
1298 while True:
1299 line = line_iter.next()
1300 if line[0:GENBANK_INDENT] == GENBANK_SPACER:
1301 data = line[GENBANK_INDENT:]
1302 comment_list.append(data)
1303 if self.debug > 2:
1304 print "Comment continuation [" + data + "]"
1305 else:
1306
1307 break
1308 consumer.comment(comment_list)
1309 del comment_list
1310 elif line_type in consumer_dict:
1311
1312
1313 while True:
1314 line = line_iter.next()
1315 if line[0:GENBANK_INDENT] == GENBANK_SPACER:
1316 data += ' ' + line[GENBANK_INDENT:]
1317 else:
1318
1319 getattr(consumer, consumer_dict[line_type])(data)
1320
1321 break
1322 else:
1323 if self.debug:
1324 print "Ignoring GenBank header line:\n" % line
1325
1326 line = line_iter.next()
1327 except StopIteration:
1328 raise ValueError("Problem in header")
1329
1372
1373 if __name__ == "__main__":
1374 from StringIO import StringIO
1375
1376 gbk_example = \
1377 """LOCUS SCU49845 5028 bp DNA PLN 21-JUN-1999
1378 DEFINITION Saccharomyces cerevisiae TCP1-beta gene, partial cds, and Axl2p
1379 (AXL2) and Rev7p (REV7) genes, complete cds.
1380 ACCESSION U49845
1381 VERSION U49845.1 GI:1293613
1382 KEYWORDS .
1383 SOURCE Saccharomyces cerevisiae (baker's yeast)
1384 ORGANISM Saccharomyces cerevisiae
1385 Eukaryota; Fungi; Ascomycota; Saccharomycotina; Saccharomycetes;
1386 Saccharomycetales; Saccharomycetaceae; Saccharomyces.
1387 REFERENCE 1 (bases 1 to 5028)
1388 AUTHORS Torpey,L.E., Gibbs,P.E., Nelson,J. and Lawrence,C.W.
1389 TITLE Cloning and sequence of REV7, a gene whose function is required for
1390 DNA damage-induced mutagenesis in Saccharomyces cerevisiae
1391 JOURNAL Yeast 10 (11), 1503-1509 (1994)
1392 PUBMED 7871890
1393 REFERENCE 2 (bases 1 to 5028)
1394 AUTHORS Roemer,T., Madden,K., Chang,J. and Snyder,M.
1395 TITLE Selection of axial growth sites in yeast requires Axl2p, a novel
1396 plasma membrane glycoprotein
1397 JOURNAL Genes Dev. 10 (7), 777-793 (1996)
1398 PUBMED 8846915
1399 REFERENCE 3 (bases 1 to 5028)
1400 AUTHORS Roemer,T.
1401 TITLE Direct Submission
1402 JOURNAL Submitted (22-FEB-1996) Terry Roemer, Biology, Yale University, New
1403 Haven, CT, USA
1404 FEATURES Location/Qualifiers
1405 source 1..5028
1406 /organism="Saccharomyces cerevisiae"
1407 /db_xref="taxon:4932"
1408 /chromosome="IX"
1409 /map="9"
1410 CDS <1..206
1411 /codon_start=3
1412 /product="TCP1-beta"
1413 /protein_id="AAA98665.1"
1414 /db_xref="GI:1293614"
1415 /translation="SSIYNGISTSGLDLNNGTIADMRQLGIVESYKLKRAVVSSASEA
1416 AEVLLRVDNIIRARPRTANRQHM"
1417 gene 687..3158
1418 /gene="AXL2"
1419 CDS 687..3158
1420 /gene="AXL2"
1421 /note="plasma membrane glycoprotein"
1422 /codon_start=1
1423 /function="required for axial budding pattern of S.
1424 cerevisiae"
1425 /product="Axl2p"
1426 /protein_id="AAA98666.1"
1427 /db_xref="GI:1293615"
1428 /translation="MTQLQISLLLTATISLLHLVVATPYEAYPIGKQYPPVARVNESF
1429 TFQISNDTYKSSVDKTAQITYNCFDLPSWLSFDSSSRTFSGEPSSDLLSDANTTLYFN
1430 VILEGTDSADSTSLNNTYQFVVTNRPSISLSSDFNLLALLKNYGYTNGKNALKLDPNE
1431 VFNVTFDRSMFTNEESIVSYYGRSQLYNAPLPNWLFFDSGELKFTGTAPVINSAIAPE
1432 TSYSFVIIATDIEGFSAVEVEFELVIGAHQLTTSIQNSLIINVTDTGNVSYDLPLNYV
1433 YLDDDPISSDKLGSINLLDAPDWVALDNATISGSVPDELLGKNSNPANFSVSIYDTYG
1434 DVIYFNFEVVSTTDLFAISSLPNINATRGEWFSYYFLPSQFTDYVNTNVSLEFTNSSQ
1435 DHDWVKFQSSNLTLAGEVPKNFDKLSLGLKANQGSQSQELYFNIIGMDSKITHSNHSA
1436 NATSTRSSHHSTSTSSYTSSTYTAKISSTSAAATSSAPAALPAANKTSSHNKKAVAIA
1437 CGVAIPLGVILVALICFLIFWRRRRENPDDENLPHAISGPDLNNPANKPNQENATPLN
1438 NPFDDDASSYDDTSIARRLAALNTLKLDNHSATESDISSVDEKRDSLSGMNTYNDQFQ
1439 SQSKEELLAKPPVQPPESPFFDPQNRSSSVYMDSEPAVNKSWRYTGNLSPVSDIVRDS
1440 YGSQKTVDTEKLFDLEAPEKEKRTSRDVTMSSLDPWNSNISPSPVRKSVTPSPYNVTK
1441 HRNRHLQNIQDSQSGKNGITPTTMSTSSSDDFVPVKDGENFCWVHSMEPDRRPSKKRL
1442 VDFSNKSNVNVGQVKDIHGRIPEML"
1443 gene complement(3300..4037)
1444 /gene="REV7"
1445 CDS complement(3300..4037)
1446 /gene="REV7"
1447 /codon_start=1
1448 /product="Rev7p"
1449 /protein_id="AAA98667.1"
1450 /db_xref="GI:1293616"
1451 /translation="MNRWVEKWLRVYLKCYINLILFYRNVYPPQSFDYTTYQSFNLPQ
1452 FVPINRHPALIDYIEELILDVLSKLTHVYRFSICIINKKNDLCIEKYVLDFSELQHVD
1453 KDDQIITETEVFDEFRSSLNSLIMHLEKLPKVNDDTITFEAVINAIELELGHKLDRNR
1454 RVDSLEEKAEIERDSNWVKCQEDENLPDNNGFQPPKIKLTSLVGSDVGPLIIHQFSEK
1455 LISGDDKILNGVYSQYEEGESIFGSLF"
1456 ORIGIN
1457 1 gatcctccat atacaacggt atctccacct caggtttaga tctcaacaac ggaaccattg
1458 61 ccgacatgag acagttaggt atcgtcgaga gttacaagct aaaacgagca gtagtcagct
1459 121 ctgcatctga agccgctgaa gttctactaa gggtggataa catcatccgt gcaagaccaa
1460 181 gaaccgccaa tagacaacat atgtaacata tttaggatat acctcgaaaa taataaaccg
1461 241 ccacactgtc attattataa ttagaaacag aacgcaaaaa ttatccacta tataattcaa
1462 301 agacgcgaaa aaaaaagaac aacgcgtcat agaacttttg gcaattcgcg tcacaaataa
1463 361 attttggcaa cttatgtttc ctcttcgagc agtactcgag ccctgtctca agaatgtaat
1464 421 aatacccatc gtaggtatgg ttaaagatag catctccaca acctcaaagc tccttgccga
1465 481 gagtcgccct cctttgtcga gtaattttca cttttcatat gagaacttat tttcttattc
1466 541 tttactctca catcctgtag tgattgacac tgcaacagcc accatcacta gaagaacaga
1467 601 acaattactt aatagaaaaa ttatatcttc ctcgaaacga tttcctgctt ccaacatcta
1468 661 cgtatatcaa gaagcattca cttaccatga cacagcttca gatttcatta ttgctgacag
1469 721 ctactatatc actactccat ctagtagtgg ccacgcccta tgaggcatat cctatcggaa
1470 781 aacaataccc cccagtggca agagtcaatg aatcgtttac atttcaaatt tccaatgata
1471 841 cctataaatc gtctgtagac aagacagctc aaataacata caattgcttc gacttaccga
1472 901 gctggctttc gtttgactct agttctagaa cgttctcagg tgaaccttct tctgacttac
1473 961 tatctgatgc gaacaccacg ttgtatttca atgtaatact cgagggtacg gactctgccg
1474 1021 acagcacgtc tttgaacaat acataccaat ttgttgttac aaaccgtcca tccatctcgc
1475 1081 tatcgtcaga tttcaatcta ttggcgttgt taaaaaacta tggttatact aacggcaaaa
1476 1141 acgctctgaa actagatcct aatgaagtct tcaacgtgac ttttgaccgt tcaatgttca
1477 1201 ctaacgaaga atccattgtg tcgtattacg gacgttctca gttgtataat gcgccgttac
1478 1261 ccaattggct gttcttcgat tctggcgagt tgaagtttac tgggacggca ccggtgataa
1479 1321 actcggcgat tgctccagaa acaagctaca gttttgtcat catcgctaca gacattgaag
1480 1381 gattttctgc cgttgaggta gaattcgaat tagtcatcgg ggctcaccag ttaactacct
1481 1441 ctattcaaaa tagtttgata atcaacgtta ctgacacagg taacgtttca tatgacttac
1482 1501 ctctaaacta tgtttatctc gatgacgatc ctatttcttc tgataaattg ggttctataa
1483 1561 acttattgga tgctccagac tgggtggcat tagataatgc taccatttcc gggtctgtcc
1484 1621 cagatgaatt actcggtaag aactccaatc ctgccaattt ttctgtgtcc atttatgata
1485 1681 cttatggtga tgtgatttat ttcaacttcg aagttgtctc cacaacggat ttgtttgcca
1486 1741 ttagttctct tcccaatatt aacgctacaa ggggtgaatg gttctcctac tattttttgc
1487 1801 cttctcagtt tacagactac gtgaatacaa acgtttcatt agagtttact aattcaagcc
1488 1861 aagaccatga ctgggtgaaa ttccaatcat ctaatttaac attagctgga gaagtgccca
1489 1921 agaatttcga caagctttca ttaggtttga aagcgaacca aggttcacaa tctcaagagc
1490 1981 tatattttaa catcattggc atggattcaa agataactca ctcaaaccac agtgcgaatg
1491 2041 caacgtccac aagaagttct caccactcca cctcaacaag ttcttacaca tcttctactt
1492 2101 acactgcaaa aatttcttct acctccgctg ctgctacttc ttctgctcca gcagcgctgc
1493 2161 cagcagccaa taaaacttca tctcacaata aaaaagcagt agcaattgcg tgcggtgttg
1494 2221 ctatcccatt aggcgttatc ctagtagctc tcatttgctt cctaatattc tggagacgca
1495 2281 gaagggaaaa tccagacgat gaaaacttac cgcatgctat tagtggacct gatttgaata
1496 2341 atcctgcaaa taaaccaaat caagaaaacg ctacaccttt gaacaacccc tttgatgatg
1497 2401 atgcttcctc gtacgatgat acttcaatag caagaagatt ggctgctttg aacactttga
1498 2461 aattggataa ccactctgcc actgaatctg atatttccag cgtggatgaa aagagagatt
1499 2521 ctctatcagg tatgaataca tacaatgatc agttccaatc ccaaagtaaa gaagaattat
1500 2581 tagcaaaacc cccagtacag cctccagaga gcccgttctt tgacccacag aataggtctt
1501 2641 cttctgtgta tatggatagt gaaccagcag taaataaatc ctggcgatat actggcaacc
1502 2701 tgtcaccagt ctctgatatt gtcagagaca gttacggatc acaaaaaact gttgatacag
1503 2761 aaaaactttt cgatttagaa gcaccagaga aggaaaaacg tacgtcaagg gatgtcacta
1504 2821 tgtcttcact ggacccttgg aacagcaata ttagcccttc tcccgtaaga aaatcagtaa
1505 2881 caccatcacc atataacgta acgaagcatc gtaaccgcca cttacaaaat attcaagact
1506 2941 ctcaaagcgg taaaaacgga atcactccca caacaatgtc aacttcatct tctgacgatt
1507 3001 ttgttccggt taaagatggt gaaaattttt gctgggtcca tagcatggaa ccagacagaa
1508 3061 gaccaagtaa gaaaaggtta gtagattttt caaataagag taatgtcaat gttggtcaag
1509 3121 ttaaggacat tcacggacgc atcccagaaa tgctgtgatt atacgcaacg atattttgct
1510 3181 taattttatt ttcctgtttt attttttatt agtggtttac agatacccta tattttattt
1511 3241 agtttttata cttagagaca tttaatttta attccattct tcaaatttca tttttgcact
1512 3301 taaaacaaag atccaaaaat gctctcgccc tcttcatatt gagaatacac tccattcaaa
1513 3361 attttgtcgt caccgctgat taatttttca ctaaactgat gaataatcaa aggccccacg
1514 3421 tcagaaccga ctaaagaagt gagttttatt ttaggaggtt gaaaaccatt attgtctggt
1515 3481 aaattttcat cttcttgaca tttaacccag tttgaatccc tttcaatttc tgctttttcc
1516 3541 tccaaactat cgaccctcct gtttctgtcc aacttatgtc ctagttccaa ttcgatcgca
1517 3601 ttaataactg cttcaaatgt tattgtgtca tcgttgactt taggtaattt ctccaaatgc
1518 3661 ataatcaaac tatttaagga agatcggaat tcgtcgaaca cttcagtttc cgtaatgatc
1519 3721 tgatcgtctt tatccacatg ttgtaattca ctaaaatcta aaacgtattt ttcaatgcat
1520 3781 aaatcgttct ttttattaat aatgcagatg gaaaatctgt aaacgtgcgt taatttagaa
1521 3841 agaacatcca gtataagttc ttctatatag tcaattaaag caggatgcct attaatggga
1522 3901 acgaactgcg gcaagttgaa tgactggtaa gtagtgtagt cgaatgactg aggtgggtat
1523 3961 acatttctat aaaataaaat caaattaatg tagcatttta agtataccct cagccacttc
1524 4021 tctacccatc tattcataaa gctgacgcaa cgattactat tttttttttc ttcttggatc
1525 4081 tcagtcgtcg caaaaacgta taccttcttt ttccgacctt ttttttagct ttctggaaaa
1526 4141 gtttatatta gttaaacagg gtctagtctt agtgtgaaag ctagtggttt cgattgactg
1527 4201 atattaagaa agtggaaatt aaattagtag tgtagacgta tatgcatatg tatttctcgc
1528 4261 ctgtttatgt ttctacgtac ttttgattta tagcaagggg aaaagaaata catactattt
1529 4321 tttggtaaag gtgaaagcat aatgtaaaag ctagaataaa atggacgaaa taaagagagg
1530 4381 cttagttcat cttttttcca aaaagcaccc aatgataata actaaaatga aaaggatttg
1531 4441 ccatctgtca gcaacatcag ttgtgtgagc aataataaaa tcatcacctc cgttgccttt
1532 4501 agcgcgtttg tcgtttgtat cttccgtaat tttagtctta tcaatgggaa tcataaattt
1533 4561 tccaatgaat tagcaatttc gtccaattct ttttgagctt cttcatattt gctttggaat
1534 4621 tcttcgcact tcttttccca ttcatctctt tcttcttcca aagcaacgat ccttctaccc
1535 4681 atttgctcag agttcaaatc ggcctctttc agtttatcca ttgcttcctt cagtttggct
1536 4741 tcactgtctt ctagctgttg ttctagatcc tggtttttct tggtgtagtt ctcattatta
1537 4801 gatctcaagt tattggagtc ttcagccaat tgctttgtat cagacaattg actctctaac
1538 4861 ttctccactt cactgtcgag ttgctcgttt ttagcggaca aagatttaat ctcgttttct
1539 4921 ttttcagtgt tagattgctc taattctttg agctgttctc tcagctcctc atatttttct
1540 4981 tgccatgact cagattctaa ttttaagcta ttcaatttct ctttgatc
1541 //"""
1542
1543
1544
1545 gbk_example2 = \
1546 """LOCUS AAD51968 143 aa linear BCT 21-AUG-2001
1547 DEFINITION transcriptional regulator RovA [Yersinia enterocolitica].
1548 ACCESSION AAD51968
1549 VERSION AAD51968.1 GI:5805369
1550 DBSOURCE locus AF171097 accession AF171097.1
1551 KEYWORDS .
1552 SOURCE Yersinia enterocolitica
1553 ORGANISM Yersinia enterocolitica
1554 Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacteriales;
1555 Enterobacteriaceae; Yersinia.
1556 REFERENCE 1 (residues 1 to 143)
1557 AUTHORS Revell,P.A. and Miller,V.L.
1558 TITLE A chromosomally encoded regulator is required for expression of the
1559 Yersinia enterocolitica inv gene and for virulence
1560 JOURNAL Mol. Microbiol. 35 (3), 677-685 (2000)
1561 MEDLINE 20138369
1562 PUBMED 10672189
1563 REFERENCE 2 (residues 1 to 143)
1564 AUTHORS Revell,P.A. and Miller,V.L.
1565 TITLE Direct Submission
1566 JOURNAL Submitted (22-JUL-1999) Molecular Microbiology, Washington
1567 University School of Medicine, Campus Box 8230, 660 South Euclid,
1568 St. Louis, MO 63110, USA
1569 COMMENT Method: conceptual translation.
1570 FEATURES Location/Qualifiers
1571 source 1..143
1572 /organism="Yersinia enterocolitica"
1573 /mol_type="unassigned DNA"
1574 /strain="JB580v"
1575 /serotype="O:8"
1576 /db_xref="taxon:630"
1577 Protein 1..143
1578 /product="transcriptional regulator RovA"
1579 /name="regulates inv expression"
1580 CDS 1..143
1581 /gene="rovA"
1582 /coded_by="AF171097.1:380..811"
1583 /note="regulator of virulence"
1584 /transl_table=11
1585 ORIGIN
1586 1 mestlgsdla rlvrvwrali dhrlkplelt qthwvtlhni nrlppeqsqi qlakaigieq
1587 61 pslvrtldql eekglitrht candrrakri klteqsspii eqvdgvicst rkeilggisp
1588 121 deiellsgli dklerniiql qsk
1589 //
1590 """
1591
1592 embl_example = """ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP.
1593 XX
1594 AC X56734; S46826;
1595 XX
1596 DT 12-SEP-1991 (Rel. 29, Created)
1597 DT 25-NOV-2005 (Rel. 85, Last updated, Version 11)
1598 XX
1599 DE Trifolium repens mRNA for non-cyanogenic beta-glucosidase
1600 XX
1601 KW beta-glucosidase.
1602 XX
1603 OS Trifolium repens (white clover)
1604 OC Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta;
1605 OC Spermatophyta; Magnoliophyta; eudicotyledons; core eudicotyledons; rosids;
1606 OC eurosids I; Fabales; Fabaceae; Papilionoideae; Trifolieae; Trifolium.
1607 XX
1608 RN [5]
1609 RP 1-1859
1610 RX PUBMED; 1907511.
1611 RA Oxtoby E., Dunn M.A., Pancoro A., Hughes M.A.;
1612 RT "Nucleotide and derived amino acid sequence of the cyanogenic
1613 RT beta-glucosidase (linamarase) from white clover (Trifolium repens L.)";
1614 RL Plant Mol. Biol. 17(2):209-219(1991).
1615 XX
1616 RN [6]
1617 RP 1-1859
1618 RA Hughes M.A.;
1619 RT ;
1620 RL Submitted (19-NOV-1990) to the EMBL/GenBank/DDBJ databases.
1621 RL Hughes M.A., University of Newcastle Upon Tyne, Medical School, Newcastle
1622 RL Upon Tyne, NE2 4HH, UK
1623 XX
1624 FH Key Location/Qualifiers
1625 FH
1626 FT source 1..1859
1627 FT /organism="Trifolium repens"
1628 FT /mol_type="mRNA"
1629 FT /clone_lib="lambda gt10"
1630 FT /clone="TRE361"
1631 FT /tissue_type="leaves"
1632 FT /db_xref="taxon:3899"
1633 FT CDS 14..1495
1634 FT /product="beta-glucosidase"
1635 FT /EC_number="3.2.1.21"
1636 FT /note="non-cyanogenic"
1637 FT /db_xref="GOA:P26204"
1638 FT /db_xref="InterPro:IPR001360"
1639 FT /db_xref="InterPro:IPR013781"
1640 FT /db_xref="UniProtKB/Swiss-Prot:P26204"
1641 FT /protein_id="CAA40058.1"
1642 FT /translation="MDFIVAIFALFVISSFTITSTNAVEASTLLDIGNLSRSSFPRGFI
1643 FT FGAGSSAYQFEGAVNEGGRGPSIWDTFTHKYPEKIRDGSNADITVDQYHRYKEDVGIMK
1644 FT DQNMDSYRFSISWPRILPKGKLSGGINHEGIKYYNNLINELLANGIQPFVTLFHWDLPQ
1645 FT VLEDEYGGFLNSGVINDFRDYTDLCFKEFGDRVRYWSTLNEPWVFSNSGYALGTNAPGR
1646 FT CSASNVAKPGDSGTGPYIVTHNQILAHAEAVHVYKTKYQAYQKGKIGITLVSNWLMPLD
1647 FT DNSIPDIKAAERSLDFQFGLFMEQLTTGDYSKSMRRIVKNRLPKFSKFESSLVNGSFDF
1648 FT IGINYYSSSYISNAPSHGNAKPSYSTNPMTNISFEKHGIPLGPRAASIWIYVYPYMFIQ
1649 FT EDFEIFCYILKINITILQFSITENGMNEFNDATLPVEEALLNTYRIDYYYRHLYYIRSA
1650 FT IRAGSNVKGFYAWSFLDCNEWFAGFTVRFGLNFVD"
1651 FT mRNA 1..1859
1652 FT /experiment="experimental evidence, no additional details
1653 FT recorded"
1654 XX
1655 SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other;
1656 aaacaaacca aatatggatt ttattgtagc catatttgct ctgtttgtta ttagctcatt 60
1657 cacaattact tccacaaatg cagttgaagc ttctactctt cttgacatag gtaacctgag 120
1658 tcggagcagt tttcctcgtg gcttcatctt tggtgctgga tcttcagcat accaatttga 180
1659 aggtgcagta aacgaaggcg gtagaggacc aagtatttgg gataccttca cccataaata 240
1660 tccagaaaaa ataagggatg gaagcaatgc agacatcacg gttgaccaat atcaccgcta 300
1661 caaggaagat gttgggatta tgaaggatca aaatatggat tcgtatagat tctcaatctc 360
1662 ttggccaaga atactcccaa agggaaagtt gagcggaggc ataaatcacg aaggaatcaa 420
1663 atattacaac aaccttatca acgaactatt ggctaacggt atacaaccat ttgtaactct 480
1664 ttttcattgg gatcttcccc aagtcttaga agatgagtat ggtggtttct taaactccgg 540
1665 tgtaataaat gattttcgag actatacgga tctttgcttc aaggaatttg gagatagagt 600
1666 gaggtattgg agtactctaa atgagccatg ggtgtttagc aattctggat atgcactagg 660
1667 aacaaatgca ccaggtcgat gttcggcctc caacgtggcc aagcctggtg attctggaac 720
1668 aggaccttat atagttacac acaatcaaat tcttgctcat gcagaagctg tacatgtgta 780
1669 taagactaaa taccaggcat atcaaaaggg aaagataggc ataacgttgg tatctaactg 840
1670 gttaatgcca cttgatgata atagcatacc agatataaag gctgccgaga gatcacttga 900
1671 cttccaattt ggattgttta tggaacaatt aacaacagga gattattcta agagcatgcg 960
1672 gcgtatagtt aaaaaccgat tacctaagtt ctcaaaattc gaatcaagcc tagtgaatgg 1020
1673 ttcatttgat tttattggta taaactatta ctcttctagt tatattagca atgccccttc 1080
1674 acatggcaat gccaaaccca gttactcaac aaatcctatg accaatattt catttgaaaa 1140
1675 acatgggata cccttaggtc caagggctgc ttcaatttgg atatatgttt atccatatat 1200
1676 gtttatccaa gaggacttcg agatcttttg ttacatatta aaaataaata taacaatcct 1260
1677 gcaattttca atcactgaaa atggtatgaa tgaattcaac gatgcaacac ttccagtaga 1320
1678 agaagctctt ttgaatactt acagaattga ttactattac cgtcacttat actacattcg 1380
1679 ttctgcaatc agggctggct caaatgtgaa gggtttttac gcatggtcat ttttggactg 1440
1680 taatgaatgg tttgcaggct ttactgttcg ttttggatta aactttgtag attagaaaga 1500
1681 tggattaaaa aggtacccta agctttctgc ccaatggtac aagaactttc tcaaaagaaa 1560
1682 ctagctagta ttattaaaag aactttgtag tagattacag tacatcgttt gaagttgagt 1620
1683 tggtgcacct aattaaataa aagaggttac tcttaacata tttttaggcc attcgttgtg 1680
1684 aagttgttag gctgttattt ctattatact atgttgtagt aataagtgca ttgttgtacc 1740
1685 agaagctatg atcataacta taggttgatc cttcatgtat cagtttgatg ttgagaatac 1800
1686 tttgaattaa aagtcttttt ttattttttt aaaaaaaaaa aaaaaaaaaa aaaaaaaaa 1859
1687 //
1688 """
1689
1690 print "GenBank CDS Iteration"
1691 print "====================="
1692
1693 g = GenBankScanner()
1694 for record in g.parse_cds_features(StringIO(gbk_example)):
1695 print record
1696
1697 g = GenBankScanner()
1698 for record in g.parse_cds_features(StringIO(gbk_example2),
1699 tags2id=('gene', 'locus_tag', 'product')):
1700 print record
1701
1702 g = GenBankScanner()
1703 for record in g.parse_cds_features(StringIO(gbk_example + "\n" + gbk_example2),
1704 tags2id=('gene', 'locus_tag', 'product')):
1705 print record
1706
1707 print
1708 print "GenBank Iteration"
1709 print "================="
1710 g = GenBankScanner()
1711 for record in g.parse_records(StringIO(gbk_example), do_features=False):
1712 print record.id, record.name, record.description
1713 print record.seq
1714
1715 g = GenBankScanner()
1716 for record in g.parse_records(StringIO(gbk_example), do_features=True):
1717 print record.id, record.name, record.description
1718 print record.seq
1719
1720 g = GenBankScanner()
1721 for record in g.parse_records(StringIO(gbk_example2), do_features=False):
1722 print record.id, record.name, record.description
1723 print record.seq
1724
1725 g = GenBankScanner()
1726 for record in g.parse_records(StringIO(gbk_example2), do_features=True):
1727 print record.id, record.name, record.description
1728 print record.seq
1729
1730 print
1731 print "EMBL CDS Iteration"
1732 print "=================="
1733
1734 e = EmblScanner()
1735 for record in e.parse_cds_features(StringIO(embl_example)):
1736 print record
1737
1738 print
1739 print "EMBL Iteration"
1740 print "=============="
1741 e = EmblScanner()
1742 for record in e.parse_records(StringIO(embl_example), do_features=True):
1743 print record.id, record.name, record.description
1744 print record.seq
1745