1
2
3
4
5
6
7 """Code to work with GenBank formatted files.
8
9 Rather than using Bio.GenBank, you are now encouraged to use Bio.SeqIO with
10 the "genbank" or "embl" format names to parse GenBank or EMBL files into
11 SeqRecord and SeqFeature objects (see the Biopython tutorial for details).
12
13 Using Bio.GenBank directly to parse GenBank files is only useful if you want
14 to obtain GenBank-specific Record objects, which is a much closer
15 representation to the raw file contents that the SeqRecord alternative from
16 the FeatureParser (used in Bio.SeqIO).
17
18 To use the Bio.GenBank parser, there are two helper functions:
19
20 read Parse a handle containing a single GenBank record
21 as Bio.GenBank specific Record objects.
22 parse Iterate over a handle containing multiple GenBank
23 records as Bio.GenBank specific Record objects.
24
25 The following internal classes are not intended for direct use and may
26 be deprecated in a future release.
27
28 Classes:
29 Iterator Iterate through a file of GenBank entries
30 ErrorFeatureParser Catch errors caused during parsing.
31 FeatureParser Parse GenBank data in SeqRecord and SeqFeature objects.
32 RecordParser Parse GenBank data into a Record object.
33
34 Exceptions:
35 ParserFailureError Exception indicating a failure in the parser (ie.
36 scanner or consumer)
37 LocationParserError Exception indiciating a problem with the spark based
38 location parser.
39
40 """
41 import re
42
43
44 from Bio import SeqFeature
45
46
47 from utils import FeatureValueCleaner
48 from Scanner import GenBankScanner
49
50
51 GENBANK_INDENT = 12
52 GENBANK_SPACER = " " * GENBANK_INDENT
53
54
55 FEATURE_KEY_INDENT = 5
56 FEATURE_QUALIFIER_INDENT = 21
57 FEATURE_KEY_SPACER = " " * FEATURE_KEY_INDENT
58 FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT
59
60
61 _solo_location = r"[<>]?\d+"
62 _pair_location = r"[<>]?\d+\.\.[<>]?\d+"
63 _between_location = r"\d+\^\d+"
64
65 _within_position = r"\(\d+\.\d+\)"
66 _re_within_position = re.compile(_within_position)
67 _within_location = r"([<>]?\d+|%s)\.\.([<>]?\d+|%s)" \
68 % (_within_position,_within_position)
69 assert _re_within_position.match("(3.9)")
70 assert re.compile(_within_location).match("(3.9)..10")
71 assert re.compile(_within_location).match("26..(30.33)")
72 assert re.compile(_within_location).match("(13.19)..(20.28)")
73
74 _oneof_position = r"one\-of\(\d+(,\d+)+\)"
75 _re_oneof_position = re.compile(_oneof_position)
76 _oneof_location = r"([<>]?\d+|%s)\.\.([<>]?\d+|%s)" \
77 % (_oneof_position,_oneof_position)
78 assert _re_oneof_position.match("one-of(6,9)")
79 assert re.compile(_oneof_location).match("one-of(6,9)..101")
80 assert re.compile(_oneof_location).match("one-of(6,9)..one-of(101,104)")
81 assert re.compile(_oneof_location).match("6..one-of(101,104)")
82
83 assert not _re_oneof_position.match("one-of(3)")
84 assert _re_oneof_position.match("one-of(3,6)")
85 assert _re_oneof_position.match("one-of(3,6,9)")
86
87
88 _simple_location = r"\d+\.\.\d+"
89 _re_simple_location = re.compile(r"^%s$" % _simple_location)
90 _re_simple_compound = re.compile(r"^(join|order|bond)\(%s(,%s)*\)$"
91 % (_simple_location, _simple_location))
92 _complex_location = r"([a-zA-z][a-zA-Z0-9_]*(\.[a-zA-Z0-9]+)?\:)?(%s|%s|%s|%s|%s)" \
93 % (_pair_location, _solo_location, _between_location,
94 _within_location, _oneof_location)
95 _re_complex_location = re.compile(r"^%s$" % _complex_location)
96 _possibly_complemented_complex_location = r"(%s|complement\(%s\))" \
97 % (_complex_location, _complex_location)
98 _re_complex_compound = re.compile(r"^(join|order|bond)\(%s(,%s)*\)$"
99 % (_possibly_complemented_complex_location,
100 _possibly_complemented_complex_location))
101
102 assert _re_simple_location.match("104..160")
103 assert not _re_simple_location.match("68451760..68452073^68452074")
104 assert not _re_simple_location.match("<104..>160")
105 assert not _re_simple_location.match("104")
106 assert not _re_simple_location.match("<1")
107 assert not _re_simple_location.match(">99999")
108 assert not _re_simple_location.match("join(104..160,320..390,504..579)")
109 assert not _re_simple_compound.match("bond(12,63)")
110 assert _re_simple_compound.match("join(104..160,320..390,504..579)")
111 assert _re_simple_compound.match("order(1..69,1308..1465)")
112 assert not _re_simple_compound.match("order(1..69,1308..1465,1524)")
113 assert not _re_simple_compound.match("join(<1..442,992..1228,1524..>1983)")
114 assert not _re_simple_compound.match("join(<1..181,254..336,422..497,574..>590)")
115 assert not _re_simple_compound.match("join(1475..1577,2841..2986,3074..3193,3314..3481,4126..>4215)")
116 assert not _re_simple_compound.match("test(1..69,1308..1465)")
117 assert not _re_simple_compound.match("complement(1..69)")
118 assert not _re_simple_compound.match("(1..69)")
119 assert _re_complex_location.match("(3.9)..10")
120 assert _re_complex_location.match("26..(30.33)")
121 assert _re_complex_location.match("(13.19)..(20.28)")
122 assert _re_complex_location.match("41^42")
123 assert _re_complex_location.match("AL121804:41^42")
124 assert _re_complex_location.match("AL121804:41..610")
125 assert _re_complex_location.match("AL121804.2:41..610")
126 assert _re_complex_location.match("one-of(3,6)..101")
127 assert _re_complex_compound.match("join(153490..154269,AL121804.2:41..610,AL121804.2:672..1487)")
128 assert not _re_simple_compound.match("join(153490..154269,AL121804.2:41..610,AL121804.2:672..1487)")
129 assert _re_complex_compound.match("join(complement(69611..69724),139856..140650)")
130
131
132 assert _re_complex_location.match("NC_016402.1:6618..6676")
133 assert _re_complex_location.match("181647..181905")
134 assert _re_complex_compound.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)")
135 assert not _re_complex_location.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)")
136 assert not _re_simple_compound.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)")
137 assert not _re_complex_location.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)")
138 assert not _re_simple_location.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)")
139
140
141 -def _pos(pos_str, offset=0):
142 """Build a Position object (PRIVATE).
143
144 For an end position, leave offset as zero (default):
145
146 >>> _pos("5")
147 ExactPosition(5)
148
149 For a start position, set offset to minus one (for Python counting):
150
151 >>> _pos("5", -1)
152 ExactPosition(4)
153
154 This also covers fuzzy positions:
155
156 >>> p = _pos("<5")
157 >>> p
158 BeforePosition(5)
159 >>> print p
160 <5
161 >>> int(p)
162 5
163
164 >>> _pos(">5")
165 AfterPosition(5)
166
167 By default assumes an end position, so note the integer behaviour:
168
169 >>> p = _pos("one-of(5,8,11)")
170 >>> p
171 OneOfPosition(11, choices=[ExactPosition(5), ExactPosition(8), ExactPosition(11)])
172 >>> print p
173 one-of(5,8,11)
174 >>> int(p)
175 11
176
177 >>> _pos("(8.10)")
178 WithinPosition(10, left=8, right=10)
179
180 Fuzzy start positions:
181
182 >>> p = _pos("<5", -1)
183 >>> p
184 BeforePosition(4)
185 >>> print p
186 <4
187 >>> int(p)
188 4
189
190 Notice how the integer behaviour changes too!
191
192 >>> p = _pos("one-of(5,8,11)", -1)
193 >>> p
194 OneOfPosition(4, choices=[ExactPosition(4), ExactPosition(7), ExactPosition(10)])
195 >>> print(p)
196 one-of(4,7,10)
197 >>> int(p)
198 4
199
200 """
201 if pos_str.startswith("<"):
202 return SeqFeature.BeforePosition(int(pos_str[1:])+offset)
203 elif pos_str.startswith(">"):
204 return SeqFeature.AfterPosition(int(pos_str[1:])+offset)
205 elif _re_within_position.match(pos_str):
206 s,e = pos_str[1:-1].split(".")
207 s = int(s) + offset
208 e = int(e) + offset
209 if offset == -1:
210 default = s
211 else:
212 default = e
213 return SeqFeature.WithinPosition(default, left=s, right=e)
214 elif _re_oneof_position.match(pos_str):
215 assert pos_str.startswith("one-of(")
216 assert pos_str[-1]==")"
217 parts = [SeqFeature.ExactPosition(int(pos)+offset)
218 for pos in pos_str[7:-1].split(",")]
219 if offset == -1:
220 default = min(int(pos) for pos in parts)
221 else:
222 default = max(int(pos) for pos in parts)
223 return SeqFeature.OneOfPosition(default, choices=parts)
224 else:
225 return SeqFeature.ExactPosition(int(pos_str)+offset)
226
227
228 -def _loc(loc_str, expected_seq_length, strand):
229 """FeatureLocation from non-compound non-complement location (PRIVATE).
230
231 Simple examples,
232
233 >>> _loc("123..456", 1000, +1)
234 FeatureLocation(ExactPosition(122), ExactPosition(456), strand=1)
235 >>> _loc("<123..>456", 1000, strand = -1)
236 FeatureLocation(BeforePosition(122), AfterPosition(456), strand=-1)
237
238 A more complex location using within positions,
239
240 >>> _loc("(9.10)..(20.25)", 1000, 1)
241 FeatureLocation(WithinPosition(8, left=8, right=9), WithinPosition(25, left=20, right=25), strand=1)
242
243 Notice how that will act as though it has overall start 8 and end 25.
244
245 Zero length between feature,
246
247 >>> _loc("123^124", 1000, 0)
248 FeatureLocation(ExactPosition(123), ExactPosition(123), strand=0)
249
250 The expected sequence length is needed for a special case, a between
251 position at the start/end of a circular genome:
252
253 >>> _loc("1000^1", 1000, 1)
254 FeatureLocation(ExactPosition(1000), ExactPosition(1000), strand=1)
255
256 Apart from this special case, between positions P^Q must have P+1==Q,
257
258 >>> _loc("123^456", 1000, 1)
259 Traceback (most recent call last):
260 ...
261 ValueError: Invalid between location '123^456'
262 """
263 try:
264 s, e = loc_str.split("..")
265 except ValueError:
266 assert ".." not in loc_str
267 if "^" in loc_str:
268
269
270
271
272
273
274
275 s, e = loc_str.split("^")
276 if int(s)+1==int(e):
277 pos = _pos(s)
278 elif int(s)==expected_seq_length and e=="1":
279 pos = _pos(s)
280 else:
281 raise ValueError("Invalid between location %s" % repr(loc_str))
282 return SeqFeature.FeatureLocation(pos, pos, strand)
283 else:
284
285 s = loc_str
286 e = loc_str
287 return SeqFeature.FeatureLocation(_pos(s,-1), _pos(e), strand)
288
289
291 """Split a tricky compound location string (PRIVATE).
292
293 >>> list(_split_compound_loc("123..145"))
294 ['123..145']
295 >>> list(_split_compound_loc("123..145,200..209"))
296 ['123..145', '200..209']
297 >>> list(_split_compound_loc("one-of(200,203)..300"))
298 ['one-of(200,203)..300']
299 >>> list(_split_compound_loc("complement(123..145),200..209"))
300 ['complement(123..145)', '200..209']
301 >>> list(_split_compound_loc("123..145,one-of(200,203)..209"))
302 ['123..145', 'one-of(200,203)..209']
303 >>> list(_split_compound_loc("123..145,one-of(200,203)..one-of(209,211),300"))
304 ['123..145', 'one-of(200,203)..one-of(209,211)', '300']
305 >>> list(_split_compound_loc("123..145,complement(one-of(200,203)..one-of(209,211)),300"))
306 ['123..145', 'complement(one-of(200,203)..one-of(209,211))', '300']
307 >>> list(_split_compound_loc("123..145,200..one-of(209,211),300"))
308 ['123..145', '200..one-of(209,211)', '300']
309 >>> list(_split_compound_loc("123..145,200..one-of(209,211)"))
310 ['123..145', '200..one-of(209,211)']
311 >>> list(_split_compound_loc("complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905"))
312 ['complement(149815..150200)', 'complement(293787..295573)', 'NC_016402.1:6618..6676', '181647..181905']
313 """
314 if "one-of(" in compound_loc:
315
316 while "," in compound_loc:
317 assert compound_loc[0] != ","
318 assert compound_loc[0:2] != ".."
319 i = compound_loc.find(",")
320 part = compound_loc[:i]
321 compound_loc = compound_loc[i:]
322 while part.count("(") > part.count(")"):
323 assert "one-of(" in part, (part, compound_loc)
324 i = compound_loc.find(")")
325 part += compound_loc[:i+1]
326 compound_loc = compound_loc[i+1:]
327 if compound_loc.startswith(".."):
328 i = compound_loc.find(",")
329 if i==-1:
330 part += compound_loc
331 compound_loc = ""
332 else:
333 part += compound_loc[:i]
334 compound_loc = compound_loc[i:]
335 while part.count("(") > part.count(")"):
336 assert part.count("one-of(") == 2
337 i = compound_loc.find(")")
338 part += compound_loc[:i+1]
339 compound_loc = compound_loc[i+1:]
340 if compound_loc.startswith(","):
341 compound_loc = compound_loc[1:]
342 assert part
343 yield part
344 if compound_loc:
345 yield compound_loc
346 else:
347
348 for part in compound_loc.split(","):
349 yield part
350
351
353 """Iterator interface to move over a file of GenBank entries one at a time (OBSOLETE).
354
355 This class is likely to be deprecated in a future release of Biopython.
356 Please use Bio.SeqIO.parse(..., format="gb") or Bio.GenBank.parse(...)
357 for SeqRecord and GenBank specific Record objects respectively instead.
358 """
359 - def __init__(self, handle, parser = None):
360 """Initialize the iterator.
361
362 Arguments:
363 o handle - A handle with GenBank entries to iterate through.
364 o parser - An optional parser to pass the entries through before
365 returning them. If None, then the raw entry will be returned.
366 """
367 self.handle = handle
368 self._parser = parser
369
371 """Return the next GenBank record from the handle.
372
373 Will return None if we ran out of records.
374 """
375 if self._parser is None:
376 lines = []
377 while True:
378 line = self.handle.readline()
379 if not line:
380 return None
381 lines.append(line)
382 if line.rstrip() == "//":
383 break
384 return "".join(lines)
385 try:
386 return self._parser.parse(self.handle)
387 except StopIteration:
388 return None
389
391 return iter(self.next, None)
392
393
395 """Failure caused by some kind of problem in the parser.
396 """
397 pass
398
399
401 """Could not Properly parse out a location from a GenBank file.
402 """
403 pass
404
405
407 """Parse GenBank files into Seq + Feature objects (OBSOLETE).
408
409 Direct use of this class is discouraged, and may be deprecated in
410 a future release of Biopython.
411
412 Please use Bio.SeqIO.parse(...) or Bio.SeqIO.read(...) instead.
413 """
416 """Initialize a GenBank parser and Feature consumer.
417
418 Arguments:
419 o debug_level - An optional argument that species the amount of
420 debugging information the parser should spit out. By default we have
421 no debugging info (the fastest way to do things), but if you want
422 you can set this as high as two and see exactly where a parse fails.
423 o use_fuzziness - Specify whether or not to use fuzzy representations.
424 The default is 1 (use fuzziness).
425 o feature_cleaner - A class which will be used to clean out the
426 values of features. This class must implement the function
427 clean_value. GenBank.utils has a "standard" cleaner class, which
428 is used by default.
429 """
430 self._scanner = GenBankScanner(debug_level)
431 self.use_fuzziness = use_fuzziness
432 self._cleaner = feature_cleaner
433
434 - def parse(self, handle):
435 """Parse the specified handle.
436 """
437 self._consumer = _FeatureConsumer(self.use_fuzziness,
438 self._cleaner)
439 self._scanner.feed(handle, self._consumer)
440 return self._consumer.data
441
442
444 """Parse GenBank files into Record objects (OBSOLETE).
445
446 Direct use of this class is discouraged, and may be deprecated in
447 a future release of Biopython.
448
449 Please use the Bio.GenBank.parse(...) or Bio.GenBank.read(...) functions
450 instead.
451 """
453 """Initialize the parser.
454
455 Arguments:
456 o debug_level - An optional argument that species the amount of
457 debugging information the parser should spit out. By default we have
458 no debugging info (the fastest way to do things), but if you want
459 you can set this as high as two and see exactly where a parse fails.
460 """
461 self._scanner = GenBankScanner(debug_level)
462
463 - def parse(self, handle):
464 """Parse the specified handle into a GenBank record.
465 """
466 self._consumer = _RecordConsumer()
467
468 self._scanner.feed(handle, self._consumer)
469 return self._consumer.data
470
471
473 """Abstract GenBank consumer providing useful general functions (PRIVATE).
474
475 This just helps to eliminate some duplication in things that most
476 GenBank consumers want to do.
477 """
478
479
480
481
482 remove_space_keys = ["translation"]
483
486
489
492
494 """Split a string of keywords into a nice clean list.
495 """
496
497 if keyword_string == "" or keyword_string == ".":
498 keywords = ""
499 elif keyword_string[-1] == '.':
500 keywords = keyword_string[:-1]
501 else:
502 keywords = keyword_string
503 keyword_list = keywords.split(';')
504 clean_keyword_list = [x.strip() for x in keyword_list]
505 return clean_keyword_list
506
508 """Split a string of accession numbers into a list.
509 """
510
511
512 accession = accession_string.replace("\n", " ").replace(";"," ")
513
514 return [x.strip() for x in accession.split() if x.strip()]
515
517 """Split a string with taxonomy info into a list.
518 """
519 if not taxonomy_string or taxonomy_string==".":
520
521 return []
522
523 if taxonomy_string[-1] == '.':
524 tax_info = taxonomy_string[:-1]
525 else:
526 tax_info = taxonomy_string
527 tax_list = tax_info.split(';')
528 new_tax_list = []
529 for tax_item in tax_list:
530 new_items = tax_item.split("\n")
531 new_tax_list.extend(new_items)
532 while '' in new_tax_list:
533 new_tax_list.remove('')
534 clean_tax_list = [x.strip() for x in new_tax_list]
535
536 return clean_tax_list
537
539 """Clean whitespace out of a location string.
540
541 The location parser isn't a fan of whitespace, so we clean it out
542 before feeding it into the parser.
543 """
544
545
546
547 return ''.join(location_string.split())
548
550 """Remove any newlines in the passed text, returning the new string.
551 """
552
553 newlines = ["\n", "\r"]
554 for ws in newlines:
555 text = text.replace(ws, "")
556
557 return text
558
560 """Replace multiple spaces in the passed text with single spaces.
561 """
562
563 text_parts = text.split(" ")
564 text_parts = filter(None, text_parts)
565 return ' '.join(text_parts)
566
568 """Remove all spaces from the passed text.
569 """
570 return text.replace(" ", "")
571
573 """Convert a start and end range to python notation.
574
575 In GenBank, starts and ends are defined in "biological" coordinates,
576 where 1 is the first base and [i, j] means to include both i and j.
577
578 In python, 0 is the first base and [i, j] means to include i, but
579 not j.
580
581 So, to convert "biological" to python coordinates, we need to
582 subtract 1 from the start, and leave the end and things should
583 be converted happily.
584 """
585 new_start = start - 1
586 new_end = end
587
588 return new_start, new_end
589
590
592 """Create a SeqRecord object with Features to return (PRIVATE).
593
594 Attributes:
595 o use_fuzziness - specify whether or not to parse with fuzziness in
596 feature locations.
597 o feature_cleaner - a class that will be used to provide specialized
598 cleaning-up of feature values.
599 """
600 - def __init__(self, use_fuzziness, feature_cleaner = None):
615
616 - def locus(self, locus_name):
617 """Set the locus name is set as the name of the Sequence.
618 """
619 self.data.name = locus_name
620
621 - def size(self, content):
622 """Record the sequence length."""
623 self._expected_size = int(content)
624
626 """Record the sequence type so we can choose an appropriate alphabet.
627 """
628 self._seq_type = type
629
632
633 - def date(self, submit_date):
635
645
647 """Set the accession number as the id of the sequence.
648
649 If we have multiple accession numbers, the first one passed is
650 used.
651 """
652 new_acc_nums = self._split_accessions(acc_num)
653
654
655 try:
656
657 for acc in new_acc_nums:
658
659 if acc not in self.data.annotations['accessions']:
660 self.data.annotations['accessions'].append(acc)
661 except KeyError:
662 self.data.annotations['accessions'] = new_acc_nums
663
664
665 if not self.data.id:
666 if len(new_acc_nums) > 0:
667
668
669 self.data.id = self.data.annotations['accessions'][0]
670
671 - def wgs(self, content):
673
676
677 - def nid(self, content):
679
680 - def pid(self, content):
682
695
697 """Handle the information from the PROJECT line as a list of projects.
698
699 e.g.
700 PROJECT GenomeProject:28471
701
702 or:
703 PROJECT GenomeProject:13543 GenomeProject:99999
704
705 This is stored as dbxrefs in the SeqRecord to be consistent with the
706 projected switch of this line to DBLINK in future GenBank versions.
707 Note the NCBI plan to replace "GenomeProject:28471" with the shorter
708 "Project:28471" as part of this transition.
709 """
710 content = content.replace("GenomeProject:", "Project:")
711 self.data.dbxrefs.extend([p for p in content.split() if p])
712
714 """Store DBLINK cross references as dbxrefs in our record object.
715
716 This line type is expected to replace the PROJECT line in 2009. e.g.
717
718 During transition:
719
720 PROJECT GenomeProject:28471
721 DBLINK Project:28471
722 Trace Assembly Archive:123456
723
724 Once the project line is dropped:
725
726 DBLINK Project:28471
727 Trace Assembly Archive:123456
728
729 Note GenomeProject -> Project.
730
731 We'll have to see some real examples to be sure, but based on the
732 above example we can expect one reference per line.
733
734 Note that at some point the NCBI have included an extra space, e.g.
735
736 DBLINK Project: 28471
737 """
738
739
740 while ": " in content:
741 content = content.replace(": ", ":")
742 if content.strip() not in self.data.dbxrefs:
743 self.data.dbxrefs.append(content.strip())
744
746 """Set the version to overwrite the id.
747
748 Since the verison provides the same information as the accession
749 number, plus some extra info, we set this as the id if we have
750 a version.
751 """
752
753
754
755
756
757
758
759
760
761
762 assert version.isdigit()
763 self.data.annotations['sequence_version'] = int(version)
764
767
768 - def gi(self, content):
770
773
776
778
779
780 if content == "":
781 source_info = ""
782 elif content[-1] == '.':
783 source_info = content[:-1]
784 else:
785 source_info = content
786 self.data.annotations['source'] = source_info
787
790
799
801 """Signal the beginning of a new reference object.
802 """
803
804
805 if self._cur_reference is not None:
806 self.data.annotations['references'].append(self._cur_reference)
807 else:
808 self.data.annotations['references'] = []
809
810 self._cur_reference = SeqFeature.Reference()
811
813 """Attempt to determine the sequence region the reference entails.
814
815 Possible types of information we may have to deal with:
816
817 (bases 1 to 86436)
818 (sites)
819 (bases 1 to 105654; 110423 to 111122)
820 1 (residues 1 to 182)
821 """
822
823 ref_base_info = content[1:-1]
824
825 all_locations = []
826
827 if 'bases' in ref_base_info and 'to' in ref_base_info:
828
829 ref_base_info = ref_base_info[5:]
830 locations = self._split_reference_locations(ref_base_info)
831 all_locations.extend(locations)
832 elif 'residues' in ref_base_info and 'to' in ref_base_info:
833 residues_start = ref_base_info.find("residues")
834
835 ref_base_info = ref_base_info[(residues_start + len("residues ")):]
836 locations = self._split_reference_locations(ref_base_info)
837 all_locations.extend(locations)
838
839
840
841 elif (ref_base_info == 'sites' or
842 ref_base_info.strip() == 'bases'):
843 pass
844
845 else:
846 raise ValueError("Could not parse base info %s in record %s" %
847 (ref_base_info, self.data.id))
848
849 self._cur_reference.location = all_locations
850
852 """Get reference locations out of a string of reference information
853
854 The passed string should be of the form:
855
856 1 to 20; 20 to 100
857
858 This splits the information out and returns a list of location objects
859 based on the reference locations.
860 """
861
862 all_base_info = location_string.split(';')
863
864 new_locations = []
865 for base_info in all_base_info:
866 start, end = base_info.split('to')
867 new_start, new_end = \
868 self._convert_to_python_numbers(int(start.strip()),
869 int(end.strip()))
870 this_location = SeqFeature.FeatureLocation(new_start, new_end)
871 new_locations.append(this_location)
872 return new_locations
873
875 if self._cur_reference.authors:
876 self._cur_reference.authors += ' ' + content
877 else:
878 self._cur_reference.authors = content
879
881 if self._cur_reference.consrtm:
882 self._cur_reference.consrtm += ' ' + content
883 else:
884 self._cur_reference.consrtm = content
885
886 - def title(self, content):
887 if self._cur_reference is None:
888 import warnings
889 from Bio import BiopythonParserWarning
890 warnings.warn("GenBank TITLE line without REFERENCE line.",
891 BiopythonParserWarning)
892 elif self._cur_reference.title:
893 self._cur_reference.title += ' ' + content
894 else:
895 self._cur_reference.title = content
896
898 if self._cur_reference.journal:
899 self._cur_reference.journal += ' ' + content
900 else:
901 self._cur_reference.journal = content
902
905
908
910 """Deal with a reference comment."""
911 if self._cur_reference.comment:
912 self._cur_reference.comment += ' ' + content
913 else:
914 self._cur_reference.comment = content
915
921
923 """Get ready for the feature table when we reach the FEATURE line.
924 """
925 self.start_feature_table()
926
928 """Indicate we've got to the start of the feature table.
929 """
930
931 if self._cur_reference is not None:
932 self.data.annotations['references'].append(self._cur_reference)
933 self._cur_reference = None
934
940
942 """Parse out location information from the location string.
943
944 This uses simple Python code with some regular expressions to do the
945 parsing, and then translates the results into appropriate objects.
946 """
947
948
949 location_line = self._clean_location(content)
950
951
952
953
954
955
956 if 'replace' in location_line:
957 comma_pos = location_line.find(',')
958 location_line = location_line[8:comma_pos]
959
960 cur_feature = self._cur_feature
961
962
963 if location_line.startswith("complement("):
964 assert location_line.endswith(")")
965 location_line = location_line[11:-1]
966 strand = -1
967 elif 'DNA' in self._seq_type.upper() or 'RNA' in self._seq_type.upper():
968
969 strand = 1
970 else:
971
972 strand = None
973
974
975 if _re_simple_location.match(location_line):
976
977 s, e = location_line.split("..")
978 cur_feature.location = SeqFeature.FeatureLocation(int(s)-1,
979 int(e),
980 strand)
981 return
982
983 if _re_simple_compound.match(location_line):
984
985 i = location_line.find("(")
986 cur_feature.location_operator = location_line[:i]
987
988 for part in location_line[i+1:-1].split(","):
989 s, e = part.split("..")
990 f = SeqFeature.SeqFeature(SeqFeature.FeatureLocation(int(s)-1,
991 int(e),
992 strand),
993 location_operator=cur_feature.location_operator,
994 type=cur_feature.type)
995 cur_feature.sub_features.append(f)
996 s = cur_feature.sub_features[0].location.start
997 e = cur_feature.sub_features[-1].location.end
998 cur_feature.location = SeqFeature.FeatureLocation(s,e, strand)
999 return
1000
1001
1002 if _re_complex_location.match(location_line):
1003
1004 if ":" in location_line:
1005 location_ref, location_line = location_line.split(":")
1006 cur_feature.location = _loc(location_line, self._expected_size, strand)
1007 cur_feature.location.ref = location_ref
1008 else:
1009 cur_feature.location = _loc(location_line, self._expected_size, strand)
1010 return
1011
1012 if _re_complex_compound.match(location_line):
1013 i = location_line.find("(")
1014 cur_feature.location_operator = location_line[:i]
1015
1016 for part in _split_compound_loc(location_line[i+1:-1]):
1017 if part.startswith("complement("):
1018 assert part[-1]==")"
1019 part = part[11:-1]
1020 assert strand != -1, "Double complement?"
1021 part_strand = -1
1022 else:
1023 part_strand = strand
1024 if ":" in part:
1025 ref, part = part.split(":")
1026 else:
1027 ref = None
1028 try:
1029 loc = _loc(part, self._expected_size, part_strand)
1030 except ValueError, err:
1031 print location_line
1032 print part
1033 raise err
1034 f = SeqFeature.SeqFeature(location=loc, ref=ref,
1035 location_operator=cur_feature.location_operator,
1036 type=cur_feature.type)
1037 cur_feature.sub_features.append(f)
1038
1039
1040
1041
1042
1043
1044 strands = set(sf.strand for sf in cur_feature.sub_features)
1045 if len(strands)==1:
1046 strand = cur_feature.sub_features[0].strand
1047 else:
1048 strand = None
1049 s = cur_feature.sub_features[0].location.start
1050 e = cur_feature.sub_features[-1].location.end
1051 cur_feature.location = SeqFeature.FeatureLocation(s, e, strand)
1052 return
1053
1054 if "order" in location_line and "join" in location_line:
1055
1056 msg = 'Combinations of "join" and "order" within the same ' + \
1057 'location (nested operators) are illegal:\n' + location_line
1058 raise LocationParserError(msg)
1059
1060 cur_feature.location = None
1061 import warnings
1062 from Bio import BiopythonParserWarning
1063 warnings.warn(BiopythonParserWarning("Couldn't parse feature location: %r"
1064 % (location_line)))
1065
1067 """When we get a qualifier key and its value.
1068
1069 Can receive None, since you can have valueless keys such as /pseudo
1070 """
1071
1072 if value is None:
1073 if key not in self._cur_feature.qualifiers:
1074 self._cur_feature.qualifiers[key] = [""]
1075 return
1076
1077 value = value.replace('"', '')
1078 if self._feature_cleaner is not None:
1079 value = self._feature_cleaner.clean_value(key, value)
1080
1081
1082 if key in self._cur_feature.qualifiers:
1083 self._cur_feature.qualifiers[key].append(value)
1084
1085 else:
1086 self._cur_feature.qualifiers[key] = [value]
1087
1089 """Use feature_qualifier instead (OBSOLETE)."""
1090 raise NotImplementedError("Use the feature_qualifier method instead.")
1091
1093 """Use feature_qualifier instead (OBSOLETE)."""
1094 raise NotImplementedError("Use the feature_qualifier method instead.")
1095
1097 """Deal with CONTIG information."""
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113 self.data.annotations["contig"] = content
1114
1117
1120
1123
1125 """Add up sequence information as we get it.
1126
1127 To try and make things speedier, this puts all of the strings
1128 into a list of strings, and then uses string.join later to put
1129 them together. Supposedly, this is a big time savings
1130 """
1131 assert ' ' not in content
1132 self._seq_data.append(content.upper())
1133
1135 """Clean up when we've finished the record.
1136 """
1137 from Bio import Alphabet
1138 from Bio.Alphabet import IUPAC
1139 from Bio.Seq import Seq, UnknownSeq
1140
1141
1142 if not self.data.id:
1143 assert 'accessions' not in self.data.annotations, \
1144 self.data.annotations['accessions']
1145 self.data.id = self.data.name
1146 elif self.data.id.count('.') == 0:
1147 try:
1148 self.data.id+='.%i' % self.data.annotations['sequence_version']
1149 except KeyError:
1150 pass
1151
1152
1153
1154
1155
1156 seq_alphabet = Alphabet.generic_alphabet
1157
1158
1159 sequence = "".join(self._seq_data)
1160
1161 if self._expected_size is not None \
1162 and len(sequence) != 0 \
1163 and self._expected_size != len(sequence):
1164 import warnings
1165 from Bio import BiopythonParserWarning
1166 warnings.warn("Expected sequence length %i, found %i (%s)."
1167 % (self._expected_size, len(sequence), self.data.id),
1168 BiopythonParserWarning)
1169
1170 if self._seq_type:
1171
1172 if 'DNA' in self._seq_type.upper() or 'MRNA' in self._seq_type.upper():
1173 seq_alphabet = IUPAC.ambiguous_dna
1174
1175 elif 'RNA' in self._seq_type.upper():
1176
1177
1178 if "T" in sequence and "U" not in sequence:
1179 seq_alphabet = IUPAC.ambiguous_dna
1180 else:
1181 seq_alphabet = IUPAC.ambiguous_rna
1182 elif 'PROTEIN' in self._seq_type.upper():
1183 seq_alphabet = IUPAC.protein
1184
1185
1186 elif self._seq_type in ["circular", "linear", "unspecified"]:
1187 pass
1188
1189 else:
1190 raise ValueError("Could not determine alphabet for seq_type %s"
1191 % self._seq_type)
1192
1193 if not sequence and self.__expected_size:
1194 self.data.seq = UnknownSeq(self._expected_size, seq_alphabet)
1195 else:
1196 self.data.seq = Seq(sequence, seq_alphabet)
1197
1198
1200 """Create a GenBank Record object from scanner generated information (PRIVATE).
1201 """
1211
1212 - def wgs(self, content):
1214
1217
1218 - def locus(self, content):
1220
1221 - def size(self, content):
1223
1232
1235
1236 - def date(self, content):
1238
1241
1246
1247 - def nid(self, content):
1249
1250 - def pid(self, content):
1252
1255
1258
1259 - def gi(self, content):
1261
1264
1267
1270
1273
1276
1279
1282
1284 """Grab the reference number and signal the start of a new reference.
1285 """
1286
1287 if self._cur_reference is not None:
1288 self.data.references.append(self._cur_reference)
1289
1290 import Record
1291 self._cur_reference = Record.Reference()
1292 self._cur_reference.number = content
1293
1295 self._cur_reference.bases = content
1296
1298 self._cur_reference.authors = content
1299
1301 self._cur_reference.consrtm = content
1302
1303 - def title(self, content):
1311
1313 self._cur_reference.journal = content
1314
1317
1320
1322 self._cur_reference.remark = content
1323
1326
1330
1333
1335 """Get ready for the feature table when we reach the FEATURE line.
1336 """
1337 self.start_feature_table()
1338
1340 """Signal the start of the feature table.
1341 """
1342
1343 if self._cur_reference is not None:
1344 self.data.references.append(self._cur_reference)
1345
1347 """Grab the key of the feature and signal the start of a new feature.
1348 """
1349
1350 self._add_feature()
1351
1352 import Record
1353 self._cur_feature = Record.Feature()
1354 self._cur_feature.key = content
1355
1357 """Utility function to add a feature to the Record.
1358
1359 This does all of the appropriate checking to make sure we haven't
1360 left any info behind, and that we are only adding info if it
1361 exists.
1362 """
1363 if self._cur_feature is not None:
1364
1365
1366 if self._cur_qualifier is not None:
1367 self._cur_feature.qualifiers.append(self._cur_qualifier)
1368
1369 self._cur_qualifier = None
1370 self.data.features.append(self._cur_feature)
1371
1374
1379
1381 """Deal with qualifier names
1382
1383 We receive a list of keys, since you can have valueless keys such as
1384 /pseudo which would be passed in with the next key (since no other
1385 tags separate them in the file)
1386 """
1387 import Record
1388 for content in content_list:
1389
1390 if not content.startswith("/"):
1391 content = "/%s" % content
1392
1393 if self._cur_qualifier is not None:
1394 self._cur_feature.qualifiers.append(self._cur_qualifier)
1395
1396 self._cur_qualifier = Record.Qualifier()
1397 self._cur_qualifier.key = content
1398
1400
1401 if '=' not in self._cur_qualifier.key:
1402 self._cur_qualifier.key = "%s=" % self._cur_qualifier.key
1403 cur_content = self._remove_newlines(content)
1404
1405
1406 for remove_space_key in self.__class__.remove_space_keys:
1407 if remove_space_key in self._cur_qualifier.key:
1408 cur_content = self._remove_spaces(cur_content)
1409 self._cur_qualifier.value = self._normalize_spaces(cur_content)
1410
1412 self.data.base_counts = content
1413
1415 self.data.origin = content
1416
1418 """Signal that we have contig information to add to the record.
1419 """
1420 self.data.contig = self._clean_location(content)
1421
1423 """Add sequence information to a list of sequence strings.
1424
1425 This removes spaces in the data and uppercases the sequence, and
1426 then adds it to a list of sequences. Later on we'll join this
1427 list together to make the final sequence. This is faster than
1428 adding on the new string every time.
1429 """
1430 assert ' ' not in content
1431 self._seq_data.append(content.upper())
1432
1434 """Signal the end of the record and do any necessary clean-up.
1435 """
1436
1437
1438 self.data.sequence = "".join(self._seq_data)
1439
1440 self._add_feature()
1441
1442
1444 """Iterate over GenBank formatted entries as Record objects.
1445
1446 >>> from Bio import GenBank
1447 >>> handle = open("GenBank/NC_000932.gb")
1448 >>> for record in GenBank.parse(handle):
1449 ... print record.accession
1450 ['NC_000932']
1451 >>> handle.close()
1452
1453 To get SeqRecord objects use Bio.SeqIO.parse(..., format="gb")
1454 instead.
1455 """
1456 return iter(Iterator(handle, RecordParser()))
1457
1458
1460 """Read a handle containing a single GenBank entry as a Record object.
1461
1462 >>> from Bio import GenBank
1463 >>> handle = open("GenBank/NC_000932.gb")
1464 >>> record = GenBank.read(handle)
1465 >>> print record.accession
1466 ['NC_000932']
1467 >>> handle.close()
1468
1469 To get a SeqRecord object use Bio.SeqIO.read(..., format="gb")
1470 instead.
1471 """
1472 iterator = parse(handle)
1473 try:
1474 first = iterator.next()
1475 except StopIteration:
1476 first = None
1477 if first is None:
1478 raise ValueError("No records found in handle")
1479 try:
1480 second = iterator.next()
1481 except StopIteration:
1482 second = None
1483 if second is not None:
1484 raise ValueError("More than one record found in handle")
1485 return first
1486
1487
1489 """Run the Bio.GenBank module's doctests."""
1490 import doctest
1491 import os
1492 if os.path.isdir(os.path.join("..","..","Tests")):
1493 print "Running doctests..."
1494 cur_dir = os.path.abspath(os.curdir)
1495 os.chdir(os.path.join("..","..","Tests"))
1496 doctest.testmod()
1497 os.chdir(cur_dir)
1498 del cur_dir
1499 print "Done"
1500 elif os.path.isdir(os.path.join("Tests")):
1501 print "Running doctests..."
1502 cur_dir = os.path.abspath(os.curdir)
1503 os.chdir(os.path.join("Tests"))
1504 doctest.testmod()
1505 os.chdir(cur_dir)
1506 del cur_dir
1507 print "Done"
1508
1509 if __name__ == "__main__":
1510 _test()
1511