Package Bio :: Package GenBank
[hide private]
[frames] | no frames]

Source Code for Package Bio.GenBank

   1  # Copyright 2000 by Jeffrey Chang, Brad Chapman.  All rights reserved. 
   2  # Copyright 2006-2013 by Peter Cock.  All rights reserved. 
   3  # 
   4  # This code is part of the Biopython distribution and governed by its 
   5  # license.  Please see the LICENSE file that should have been included 
   6  # as part of this package. 
   7   
   8  """Code to work with GenBank formatted files. 
   9   
  10  Rather than using Bio.GenBank, you are now encouraged to use Bio.SeqIO with 
  11  the "genbank" or "embl" format names to parse GenBank or EMBL files into 
  12  SeqRecord and SeqFeature objects (see the Biopython tutorial for details). 
  13   
  14  Using Bio.GenBank directly to parse GenBank files is only useful if you want 
  15  to obtain GenBank-specific Record objects, which is a much closer 
  16  representation to the raw file contents that the SeqRecord alternative from 
  17  the FeatureParser (used in Bio.SeqIO). 
  18   
  19  To use the Bio.GenBank parser, there are two helper functions: 
  20   
  21  read                  Parse a handle containing a single GenBank record 
  22                        as Bio.GenBank specific Record objects. 
  23  parse                 Iterate over a handle containing multiple GenBank 
  24                        records as Bio.GenBank specific Record objects. 
  25   
  26  The following internal classes are not intended for direct use and may 
  27  be deprecated in a future release. 
  28   
  29  Classes: 
  30  Iterator              Iterate through a file of GenBank entries 
  31  ErrorFeatureParser    Catch errors caused during parsing. 
  32  FeatureParser         Parse GenBank data in SeqRecord and SeqFeature objects. 
  33  RecordParser          Parse GenBank data into a Record object. 
  34   
  35  Exceptions: 
  36  ParserFailureError    Exception indicating a failure in the parser (ie. 
  37                        scanner or consumer) 
  38  LocationParserError   Exception indiciating a problem with the spark based 
  39                        location parser. 
  40   
  41  """ 
  42  from __future__ import print_function 
  43   
  44  import re 
  45  import sys # for checking if Python 2 
  46   
  47  # other Biopython stuff 
  48  from Bio import SeqFeature 
  49   
  50  # other Bio.GenBank stuff 
  51  from .utils import FeatureValueCleaner 
  52  from .Scanner import GenBankScanner 
  53   
  54  #Constants used to parse GenBank header lines 
  55  GENBANK_INDENT = 12 
  56  GENBANK_SPACER = " " * GENBANK_INDENT 
  57   
  58  #Constants for parsing GenBank feature lines 
  59  FEATURE_KEY_INDENT = 5 
  60  FEATURE_QUALIFIER_INDENT = 21 
  61  FEATURE_KEY_SPACER = " " * FEATURE_KEY_INDENT 
  62  FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT 
  63   
  64  #Regular expressions for location parsing 
  65  _solo_location = r"[<>]?\d+" 
  66  _pair_location = r"[<>]?\d+\.\.[<>]?\d+" 
  67  _between_location = r"\d+\^\d+" 
  68   
  69  _within_position = r"\(\d+\.\d+\)" 
  70  _re_within_position = re.compile(_within_position) 
  71  _within_location = r"([<>]?\d+|%s)\.\.([<>]?\d+|%s)" \ 
  72                     % (_within_position, _within_position) 
  73  assert _re_within_position.match("(3.9)") 
  74  assert re.compile(_within_location).match("(3.9)..10") 
  75  assert re.compile(_within_location).match("26..(30.33)") 
  76  assert re.compile(_within_location).match("(13.19)..(20.28)") 
  77   
  78  _oneof_position = r"one\-of\(\d+(,\d+)+\)" 
  79  _re_oneof_position = re.compile(_oneof_position) 
  80  _oneof_location = r"([<>]?\d+|%s)\.\.([<>]?\d+|%s)" \ 
  81                     % (_oneof_position, _oneof_position) 
  82  assert _re_oneof_position.match("one-of(6,9)") 
  83  assert re.compile(_oneof_location).match("one-of(6,9)..101") 
  84  assert re.compile(_oneof_location).match("one-of(6,9)..one-of(101,104)") 
  85  assert re.compile(_oneof_location).match("6..one-of(101,104)") 
  86   
  87  assert not _re_oneof_position.match("one-of(3)") 
  88  assert _re_oneof_position.match("one-of(3,6)") 
  89  assert _re_oneof_position.match("one-of(3,6,9)") 
  90   
  91   
  92  _simple_location = r"\d+\.\.\d+" 
  93  _re_simple_location = re.compile(r"^%s$" % _simple_location) 
  94  _re_simple_compound = re.compile(r"^(join|order|bond)\(%s(,%s)*\)$" 
  95                                   % (_simple_location, _simple_location)) 
  96  _complex_location = r"([a-zA-z][a-zA-Z0-9_]*(\.[a-zA-Z0-9]+)?\:)?(%s|%s|%s|%s|%s)" \ 
  97                      % (_pair_location, _solo_location, _between_location, 
  98                         _within_location, _oneof_location) 
  99  _re_complex_location = re.compile(r"^%s$" % _complex_location) 
 100  _possibly_complemented_complex_location = r"(%s|complement\(%s\))" \ 
 101                                            % (_complex_location, _complex_location) 
 102  _re_complex_compound = re.compile(r"^(join|order|bond)\(%s(,%s)*\)$" 
 103                                   % (_possibly_complemented_complex_location, 
 104                                      _possibly_complemented_complex_location)) 
 105   
 106  assert _re_simple_location.match("104..160") 
 107  assert not _re_simple_location.match("68451760..68452073^68452074") 
 108  assert not _re_simple_location.match("<104..>160") 
 109  assert not _re_simple_location.match("104") 
 110  assert not _re_simple_location.match("<1") 
 111  assert not _re_simple_location.match(">99999") 
 112  assert not _re_simple_location.match("join(104..160,320..390,504..579)") 
 113  assert not _re_simple_compound.match("bond(12,63)") 
 114  assert _re_simple_compound.match("join(104..160,320..390,504..579)") 
 115  assert _re_simple_compound.match("order(1..69,1308..1465)") 
 116  assert not _re_simple_compound.match("order(1..69,1308..1465,1524)") 
 117  assert not _re_simple_compound.match("join(<1..442,992..1228,1524..>1983)") 
 118  assert not _re_simple_compound.match("join(<1..181,254..336,422..497,574..>590)") 
 119  assert not _re_simple_compound.match("join(1475..1577,2841..2986,3074..3193,3314..3481,4126..>4215)") 
 120  assert not _re_simple_compound.match("test(1..69,1308..1465)") 
 121  assert not _re_simple_compound.match("complement(1..69)") 
 122  assert not _re_simple_compound.match("(1..69)") 
 123  assert _re_complex_location.match("(3.9)..10") 
 124  assert _re_complex_location.match("26..(30.33)") 
 125  assert _re_complex_location.match("(13.19)..(20.28)") 
 126  assert _re_complex_location.match("41^42")  # between 
 127  assert _re_complex_location.match("AL121804:41^42") 
 128  assert _re_complex_location.match("AL121804:41..610") 
 129  assert _re_complex_location.match("AL121804.2:41..610") 
 130  assert _re_complex_location.match("one-of(3,6)..101") 
 131  assert _re_complex_compound.match("join(153490..154269,AL121804.2:41..610,AL121804.2:672..1487)") 
 132  assert not _re_simple_compound.match("join(153490..154269,AL121804.2:41..610,AL121804.2:672..1487)") 
 133  assert _re_complex_compound.match("join(complement(69611..69724),139856..140650)") 
 134   
 135  #Trans-spliced example from NC_016406, note underscore in reference name: 
 136  assert _re_complex_location.match("NC_016402.1:6618..6676") 
 137  assert _re_complex_location.match("181647..181905") 
 138  assert _re_complex_compound.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)") 
 139  assert not _re_complex_location.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)") 
 140  assert not _re_simple_compound.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)") 
 141  assert not _re_complex_location.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)") 
 142  assert not _re_simple_location.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)") 
 143   
 144   
145 -def _pos(pos_str, offset=0):
146 """Build a Position object (PRIVATE). 147 148 For an end position, leave offset as zero (default): 149 150 >>> _pos("5") 151 ExactPosition(5) 152 153 For a start position, set offset to minus one (for Python counting): 154 155 >>> _pos("5", -1) 156 ExactPosition(4) 157 158 This also covers fuzzy positions: 159 160 >>> p = _pos("<5") 161 >>> p 162 BeforePosition(5) 163 >>> print(p) 164 <5 165 >>> int(p) 166 5 167 168 >>> _pos(">5") 169 AfterPosition(5) 170 171 By default assumes an end position, so note the integer behaviour: 172 173 >>> p = _pos("one-of(5,8,11)") 174 >>> p 175 OneOfPosition(11, choices=[ExactPosition(5), ExactPosition(8), ExactPosition(11)]) 176 >>> print(p) 177 one-of(5,8,11) 178 >>> int(p) 179 11 180 181 >>> _pos("(8.10)") 182 WithinPosition(10, left=8, right=10) 183 184 Fuzzy start positions: 185 186 >>> p = _pos("<5", -1) 187 >>> p 188 BeforePosition(4) 189 >>> print(p) 190 <4 191 >>> int(p) 192 4 193 194 Notice how the integer behaviour changes too! 195 196 >>> p = _pos("one-of(5,8,11)", -1) 197 >>> p 198 OneOfPosition(4, choices=[ExactPosition(4), ExactPosition(7), ExactPosition(10)]) 199 >>> print(p) 200 one-of(4,7,10) 201 >>> int(p) 202 4 203 204 """ 205 if pos_str.startswith("<"): 206 return SeqFeature.BeforePosition(int(pos_str[1:])+offset) 207 elif pos_str.startswith(">"): 208 return SeqFeature.AfterPosition(int(pos_str[1:])+offset) 209 elif _re_within_position.match(pos_str): 210 s, e = pos_str[1:-1].split(".") 211 s = int(s) + offset 212 e = int(e) + offset 213 if offset == -1: 214 default = s 215 else: 216 default = e 217 return SeqFeature.WithinPosition(default, left=s, right=e) 218 elif _re_oneof_position.match(pos_str): 219 assert pos_str.startswith("one-of(") 220 assert pos_str[-1]==")" 221 parts = [SeqFeature.ExactPosition(int(pos)+offset) 222 for pos in pos_str[7:-1].split(",")] 223 if offset == -1: 224 default = min(int(pos) for pos in parts) 225 else: 226 default = max(int(pos) for pos in parts) 227 return SeqFeature.OneOfPosition(default, choices=parts) 228 else: 229 return SeqFeature.ExactPosition(int(pos_str)+offset)
230 231
232 -def _loc(loc_str, expected_seq_length, strand):
233 """FeatureLocation from non-compound non-complement location (PRIVATE). 234 235 Simple examples, 236 237 >>> _loc("123..456", 1000, +1) 238 FeatureLocation(ExactPosition(122), ExactPosition(456), strand=1) 239 >>> _loc("<123..>456", 1000, strand = -1) 240 FeatureLocation(BeforePosition(122), AfterPosition(456), strand=-1) 241 242 A more complex location using within positions, 243 244 >>> _loc("(9.10)..(20.25)", 1000, 1) 245 FeatureLocation(WithinPosition(8, left=8, right=9), WithinPosition(25, left=20, right=25), strand=1) 246 247 Notice how that will act as though it has overall start 8 and end 25. 248 249 Zero length between feature, 250 251 >>> _loc("123^124", 1000, 0) 252 FeatureLocation(ExactPosition(123), ExactPosition(123), strand=0) 253 254 The expected sequence length is needed for a special case, a between 255 position at the start/end of a circular genome: 256 257 >>> _loc("1000^1", 1000, 1) 258 FeatureLocation(ExactPosition(1000), ExactPosition(1000), strand=1) 259 260 Apart from this special case, between positions P^Q must have P+1==Q, 261 262 >>> _loc("123^456", 1000, 1) 263 Traceback (most recent call last): 264 ... 265 ValueError: Invalid between location '123^456' 266 """ 267 try: 268 s, e = loc_str.split("..") 269 except ValueError: 270 assert ".." not in loc_str 271 if "^" in loc_str: 272 #A between location like "67^68" (one based counting) is a 273 #special case (note it has zero length). In python slice 274 #notation this is 67:67, a zero length slice. See Bug 2622 275 #Further more, on a circular genome of length N you can have 276 #a location N^1 meaning the junction at the origin. See Bug 3098. 277 #NOTE - We can imagine between locations like "2^4", but this 278 #is just "3". Similarly, "2^5" is just "3..4" 279 s, e = loc_str.split("^") 280 if int(s)+1==int(e): 281 pos = _pos(s) 282 elif int(s)==expected_seq_length and e=="1": 283 pos = _pos(s) 284 else: 285 raise ValueError("Invalid between location %s" % repr(loc_str)) 286 return SeqFeature.FeatureLocation(pos, pos, strand) 287 else: 288 #e.g. "123" 289 s = loc_str 290 e = loc_str 291 return SeqFeature.FeatureLocation(_pos(s, -1), _pos(e), strand)
292 293
294 -def _split_compound_loc(compound_loc):
295 """Split a tricky compound location string (PRIVATE). 296 297 >>> list(_split_compound_loc("123..145")) 298 ['123..145'] 299 >>> list(_split_compound_loc("123..145,200..209")) 300 ['123..145', '200..209'] 301 >>> list(_split_compound_loc("one-of(200,203)..300")) 302 ['one-of(200,203)..300'] 303 >>> list(_split_compound_loc("complement(123..145),200..209")) 304 ['complement(123..145)', '200..209'] 305 >>> list(_split_compound_loc("123..145,one-of(200,203)..209")) 306 ['123..145', 'one-of(200,203)..209'] 307 >>> list(_split_compound_loc("123..145,one-of(200,203)..one-of(209,211),300")) 308 ['123..145', 'one-of(200,203)..one-of(209,211)', '300'] 309 >>> list(_split_compound_loc("123..145,complement(one-of(200,203)..one-of(209,211)),300")) 310 ['123..145', 'complement(one-of(200,203)..one-of(209,211))', '300'] 311 >>> list(_split_compound_loc("123..145,200..one-of(209,211),300")) 312 ['123..145', '200..one-of(209,211)', '300'] 313 >>> list(_split_compound_loc("123..145,200..one-of(209,211)")) 314 ['123..145', '200..one-of(209,211)'] 315 >>> list(_split_compound_loc("complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905")) 316 ['complement(149815..150200)', 'complement(293787..295573)', 'NC_016402.1:6618..6676', '181647..181905'] 317 """ 318 if "one-of(" in compound_loc: 319 #Hard case 320 while "," in compound_loc: 321 assert compound_loc[0] != "," 322 assert compound_loc[0:2] != ".." 323 i = compound_loc.find(",") 324 part = compound_loc[:i] 325 compound_loc = compound_loc[i:] # includes the comma 326 while part.count("(") > part.count(")"): 327 assert "one-of(" in part, (part, compound_loc) 328 i = compound_loc.find(")") 329 part += compound_loc[:i+1] 330 compound_loc = compound_loc[i+1:] 331 if compound_loc.startswith(".."): 332 i = compound_loc.find(",") 333 if i==-1: 334 part += compound_loc 335 compound_loc = "" 336 else: 337 part += compound_loc[:i] 338 compound_loc = compound_loc[i:] # includes the comma 339 while part.count("(") > part.count(")"): 340 assert part.count("one-of(") == 2 341 i = compound_loc.find(")") 342 part += compound_loc[:i+1] 343 compound_loc = compound_loc[i+1:] 344 if compound_loc.startswith(","): 345 compound_loc = compound_loc[1:] 346 assert part 347 yield part 348 if compound_loc: 349 yield compound_loc 350 else: 351 #Easy case 352 for part in compound_loc.split(","): 353 yield part
354 355
356 -class Iterator(object):
357 """Iterator interface to move over a file of GenBank entries one at a time (OBSOLETE). 358 359 This class is likely to be deprecated in a future release of Biopython. 360 Please use Bio.SeqIO.parse(..., format="gb") or Bio.GenBank.parse(...) 361 for SeqRecord and GenBank specific Record objects respectively instead. 362 """
363 - def __init__(self, handle, parser = None):
364 """Initialize the iterator. 365 366 Arguments: 367 o handle - A handle with GenBank entries to iterate through. 368 o parser - An optional parser to pass the entries through before 369 returning them. If None, then the raw entry will be returned. 370 """ 371 self.handle = handle 372 self._parser = parser
373
374 - def __next__(self):
375 """Return the next GenBank record from the handle. 376 377 Will return None if we ran out of records. 378 """ 379 if self._parser is None: 380 lines = [] 381 while True: 382 line = self.handle.readline() 383 if not line: 384 return None # Premature end of file? 385 lines.append(line) 386 if line.rstrip() == "//": 387 break 388 return "".join(lines) 389 try: 390 return self._parser.parse(self.handle) 391 except StopIteration: 392 return None
393 394 if sys.version_info[0] < 3:
395 - def next(self):
396 """Deprecated Python 2 style alias for Python 3 style __next__ method.""" 397 import warnings 398 from Bio import BiopythonDeprecationWarning 399 warnings.warn("Please use next(my_iterator) instead of my_iterator.next(), " 400 "the .next() method is deprecated and will be removed in a " 401 "future release of Biopython.", BiopythonDeprecationWarning) 402 return self.__next__()
403
404 - def __iter__(self):
405 return iter(self.__next__, None)
406 407
408 -class ParserFailureError(Exception):
409 """Failure caused by some kind of problem in the parser. 410 """ 411 pass
412 413
414 -class LocationParserError(Exception):
415 """Could not Properly parse out a location from a GenBank file. 416 """ 417 pass
418 419
420 -class FeatureParser(object):
421 """Parse GenBank files into Seq + Feature objects (OBSOLETE). 422 423 Direct use of this class is discouraged, and may be deprecated in 424 a future release of Biopython. 425 426 Please use Bio.SeqIO.parse(...) or Bio.SeqIO.read(...) instead. 427 """
428 - def __init__(self, debug_level = 0, use_fuzziness = 1, 429 feature_cleaner = FeatureValueCleaner()):
430 """Initialize a GenBank parser and Feature consumer. 431 432 Arguments: 433 o debug_level - An optional argument that species the amount of 434 debugging information the parser should spit out. By default we have 435 no debugging info (the fastest way to do things), but if you want 436 you can set this as high as two and see exactly where a parse fails. 437 o use_fuzziness - Specify whether or not to use fuzzy representations. 438 The default is 1 (use fuzziness). 439 o feature_cleaner - A class which will be used to clean out the 440 values of features. This class must implement the function 441 clean_value. GenBank.utils has a "standard" cleaner class, which 442 is used by default. 443 """ 444 self._scanner = GenBankScanner(debug_level) 445 self.use_fuzziness = use_fuzziness 446 self._cleaner = feature_cleaner
447
448 - def parse(self, handle):
449 """Parse the specified handle. 450 """ 451 self._consumer = _FeatureConsumer(self.use_fuzziness, 452 self._cleaner) 453 self._scanner.feed(handle, self._consumer) 454 return self._consumer.data
455 456
457 -class RecordParser(object):
458 """Parse GenBank files into Record objects (OBSOLETE). 459 460 Direct use of this class is discouraged, and may be deprecated in 461 a future release of Biopython. 462 463 Please use the Bio.GenBank.parse(...) or Bio.GenBank.read(...) functions 464 instead. 465 """
466 - def __init__(self, debug_level = 0):
467 """Initialize the parser. 468 469 Arguments: 470 o debug_level - An optional argument that species the amount of 471 debugging information the parser should spit out. By default we have 472 no debugging info (the fastest way to do things), but if you want 473 you can set this as high as two and see exactly where a parse fails. 474 """ 475 self._scanner = GenBankScanner(debug_level)
476
477 - def parse(self, handle):
478 """Parse the specified handle into a GenBank record. 479 """ 480 self._consumer = _RecordConsumer() 481 482 self._scanner.feed(handle, self._consumer) 483 return self._consumer.data
484 485
486 -class _BaseGenBankConsumer(object):
487 """Abstract GenBank consumer providing useful general functions (PRIVATE). 488 489 This just helps to eliminate some duplication in things that most 490 GenBank consumers want to do. 491 """ 492 # Special keys in GenBank records that we should remove spaces from 493 # For instance, \translation keys have values which are proteins and 494 # should have spaces and newlines removed from them. This class 495 # attribute gives us more control over specific formatting problems. 496 remove_space_keys = ["translation"] 497
498 - def __init__(self):
499 pass
500
501 - def _unhandled(self, data):
502 pass
503
504 - def __getattr__(self, attr):
505 return self._unhandled
506
507 - def _split_keywords(self, keyword_string):
508 """Split a string of keywords into a nice clean list. 509 """ 510 # process the keywords into a python list 511 if keyword_string == "" or keyword_string == ".": 512 keywords = "" 513 elif keyword_string[-1] == '.': 514 keywords = keyword_string[:-1] 515 else: 516 keywords = keyword_string 517 keyword_list = keywords.split(';') 518 clean_keyword_list = [x.strip() for x in keyword_list] 519 return clean_keyword_list
520
521 - def _split_accessions(self, accession_string):
522 """Split a string of accession numbers into a list. 523 """ 524 # first replace all line feeds with spaces 525 # Also, EMBL style accessions are split with ';' 526 accession = accession_string.replace("\n", " ").replace(";", " ") 527 528 return [x.strip() for x in accession.split() if x.strip()]
529
530 - def _split_taxonomy(self, taxonomy_string):
531 """Split a string with taxonomy info into a list. 532 """ 533 if not taxonomy_string or taxonomy_string==".": 534 #Missing data, no taxonomy 535 return [] 536 537 if taxonomy_string[-1] == '.': 538 tax_info = taxonomy_string[:-1] 539 else: 540 tax_info = taxonomy_string 541 tax_list = tax_info.split(';') 542 new_tax_list = [] 543 for tax_item in tax_list: 544 new_items = tax_item.split("\n") 545 new_tax_list.extend(new_items) 546 while '' in new_tax_list: 547 new_tax_list.remove('') 548 clean_tax_list = [x.strip() for x in new_tax_list] 549 550 return clean_tax_list
551
552 - def _clean_location(self, location_string):
553 """Clean whitespace out of a location string. 554 555 The location parser isn't a fan of whitespace, so we clean it out 556 before feeding it into the parser. 557 """ 558 #Originally this imported string.whitespace and did a replace 559 #via a loop. It's simpler to just split on whitespace and rejoin 560 #the string - and this avoids importing string too. See Bug 2684. 561 return ''.join(location_string.split())
562
563 - def _remove_newlines(self, text):
564 """Remove any newlines in the passed text, returning the new string. 565 """ 566 # get rid of newlines in the qualifier value 567 newlines = ["\n", "\r"] 568 for ws in newlines: 569 text = text.replace(ws, "") 570 571 return text
572
573 - def _normalize_spaces(self, text):
574 """Replace multiple spaces in the passed text with single spaces. 575 """ 576 # get rid of excessive spaces 577 return ' '.join(x for x in text.split(" ") if x)
578
579 - def _remove_spaces(self, text):
580 """Remove all spaces from the passed text. 581 """ 582 return text.replace(" ", "")
583
584 - def _convert_to_python_numbers(self, start, end):
585 """Convert a start and end range to python notation. 586 587 In GenBank, starts and ends are defined in "biological" coordinates, 588 where 1 is the first base and [i, j] means to include both i and j. 589 590 In python, 0 is the first base and [i, j] means to include i, but 591 not j. 592 593 So, to convert "biological" to python coordinates, we need to 594 subtract 1 from the start, and leave the end and things should 595 be converted happily. 596 """ 597 new_start = start - 1 598 new_end = end 599 600 return new_start, new_end
601 602
603 -class _FeatureConsumer(_BaseGenBankConsumer):
604 """Create a SeqRecord object with Features to return (PRIVATE). 605 606 Attributes: 607 o use_fuzziness - specify whether or not to parse with fuzziness in 608 feature locations. 609 o feature_cleaner - a class that will be used to provide specialized 610 cleaning-up of feature values. 611 """
612 - def __init__(self, use_fuzziness, feature_cleaner = None):
613 from Bio.SeqRecord import SeqRecord 614 _BaseGenBankConsumer.__init__(self) 615 self.data = SeqRecord(None, id = None) 616 self.data.id = None 617 self.data.description = "" 618 619 self._use_fuzziness = use_fuzziness 620 self._feature_cleaner = feature_cleaner 621 622 self._seq_type = '' 623 self._seq_data = [] 624 self._cur_reference = None 625 self._cur_feature = None 626 self._expected_size = None
627
628 - def locus(self, locus_name):
629 """Set the locus name is set as the name of the Sequence. 630 """ 631 self.data.name = locus_name
632
633 - def size(self, content):
634 """Record the sequence length.""" 635 self._expected_size = int(content)
636
637 - def residue_type(self, type):
638 """Record the sequence type so we can choose an appropriate alphabet. 639 """ 640 self._seq_type = type.strip()
641
642 - def data_file_division(self, division):
643 self.data.annotations['data_file_division'] = division
644
645 - def date(self, submit_date):
646 self.data.annotations['date'] = submit_date
647
648 - def definition(self, definition):
649 """Set the definition as the description of the sequence. 650 """ 651 if self.data.description: 652 #Append to any existing description 653 #e.g. EMBL files with two DE lines. 654 self.data.description += " " + definition 655 else: 656 self.data.description = definition
657
658 - def accession(self, acc_num):
659 """Set the accession number as the id of the sequence. 660 661 If we have multiple accession numbers, the first one passed is 662 used. 663 """ 664 new_acc_nums = self._split_accessions(acc_num) 665 666 #Also record them ALL in the annotations 667 try: 668 #On the off chance there was more than one accession line: 669 for acc in new_acc_nums: 670 #Prevent repeat entries 671 if acc not in self.data.annotations['accessions']: 672 self.data.annotations['accessions'].append(acc) 673 except KeyError: 674 self.data.annotations['accessions'] = new_acc_nums 675 676 # if we haven't set the id information yet, add the first acc num 677 if not self.data.id: 678 if len(new_acc_nums) > 0: 679 #self.data.id = new_acc_nums[0] 680 #Use the FIRST accession as the ID, not the first on this line! 681 self.data.id = self.data.annotations['accessions'][0]
682
683 - def wgs(self, content):
684 self.data.annotations['wgs'] = content.split('-')
685
686 - def add_wgs_scafld(self, content):
687 self.data.annotations.setdefault('wgs_scafld', []).append(content.split('-'))
688
689 - def nid(self, content):
690 self.data.annotations['nid'] = content
691
692 - def pid(self, content):
693 self.data.annotations['pid'] = content
694
695 - def version(self, version_id):
696 #Want to use the versioned accession as the record.id 697 #This comes from the VERSION line in GenBank files, or the 698 #obsolete SV line in EMBL. For the new EMBL files we need 699 #both the version suffix from the ID line and the accession 700 #from the AC line. 701 if version_id.count(".")==1 and version_id.split(".")[1].isdigit(): 702 self.accession(version_id.split(".")[0]) 703 self.version_suffix(version_id.split(".")[1]) 704 elif version_id: 705 #For backwards compatibility... 706 self.data.id = version_id
707
708 - def project(self, content):
709 """Handle the information from the PROJECT line as a list of projects. 710 711 e.g. 712 PROJECT GenomeProject:28471 713 714 or: 715 PROJECT GenomeProject:13543 GenomeProject:99999 716 717 This is stored as dbxrefs in the SeqRecord to be consistent with the 718 projected switch of this line to DBLINK in future GenBank versions. 719 Note the NCBI plan to replace "GenomeProject:28471" with the shorter 720 "Project:28471" as part of this transition. 721 """ 722 content = content.replace("GenomeProject:", "Project:") 723 self.data.dbxrefs.extend(p for p in content.split() if p)
724 756
757 - def version_suffix(self, version):
758 """Set the version to overwrite the id. 759 760 Since the verison provides the same information as the accession 761 number, plus some extra info, we set this as the id if we have 762 a version. 763 """ 764 #e.g. GenBank line: 765 #VERSION U49845.1 GI:1293613 766 #or the obsolete EMBL line: 767 #SV U49845.1 768 #Scanner calls consumer.version("U49845.1") 769 #which then calls consumer.version_suffix(1) 770 # 771 #e.g. EMBL new line: 772 #ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP. 773 #Scanner calls consumer.version_suffix(1) 774 assert version.isdigit() 775 self.data.annotations['sequence_version'] = int(version)
776
777 - def db_source(self, content):
778 self.data.annotations['db_source'] = content.rstrip()
779
780 - def gi(self, content):
781 self.data.annotations['gi'] = content
782
783 - def keywords(self, content):
784 if 'keywords' in self.data.annotations: 785 #Multi-line keywords, append to list 786 #Note EMBL states "A keyword is never split between lines." 787 self.data.annotations['keywords'].extend(self._split_keywords(content)) 788 else: 789 self.data.annotations['keywords'] = self._split_keywords(content)
790
791 - def segment(self, content):
792 self.data.annotations['segment'] = content
793
794 - def source(self, content):
795 #Note that some software (e.g. VectorNTI) may produce an empty 796 #source (rather than using a dot/period as might be expected). 797 if content == "": 798 source_info = "" 799 elif content[-1] == '.': 800 source_info = content[:-1] 801 else: 802 source_info = content 803 self.data.annotations['source'] = source_info
804
805 - def organism(self, content):
806 self.data.annotations['organism'] = content
807
808 - def taxonomy(self, content):
809 """Records (another line of) the taxonomy lineage. 810 """ 811 lineage = self._split_taxonomy(content) 812 try: 813 self.data.annotations['taxonomy'].extend(lineage) 814 except KeyError: 815 self.data.annotations['taxonomy'] = lineage
816
817 - def reference_num(self, content):
818 """Signal the beginning of a new reference object. 819 """ 820 # if we have a current reference that hasn't been added to 821 # the list of references, add it. 822 if self._cur_reference is not None: 823 self.data.annotations['references'].append(self._cur_reference) 824 else: 825 self.data.annotations['references'] = [] 826 827 self._cur_reference = SeqFeature.Reference()
828
829 - def reference_bases(self, content):
830 """Attempt to determine the sequence region the reference entails. 831 832 Possible types of information we may have to deal with: 833 834 (bases 1 to 86436) 835 (sites) 836 (bases 1 to 105654; 110423 to 111122) 837 1 (residues 1 to 182) 838 """ 839 # first remove the parentheses or other junk 840 ref_base_info = content[1:-1] 841 842 all_locations = [] 843 # parse if we've got 'bases' and 'to' 844 if 'bases' in ref_base_info and 'to' in ref_base_info: 845 # get rid of the beginning 'bases' 846 ref_base_info = ref_base_info[5:] 847 locations = self._split_reference_locations(ref_base_info) 848 all_locations.extend(locations) 849 elif 'residues' in ref_base_info and 'to' in ref_base_info: 850 residues_start = ref_base_info.find("residues") 851 # get only the information after "residues" 852 ref_base_info = ref_base_info[(residues_start + len("residues ")):] 853 locations = self._split_reference_locations(ref_base_info) 854 all_locations.extend(locations) 855 856 # make sure if we are not finding information then we have 857 # the string 'sites' or the string 'bases' 858 elif (ref_base_info == 'sites' or 859 ref_base_info.strip() == 'bases'): 860 pass 861 # otherwise raise an error 862 else: 863 raise ValueError("Could not parse base info %s in record %s" % 864 (ref_base_info, self.data.id)) 865 866 self._cur_reference.location = all_locations
867
868 - def _split_reference_locations(self, location_string):
869 """Get reference locations out of a string of reference information 870 871 The passed string should be of the form: 872 873 1 to 20; 20 to 100 874 875 This splits the information out and returns a list of location objects 876 based on the reference locations. 877 """ 878 # split possibly multiple locations using the ';' 879 all_base_info = location_string.split(';') 880 881 new_locations = [] 882 for base_info in all_base_info: 883 start, end = base_info.split('to') 884 new_start, new_end = \ 885 self._convert_to_python_numbers(int(start.strip()), 886 int(end.strip())) 887 this_location = SeqFeature.FeatureLocation(new_start, new_end) 888 new_locations.append(this_location) 889 return new_locations
890
891 - def authors(self, content):
892 if self._cur_reference.authors: 893 self._cur_reference.authors += ' ' + content 894 else: 895 self._cur_reference.authors = content
896
897 - def consrtm(self, content):
898 if self._cur_reference.consrtm: 899 self._cur_reference.consrtm += ' ' + content 900 else: 901 self._cur_reference.consrtm = content
902
903 - def title(self, content):
904 if self._cur_reference is None: 905 import warnings 906 from Bio import BiopythonParserWarning 907 warnings.warn("GenBank TITLE line without REFERENCE line.", 908 BiopythonParserWarning) 909 elif self._cur_reference.title: 910 self._cur_reference.title += ' ' + content 911 else: 912 self._cur_reference.title = content
913
914 - def journal(self, content):
915 if self._cur_reference.journal: 916 self._cur_reference.journal += ' ' + content 917 else: 918 self._cur_reference.journal = content
919
920 - def medline_id(self, content):
921 self._cur_reference.medline_id = content
922
923 - def pubmed_id(self, content):
924 self._cur_reference.pubmed_id = content
925
926 - def remark(self, content):
927 """Deal with a reference comment.""" 928 if self._cur_reference.comment: 929 self._cur_reference.comment += ' ' + content 930 else: 931 self._cur_reference.comment = content
932
933 - def comment(self, content):
934 try: 935 self.data.annotations['comment'] += "\n" + "\n".join(content) 936 except KeyError: 937 self.data.annotations['comment'] = "\n".join(content)
938
939 - def features_line(self, content):
940 """Get ready for the feature table when we reach the FEATURE line. 941 """ 942 self.start_feature_table()
943
944 - def start_feature_table(self):
945 """Indicate we've got to the start of the feature table. 946 """ 947 # make sure we've added on our last reference object 948 if self._cur_reference is not None: 949 self.data.annotations['references'].append(self._cur_reference) 950 self._cur_reference = None
951
952 - def feature_key(self, content):
953 # start a new feature 954 self._cur_feature = SeqFeature.SeqFeature() 955 self._cur_feature.type = content 956 self.data.features.append(self._cur_feature)
957
958 - def location(self, content):
959 """Parse out location information from the location string. 960 961 This uses simple Python code with some regular expressions to do the 962 parsing, and then translates the results into appropriate objects. 963 """ 964 # clean up newlines and other whitespace inside the location before 965 # parsing - locations should have no whitespace whatsoever 966 location_line = self._clean_location(content) 967 968 # Older records have junk like replace(266,"c") in the 969 # location line. Newer records just replace this with 970 # the number 266 and have the information in a more reasonable 971 # place. So we'll just grab out the number and feed this to the 972 # parser. We shouldn't really be losing any info this way. 973 if 'replace' in location_line: 974 comma_pos = location_line.find(',') 975 location_line = location_line[8:comma_pos] 976 977 cur_feature = self._cur_feature 978 979 #Handle top level complement here for speed 980 if location_line.startswith("complement("): 981 assert location_line.endswith(")") 982 location_line = location_line[11:-1] 983 strand = -1 984 elif "PROTEIN" in self._seq_type.upper(): 985 strand = None 986 else: 987 #Assume nucleotide otherwise feature strand for 988 #GenBank files with bad LOCUS lines set to None 989 strand = 1 990 991 #Special case handling of the most common cases for speed 992 if _re_simple_location.match(location_line): 993 #e.g. "123..456" 994 s, e = location_line.split("..") 995 cur_feature.location = SeqFeature.FeatureLocation(int(s)-1, 996 int(e), 997 strand) 998 return 999 1000 if _re_simple_compound.match(location_line): 1001 #e.g. join(<123..456,480..>500) 1002 i = location_line.find("(") 1003 #cur_feature.location_operator = location_line[:i] 1004 #we can split on the comma because these are simple locations 1005 sub_features = cur_feature.sub_features 1006 for part in location_line[i+1:-1].split(","): 1007 s, e = part.split("..") 1008 f = SeqFeature.SeqFeature(SeqFeature.FeatureLocation(int(s)-1, 1009 int(e), 1010 strand), 1011 location_operator=cur_feature.location_operator, 1012 type=cur_feature.type) 1013 sub_features.append(f) 1014 #s = cur_feature.sub_features[0].location.start 1015 #e = cur_feature.sub_features[-1].location.end 1016 #cur_feature.location = SeqFeature.FeatureLocation(s,e, strand) 1017 #TODO - Remove use of sub_features 1018 if strand == -1: 1019 cur_feature.location = SeqFeature.CompoundLocation([f.location for f in sub_features[::-1]], 1020 operator=location_line[:i]) 1021 else: 1022 cur_feature.location = SeqFeature.CompoundLocation([f.location for f in sub_features], 1023 operator=location_line[:i]) 1024 return 1025 1026 #Handle the general case with more complex regular expressions 1027 if _re_complex_location.match(location_line): 1028 #e.g. "AL121804.2:41..610" 1029 if ":" in location_line: 1030 location_ref, location_line = location_line.split(":") 1031 cur_feature.location = _loc(location_line, self._expected_size, strand) 1032 cur_feature.location.ref = location_ref 1033 else: 1034 cur_feature.location = _loc(location_line, self._expected_size, strand) 1035 return 1036 1037 if _re_complex_compound.match(location_line): 1038 i = location_line.find("(") 1039 #cur_feature.location_operator = location_line[:i] 1040 #Can't split on the comma because of positions like one-of(1,2,3) 1041 sub_features = cur_feature.sub_features 1042 for part in _split_compound_loc(location_line[i+1:-1]): 1043 if part.startswith("complement("): 1044 assert part[-1]==")" 1045 part = part[11:-1] 1046 assert strand != -1, "Double complement?" 1047 part_strand = -1 1048 else: 1049 part_strand = strand 1050 if ":" in part: 1051 ref, part = part.split(":") 1052 else: 1053 ref = None 1054 try: 1055 loc = _loc(part, self._expected_size, part_strand) 1056 except ValueError as err: 1057 print(location_line) 1058 print(part) 1059 raise err 1060 f = SeqFeature.SeqFeature(location=loc, ref=ref, 1061 location_operator=cur_feature.location_operator, 1062 type=cur_feature.type) 1063 sub_features.append(f) 1064 # Historically a join on the reverse strand has been represented 1065 # in Biopython with both the parent SeqFeature and its children 1066 # (the exons for a CDS) all given a strand of -1. Likewise, for 1067 # a join feature on the forward strand they all have strand +1. 1068 # However, we must also consider evil mixed strand examples like 1069 # this, join(complement(69611..69724),139856..140087,140625..140650) 1070 # 1071 # TODO - Remove use of sub_features 1072 strands = set(sf.strand for sf in sub_features) 1073 if len(strands)==1: 1074 strand = sub_features[0].strand 1075 else: 1076 strand = None # i.e. mixed strands 1077 if strand == -1: 1078 #Reverse the backwards order used in GenBank files 1079 cur_feature.location = SeqFeature.CompoundLocation([f.location for f in sub_features[::-1]], 1080 operator=location_line[:i]) 1081 else: 1082 cur_feature.location = SeqFeature.CompoundLocation([f.location for f in sub_features], 1083 operator=location_line[:i]) 1084 return 1085 #Not recognised 1086 if "order" in location_line and "join" in location_line: 1087 #See Bug 3197 1088 msg = 'Combinations of "join" and "order" within the same ' + \ 1089 'location (nested operators) are illegal:\n' + location_line 1090 raise LocationParserError(msg) 1091 #This used to be an error.... 1092 cur_feature.location = None 1093 import warnings 1094 from Bio import BiopythonParserWarning 1095 warnings.warn(BiopythonParserWarning("Couldn't parse feature location: %r" 1096 % (location_line)))
1097
1098 - def feature_qualifier(self, key, value):
1099 """When we get a qualifier key and its value. 1100 1101 Can receive None, since you can have valueless keys such as /pseudo 1102 """ 1103 # Hack to try to preserve historical behaviour of /pseudo etc 1104 if value is None: 1105 # if the key doesn't exist yet, add an empty string 1106 if key not in self._cur_feature.qualifiers: 1107 self._cur_feature.qualifiers[key] = [""] 1108 return 1109 # otherwise just skip this key 1110 return 1111 1112 value = value.replace('"', '') 1113 if self._feature_cleaner is not None: 1114 value = self._feature_cleaner.clean_value(key, value) 1115 1116 # if the qualifier name exists, append the value 1117 if key in self._cur_feature.qualifiers: 1118 self._cur_feature.qualifiers[key].append(value) 1119 # otherwise start a new list of the key with its values 1120 else: 1121 self._cur_feature.qualifiers[key] = [value]
1122
1123 - def feature_qualifier_name(self, content_list):
1124 """Use feature_qualifier instead (OBSOLETE).""" 1125 raise NotImplementedError("Use the feature_qualifier method instead.")
1126
1127 - def feature_qualifier_description(self, content):
1128 """Use feature_qualifier instead (OBSOLETE).""" 1129 raise NotImplementedError("Use the feature_qualifier method instead.")
1130
1131 - def contig_location(self, content):
1132 """Deal with CONTIG information.""" 1133 #Historically this was stored as a SeqFeature object, but it was 1134 #stored under record.annotations["contig"] and not under 1135 #record.features with the other SeqFeature objects. 1136 # 1137 #The CONTIG location line can include additional tokens like 1138 #Gap(), Gap(100) or Gap(unk100) which are not used in the feature 1139 #location lines, so storing it using SeqFeature based location 1140 #objects is difficult. 1141 # 1142 #We now store this a string, which means for BioSQL we are now in 1143 #much better agreement with how BioPerl records the CONTIG line 1144 #in the database. 1145 # 1146 #NOTE - This code assumes the scanner will return all the CONTIG 1147 #lines already combined into one long string! 1148 self.data.annotations["contig"] = content
1149
1150 - def origin_name(self, content):
1151 pass
1152
1153 - def base_count(self, content):
1154 pass
1155
1156 - def base_number(self, content):
1157 pass
1158
1159 - def sequence(self, content):
1160 """Add up sequence information as we get it. 1161 1162 To try and make things speedier, this puts all of the strings 1163 into a list of strings, and then uses string.join later to put 1164 them together. Supposedly, this is a big time savings 1165 """ 1166 assert ' ' not in content 1167 self._seq_data.append(content.upper())
1168
1169 - def record_end(self, content):
1170 """Clean up when we've finished the record. 1171 """ 1172 from Bio import Alphabet 1173 from Bio.Alphabet import IUPAC 1174 from Bio.Seq import Seq, UnknownSeq 1175 1176 #Try and append the version number to the accession for the full id 1177 if not self.data.id: 1178 assert 'accessions' not in self.data.annotations, \ 1179 self.data.annotations['accessions'] 1180 self.data.id = self.data.name # Good fall back? 1181 elif self.data.id.count('.') == 0: 1182 try: 1183 self.data.id+='.%i' % self.data.annotations['sequence_version'] 1184 except KeyError: 1185 pass 1186 1187 # add the sequence information 1188 # first, determine the alphabet 1189 # we default to an generic alphabet if we don't have a 1190 # seq type or have strange sequence information. 1191 seq_alphabet = Alphabet.generic_alphabet 1192 1193 # now set the sequence 1194 sequence = "".join(self._seq_data) 1195 1196 if self._expected_size is not None \ 1197 and len(sequence) != 0 \ 1198 and self._expected_size != len(sequence): 1199 import warnings 1200 from Bio import BiopythonParserWarning 1201 warnings.warn("Expected sequence length %i, found %i (%s)." 1202 % (self._expected_size, len(sequence), self.data.id), 1203 BiopythonParserWarning) 1204 1205 if self._seq_type: 1206 # mRNA is really also DNA, since it is actually cDNA 1207 if 'DNA' in self._seq_type.upper() or 'MRNA' in self._seq_type.upper(): 1208 seq_alphabet = IUPAC.ambiguous_dna 1209 # are there ever really RNA sequences in GenBank? 1210 elif 'RNA' in self._seq_type.upper(): 1211 #Even for data which was from RNA, the sequence string 1212 #is usually given as DNA (T not U). Bug 2408 1213 if "T" in sequence and "U" not in sequence: 1214 seq_alphabet = IUPAC.ambiguous_dna 1215 else: 1216 seq_alphabet = IUPAC.ambiguous_rna 1217 elif 'PROTEIN' in self._seq_type.upper() \ 1218 or self._seq_type == "PRT": # PRT is used in EMBL-bank for patents 1219 seq_alphabet = IUPAC.protein # or extended protein? 1220 # work around ugly GenBank records which have circular or 1221 # linear but no indication of sequence type 1222 elif self._seq_type in ["circular", "linear", "unspecified"]: 1223 pass 1224 # we have a bug if we get here 1225 else: 1226 raise ValueError("Could not determine alphabet for seq_type %s" 1227 % self._seq_type) 1228 1229 if not sequence and self.__expected_size: 1230 self.data.seq = UnknownSeq(self._expected_size, seq_alphabet) 1231 else: 1232 self.data.seq = Seq(sequence, seq_alphabet)
1233 1234
1235 -class _RecordConsumer(_BaseGenBankConsumer):
1236 """Create a GenBank Record object from scanner generated information (PRIVATE). 1237 """
1238 - def __init__(self):
1239 _BaseGenBankConsumer.__init__(self) 1240 from . import Record 1241 self.data = Record.Record() 1242 1243 self._seq_data = [] 1244 self._cur_reference = None 1245 self._cur_feature = None 1246 self._cur_qualifier = None
1247
1248 - def wgs(self, content):
1249 self.data.wgs = content.split('-')
1250
1251 - def add_wgs_scafld(self, content):
1252 self.data.wgs_scafld.append(content.split('-'))
1253
1254 - def locus(self, content):
1255 self.data.locus = content
1256
1257 - def size(self, content):
1258 self.data.size = content
1259
1260 - def residue_type(self, content):
1261 # Be lenient about parsing, but technically lowercase residue types are malformed. 1262 if 'dna' in content or 'rna' in content: 1263 import warnings 1264 from Bio import BiopythonParserWarning 1265 warnings.warn("Invalid seq_type (%s): DNA/RNA should be uppercase." % content, 1266 BiopythonParserWarning) 1267 self.data.residue_type = content
1268
1269 - def data_file_division(self, content):
1270 self.data.data_file_division = content
1271
1272 - def date(self, content):
1273 self.data.date = content
1274
1275 - def definition(self, content):
1276 self.data.definition = content
1277
1278 - def accession(self, content):
1279 for acc in self._split_accessions(content): 1280 if acc not in self.data.accession: 1281 self.data.accession.append(acc)
1282
1283 - def nid(self, content):
1284 self.data.nid = content
1285
1286 - def pid(self, content):
1287 self.data.pid = content
1288
1289 - def version(self, content):
1290 self.data.version = content
1291
1292 - def db_source(self, content):
1293 self.data.db_source = content.rstrip()
1294
1295 - def gi(self, content):
1296 self.data.gi = content
1297
1298 - def keywords(self, content):
1299 self.data.keywords = self._split_keywords(content)
1300
1301 - def project(self, content):
1302 self.data.projects.extend(p for p in content.split() if p)
1303 1306
1307 - def segment(self, content):
1308 self.data.segment = content
1309
1310 - def source(self, content):
1311 self.data.source = content
1312
1313 - def organism(self, content):
1314 self.data.organism = content
1315
1316 - def taxonomy(self, content):
1317 self.data.taxonomy = self._split_taxonomy(content)
1318
1319 - def reference_num(self, content):
1320 """Grab the reference number and signal the start of a new reference. 1321 """ 1322 # check if we have a reference to add 1323 if self._cur_reference is not None: 1324 self.data.references.append(self._cur_reference) 1325 1326 from . import Record 1327 self._cur_reference = Record.Reference() 1328 self._cur_reference.number = content
1329
1330 - def reference_bases(self, content):
1331 self._cur_reference.bases = content
1332
1333 - def authors(self, content):
1334 self._cur_reference.authors = content
1335
1336 - def consrtm(self, content):
1337 self._cur_reference.consrtm = content
1338
1339 - def title(self, content):
1340 if self._cur_reference is None: 1341 import warnings 1342 from Bio import BiopythonParserWarning 1343 warnings.warn("GenBank TITLE line without REFERENCE line.", 1344 BiopythonParserWarning) 1345 return 1346 self._cur_reference.title = content
1347
1348 - def journal(self, content):
1349 self._cur_reference.journal = content
1350
1351 - def medline_id(self, content):
1352 self._cur_reference.medline_id = content
1353
1354 - def pubmed_id(self, content):
1355 self._cur_reference.pubmed_id = content
1356
1357 - def remark(self, content):
1358 self._cur_reference.remark = content
1359
1360 - def comment(self, content):
1361 self.data.comment += "\n".join(content)
1362
1363 - def primary_ref_line(self, content):
1364 """Data for the PRIMARY line""" 1365 self.data.primary.append(content)
1366
1367 - def primary(self, content):
1368 pass
1369
1370 - def features_line(self, content):
1371 """Get ready for the feature table when we reach the FEATURE line. 1372 """ 1373 self.start_feature_table()
1374
1375 - def start_feature_table(self):
1376 """Signal the start of the feature table. 1377 """ 1378 # we need to add on the last reference 1379 if self._cur_reference is not None: 1380 self.data.references.append(self._cur_reference)
1381
1382 - def feature_key(self, content):
1383 """Grab the key of the feature and signal the start of a new feature. 1384 """ 1385 # first add on feature information if we've got any 1386 self._add_feature() 1387 1388 from . import Record 1389 self._cur_feature = Record.Feature() 1390 self._cur_feature.key = content
1391
1392 - def _add_feature(self):
1393 """Utility function to add a feature to the Record. 1394 1395 This does all of the appropriate checking to make sure we haven't 1396 left any info behind, and that we are only adding info if it 1397 exists. 1398 """ 1399 if self._cur_feature is not None: 1400 # if we have a left over qualifier, add it to the qualifiers 1401 # on the current feature 1402 if self._cur_qualifier is not None: 1403 self._cur_feature.qualifiers.append(self._cur_qualifier) 1404 1405 self._cur_qualifier = None 1406 self.data.features.append(self._cur_feature)
1407
1408 - def location(self, content):
1409 self._cur_feature.location = self._clean_location(content)
1410
1411 - def feature_qualifier(self, key, value):
1412 self.feature_qualifier_name([key]) 1413 if value is not None: 1414 self.feature_qualifier_description(value)
1415
1416 - def feature_qualifier_name(self, content_list):
1417 """Deal with qualifier names 1418 1419 We receive a list of keys, since you can have valueless keys such as 1420 /pseudo which would be passed in with the next key (since no other 1421 tags separate them in the file) 1422 """ 1423 from . import Record 1424 for content in content_list: 1425 # the record parser keeps the /s -- add them if we don't have 'em 1426 if not content.startswith("/"): 1427 content = "/%s" % content 1428 # add on a qualifier if we've got one 1429 if self._cur_qualifier is not None: 1430 self._cur_feature.qualifiers.append(self._cur_qualifier) 1431 1432 self._cur_qualifier = Record.Qualifier() 1433 self._cur_qualifier.key = content
1434
1435 - def feature_qualifier_description(self, content):
1436 # if we have info then the qualifier key should have a ='s 1437 if '=' not in self._cur_qualifier.key: 1438 self._cur_qualifier.key = "%s=" % self._cur_qualifier.key 1439 cur_content = self._remove_newlines(content) 1440 # remove all spaces from the value if it is a type where spaces 1441 # are not important 1442 for remove_space_key in self.__class__.remove_space_keys: 1443 if remove_space_key in self._cur_qualifier.key: 1444 cur_content = self._remove_spaces(cur_content) 1445 self._cur_qualifier.value = self._normalize_spaces(cur_content)
1446
1447 - def base_count(self, content):
1448 self.data.base_counts = content
1449
1450 - def origin_name(self, content):
1451 self.data.origin = content
1452
1453 - def contig_location(self, content):
1454 """Signal that we have contig information to add to the record. 1455 """ 1456 self.data.contig = self._clean_location(content)
1457
1458 - def sequence(self, content):
1459 """Add sequence information to a list of sequence strings. 1460 1461 This removes spaces in the data and uppercases the sequence, and 1462 then adds it to a list of sequences. Later on we'll join this 1463 list together to make the final sequence. This is faster than 1464 adding on the new string every time. 1465 """ 1466 assert ' ' not in content 1467 self._seq_data.append(content.upper())
1468
1469 - def record_end(self, content):
1470 """Signal the end of the record and do any necessary clean-up. 1471 """ 1472 # add together all of the sequence parts to create the 1473 # final sequence string 1474 self.data.sequence = "".join(self._seq_data) 1475 # add on the last feature 1476 self._add_feature()
1477 1478
1479 -def parse(handle):
1480 """Iterate over GenBank formatted entries as Record objects. 1481 1482 >>> from Bio import GenBank 1483 >>> with open("GenBank/NC_000932.gb") as handle: 1484 ... for record in GenBank.parse(handle): 1485 ... print(record.accession) 1486 ['NC_000932'] 1487 1488 To get SeqRecord objects use Bio.SeqIO.parse(..., format="gb") 1489 instead. 1490 """ 1491 return iter(Iterator(handle, RecordParser()))
1492 1493
1494 -def read(handle):
1495 """Read a handle containing a single GenBank entry as a Record object. 1496 1497 >>> from Bio import GenBank 1498 >>> with open("GenBank/NC_000932.gb") as handle: 1499 ... record = GenBank.read(handle) 1500 ... print(record.accession) 1501 ['NC_000932'] 1502 1503 To get a SeqRecord object use Bio.SeqIO.read(..., format="gb") 1504 instead. 1505 """ 1506 iterator = parse(handle) 1507 try: 1508 first = next(iterator) 1509 except StopIteration: 1510 first = None 1511 if first is None: 1512 raise ValueError("No records found in handle") 1513 try: 1514 second = next(iterator) 1515 except StopIteration: 1516 second = None 1517 if second is not None: 1518 raise ValueError("More than one record found in handle") 1519 return first
1520 1521
1522 -def _test():
1523 """Run the Bio.GenBank module's doctests.""" 1524 import doctest 1525 import os 1526 if os.path.isdir(os.path.join("..", "..", "Tests")): 1527 print("Running doctests...") 1528 cur_dir = os.path.abspath(os.curdir) 1529 os.chdir(os.path.join("..", "..", "Tests")) 1530 doctest.testmod() 1531 os.chdir(cur_dir) 1532 del cur_dir 1533 print("Done") 1534 elif os.path.isdir(os.path.join("Tests")): 1535 print("Running doctests...") 1536 cur_dir = os.path.abspath(os.curdir) 1537 os.chdir(os.path.join("Tests")) 1538 doctest.testmod() 1539 os.chdir(cur_dir) 1540 del cur_dir 1541 print("Done")
1542 1543 if __name__ == "__main__": 1544 _test() 1545