Package Bio :: Package GenBank
[hide private]
[frames] | no frames]

Source Code for Package Bio.GenBank

   1  # Copyright 2000 by Jeffrey Chang, Brad Chapman.  All rights reserved. 
   2  # Copyright 2006-2016 by Peter Cock.  All rights reserved. 
   3  # 
   4  # This code is part of the Biopython distribution and governed by its 
   5  # license.  Please see the LICENSE file that should have been included 
   6  # as part of this package. 
   7   
   8  """Code to work with GenBank formatted files. 
   9   
  10  Rather than using Bio.GenBank, you are now encouraged to use Bio.SeqIO with 
  11  the "genbank" or "embl" format names to parse GenBank or EMBL files into 
  12  SeqRecord and SeqFeature objects (see the Biopython tutorial for details). 
  13   
  14  Using Bio.GenBank directly to parse GenBank files is only useful if you want 
  15  to obtain GenBank-specific Record objects, which is a much closer 
  16  representation to the raw file contents than the SeqRecord alternative from 
  17  the FeatureParser (used in Bio.SeqIO). 
  18   
  19  To use the Bio.GenBank parser, there are two helper functions: 
  20   
  21      - read                  Parse a handle containing a single GenBank record 
  22        as Bio.GenBank specific Record objects. 
  23      - parse                 Iterate over a handle containing multiple GenBank 
  24        records as Bio.GenBank specific Record objects. 
  25   
  26  The following internal classes are not intended for direct use and may 
  27  be deprecated in a future release. 
  28   
  29  Classes: 
  30   
  31      - Iterator              Iterate through a file of GenBank entries 
  32      - ErrorFeatureParser    Catch errors caused during parsing. 
  33      - FeatureParser         Parse GenBank data in SeqRecord and SeqFeature objects. 
  34      - RecordParser          Parse GenBank data into a Record object. 
  35   
  36  Exceptions: 
  37   
  38      - ParserFailureError    Exception indicating a failure in the parser (ie. 
  39        scanner or consumer) 
  40      - LocationParserError   Exception indicating a problem with the spark based 
  41        location parser. 
  42   
  43  """ 
  44  from __future__ import print_function 
  45   
  46  import re 
  47  import sys  # for checking if Python 2 
  48   
  49  # other Biopython stuff 
  50  from Bio import SeqFeature 
  51   
  52  # other Bio.GenBank stuff 
  53  from .utils import FeatureValueCleaner 
  54  from .Scanner import GenBankScanner 
  55   
  56   
  57  # Constants used to parse GenBank header lines 
  58  GENBANK_INDENT = 12 
  59  GENBANK_SPACER = " " * GENBANK_INDENT 
  60   
  61  # Constants for parsing GenBank feature lines 
  62  FEATURE_KEY_INDENT = 5 
  63  FEATURE_QUALIFIER_INDENT = 21 
  64  FEATURE_KEY_SPACER = " " * FEATURE_KEY_INDENT 
  65  FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT 
  66   
  67  # Regular expressions for location parsing 
  68  _solo_location = r"[<>]?\d+" 
  69  _pair_location = r"[<>]?\d+\.\.[<>]?\d+" 
  70  _between_location = r"\d+\^\d+" 
  71   
  72  _within_position = r"\(\d+\.\d+\)" 
  73  _re_within_position = re.compile(_within_position) 
  74  _within_location = r"([<>]?\d+|%s)\.\.([<>]?\d+|%s)" \ 
  75                     % (_within_position, _within_position) 
  76  assert _re_within_position.match("(3.9)") 
  77  assert re.compile(_within_location).match("(3.9)..10") 
  78  assert re.compile(_within_location).match("26..(30.33)") 
  79  assert re.compile(_within_location).match("(13.19)..(20.28)") 
  80   
  81  _oneof_position = r"one\-of\(\d+(,\d+)+\)" 
  82  _re_oneof_position = re.compile(_oneof_position) 
  83  _oneof_location = r"([<>]?\d+|%s)\.\.([<>]?\d+|%s)" \ 
  84                     % (_oneof_position, _oneof_position) 
  85  assert _re_oneof_position.match("one-of(6,9)") 
  86  assert re.compile(_oneof_location).match("one-of(6,9)..101") 
  87  assert re.compile(_oneof_location).match("one-of(6,9)..one-of(101,104)") 
  88  assert re.compile(_oneof_location).match("6..one-of(101,104)") 
  89   
  90  assert not _re_oneof_position.match("one-of(3)") 
  91  assert _re_oneof_position.match("one-of(3,6)") 
  92  assert _re_oneof_position.match("one-of(3,6,9)") 
  93   
  94   
  95  _simple_location = r"\d+\.\.\d+" 
  96  _re_simple_location = re.compile(r"^%s$" % _simple_location) 
  97  _re_simple_compound = re.compile(r"^(join|order|bond)\(%s(,%s)*\)$" 
  98                                   % (_simple_location, _simple_location)) 
  99  _complex_location = r"([a-zA-Z][a-zA-Z0-9_\.]*[a-zA-Z0-9]?\:)?(%s|%s|%s|%s|%s)" \ 
 100                      % (_pair_location, _solo_location, _between_location, 
 101                         _within_location, _oneof_location) 
 102  _re_complex_location = re.compile(r"^%s$" % _complex_location) 
 103  _possibly_complemented_complex_location = r"(%s|complement\(%s\))" \ 
 104                                            % (_complex_location, _complex_location) 
 105  _re_complex_compound = re.compile(r"^(join|order|bond)\(%s(,%s)*\)$" 
 106                                   % (_possibly_complemented_complex_location, 
 107                                      _possibly_complemented_complex_location)) 
 108   
 109   
 110  assert _re_simple_location.match("104..160") 
 111  assert not _re_simple_location.match("68451760..68452073^68452074") 
 112  assert not _re_simple_location.match("<104..>160") 
 113  assert not _re_simple_location.match("104") 
 114  assert not _re_simple_location.match("<1") 
 115  assert not _re_simple_location.match(">99999") 
 116  assert not _re_simple_location.match("join(104..160,320..390,504..579)") 
 117  assert not _re_simple_compound.match("bond(12,63)") 
 118  assert _re_simple_compound.match("join(104..160,320..390,504..579)") 
 119  assert _re_simple_compound.match("order(1..69,1308..1465)") 
 120  assert not _re_simple_compound.match("order(1..69,1308..1465,1524)") 
 121  assert not _re_simple_compound.match("join(<1..442,992..1228,1524..>1983)") 
 122  assert not _re_simple_compound.match("join(<1..181,254..336,422..497,574..>590)") 
 123  assert not _re_simple_compound.match("join(1475..1577,2841..2986,3074..3193,3314..3481,4126..>4215)") 
 124  assert not _re_simple_compound.match("test(1..69,1308..1465)") 
 125  assert not _re_simple_compound.match("complement(1..69)") 
 126  assert not _re_simple_compound.match("(1..69)") 
 127  assert _re_complex_location.match("(3.9)..10") 
 128  assert _re_complex_location.match("26..(30.33)") 
 129  assert _re_complex_location.match("(13.19)..(20.28)") 
 130  assert _re_complex_location.match("41^42")  # between 
 131  assert _re_complex_location.match("AL121804:41^42") 
 132  assert _re_complex_location.match("AL121804:41..610") 
 133  assert _re_complex_location.match("AL121804.2:41..610") 
 134  assert _re_complex_location.match("AL358792.24.1.166931:3274..3461")  # lots of dots in external reference 
 135  assert _re_complex_location.match("one-of(3,6)..101") 
 136  assert _re_complex_compound.match("join(153490..154269,AL121804.2:41..610,AL121804.2:672..1487)") 
 137  assert not _re_simple_compound.match("join(153490..154269,AL121804.2:41..610,AL121804.2:672..1487)") 
 138  assert _re_complex_compound.match("join(complement(69611..69724),139856..140650)") 
 139  assert _re_complex_compound.match("join(complement(AL354868.10.1.164018:80837..81016),complement(AL354868.10.1.164018:80539..80835))") 
 140   
 141  # Trans-spliced example from NC_016406, note underscore in reference name: 
 142  assert _re_complex_location.match("NC_016402.1:6618..6676") 
 143  assert _re_complex_location.match("181647..181905") 
 144  assert _re_complex_compound.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)") 
 145  assert not _re_complex_location.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)") 
 146  assert not _re_simple_compound.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)") 
 147  assert not _re_complex_location.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)") 
 148  assert not _re_simple_location.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)") 
 149   
 150  _solo_bond = re.compile("bond\(%s\)" % _solo_location) 
 151  assert _solo_bond.match("bond(196)") 
 152  assert _solo_bond.search("bond(196)") 
 153  assert _solo_bond.search("join(bond(284),bond(305),bond(309),bond(305))") 
 154   
 155   
156 -def _pos(pos_str, offset=0):
157 """Build a Position object (PRIVATE). 158 159 For an end position, leave offset as zero (default): 160 161 >>> _pos("5") 162 ExactPosition(5) 163 164 For a start position, set offset to minus one (for Python counting): 165 166 >>> _pos("5", -1) 167 ExactPosition(4) 168 169 This also covers fuzzy positions: 170 171 >>> p = _pos("<5") 172 >>> p 173 BeforePosition(5) 174 >>> print(p) 175 <5 176 >>> int(p) 177 5 178 179 >>> _pos(">5") 180 AfterPosition(5) 181 182 By default assumes an end position, so note the integer behaviour: 183 184 >>> p = _pos("one-of(5,8,11)") 185 >>> p 186 OneOfPosition(11, choices=[ExactPosition(5), ExactPosition(8), ExactPosition(11)]) 187 >>> print(p) 188 one-of(5,8,11) 189 >>> int(p) 190 11 191 192 >>> _pos("(8.10)") 193 WithinPosition(10, left=8, right=10) 194 195 Fuzzy start positions: 196 197 >>> p = _pos("<5", -1) 198 >>> p 199 BeforePosition(4) 200 >>> print(p) 201 <4 202 >>> int(p) 203 4 204 205 Notice how the integer behaviour changes too! 206 207 >>> p = _pos("one-of(5,8,11)", -1) 208 >>> p 209 OneOfPosition(4, choices=[ExactPosition(4), ExactPosition(7), ExactPosition(10)]) 210 >>> print(p) 211 one-of(4,7,10) 212 >>> int(p) 213 4 214 215 """ 216 if pos_str.startswith("<"): 217 return SeqFeature.BeforePosition(int(pos_str[1:]) + offset) 218 elif pos_str.startswith(">"): 219 return SeqFeature.AfterPosition(int(pos_str[1:]) + offset) 220 elif _re_within_position.match(pos_str): 221 s, e = pos_str[1:-1].split(".") 222 s = int(s) + offset 223 e = int(e) + offset 224 if offset == -1: 225 default = s 226 else: 227 default = e 228 return SeqFeature.WithinPosition(default, left=s, right=e) 229 elif _re_oneof_position.match(pos_str): 230 assert pos_str.startswith("one-of(") 231 assert pos_str[-1] == ")" 232 parts = [SeqFeature.ExactPosition(int(pos) + offset) 233 for pos in pos_str[7:-1].split(",")] 234 if offset == -1: 235 default = min(int(pos) for pos in parts) 236 else: 237 default = max(int(pos) for pos in parts) 238 return SeqFeature.OneOfPosition(default, choices=parts) 239 else: 240 return SeqFeature.ExactPosition(int(pos_str) + offset)
241 242
243 -def _loc(loc_str, expected_seq_length, strand):
244 """FeatureLocation from non-compound non-complement location (PRIVATE). 245 246 Simple examples, 247 248 >>> _loc("123..456", 1000, +1) 249 FeatureLocation(ExactPosition(122), ExactPosition(456), strand=1) 250 >>> _loc("<123..>456", 1000, strand = -1) 251 FeatureLocation(BeforePosition(122), AfterPosition(456), strand=-1) 252 253 A more complex location using within positions, 254 255 >>> _loc("(9.10)..(20.25)", 1000, 1) 256 FeatureLocation(WithinPosition(8, left=8, right=9), WithinPosition(25, left=20, right=25), strand=1) 257 258 Notice how that will act as though it has overall start 8 and end 25. 259 260 Zero length between feature, 261 262 >>> _loc("123^124", 1000, 0) 263 FeatureLocation(ExactPosition(123), ExactPosition(123), strand=0) 264 265 The expected sequence length is needed for a special case, a between 266 position at the start/end of a circular genome: 267 268 >>> _loc("1000^1", 1000, 1) 269 FeatureLocation(ExactPosition(1000), ExactPosition(1000), strand=1) 270 271 Apart from this special case, between positions P^Q must have P+1==Q, 272 273 >>> _loc("123^456", 1000, 1) 274 Traceback (most recent call last): 275 ... 276 ValueError: Invalid between location '123^456' 277 """ 278 try: 279 s, e = loc_str.split("..") 280 except ValueError: 281 assert ".." not in loc_str 282 if "^" in loc_str: 283 # A between location like "67^68" (one based counting) is a 284 # special case (note it has zero length). In python slice 285 # notation this is 67:67, a zero length slice. See Bug 2622 286 # Further more, on a circular genome of length N you can have 287 # a location N^1 meaning the junction at the origin. See Bug 3098. 288 # NOTE - We can imagine between locations like "2^4", but this 289 # is just "3". Similarly, "2^5" is just "3..4" 290 s, e = loc_str.split("^") 291 if int(s) + 1 == int(e): 292 pos = _pos(s) 293 elif int(s) == expected_seq_length and e == "1": 294 pos = _pos(s) 295 else: 296 raise ValueError("Invalid between location %s" % repr(loc_str)) 297 return SeqFeature.FeatureLocation(pos, pos, strand) 298 else: 299 # e.g. "123" 300 s = loc_str 301 e = loc_str 302 return SeqFeature.FeatureLocation(_pos(s, -1), _pos(e), strand)
303 304
305 -def _split_compound_loc(compound_loc):
306 """Split a tricky compound location string (PRIVATE). 307 308 >>> list(_split_compound_loc("123..145")) 309 ['123..145'] 310 >>> list(_split_compound_loc("123..145,200..209")) 311 ['123..145', '200..209'] 312 >>> list(_split_compound_loc("one-of(200,203)..300")) 313 ['one-of(200,203)..300'] 314 >>> list(_split_compound_loc("complement(123..145),200..209")) 315 ['complement(123..145)', '200..209'] 316 >>> list(_split_compound_loc("123..145,one-of(200,203)..209")) 317 ['123..145', 'one-of(200,203)..209'] 318 >>> list(_split_compound_loc("123..145,one-of(200,203)..one-of(209,211),300")) 319 ['123..145', 'one-of(200,203)..one-of(209,211)', '300'] 320 >>> list(_split_compound_loc("123..145,complement(one-of(200,203)..one-of(209,211)),300")) 321 ['123..145', 'complement(one-of(200,203)..one-of(209,211))', '300'] 322 >>> list(_split_compound_loc("123..145,200..one-of(209,211),300")) 323 ['123..145', '200..one-of(209,211)', '300'] 324 >>> list(_split_compound_loc("123..145,200..one-of(209,211)")) 325 ['123..145', '200..one-of(209,211)'] 326 >>> list(_split_compound_loc("complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905")) 327 ['complement(149815..150200)', 'complement(293787..295573)', 'NC_016402.1:6618..6676', '181647..181905'] 328 """ 329 if "one-of(" in compound_loc: 330 # Hard case 331 while "," in compound_loc: 332 assert compound_loc[0] != "," 333 assert compound_loc[0:2] != ".." 334 i = compound_loc.find(",") 335 part = compound_loc[:i] 336 compound_loc = compound_loc[i:] # includes the comma 337 while part.count("(") > part.count(")"): 338 assert "one-of(" in part, (part, compound_loc) 339 i = compound_loc.find(")") 340 part += compound_loc[:i + 1] 341 compound_loc = compound_loc[i + 1:] 342 if compound_loc.startswith(".."): 343 i = compound_loc.find(",") 344 if i == -1: 345 part += compound_loc 346 compound_loc = "" 347 else: 348 part += compound_loc[:i] 349 compound_loc = compound_loc[i:] # includes the comma 350 while part.count("(") > part.count(")"): 351 assert part.count("one-of(") == 2 352 i = compound_loc.find(")") 353 part += compound_loc[:i + 1] 354 compound_loc = compound_loc[i + 1:] 355 if compound_loc.startswith(","): 356 compound_loc = compound_loc[1:] 357 assert part 358 yield part 359 if compound_loc: 360 yield compound_loc 361 else: 362 # Easy case 363 for part in compound_loc.split(","): 364 yield part
365 366
367 -class Iterator(object):
368 """Iterator interface to move over a file of GenBank entries one at a time (OBSOLETE). 369 370 This class is likely to be deprecated in a future release of Biopython. 371 Please use Bio.SeqIO.parse(..., format="gb") or Bio.GenBank.parse(...) 372 for SeqRecord and GenBank specific Record objects respectively instead. 373 """
374 - def __init__(self, handle, parser=None):
375 """Initialize the iterator. 376 377 Arguments: 378 379 - handle - A handle with GenBank entries to iterate through. 380 - parser - An optional parser to pass the entries through before 381 returning them. If None, then the raw entry will be returned. 382 """ 383 self.handle = handle 384 self._parser = parser
385
386 - def __next__(self):
387 """Return the next GenBank record from the handle. 388 389 Will return None if we ran out of records. 390 """ 391 if self._parser is None: 392 lines = [] 393 while True: 394 line = self.handle.readline() 395 if not line: 396 return None # Premature end of file? 397 lines.append(line) 398 if line.rstrip() == "//": 399 break 400 return "".join(lines) 401 try: 402 return self._parser.parse(self.handle) 403 except StopIteration: 404 return None
405 406 if sys.version_info[0] < 3:
407 - def next(self):
408 """Python 2 style alias for Python 3 style __next__ method.""" 409 return self.__next__()
410
411 - def __iter__(self):
412 return iter(self.__next__, None)
413 414
415 -class ParserFailureError(Exception):
416 """Failure caused by some kind of problem in the parser. 417 """ 418 pass
419 420
421 -class LocationParserError(Exception):
422 """Could not Properly parse out a location from a GenBank file. 423 """ 424 pass
425 426
427 -class FeatureParser(object):
428 """Parse GenBank files into Seq + Feature objects (OBSOLETE). 429 430 Direct use of this class is discouraged, and may be deprecated in 431 a future release of Biopython. 432 433 Please use Bio.SeqIO.parse(...) or Bio.SeqIO.read(...) instead. 434 """
435 - def __init__(self, debug_level=0, use_fuzziness=1, 436 feature_cleaner=FeatureValueCleaner()):
437 """Initialize a GenBank parser and Feature consumer. 438 439 Arguments: 440 441 - debug_level - An optional argument that species the amount of 442 debugging information the parser should spit out. By default we have 443 no debugging info (the fastest way to do things), but if you want 444 you can set this as high as two and see exactly where a parse fails. 445 - use_fuzziness - Specify whether or not to use fuzzy representations. 446 The default is 1 (use fuzziness). 447 - feature_cleaner - A class which will be used to clean out the 448 values of features. This class must implement the function 449 clean_value. GenBank.utils has a "standard" cleaner class, which 450 is used by default. 451 """ 452 self._scanner = GenBankScanner(debug_level) 453 self.use_fuzziness = use_fuzziness 454 self._cleaner = feature_cleaner
455
456 - def parse(self, handle):
457 """Parse the specified handle. 458 """ 459 self._consumer = _FeatureConsumer(self.use_fuzziness, 460 self._cleaner) 461 self._scanner.feed(handle, self._consumer) 462 return self._consumer.data
463 464
465 -class RecordParser(object):
466 """Parse GenBank files into Record objects (OBSOLETE). 467 468 Direct use of this class is discouraged, and may be deprecated in 469 a future release of Biopython. 470 471 Please use the Bio.GenBank.parse(...) or Bio.GenBank.read(...) functions 472 instead. 473 """
474 - def __init__(self, debug_level=0):
475 """Initialize the parser. 476 477 Arguments: 478 479 - debug_level - An optional argument that species the amount of 480 debugging information the parser should spit out. By default we have 481 no debugging info (the fastest way to do things), but if you want 482 you can set this as high as two and see exactly where a parse fails. 483 """ 484 self._scanner = GenBankScanner(debug_level)
485
486 - def parse(self, handle):
487 """Parse the specified handle into a GenBank record. 488 """ 489 self._consumer = _RecordConsumer() 490 491 self._scanner.feed(handle, self._consumer) 492 return self._consumer.data
493 494
495 -class _BaseGenBankConsumer(object):
496 """Abstract GenBank consumer providing useful general functions (PRIVATE). 497 498 This just helps to eliminate some duplication in things that most 499 GenBank consumers want to do. 500 """ 501 # Special keys in GenBank records that we should remove spaces from 502 # For instance, \translation keys have values which are proteins and 503 # should have spaces and newlines removed from them. This class 504 # attribute gives us more control over specific formatting problems. 505 remove_space_keys = ["translation"] 506
507 - def __init__(self):
508 pass
509
510 - def _unhandled(self, data):
511 pass
512
513 - def __getattr__(self, attr):
514 return self._unhandled
515
516 - def _split_keywords(self, keyword_string):
517 """Split a string of keywords into a nice clean list. 518 """ 519 # process the keywords into a python list 520 if keyword_string == "" or keyword_string == ".": 521 keywords = "" 522 elif keyword_string[-1] == '.': 523 keywords = keyword_string[:-1] 524 else: 525 keywords = keyword_string 526 keyword_list = keywords.split(';') 527 clean_keyword_list = [x.strip() for x in keyword_list] 528 return clean_keyword_list
529
530 - def _split_accessions(self, accession_string):
531 """Split a string of accession numbers into a list. 532 """ 533 # first replace all line feeds with spaces 534 # Also, EMBL style accessions are split with ';' 535 accession = accession_string.replace("\n", " ").replace(";", " ") 536 537 return [x.strip() for x in accession.split() if x.strip()]
538
539 - def _split_taxonomy(self, taxonomy_string):
540 """Split a string with taxonomy info into a list. 541 """ 542 if not taxonomy_string or taxonomy_string == ".": 543 # Missing data, no taxonomy 544 return [] 545 546 if taxonomy_string[-1] == '.': 547 tax_info = taxonomy_string[:-1] 548 else: 549 tax_info = taxonomy_string 550 tax_list = tax_info.split(';') 551 new_tax_list = [] 552 for tax_item in tax_list: 553 new_items = tax_item.split("\n") 554 new_tax_list.extend(new_items) 555 while '' in new_tax_list: 556 new_tax_list.remove('') 557 clean_tax_list = [x.strip() for x in new_tax_list] 558 559 return clean_tax_list
560
561 - def _clean_location(self, location_string):
562 """Clean whitespace out of a location string. 563 564 The location parser isn't a fan of whitespace, so we clean it out 565 before feeding it into the parser. 566 """ 567 # Originally this imported string.whitespace and did a replace 568 # via a loop. It's simpler to just split on whitespace and rejoin 569 # the string - and this avoids importing string too. See Bug 2684. 570 return ''.join(location_string.split())
571
572 - def _remove_newlines(self, text):
573 """Remove any newlines in the passed text, returning the new string. 574 """ 575 # get rid of newlines in the qualifier value 576 newlines = ["\n", "\r"] 577 for ws in newlines: 578 text = text.replace(ws, "") 579 580 return text
581
582 - def _normalize_spaces(self, text):
583 """Replace multiple spaces in the passed text with single spaces. 584 """ 585 # get rid of excessive spaces 586 return ' '.join(x for x in text.split(" ") if x)
587
588 - def _remove_spaces(self, text):
589 """Remove all spaces from the passed text. 590 """ 591 return text.replace(" ", "")
592
593 - def _convert_to_python_numbers(self, start, end):
594 """Convert a start and end range to python notation. 595 596 In GenBank, starts and ends are defined in "biological" coordinates, 597 where 1 is the first base and [i, j] means to include both i and j. 598 599 In python, 0 is the first base and [i, j] means to include i, but 600 not j. 601 602 So, to convert "biological" to python coordinates, we need to 603 subtract 1 from the start, and leave the end and things should 604 be converted happily. 605 """ 606 new_start = start - 1 607 new_end = end 608 609 return new_start, new_end
610 611
612 -class _FeatureConsumer(_BaseGenBankConsumer):
613 """Create a SeqRecord object with Features to return (PRIVATE). 614 615 Attributes: 616 617 - use_fuzziness - specify whether or not to parse with fuzziness in 618 feature locations. 619 - feature_cleaner - a class that will be used to provide specialized 620 cleaning-up of feature values. 621 """
622 - def __init__(self, use_fuzziness, feature_cleaner=None):
623 from Bio.SeqRecord import SeqRecord 624 _BaseGenBankConsumer.__init__(self) 625 self.data = SeqRecord(None, id=None) 626 self.data.id = None 627 self.data.description = "" 628 629 self._use_fuzziness = use_fuzziness 630 self._feature_cleaner = feature_cleaner 631 632 self._seq_type = '' 633 self._seq_data = [] 634 self._cur_reference = None 635 self._cur_feature = None 636 self._expected_size = None
637
638 - def locus(self, locus_name):
639 """Set the locus name is set as the name of the Sequence. 640 """ 641 self.data.name = locus_name
642
643 - def size(self, content):
644 """Record the sequence length.""" 645 self._expected_size = int(content)
646
647 - def residue_type(self, type):
648 """Record the sequence type so we can choose an appropriate alphabet. 649 """ 650 self._seq_type = type.strip()
651
652 - def data_file_division(self, division):
653 self.data.annotations['data_file_division'] = division
654
655 - def date(self, submit_date):
656 self.data.annotations['date'] = submit_date
657
658 - def definition(self, definition):
659 """Set the definition as the description of the sequence. 660 """ 661 if self.data.description: 662 # Append to any existing description 663 # e.g. EMBL files with two DE lines. 664 self.data.description += " " + definition 665 else: 666 self.data.description = definition
667
668 - def accession(self, acc_num):
669 """Set the accession number as the id of the sequence. 670 671 If we have multiple accession numbers, the first one passed is 672 used. 673 """ 674 new_acc_nums = self._split_accessions(acc_num) 675 676 # Also record them ALL in the annotations 677 try: 678 # On the off chance there was more than one accession line: 679 for acc in new_acc_nums: 680 # Prevent repeat entries 681 if acc not in self.data.annotations['accessions']: 682 self.data.annotations['accessions'].append(acc) 683 except KeyError: 684 self.data.annotations['accessions'] = new_acc_nums 685 686 # if we haven't set the id information yet, add the first acc num 687 if not self.data.id: 688 if len(new_acc_nums) > 0: 689 # self.data.id = new_acc_nums[0] 690 # Use the FIRST accession as the ID, not the first on this line! 691 self.data.id = self.data.annotations['accessions'][0]
692
693 - def wgs(self, content):
694 self.data.annotations['wgs'] = content.split('-')
695
696 - def add_wgs_scafld(self, content):
697 self.data.annotations.setdefault('wgs_scafld', []).append(content.split('-'))
698
699 - def nid(self, content):
700 self.data.annotations['nid'] = content
701
702 - def pid(self, content):
703 self.data.annotations['pid'] = content
704
705 - def version(self, version_id):
706 # Want to use the versioned accession as the record.id 707 # This comes from the VERSION line in GenBank files, or the 708 # obsolete SV line in EMBL. For the new EMBL files we need 709 # both the version suffix from the ID line and the accession 710 # from the AC line. 711 if version_id.count(".") == 1 and version_id.split(".")[1].isdigit(): 712 self.accession(version_id.split(".")[0]) 713 self.version_suffix(version_id.split(".")[1]) 714 elif version_id: 715 # For backwards compatibility... 716 self.data.id = version_id
717
718 - def project(self, content):
719 """Handle the information from the PROJECT line as a list of projects. 720 721 e.g.:: 722 723 PROJECT GenomeProject:28471 724 725 or:: 726 727 PROJECT GenomeProject:13543 GenomeProject:99999 728 729 This is stored as dbxrefs in the SeqRecord to be consistent with the 730 projected switch of this line to DBLINK in future GenBank versions. 731 Note the NCBI plan to replace "GenomeProject:28471" with the shorter 732 "Project:28471" as part of this transition. 733 """ 734 content = content.replace("GenomeProject:", "Project:") 735 self.data.dbxrefs.extend(p for p in content.split() if p)
736 769
770 - def version_suffix(self, version):
771 """Set the version to overwrite the id. 772 773 Since the version provides the same information as the accession 774 number, plus some extra info, we set this as the id if we have 775 a version. 776 """ 777 # e.g. GenBank line: 778 # VERSION U49845.1 GI:1293613 779 # or the obsolete EMBL line: 780 # SV U49845.1 781 # Scanner calls consumer.version("U49845.1") 782 # which then calls consumer.version_suffix(1) 783 # 784 # e.g. EMBL new line: 785 # ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP. 786 # Scanner calls consumer.version_suffix(1) 787 assert version.isdigit() 788 self.data.annotations['sequence_version'] = int(version)
789
790 - def db_source(self, content):
791 self.data.annotations['db_source'] = content.rstrip()
792
793 - def gi(self, content):
794 self.data.annotations['gi'] = content
795
796 - def keywords(self, content):
797 if 'keywords' in self.data.annotations: 798 # Multi-line keywords, append to list 799 # Note EMBL states "A keyword is never split between lines." 800 self.data.annotations['keywords'].extend(self._split_keywords(content)) 801 else: 802 self.data.annotations['keywords'] = self._split_keywords(content)
803
804 - def segment(self, content):
805 self.data.annotations['segment'] = content
806
807 - def source(self, content):
808 # Note that some software (e.g. VectorNTI) may produce an empty 809 # source (rather than using a dot/period as might be expected). 810 if content == "": 811 source_info = "" 812 elif content[-1] == '.': 813 source_info = content[:-1] 814 else: 815 source_info = content 816 self.data.annotations['source'] = source_info
817
818 - def organism(self, content):
819 self.data.annotations['organism'] = content
820
821 - def taxonomy(self, content):
822 """Records (another line of) the taxonomy lineage. 823 """ 824 lineage = self._split_taxonomy(content) 825 try: 826 self.data.annotations['taxonomy'].extend(lineage) 827 except KeyError: 828 self.data.annotations['taxonomy'] = lineage
829
830 - def reference_num(self, content):
831 """Signal the beginning of a new reference object. 832 """ 833 # if we have a current reference that hasn't been added to 834 # the list of references, add it. 835 if self._cur_reference is not None: 836 self.data.annotations['references'].append(self._cur_reference) 837 else: 838 self.data.annotations['references'] = [] 839 840 self._cur_reference = SeqFeature.Reference()
841
842 - def reference_bases(self, content):
843 """Attempt to determine the sequence region the reference entails. 844 845 Possible types of information we may have to deal with: 846 847 (bases 1 to 86436) 848 (sites) 849 (bases 1 to 105654; 110423 to 111122) 850 1 (residues 1 to 182) 851 """ 852 # first remove the parentheses or other junk 853 ref_base_info = content[1:-1] 854 855 all_locations = [] 856 # parse if we've got 'bases' and 'to' 857 if 'bases' in ref_base_info and 'to' in ref_base_info: 858 # get rid of the beginning 'bases' 859 ref_base_info = ref_base_info[5:] 860 locations = self._split_reference_locations(ref_base_info) 861 all_locations.extend(locations) 862 elif 'residues' in ref_base_info and 'to' in ref_base_info: 863 residues_start = ref_base_info.find("residues") 864 # get only the information after "residues" 865 ref_base_info = ref_base_info[(residues_start + len("residues ")):] 866 locations = self._split_reference_locations(ref_base_info) 867 all_locations.extend(locations) 868 869 # make sure if we are not finding information then we have 870 # the string 'sites' or the string 'bases' 871 elif (ref_base_info == 'sites' or 872 ref_base_info.strip() == 'bases'): 873 pass 874 # otherwise raise an error 875 else: 876 raise ValueError("Could not parse base info %s in record %s" % 877 (ref_base_info, self.data.id)) 878 879 self._cur_reference.location = all_locations
880
881 - def _split_reference_locations(self, location_string):
882 """Get reference locations out of a string of reference information 883 884 The passed string should be of the form:: 885 886 1 to 20; 20 to 100 887 888 This splits the information out and returns a list of location objects 889 based on the reference locations. 890 """ 891 # split possibly multiple locations using the ';' 892 all_base_info = location_string.split(';') 893 894 new_locations = [] 895 for base_info in all_base_info: 896 start, end = base_info.split('to') 897 new_start, new_end = \ 898 self._convert_to_python_numbers(int(start.strip()), 899 int(end.strip())) 900 this_location = SeqFeature.FeatureLocation(new_start, new_end) 901 new_locations.append(this_location) 902 return new_locations
903
904 - def authors(self, content):
905 if self._cur_reference.authors: 906 self._cur_reference.authors += ' ' + content 907 else: 908 self._cur_reference.authors = content
909
910 - def consrtm(self, content):
911 if self._cur_reference.consrtm: 912 self._cur_reference.consrtm += ' ' + content 913 else: 914 self._cur_reference.consrtm = content
915
916 - def title(self, content):
917 if self._cur_reference is None: 918 import warnings 919 from Bio import BiopythonParserWarning 920 warnings.warn("GenBank TITLE line without REFERENCE line.", 921 BiopythonParserWarning) 922 elif self._cur_reference.title: 923 self._cur_reference.title += ' ' + content 924 else: 925 self._cur_reference.title = content
926
927 - def journal(self, content):
928 if self._cur_reference.journal: 929 self._cur_reference.journal += ' ' + content 930 else: 931 self._cur_reference.journal = content
932
933 - def medline_id(self, content):
934 self._cur_reference.medline_id = content
935
936 - def pubmed_id(self, content):
937 self._cur_reference.pubmed_id = content
938
939 - def remark(self, content):
940 """Deal with a reference comment.""" 941 if self._cur_reference.comment: 942 self._cur_reference.comment += ' ' + content 943 else: 944 self._cur_reference.comment = content
945
946 - def comment(self, content):
947 try: 948 self.data.annotations['comment'] += "\n" + "\n".join(content) 949 except KeyError: 950 self.data.annotations['comment'] = "\n".join(content)
951
952 - def structured_comment(self, content):
953 self.data.annotations['structured_comment'] = content
954
955 - def features_line(self, content):
956 """Get ready for the feature table when we reach the FEATURE line. 957 """ 958 self.start_feature_table()
959
960 - def start_feature_table(self):
961 """Indicate we've got to the start of the feature table. 962 """ 963 # make sure we've added on our last reference object 964 if self._cur_reference is not None: 965 self.data.annotations['references'].append(self._cur_reference) 966 self._cur_reference = None
967
968 - def feature_key(self, content):
969 # start a new feature 970 self._cur_feature = SeqFeature.SeqFeature() 971 self._cur_feature.type = content 972 self.data.features.append(self._cur_feature)
973
974 - def location(self, content):
975 """Parse out location information from the location string. 976 977 This uses simple Python code with some regular expressions to do the 978 parsing, and then translates the results into appropriate objects. 979 """ 980 # clean up newlines and other whitespace inside the location before 981 # parsing - locations should have no whitespace whatsoever 982 location_line = self._clean_location(content) 983 984 # Older records have junk like replace(266,"c") in the 985 # location line. Newer records just replace this with 986 # the number 266 and have the information in a more reasonable 987 # place. So we'll just grab out the number and feed this to the 988 # parser. We shouldn't really be losing any info this way. 989 if 'replace' in location_line: 990 comma_pos = location_line.find(',') 991 location_line = location_line[8:comma_pos] 992 993 cur_feature = self._cur_feature 994 995 # Handle top level complement here for speed 996 if location_line.startswith("complement("): 997 assert location_line.endswith(")") 998 location_line = location_line[11:-1] 999 strand = -1 1000 elif "PROTEIN" in self._seq_type.upper(): 1001 strand = None 1002 else: 1003 # Assume nucleotide otherwise feature strand for 1004 # GenBank files with bad LOCUS lines set to None 1005 strand = 1 1006 1007 # Special case handling of the most common cases for speed 1008 if _re_simple_location.match(location_line): 1009 # e.g. "123..456" 1010 s, e = location_line.split("..") 1011 cur_feature.location = SeqFeature.FeatureLocation(int(s) - 1, 1012 int(e), 1013 strand) 1014 return 1015 1016 if _solo_bond.search(location_line): 1017 # e.g. bond(196) 1018 # e.g. join(bond(284),bond(305),bond(309),bond(305)) 1019 import warnings 1020 from Bio import BiopythonParserWarning 1021 warnings.warn("Dropping bond qualifier in feature location", BiopythonParserWarning) 1022 # There ought to be a better way to do this... 1023 for x in _solo_bond.finditer(location_line): 1024 x = x.group() 1025 location_line = location_line.replace(x, x[5:-1]) 1026 1027 if _re_simple_compound.match(location_line): 1028 # e.g. join(<123..456,480..>500) 1029 i = location_line.find("(") 1030 # cur_feature.location_operator = location_line[:i] 1031 # we can split on the comma because these are simple locations 1032 sub_features = cur_feature.sub_features 1033 for part in location_line[i + 1:-1].split(","): 1034 s, e = part.split("..") 1035 f = SeqFeature.SeqFeature(SeqFeature.FeatureLocation(int(s) - 1, 1036 int(e), 1037 strand), 1038 location_operator=cur_feature.location_operator, 1039 type=cur_feature.type) 1040 sub_features.append(f) 1041 # s = cur_feature.sub_features[0].location.start 1042 # e = cur_feature.sub_features[-1].location.end 1043 # cur_feature.location = SeqFeature.FeatureLocation(s,e, strand) 1044 # TODO - Remove use of sub_features 1045 if strand == -1: 1046 cur_feature.location = SeqFeature.CompoundLocation([f.location for f in sub_features[::-1]], 1047 operator=location_line[:i]) 1048 else: 1049 cur_feature.location = SeqFeature.CompoundLocation([f.location for f in sub_features], 1050 operator=location_line[:i]) 1051 return 1052 1053 # Handle the general case with more complex regular expressions 1054 if _re_complex_location.match(location_line): 1055 # e.g. "AL121804.2:41..610" 1056 if ":" in location_line: 1057 location_ref, location_line = location_line.split(":") 1058 cur_feature.location = _loc(location_line, self._expected_size, strand) 1059 cur_feature.location.ref = location_ref 1060 else: 1061 cur_feature.location = _loc(location_line, self._expected_size, strand) 1062 return 1063 1064 if _re_complex_compound.match(location_line): 1065 i = location_line.find("(") 1066 # cur_feature.location_operator = location_line[:i] 1067 # Can't split on the comma because of positions like one-of(1,2,3) 1068 sub_features = cur_feature.sub_features 1069 for part in _split_compound_loc(location_line[i + 1:-1]): 1070 if part.startswith("complement("): 1071 assert part[-1] == ")" 1072 part = part[11:-1] 1073 assert strand != -1, "Double complement?" 1074 part_strand = -1 1075 else: 1076 part_strand = strand 1077 if ":" in part: 1078 ref, part = part.split(":") 1079 else: 1080 ref = None 1081 try: 1082 loc = _loc(part, self._expected_size, part_strand) 1083 except ValueError as err: 1084 print(location_line) 1085 print(part) 1086 raise err 1087 f = SeqFeature.SeqFeature(location=loc, ref=ref, 1088 location_operator=cur_feature.location_operator, 1089 type=cur_feature.type) 1090 sub_features.append(f) 1091 # Historically a join on the reverse strand has been represented 1092 # in Biopython with both the parent SeqFeature and its children 1093 # (the exons for a CDS) all given a strand of -1. Likewise, for 1094 # a join feature on the forward strand they all have strand +1. 1095 # However, we must also consider evil mixed strand examples like 1096 # this, join(complement(69611..69724),139856..140087,140625..140650) 1097 # 1098 # TODO - Remove use of sub_features 1099 strands = set(sf.strand for sf in sub_features) 1100 if strand == -1: 1101 # Whole thing was wrapped in complement(...) 1102 for sf in sub_features: 1103 assert sf.strand == -1 1104 # Reverse the backwards order used in GenBank files 1105 # with complement(join(...)) 1106 cur_feature.location = SeqFeature.CompoundLocation([f.location for f in sub_features[::-1]], 1107 operator=location_line[:i]) 1108 else: 1109 cur_feature.location = SeqFeature.CompoundLocation([f.location for f in sub_features], 1110 operator=location_line[:i]) 1111 return 1112 # Not recognised 1113 if "order" in location_line and "join" in location_line: 1114 # See Bug 3197 1115 msg = 'Combinations of "join" and "order" within the same ' + \ 1116 'location (nested operators) are illegal:\n' + location_line 1117 raise LocationParserError(msg) 1118 # This used to be an error.... 1119 cur_feature.location = None 1120 import warnings 1121 from Bio import BiopythonParserWarning 1122 warnings.warn(BiopythonParserWarning("Couldn't parse feature location: %r" 1123 % (location_line)))
1124
1125 - def feature_qualifier(self, key, value):
1126 """When we get a qualifier key and its value. 1127 1128 Can receive None, since you can have valueless keys such as /pseudo 1129 """ 1130 # Hack to try to preserve historical behaviour of /pseudo etc 1131 if value is None: 1132 # if the key doesn't exist yet, add an empty string 1133 if key not in self._cur_feature.qualifiers: 1134 self._cur_feature.qualifiers[key] = [""] 1135 return 1136 # otherwise just skip this key 1137 return 1138 1139 value = value.replace('"', '') 1140 if self._feature_cleaner is not None: 1141 value = self._feature_cleaner.clean_value(key, value) 1142 1143 # if the qualifier name exists, append the value 1144 if key in self._cur_feature.qualifiers: 1145 self._cur_feature.qualifiers[key].append(value) 1146 # otherwise start a new list of the key with its values 1147 else: 1148 self._cur_feature.qualifiers[key] = [value]
1149
1150 - def feature_qualifier_name(self, content_list):
1151 """Use feature_qualifier instead (OBSOLETE).""" 1152 raise NotImplementedError("Use the feature_qualifier method instead.")
1153
1154 - def feature_qualifier_description(self, content):
1155 """Use feature_qualifier instead (OBSOLETE).""" 1156 raise NotImplementedError("Use the feature_qualifier method instead.")
1157
1158 - def contig_location(self, content):
1159 """Deal with CONTIG information.""" 1160 # Historically this was stored as a SeqFeature object, but it was 1161 # stored under record.annotations["contig"] and not under 1162 # record.features with the other SeqFeature objects. 1163 # 1164 # The CONTIG location line can include additional tokens like 1165 # Gap(), Gap(100) or Gap(unk100) which are not used in the feature 1166 # location lines, so storing it using SeqFeature based location 1167 # objects is difficult. 1168 # 1169 # We now store this a string, which means for BioSQL we are now in 1170 # much better agreement with how BioPerl records the CONTIG line 1171 # in the database. 1172 # 1173 # NOTE - This code assumes the scanner will return all the CONTIG 1174 # lines already combined into one long string! 1175 self.data.annotations["contig"] = content
1176
1177 - def origin_name(self, content):
1178 pass
1179
1180 - def base_count(self, content):
1181 pass
1182
1183 - def base_number(self, content):
1184 pass
1185
1186 - def sequence(self, content):
1187 """Add up sequence information as we get it. 1188 1189 To try and make things speedier, this puts all of the strings 1190 into a list of strings, and then uses string.join later to put 1191 them together. Supposedly, this is a big time savings 1192 """ 1193 assert ' ' not in content 1194 self._seq_data.append(content.upper())
1195
1196 - def record_end(self, content):
1197 """Clean up when we've finished the record. 1198 """ 1199 from Bio import Alphabet 1200 from Bio.Alphabet import IUPAC 1201 from Bio.Seq import Seq, UnknownSeq 1202 1203 # Try and append the version number to the accession for the full id 1204 if not self.data.id: 1205 assert 'accessions' not in self.data.annotations, \ 1206 self.data.annotations['accessions'] 1207 self.data.id = self.data.name # Good fall back? 1208 elif self.data.id.count('.') == 0: 1209 try: 1210 self.data.id += '.%i' % self.data.annotations['sequence_version'] 1211 except KeyError: 1212 pass 1213 1214 # add the sequence information 1215 # first, determine the alphabet 1216 # we default to an generic alphabet if we don't have a 1217 # seq type or have strange sequence information. 1218 seq_alphabet = Alphabet.generic_alphabet 1219 1220 # now set the sequence 1221 sequence = "".join(self._seq_data) 1222 1223 if self._expected_size is not None \ 1224 and len(sequence) != 0 \ 1225 and self._expected_size != len(sequence): 1226 import warnings 1227 from Bio import BiopythonParserWarning 1228 warnings.warn("Expected sequence length %i, found %i (%s)." 1229 % (self._expected_size, len(sequence), self.data.id), 1230 BiopythonParserWarning) 1231 1232 if self._seq_type: 1233 # mRNA is really also DNA, since it is actually cDNA 1234 if 'DNA' in self._seq_type.upper() or 'MRNA' in self._seq_type.upper(): 1235 seq_alphabet = IUPAC.ambiguous_dna 1236 # are there ever really RNA sequences in GenBank? 1237 elif 'RNA' in self._seq_type.upper(): 1238 # Even for data which was from RNA, the sequence string 1239 # is usually given as DNA (T not U). Bug 2408 1240 if "T" in sequence and "U" not in sequence: 1241 seq_alphabet = IUPAC.ambiguous_dna 1242 else: 1243 seq_alphabet = IUPAC.ambiguous_rna 1244 elif 'PROTEIN' in self._seq_type.upper() \ 1245 or self._seq_type == "PRT": # PRT is used in EMBL-bank for patents 1246 seq_alphabet = IUPAC.protein # or extended protein? 1247 # work around ugly GenBank records which have circular or 1248 # linear but no indication of sequence type 1249 elif self._seq_type in ["circular", "linear", "unspecified"]: 1250 pass 1251 # we have a bug if we get here 1252 else: 1253 raise ValueError("Could not determine alphabet for seq_type %s" 1254 % self._seq_type) 1255 1256 if not sequence and self.__expected_size: 1257 self.data.seq = UnknownSeq(self._expected_size, seq_alphabet) 1258 else: 1259 self.data.seq = Seq(sequence, seq_alphabet)
1260 1261
1262 -class _RecordConsumer(_BaseGenBankConsumer):
1263 """Create a GenBank Record object from scanner generated information (PRIVATE). 1264 """
1265 - def __init__(self):
1266 _BaseGenBankConsumer.__init__(self) 1267 from . import Record 1268 self.data = Record.Record() 1269 1270 self._seq_data = [] 1271 self._cur_reference = None 1272 self._cur_feature = None 1273 self._cur_qualifier = None
1274
1275 - def wgs(self, content):
1276 self.data.wgs = content.split('-')
1277
1278 - def add_wgs_scafld(self, content):
1279 self.data.wgs_scafld.append(content.split('-'))
1280
1281 - def locus(self, content):
1282 self.data.locus = content
1283
1284 - def size(self, content):
1285 self.data.size = content
1286
1287 - def residue_type(self, content):
1288 # Be lenient about parsing, but technically lowercase residue types are malformed. 1289 if 'dna' in content or 'rna' in content: 1290 import warnings 1291 from Bio import BiopythonParserWarning 1292 warnings.warn("Invalid seq_type (%s): DNA/RNA should be uppercase." % content, 1293 BiopythonParserWarning) 1294 self.data.residue_type = content
1295
1296 - def data_file_division(self, content):
1297 self.data.data_file_division = content
1298
1299 - def date(self, content):
1300 self.data.date = content
1301
1302 - def definition(self, content):
1303 self.data.definition = content
1304
1305 - def accession(self, content):
1306 for acc in self._split_accessions(content): 1307 if acc not in self.data.accession: 1308 self.data.accession.append(acc)
1309
1310 - def nid(self, content):
1311 self.data.nid = content
1312
1313 - def pid(self, content):
1314 self.data.pid = content
1315
1316 - def version(self, content):
1317 self.data.version = content
1318
1319 - def db_source(self, content):
1320 self.data.db_source = content.rstrip()
1321
1322 - def gi(self, content):
1323 self.data.gi = content
1324
1325 - def keywords(self, content):
1326 self.data.keywords = self._split_keywords(content)
1327
1328 - def project(self, content):
1329 self.data.projects.extend(p for p in content.split() if p)
1330 1333
1334 - def segment(self, content):
1335 self.data.segment = content
1336
1337 - def source(self, content):
1338 self.data.source = content
1339
1340 - def organism(self, content):
1341 self.data.organism = content
1342
1343 - def taxonomy(self, content):
1344 self.data.taxonomy = self._split_taxonomy(content)
1345
1346 - def reference_num(self, content):
1347 """Grab the reference number and signal the start of a new reference. 1348 """ 1349 # check if we have a reference to add 1350 if self._cur_reference is not None: 1351 self.data.references.append(self._cur_reference) 1352 1353 from . import Record 1354 self._cur_reference = Record.Reference() 1355 self._cur_reference.number = content
1356
1357 - def reference_bases(self, content):
1358 self._cur_reference.bases = content
1359
1360 - def authors(self, content):
1361 self._cur_reference.authors = content
1362
1363 - def consrtm(self, content):
1364 self._cur_reference.consrtm = content
1365
1366 - def title(self, content):
1367 if self._cur_reference is None: 1368 import warnings 1369 from Bio import BiopythonParserWarning 1370 warnings.warn("GenBank TITLE line without REFERENCE line.", 1371 BiopythonParserWarning) 1372 return 1373 self._cur_reference.title = content
1374
1375 - def journal(self, content):
1376 self._cur_reference.journal = content
1377
1378 - def medline_id(self, content):
1379 self._cur_reference.medline_id = content
1380
1381 - def pubmed_id(self, content):
1382 self._cur_reference.pubmed_id = content
1383
1384 - def remark(self, content):
1385 self._cur_reference.remark = content
1386
1387 - def comment(self, content):
1388 self.data.comment += "\n".join(content)
1389
1390 - def structured_comment(self, content):
1391 self.data.structured_comment = content
1392
1393 - def primary_ref_line(self, content):
1394 """Data for the PRIMARY line""" 1395 self.data.primary.append(content)
1396
1397 - def primary(self, content):
1398 pass
1399
1400 - def features_line(self, content):
1401 """Get ready for the feature table when we reach the FEATURE line. 1402 """ 1403 self.start_feature_table()
1404
1405 - def start_feature_table(self):
1406 """Signal the start of the feature table. 1407 """ 1408 # we need to add on the last reference 1409 if self._cur_reference is not None: 1410 self.data.references.append(self._cur_reference)
1411
1412 - def feature_key(self, content):
1413 """Grab the key of the feature and signal the start of a new feature. 1414 """ 1415 # first add on feature information if we've got any 1416 self._add_feature() 1417 1418 from . import Record 1419 self._cur_feature = Record.Feature() 1420 self._cur_feature.key = content
1421
1422 - def _add_feature(self):
1423 """Utility function to add a feature to the Record. 1424 1425 This does all of the appropriate checking to make sure we haven't 1426 left any info behind, and that we are only adding info if it 1427 exists. 1428 """ 1429 if self._cur_feature is not None: 1430 # if we have a left over qualifier, add it to the qualifiers 1431 # on the current feature 1432 if self._cur_qualifier is not None: 1433 self._cur_feature.qualifiers.append(self._cur_qualifier) 1434 1435 self._cur_qualifier = None 1436 self.data.features.append(self._cur_feature)
1437
1438 - def location(self, content):
1439 self._cur_feature.location = self._clean_location(content)
1440
1441 - def feature_qualifier(self, key, value):
1442 self.feature_qualifier_name([key]) 1443 if value is not None: 1444 self.feature_qualifier_description(value)
1445
1446 - def feature_qualifier_name(self, content_list):
1447 """Deal with qualifier names 1448 1449 We receive a list of keys, since you can have valueless keys such as 1450 /pseudo which would be passed in with the next key (since no other 1451 tags separate them in the file) 1452 """ 1453 from . import Record 1454 for content in content_list: 1455 # the record parser keeps the /s -- add them if we don't have 'em 1456 if not content.startswith("/"): 1457 content = "/%s" % content 1458 # add on a qualifier if we've got one 1459 if self._cur_qualifier is not None: 1460 self._cur_feature.qualifiers.append(self._cur_qualifier) 1461 1462 self._cur_qualifier = Record.Qualifier() 1463 self._cur_qualifier.key = content
1464
1465 - def feature_qualifier_description(self, content):
1466 # if we have info then the qualifier key should have a ='s 1467 if '=' not in self._cur_qualifier.key: 1468 self._cur_qualifier.key = "%s=" % self._cur_qualifier.key 1469 cur_content = self._remove_newlines(content) 1470 # remove all spaces from the value if it is a type where spaces 1471 # are not important 1472 for remove_space_key in self.__class__.remove_space_keys: 1473 if remove_space_key in self._cur_qualifier.key: 1474 cur_content = self._remove_spaces(cur_content) 1475 self._cur_qualifier.value = self._normalize_spaces(cur_content)
1476
1477 - def base_count(self, content):
1478 self.data.base_counts = content
1479
1480 - def origin_name(self, content):
1481 self.data.origin = content
1482
1483 - def contig_location(self, content):
1484 """Signal that we have contig information to add to the record. 1485 """ 1486 self.data.contig = self._clean_location(content)
1487
1488 - def sequence(self, content):
1489 """Add sequence information to a list of sequence strings. 1490 1491 This removes spaces in the data and uppercases the sequence, and 1492 then adds it to a list of sequences. Later on we'll join this 1493 list together to make the final sequence. This is faster than 1494 adding on the new string every time. 1495 """ 1496 assert ' ' not in content 1497 self._seq_data.append(content.upper())
1498
1499 - def record_end(self, content):
1500 """Signal the end of the record and do any necessary clean-up. 1501 """ 1502 # add together all of the sequence parts to create the 1503 # final sequence string 1504 self.data.sequence = "".join(self._seq_data) 1505 # add on the last feature 1506 self._add_feature()
1507 1508
1509 -def parse(handle):
1510 """Iterate over GenBank formatted entries as Record objects. 1511 1512 >>> from Bio import GenBank 1513 >>> with open("GenBank/NC_000932.gb") as handle: 1514 ... for record in GenBank.parse(handle): 1515 ... print(record.accession) 1516 ['NC_000932'] 1517 1518 To get SeqRecord objects use Bio.SeqIO.parse(..., format="gb") 1519 instead. 1520 """ 1521 return iter(Iterator(handle, RecordParser()))
1522 1523
1524 -def read(handle):
1525 """Read a handle containing a single GenBank entry as a Record object. 1526 1527 >>> from Bio import GenBank 1528 >>> with open("GenBank/NC_000932.gb") as handle: 1529 ... record = GenBank.read(handle) 1530 ... print(record.accession) 1531 ['NC_000932'] 1532 1533 To get a SeqRecord object use Bio.SeqIO.read(..., format="gb") 1534 instead. 1535 """ 1536 iterator = parse(handle) 1537 try: 1538 first = next(iterator) 1539 except StopIteration: 1540 first = None 1541 if first is None: 1542 raise ValueError("No records found in handle") 1543 try: 1544 second = next(iterator) 1545 except StopIteration: 1546 second = None 1547 if second is not None: 1548 raise ValueError("More than one record found in handle") 1549 return first
1550 1551
1552 -def _test():
1553 """Run the Bio.GenBank module's doctests.""" 1554 import doctest 1555 import os 1556 if os.path.isdir(os.path.join("..", "..", "Tests")): 1557 print("Running doctests...") 1558 cur_dir = os.path.abspath(os.curdir) 1559 os.chdir(os.path.join("..", "..", "Tests")) 1560 doctest.testmod() 1561 os.chdir(cur_dir) 1562 del cur_dir 1563 print("Done") 1564 elif os.path.isdir(os.path.join("Tests")): 1565 print("Running doctests...") 1566 cur_dir = os.path.abspath(os.curdir) 1567 os.chdir(os.path.join("Tests")) 1568 doctest.testmod() 1569 os.chdir(cur_dir) 1570 del cur_dir 1571 print("Done")
1572 1573 if __name__ == "__main__": 1574 _test() 1575