Package Bio :: Package GenBank
[hide private]
[frames] | no frames]

Source Code for Package Bio.GenBank

   1  # Copyright 2000 by Jeffrey Chang, Brad Chapman.  All rights reserved. 
   2  # Copyright 2006-2016 by Peter Cock.  All rights reserved. 
   3  # 
   4  # This code is part of the Biopython distribution and governed by its 
   5  # license.  Please see the LICENSE file that should have been included 
   6  # as part of this package. 
   7   
   8  """Code to work with GenBank formatted files. 
   9   
  10  Rather than using Bio.GenBank, you are now encouraged to use Bio.SeqIO with 
  11  the "genbank" or "embl" format names to parse GenBank or EMBL files into 
  12  SeqRecord and SeqFeature objects (see the Biopython tutorial for details). 
  13   
  14  Using Bio.GenBank directly to parse GenBank files is only useful if you want 
  15  to obtain GenBank-specific Record objects, which is a much closer 
  16  representation to the raw file contents than the SeqRecord alternative from 
  17  the FeatureParser (used in Bio.SeqIO). 
  18   
  19  To use the Bio.GenBank parser, there are two helper functions: 
  20   
  21      - read                  Parse a handle containing a single GenBank record 
  22        as Bio.GenBank specific Record objects. 
  23      - parse                 Iterate over a handle containing multiple GenBank 
  24        records as Bio.GenBank specific Record objects. 
  25   
  26  The following internal classes are not intended for direct use and may 
  27  be deprecated in a future release. 
  28   
  29  Classes: 
  30   
  31      - Iterator              Iterate through a file of GenBank entries 
  32      - ErrorFeatureParser    Catch errors caused during parsing. 
  33      - FeatureParser         Parse GenBank data in SeqRecord and SeqFeature objects. 
  34      - RecordParser          Parse GenBank data into a Record object. 
  35   
  36  Exceptions: 
  37   
  38      - ParserFailureError    Exception indicating a failure in the parser (ie. 
  39        scanner or consumer) 
  40      - LocationParserError   Exception indicating a problem with the spark based 
  41        location parser. 
  42   
  43  """ 
  44  from __future__ import print_function 
  45   
  46  import re 
  47  import sys  # for checking if Python 2 
  48   
  49  # other Biopython stuff 
  50  from Bio import SeqFeature 
  51   
  52  # other Bio.GenBank stuff 
  53  from .utils import FeatureValueCleaner 
  54  from .Scanner import GenBankScanner 
  55   
  56   
  57  # Constants used to parse GenBank header lines 
  58  GENBANK_INDENT = 12 
  59  GENBANK_SPACER = " " * GENBANK_INDENT 
  60   
  61  # Constants for parsing GenBank feature lines 
  62  FEATURE_KEY_INDENT = 5 
  63  FEATURE_QUALIFIER_INDENT = 21 
  64  FEATURE_KEY_SPACER = " " * FEATURE_KEY_INDENT 
  65  FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT 
  66   
  67  # Regular expressions for location parsing 
  68  _solo_location = r"[<>]?\d+" 
  69  _pair_location = r"[<>]?\d+\.\.[<>]?\d+" 
  70  _between_location = r"\d+\^\d+" 
  71   
  72  _within_position = r"\(\d+\.\d+\)" 
  73  _re_within_position = re.compile(_within_position) 
  74  _within_location = r"([<>]?\d+|%s)\.\.([<>]?\d+|%s)" \ 
  75                     % (_within_position, _within_position) 
  76  assert _re_within_position.match("(3.9)") 
  77  assert re.compile(_within_location).match("(3.9)..10") 
  78  assert re.compile(_within_location).match("26..(30.33)") 
  79  assert re.compile(_within_location).match("(13.19)..(20.28)") 
  80   
  81  _oneof_position = r"one\-of\(\d+(,\d+)+\)" 
  82  _re_oneof_position = re.compile(_oneof_position) 
  83  _oneof_location = r"([<>]?\d+|%s)\.\.([<>]?\d+|%s)" \ 
  84                     % (_oneof_position, _oneof_position) 
  85  assert _re_oneof_position.match("one-of(6,9)") 
  86  assert re.compile(_oneof_location).match("one-of(6,9)..101") 
  87  assert re.compile(_oneof_location).match("one-of(6,9)..one-of(101,104)") 
  88  assert re.compile(_oneof_location).match("6..one-of(101,104)") 
  89   
  90  assert not _re_oneof_position.match("one-of(3)") 
  91  assert _re_oneof_position.match("one-of(3,6)") 
  92  assert _re_oneof_position.match("one-of(3,6,9)") 
  93   
  94   
  95  _simple_location = r"\d+\.\.\d+" 
  96  _re_simple_location = re.compile(r"^%s$" % _simple_location) 
  97  _re_simple_compound = re.compile(r"^(join|order|bond)\(%s(,%s)*\)$" 
  98                                   % (_simple_location, _simple_location)) 
  99  _complex_location = r"([a-zA-Z][a-zA-Z0-9_\.\|]*[a-zA-Z0-9]?\:)?(%s|%s|%s|%s|%s)" \ 
 100                      % (_pair_location, _solo_location, _between_location, 
 101                         _within_location, _oneof_location) 
 102  _re_complex_location = re.compile(r"^%s$" % _complex_location) 
 103  _possibly_complemented_complex_location = r"(%s|complement\(%s\))" \ 
 104                                            % (_complex_location, _complex_location) 
 105  _re_complex_compound = re.compile(r"^(join|order|bond)\(%s(,%s)*\)$" 
 106                                    % (_possibly_complemented_complex_location, 
 107                                       _possibly_complemented_complex_location)) 
 108   
 109   
 110  assert _re_simple_location.match("104..160") 
 111  assert not _re_simple_location.match("68451760..68452073^68452074") 
 112  assert not _re_simple_location.match("<104..>160") 
 113  assert not _re_simple_location.match("104") 
 114  assert not _re_simple_location.match("<1") 
 115  assert not _re_simple_location.match(">99999") 
 116  assert not _re_simple_location.match("join(104..160,320..390,504..579)") 
 117  assert not _re_simple_compound.match("bond(12,63)") 
 118  assert _re_simple_compound.match("join(104..160,320..390,504..579)") 
 119  assert _re_simple_compound.match("order(1..69,1308..1465)") 
 120  assert not _re_simple_compound.match("order(1..69,1308..1465,1524)") 
 121  assert not _re_simple_compound.match("join(<1..442,992..1228,1524..>1983)") 
 122  assert not _re_simple_compound.match("join(<1..181,254..336,422..497,574..>590)") 
 123  assert not _re_simple_compound.match("join(1475..1577,2841..2986,3074..3193,3314..3481,4126..>4215)") 
 124  assert not _re_simple_compound.match("test(1..69,1308..1465)") 
 125  assert not _re_simple_compound.match("complement(1..69)") 
 126  assert not _re_simple_compound.match("(1..69)") 
 127  assert _re_complex_location.match("(3.9)..10") 
 128  assert _re_complex_location.match("26..(30.33)") 
 129  assert _re_complex_location.match("(13.19)..(20.28)") 
 130  assert _re_complex_location.match("41^42")  # between 
 131  assert _re_complex_location.match("AL121804:41^42") 
 132  assert _re_complex_location.match("AL121804:41..610") 
 133  assert _re_complex_location.match("AL121804.2:41..610") 
 134  assert _re_complex_location.match("AL358792.24.1.166931:3274..3461")  # lots of dots in external reference 
 135  assert _re_complex_location.match("one-of(3,6)..101") 
 136  assert _re_complex_compound.match("join(153490..154269,AL121804.2:41..610,AL121804.2:672..1487)") 
 137  assert not _re_simple_compound.match("join(153490..154269,AL121804.2:41..610,AL121804.2:672..1487)") 
 138  assert _re_complex_compound.match("join(complement(69611..69724),139856..140650)") 
 139  assert _re_complex_compound.match("join(complement(AL354868.10.1.164018:80837..81016),complement(AL354868.10.1.164018:80539..80835))") 
 140   
 141  # Trans-spliced example from NC_016406, note underscore in reference name: 
 142  assert _re_complex_location.match("NC_016402.1:6618..6676") 
 143  assert _re_complex_location.match("181647..181905") 
 144  assert _re_complex_compound.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)") 
 145  assert not _re_complex_location.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)") 
 146  assert not _re_simple_compound.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)") 
 147  assert not _re_complex_location.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)") 
 148  assert not _re_simple_location.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)") 
 149   
 150  _solo_bond = re.compile("bond\(%s\)" % _solo_location) 
 151  assert _solo_bond.match("bond(196)") 
 152  assert _solo_bond.search("bond(196)") 
 153  assert _solo_bond.search("join(bond(284),bond(305),bond(309),bond(305))") 
154 155 156 -def _pos(pos_str, offset=0):
157 """Build a Position object (PRIVATE). 158 159 For an end position, leave offset as zero (default): 160 161 >>> _pos("5") 162 ExactPosition(5) 163 164 For a start position, set offset to minus one (for Python counting): 165 166 >>> _pos("5", -1) 167 ExactPosition(4) 168 169 This also covers fuzzy positions: 170 171 >>> p = _pos("<5") 172 >>> p 173 BeforePosition(5) 174 >>> print(p) 175 <5 176 >>> int(p) 177 5 178 179 >>> _pos(">5") 180 AfterPosition(5) 181 182 By default assumes an end position, so note the integer behaviour: 183 184 >>> p = _pos("one-of(5,8,11)") 185 >>> p 186 OneOfPosition(11, choices=[ExactPosition(5), ExactPosition(8), ExactPosition(11)]) 187 >>> print(p) 188 one-of(5,8,11) 189 >>> int(p) 190 11 191 192 >>> _pos("(8.10)") 193 WithinPosition(10, left=8, right=10) 194 195 Fuzzy start positions: 196 197 >>> p = _pos("<5", -1) 198 >>> p 199 BeforePosition(4) 200 >>> print(p) 201 <4 202 >>> int(p) 203 4 204 205 Notice how the integer behaviour changes too! 206 207 >>> p = _pos("one-of(5,8,11)", -1) 208 >>> p 209 OneOfPosition(4, choices=[ExactPosition(4), ExactPosition(7), ExactPosition(10)]) 210 >>> print(p) 211 one-of(4,7,10) 212 >>> int(p) 213 4 214 215 """ 216 if pos_str.startswith("<"): 217 return SeqFeature.BeforePosition(int(pos_str[1:]) + offset) 218 elif pos_str.startswith(">"): 219 return SeqFeature.AfterPosition(int(pos_str[1:]) + offset) 220 elif _re_within_position.match(pos_str): 221 s, e = pos_str[1:-1].split(".") 222 s = int(s) + offset 223 e = int(e) + offset 224 if offset == -1: 225 default = s 226 else: 227 default = e 228 return SeqFeature.WithinPosition(default, left=s, right=e) 229 elif _re_oneof_position.match(pos_str): 230 assert pos_str.startswith("one-of(") 231 assert pos_str[-1] == ")" 232 parts = [SeqFeature.ExactPosition(int(pos) + offset) 233 for pos in pos_str[7:-1].split(",")] 234 if offset == -1: 235 default = min(int(pos) for pos in parts) 236 else: 237 default = max(int(pos) for pos in parts) 238 return SeqFeature.OneOfPosition(default, choices=parts) 239 else: 240 return SeqFeature.ExactPosition(int(pos_str) + offset)
241
242 243 -def _loc(loc_str, expected_seq_length, strand):
244 """FeatureLocation from non-compound non-complement location (PRIVATE). 245 246 Simple examples, 247 248 >>> _loc("123..456", 1000, +1) 249 FeatureLocation(ExactPosition(122), ExactPosition(456), strand=1) 250 >>> _loc("<123..>456", 1000, strand = -1) 251 FeatureLocation(BeforePosition(122), AfterPosition(456), strand=-1) 252 253 A more complex location using within positions, 254 255 >>> _loc("(9.10)..(20.25)", 1000, 1) 256 FeatureLocation(WithinPosition(8, left=8, right=9), WithinPosition(25, left=20, right=25), strand=1) 257 258 Notice how that will act as though it has overall start 8 and end 25. 259 260 Zero length between feature, 261 262 >>> _loc("123^124", 1000, 0) 263 FeatureLocation(ExactPosition(123), ExactPosition(123), strand=0) 264 265 The expected sequence length is needed for a special case, a between 266 position at the start/end of a circular genome: 267 268 >>> _loc("1000^1", 1000, 1) 269 FeatureLocation(ExactPosition(1000), ExactPosition(1000), strand=1) 270 271 Apart from this special case, between positions P^Q must have P+1==Q, 272 273 >>> _loc("123^456", 1000, 1) 274 Traceback (most recent call last): 275 ... 276 ValueError: Invalid between location '123^456' 277 278 You can optionally provide a reference name: 279 280 >>> _loc("AL391218.9:105173..108462", 2000000, 1) 281 FeatureLocation(ExactPosition(105172), ExactPosition(108462), strand=1, ref='AL391218.9') 282 283 """ 284 if ":" in loc_str: 285 ref, loc_str = loc_str.split(":") 286 else: 287 ref = None 288 try: 289 s, e = loc_str.split("..") 290 except ValueError: 291 assert ".." not in loc_str 292 if "^" in loc_str: 293 # A between location like "67^68" (one based counting) is a 294 # special case (note it has zero length). In python slice 295 # notation this is 67:67, a zero length slice. See Bug 2622 296 # Further more, on a circular genome of length N you can have 297 # a location N^1 meaning the junction at the origin. See Bug 3098. 298 # NOTE - We can imagine between locations like "2^4", but this 299 # is just "3". Similarly, "2^5" is just "3..4" 300 s, e = loc_str.split("^") 301 if int(s) + 1 == int(e): 302 pos = _pos(s) 303 elif int(s) == expected_seq_length and e == "1": 304 pos = _pos(s) 305 else: 306 raise ValueError("Invalid between location %s" % repr(loc_str)) 307 return SeqFeature.FeatureLocation(pos, pos, strand, ref=ref) 308 else: 309 # e.g. "123" 310 s = loc_str 311 e = loc_str 312 return SeqFeature.FeatureLocation(_pos(s, -1), _pos(e), strand, ref=ref)
313
314 315 -def _split_compound_loc(compound_loc):
316 """Split a tricky compound location string (PRIVATE). 317 318 >>> list(_split_compound_loc("123..145")) 319 ['123..145'] 320 >>> list(_split_compound_loc("123..145,200..209")) 321 ['123..145', '200..209'] 322 >>> list(_split_compound_loc("one-of(200,203)..300")) 323 ['one-of(200,203)..300'] 324 >>> list(_split_compound_loc("complement(123..145),200..209")) 325 ['complement(123..145)', '200..209'] 326 >>> list(_split_compound_loc("123..145,one-of(200,203)..209")) 327 ['123..145', 'one-of(200,203)..209'] 328 >>> list(_split_compound_loc("123..145,one-of(200,203)..one-of(209,211),300")) 329 ['123..145', 'one-of(200,203)..one-of(209,211)', '300'] 330 >>> list(_split_compound_loc("123..145,complement(one-of(200,203)..one-of(209,211)),300")) 331 ['123..145', 'complement(one-of(200,203)..one-of(209,211))', '300'] 332 >>> list(_split_compound_loc("123..145,200..one-of(209,211),300")) 333 ['123..145', '200..one-of(209,211)', '300'] 334 >>> list(_split_compound_loc("123..145,200..one-of(209,211)")) 335 ['123..145', '200..one-of(209,211)'] 336 >>> list(_split_compound_loc("complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905")) 337 ['complement(149815..150200)', 'complement(293787..295573)', 'NC_016402.1:6618..6676', '181647..181905'] 338 """ 339 if "one-of(" in compound_loc: 340 # Hard case 341 while "," in compound_loc: 342 assert compound_loc[0] != "," 343 assert compound_loc[0:2] != ".." 344 i = compound_loc.find(",") 345 part = compound_loc[:i] 346 compound_loc = compound_loc[i:] # includes the comma 347 while part.count("(") > part.count(")"): 348 assert "one-of(" in part, (part, compound_loc) 349 i = compound_loc.find(")") 350 part += compound_loc[:i + 1] 351 compound_loc = compound_loc[i + 1:] 352 if compound_loc.startswith(".."): 353 i = compound_loc.find(",") 354 if i == -1: 355 part += compound_loc 356 compound_loc = "" 357 else: 358 part += compound_loc[:i] 359 compound_loc = compound_loc[i:] # includes the comma 360 while part.count("(") > part.count(")"): 361 assert part.count("one-of(") == 2 362 i = compound_loc.find(")") 363 part += compound_loc[:i + 1] 364 compound_loc = compound_loc[i + 1:] 365 if compound_loc.startswith(","): 366 compound_loc = compound_loc[1:] 367 assert part 368 yield part 369 if compound_loc: 370 yield compound_loc 371 else: 372 # Easy case 373 for part in compound_loc.split(","): 374 yield part
375
376 377 -class Iterator(object):
378 """Iterator interface to move over a file of GenBank entries one at a time (OBSOLETE). 379 380 This class is likely to be deprecated in a future release of Biopython. 381 Please use Bio.SeqIO.parse(..., format="gb") or Bio.GenBank.parse(...) 382 for SeqRecord and GenBank specific Record objects respectively instead. 383 """
384 - def __init__(self, handle, parser=None):
385 """Initialize the iterator. 386 387 Arguments: 388 389 - handle - A handle with GenBank entries to iterate through. 390 - parser - An optional parser to pass the entries through before 391 returning them. If None, then the raw entry will be returned. 392 """ 393 self.handle = handle 394 self._parser = parser
395
396 - def __next__(self):
397 """Return the next GenBank record from the handle. 398 399 Will return None if we ran out of records. 400 """ 401 if self._parser is None: 402 lines = [] 403 while True: 404 line = self.handle.readline() 405 if not line: 406 return None # Premature end of file? 407 lines.append(line) 408 if line.rstrip() == "//": 409 break 410 return "".join(lines) 411 try: 412 return self._parser.parse(self.handle) 413 except StopIteration: 414 return None
415 416 if sys.version_info[0] < 3:
417 - def next(self):
418 """Python 2 style alias for Python 3 style __next__ method.""" 419 return self.__next__()
420
421 - def __iter__(self):
422 return iter(self.__next__, None)
423
424 425 -class ParserFailureError(Exception):
426 """Failure caused by some kind of problem in the parser. 427 """ 428 pass
429
430 431 -class LocationParserError(Exception):
432 """Could not Properly parse out a location from a GenBank file. 433 """ 434 pass
435
436 437 -class FeatureParser(object):
438 """Parse GenBank files into Seq + Feature objects (OBSOLETE). 439 440 Direct use of this class is discouraged, and may be deprecated in 441 a future release of Biopython. 442 443 Please use Bio.SeqIO.parse(...) or Bio.SeqIO.read(...) instead. 444 """
445 - def __init__(self, debug_level=0, use_fuzziness=1, 446 feature_cleaner=FeatureValueCleaner()):
447 """Initialize a GenBank parser and Feature consumer. 448 449 Arguments: 450 451 - debug_level - An optional argument that species the amount of 452 debugging information the parser should spit out. By default we have 453 no debugging info (the fastest way to do things), but if you want 454 you can set this as high as two and see exactly where a parse fails. 455 - use_fuzziness - Specify whether or not to use fuzzy representations. 456 The default is 1 (use fuzziness). 457 - feature_cleaner - A class which will be used to clean out the 458 values of features. This class must implement the function 459 clean_value. GenBank.utils has a "standard" cleaner class, which 460 is used by default. 461 """ 462 self._scanner = GenBankScanner(debug_level) 463 self.use_fuzziness = use_fuzziness 464 self._cleaner = feature_cleaner
465
466 - def parse(self, handle):
467 """Parse the specified handle. 468 """ 469 _consumer = _FeatureConsumer(self.use_fuzziness, 470 self._cleaner) 471 self._scanner.feed(handle, _consumer) 472 return _consumer.data
473
474 475 -class RecordParser(object):
476 """Parse GenBank files into Record objects (OBSOLETE). 477 478 Direct use of this class is discouraged, and may be deprecated in 479 a future release of Biopython. 480 481 Please use the Bio.GenBank.parse(...) or Bio.GenBank.read(...) functions 482 instead. 483 """
484 - def __init__(self, debug_level=0):
485 """Initialize the parser. 486 487 Arguments: 488 489 - debug_level - An optional argument that species the amount of 490 debugging information the parser should spit out. By default we have 491 no debugging info (the fastest way to do things), but if you want 492 you can set this as high as two and see exactly where a parse fails. 493 """ 494 self._scanner = GenBankScanner(debug_level)
495
496 - def parse(self, handle):
497 """Parse the specified handle into a GenBank record. 498 """ 499 _consumer = _RecordConsumer() 500 501 self._scanner.feed(handle, _consumer) 502 return _consumer.data
503
504 505 -class _BaseGenBankConsumer(object):
506 """Abstract GenBank consumer providing useful general functions (PRIVATE). 507 508 This just helps to eliminate some duplication in things that most 509 GenBank consumers want to do. 510 """ 511 # Special keys in GenBank records that we should remove spaces from 512 # For instance, \translation keys have values which are proteins and 513 # should have spaces and newlines removed from them. This class 514 # attribute gives us more control over specific formatting problems. 515 remove_space_keys = ["translation"] 516
517 - def __init__(self):
518 pass
519
520 - def _unhandled(self, data):
521 pass
522
523 - def __getattr__(self, attr):
524 return self._unhandled
525 526 @staticmethod
527 - def _split_keywords(keyword_string):
528 """Split a string of keywords into a nice clean list. 529 """ 530 # process the keywords into a python list 531 if keyword_string == "" or keyword_string == ".": 532 keywords = "" 533 elif keyword_string[-1] == '.': 534 keywords = keyword_string[:-1] 535 else: 536 keywords = keyword_string 537 keyword_list = keywords.split(';') 538 clean_keyword_list = [x.strip() for x in keyword_list] 539 return clean_keyword_list
540 541 @staticmethod
542 - def _split_accessions(accession_string):
543 """Split a string of accession numbers into a list. 544 """ 545 # first replace all line feeds with spaces 546 # Also, EMBL style accessions are split with ';' 547 accession = accession_string.replace("\n", " ").replace(";", " ") 548 549 return [x.strip() for x in accession.split() if x.strip()]
550 551 @staticmethod
552 - def _split_taxonomy(taxonomy_string):
553 """Split a string with taxonomy info into a list. 554 """ 555 if not taxonomy_string or taxonomy_string == ".": 556 # Missing data, no taxonomy 557 return [] 558 559 if taxonomy_string[-1] == '.': 560 tax_info = taxonomy_string[:-1] 561 else: 562 tax_info = taxonomy_string 563 tax_list = tax_info.split(';') 564 new_tax_list = [] 565 for tax_item in tax_list: 566 new_items = tax_item.split("\n") 567 new_tax_list.extend(new_items) 568 while '' in new_tax_list: 569 new_tax_list.remove('') 570 clean_tax_list = [x.strip() for x in new_tax_list] 571 572 return clean_tax_list
573 574 @staticmethod
575 - def _clean_location(location_string):
576 """Clean whitespace out of a location string. 577 578 The location parser isn't a fan of whitespace, so we clean it out 579 before feeding it into the parser. 580 """ 581 # Originally this imported string.whitespace and did a replace 582 # via a loop. It's simpler to just split on whitespace and rejoin 583 # the string - and this avoids importing string too. See Bug 2684. 584 return ''.join(location_string.split())
585 586 @staticmethod
587 - def _remove_newlines(text):
588 """Remove any newlines in the passed text, returning the new string. 589 """ 590 # get rid of newlines in the qualifier value 591 newlines = ["\n", "\r"] 592 for ws in newlines: 593 text = text.replace(ws, "") 594 595 return text
596 597 @staticmethod
598 - def _normalize_spaces(text):
599 """Replace multiple spaces in the passed text with single spaces. 600 """ 601 # get rid of excessive spaces 602 return ' '.join(x for x in text.split(" ") if x)
603 604 @staticmethod
605 - def _remove_spaces(text):
606 """Remove all spaces from the passed text. 607 """ 608 return text.replace(" ", "")
609 610 @staticmethod
611 - def _convert_to_python_numbers(start, end):
612 """Convert a start and end range to python notation. 613 614 In GenBank, starts and ends are defined in "biological" coordinates, 615 where 1 is the first base and [i, j] means to include both i and j. 616 617 In python, 0 is the first base and [i, j] means to include i, but 618 not j. 619 620 So, to convert "biological" to python coordinates, we need to 621 subtract 1 from the start, and leave the end and things should 622 be converted happily. 623 """ 624 new_start = start - 1 625 new_end = end 626 627 return new_start, new_end
628
629 630 -class _FeatureConsumer(_BaseGenBankConsumer):
631 """Create a SeqRecord object with Features to return (PRIVATE). 632 633 Attributes: 634 635 - use_fuzziness - specify whether or not to parse with fuzziness in 636 feature locations. 637 - feature_cleaner - a class that will be used to provide specialized 638 cleaning-up of feature values. 639 """
640 - def __init__(self, use_fuzziness, feature_cleaner=None):
641 from Bio.SeqRecord import SeqRecord 642 _BaseGenBankConsumer.__init__(self) 643 self.data = SeqRecord(None, id=None) 644 self.data.id = None 645 self.data.description = "" 646 647 self._use_fuzziness = use_fuzziness 648 self._feature_cleaner = feature_cleaner 649 650 self._seq_type = '' 651 self._seq_data = [] 652 self._cur_reference = None 653 self._cur_feature = None 654 self._expected_size = None
655
656 - def locus(self, locus_name):
657 """Set the locus name is set as the name of the Sequence. 658 """ 659 self.data.name = locus_name
660
661 - def size(self, content):
662 """Record the sequence length.""" 663 self._expected_size = int(content)
664
665 - def residue_type(self, type):
666 """Record the sequence type (SEMI-OBSOLETE). 667 668 This reflects the fact that the topology (linear/circular) and 669 molecule type (e.g. DNA vs RNA) were a single field in early 670 files. Current GenBank/EMBL files have two fields. 671 """ 672 self._seq_type = type.strip()
673
674 - def topology(self, topology):
675 """Record the topology (linear or circular as strings).""" 676 if topology: 677 if topology not in ['linear', 'circular']: 678 raise ParserFailureError("Unexpected topology %r should be linear or circular" % topology) 679 self.data.annotations['topology'] = topology
680
681 - def molecule_type(self, mol_type):
682 """Record the molecule type (for round-trip etc).""" 683 if mol_type: 684 if "circular" in mol_type or 'linear' in mol_type: 685 raise ParserFailureError("Molecule type %r should not include topology" % mol_type) 686 self.data.annotations['molecule_type'] = mol_type
687
688 - def data_file_division(self, division):
689 self.data.annotations['data_file_division'] = division
690
691 - def date(self, submit_date):
692 self.data.annotations['date'] = submit_date
693
694 - def definition(self, definition):
695 """Set the definition as the description of the sequence. 696 """ 697 if self.data.description: 698 # Append to any existing description 699 # e.g. EMBL files with two DE lines. 700 self.data.description += " " + definition 701 else: 702 self.data.description = definition
703
704 - def accession(self, acc_num):
705 """Set the accession number as the id of the sequence. 706 707 If we have multiple accession numbers, the first one passed is 708 used. 709 """ 710 new_acc_nums = self._split_accessions(acc_num) 711 712 # Also record them ALL in the annotations 713 try: 714 # On the off chance there was more than one accession line: 715 for acc in new_acc_nums: 716 # Prevent repeat entries 717 if acc not in self.data.annotations['accessions']: 718 self.data.annotations['accessions'].append(acc) 719 except KeyError: 720 self.data.annotations['accessions'] = new_acc_nums 721 722 # if we haven't set the id information yet, add the first acc num 723 if not self.data.id: 724 if len(new_acc_nums) > 0: 725 # self.data.id = new_acc_nums[0] 726 # Use the FIRST accession as the ID, not the first on this line! 727 self.data.id = self.data.annotations['accessions'][0]
728
729 - def wgs(self, content):
730 self.data.annotations['wgs'] = content.split('-')
731
732 - def add_wgs_scafld(self, content):
733 self.data.annotations.setdefault('wgs_scafld', []).append(content.split('-'))
734
735 - def nid(self, content):
736 self.data.annotations['nid'] = content
737
738 - def pid(self, content):
739 self.data.annotations['pid'] = content
740
741 - def version(self, version_id):
742 # Want to use the versioned accession as the record.id 743 # This comes from the VERSION line in GenBank files, or the 744 # obsolete SV line in EMBL. For the new EMBL files we need 745 # both the version suffix from the ID line and the accession 746 # from the AC line. 747 if version_id.count(".") == 1 and version_id.split(".")[1].isdigit(): 748 self.accession(version_id.split(".")[0]) 749 self.version_suffix(version_id.split(".")[1]) 750 elif version_id: 751 # For backwards compatibility... 752 self.data.id = version_id
753
754 - def project(self, content):
755 """Handle the information from the PROJECT line as a list of projects. 756 757 e.g.:: 758 759 PROJECT GenomeProject:28471 760 761 or:: 762 763 PROJECT GenomeProject:13543 GenomeProject:99999 764 765 This is stored as dbxrefs in the SeqRecord to be consistent with the 766 projected switch of this line to DBLINK in future GenBank versions. 767 Note the NCBI plan to replace "GenomeProject:28471" with the shorter 768 "Project:28471" as part of this transition. 769 """ 770 content = content.replace("GenomeProject:", "Project:") 771 self.data.dbxrefs.extend(p for p in content.split() if p)
772 805
806 - def version_suffix(self, version):
807 """Set the version to overwrite the id. 808 809 Since the version provides the same information as the accession 810 number, plus some extra info, we set this as the id if we have 811 a version. 812 """ 813 # e.g. GenBank line: 814 # VERSION U49845.1 GI:1293613 815 # or the obsolete EMBL line: 816 # SV U49845.1 817 # Scanner calls consumer.version("U49845.1") 818 # which then calls consumer.version_suffix(1) 819 # 820 # e.g. EMBL new line: 821 # ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP. 822 # Scanner calls consumer.version_suffix(1) 823 assert version.isdigit() 824 self.data.annotations['sequence_version'] = int(version)
825
826 - def db_source(self, content):
827 self.data.annotations['db_source'] = content.rstrip()
828
829 - def gi(self, content):
830 self.data.annotations['gi'] = content
831
832 - def keywords(self, content):
833 if 'keywords' in self.data.annotations: 834 # Multi-line keywords, append to list 835 # Note EMBL states "A keyword is never split between lines." 836 self.data.annotations['keywords'].extend(self._split_keywords(content)) 837 else: 838 self.data.annotations['keywords'] = self._split_keywords(content)
839
840 - def segment(self, content):
841 self.data.annotations['segment'] = content
842
843 - def source(self, content):
844 # Note that some software (e.g. VectorNTI) may produce an empty 845 # source (rather than using a dot/period as might be expected). 846 if content == "": 847 source_info = "" 848 elif content[-1] == '.': 849 source_info = content[:-1] 850 else: 851 source_info = content 852 self.data.annotations['source'] = source_info
853
854 - def organism(self, content):
855 self.data.annotations['organism'] = content
856
857 - def taxonomy(self, content):
858 """Records (another line of) the taxonomy lineage. 859 """ 860 lineage = self._split_taxonomy(content) 861 try: 862 self.data.annotations['taxonomy'].extend(lineage) 863 except KeyError: 864 self.data.annotations['taxonomy'] = lineage
865
866 - def reference_num(self, content):
867 """Signal the beginning of a new reference object. 868 """ 869 # if we have a current reference that hasn't been added to 870 # the list of references, add it. 871 if self._cur_reference is not None: 872 self.data.annotations['references'].append(self._cur_reference) 873 else: 874 self.data.annotations['references'] = [] 875 876 self._cur_reference = SeqFeature.Reference()
877
878 - def reference_bases(self, content):
879 """Attempt to determine the sequence region the reference entails. 880 881 Possible types of information we may have to deal with: 882 883 (bases 1 to 86436) 884 (sites) 885 (bases 1 to 105654; 110423 to 111122) 886 1 (residues 1 to 182) 887 """ 888 # first remove the parentheses 889 assert content.endswith(")"), content 890 ref_base_info = content[1:-1] 891 892 all_locations = [] 893 # parse if we've got 'bases' and 'to' 894 if 'bases' in ref_base_info and 'to' in ref_base_info: 895 # get rid of the beginning 'bases' 896 ref_base_info = ref_base_info[5:] 897 locations = self._split_reference_locations(ref_base_info) 898 all_locations.extend(locations) 899 elif 'residues' in ref_base_info and 'to' in ref_base_info: 900 residues_start = ref_base_info.find("residues") 901 # get only the information after "residues" 902 ref_base_info = ref_base_info[(residues_start + len("residues ")):] 903 locations = self._split_reference_locations(ref_base_info) 904 all_locations.extend(locations) 905 906 # make sure if we are not finding information then we have 907 # the string 'sites' or the string 'bases' 908 elif (ref_base_info == 'sites' or 909 ref_base_info.strip() == 'bases'): 910 pass 911 # otherwise raise an error 912 else: 913 raise ValueError("Could not parse base info %s in record %s" % 914 (ref_base_info, self.data.id)) 915 916 self._cur_reference.location = all_locations
917
918 - def _split_reference_locations(self, location_string):
919 """Get reference locations out of a string of reference information 920 921 The passed string should be of the form:: 922 923 1 to 20; 20 to 100 924 925 This splits the information out and returns a list of location objects 926 based on the reference locations. 927 """ 928 # split possibly multiple locations using the ';' 929 all_base_info = location_string.split(';') 930 931 new_locations = [] 932 for base_info in all_base_info: 933 start, end = base_info.split('to') 934 new_start, new_end = \ 935 self._convert_to_python_numbers(int(start.strip()), 936 int(end.strip())) 937 this_location = SeqFeature.FeatureLocation(new_start, new_end) 938 new_locations.append(this_location) 939 return new_locations
940
941 - def authors(self, content):
942 if self._cur_reference.authors: 943 self._cur_reference.authors += ' ' + content 944 else: 945 self._cur_reference.authors = content
946
947 - def consrtm(self, content):
948 if self._cur_reference.consrtm: 949 self._cur_reference.consrtm += ' ' + content 950 else: 951 self._cur_reference.consrtm = content
952
953 - def title(self, content):
954 if self._cur_reference is None: 955 import warnings 956 from Bio import BiopythonParserWarning 957 warnings.warn("GenBank TITLE line without REFERENCE line.", 958 BiopythonParserWarning) 959 elif self._cur_reference.title: 960 self._cur_reference.title += ' ' + content 961 else: 962 self._cur_reference.title = content
963
964 - def journal(self, content):
965 if self._cur_reference.journal: 966 self._cur_reference.journal += ' ' + content 967 else: 968 self._cur_reference.journal = content
969
970 - def medline_id(self, content):
971 self._cur_reference.medline_id = content
972
973 - def pubmed_id(self, content):
974 self._cur_reference.pubmed_id = content
975
976 - def remark(self, content):
977 """Deal with a reference comment.""" 978 if self._cur_reference.comment: 979 self._cur_reference.comment += ' ' + content 980 else: 981 self._cur_reference.comment = content
982
983 - def comment(self, content):
984 try: 985 self.data.annotations['comment'] += "\n" + "\n".join(content) 986 except KeyError: 987 self.data.annotations['comment'] = "\n".join(content)
988
989 - def structured_comment(self, content):
990 self.data.annotations['structured_comment'] = content
991
992 - def features_line(self, content):
993 """Get ready for the feature table when we reach the FEATURE line. 994 """ 995 self.start_feature_table()
996
997 - def start_feature_table(self):
998 """Indicate we've got to the start of the feature table. 999 """ 1000 # make sure we've added on our last reference object 1001 if self._cur_reference is not None: 1002 self.data.annotations['references'].append(self._cur_reference) 1003 self._cur_reference = None
1004
1005 - def feature_key(self, content):
1006 # start a new feature 1007 self._cur_feature = SeqFeature.SeqFeature() 1008 self._cur_feature.type = content 1009 self.data.features.append(self._cur_feature)
1010
1011 - def location(self, content):
1012 """Parse out location information from the location string. 1013 1014 This uses simple Python code with some regular expressions to do the 1015 parsing, and then translates the results into appropriate objects. 1016 """ 1017 # clean up newlines and other whitespace inside the location before 1018 # parsing - locations should have no whitespace whatsoever 1019 location_line = self._clean_location(content) 1020 1021 # Older records have junk like replace(266,"c") in the 1022 # location line. Newer records just replace this with 1023 # the number 266 and have the information in a more reasonable 1024 # place. So we'll just grab out the number and feed this to the 1025 # parser. We shouldn't really be losing any info this way. 1026 if 'replace' in location_line: 1027 comma_pos = location_line.find(',') 1028 location_line = location_line[8:comma_pos] 1029 1030 cur_feature = self._cur_feature 1031 1032 # Handle top level complement here for speed 1033 if location_line.startswith("complement("): 1034 assert location_line.endswith(")") 1035 location_line = location_line[11:-1] 1036 strand = -1 1037 elif "PROTEIN" in self._seq_type.upper(): 1038 strand = None 1039 else: 1040 # Assume nucleotide otherwise feature strand for 1041 # GenBank files with bad LOCUS lines set to None 1042 strand = 1 1043 1044 # Special case handling of the most common cases for speed 1045 if _re_simple_location.match(location_line): 1046 # e.g. "123..456" 1047 s, e = location_line.split("..") 1048 cur_feature.location = SeqFeature.FeatureLocation(int(s) - 1, 1049 int(e), 1050 strand) 1051 return 1052 1053 if ",)" in location_line: 1054 import warnings 1055 from Bio import BiopythonParserWarning 1056 warnings.warn("Dropping trailing comma in malformed feature location", 1057 BiopythonParserWarning) 1058 location_line = location_line.replace(",)", ")") 1059 1060 if _solo_bond.search(location_line): 1061 # e.g. bond(196) 1062 # e.g. join(bond(284),bond(305),bond(309),bond(305)) 1063 import warnings 1064 from Bio import BiopythonParserWarning 1065 warnings.warn("Dropping bond qualifier in feature location", BiopythonParserWarning) 1066 # There ought to be a better way to do this... 1067 for x in _solo_bond.finditer(location_line): 1068 x = x.group() 1069 location_line = location_line.replace(x, x[5:-1]) 1070 1071 if _re_simple_compound.match(location_line): 1072 # e.g. join(<123..456,480..>500) 1073 i = location_line.find("(") 1074 # cur_feature.location_operator = location_line[:i] 1075 # we can split on the comma because these are simple locations 1076 locs = [] 1077 for part in location_line[i + 1:-1].split(","): 1078 s, e = part.split("..") 1079 locs.append(SeqFeature.FeatureLocation(int(s) - 1, 1080 int(e), 1081 strand)) 1082 if len(locs) < 2: 1083 # The CompoundLocation will raise a ValueError here! 1084 import warnings 1085 from Bio import BiopythonParserWarning 1086 warnings.warn("Should have at least 2 parts for compound location", 1087 BiopythonParserWarning) 1088 cur_feature.location = None 1089 return 1090 if strand == -1: 1091 cur_feature.location = SeqFeature.CompoundLocation(locs[::-1], 1092 operator=location_line[:i]) 1093 else: 1094 cur_feature.location = SeqFeature.CompoundLocation(locs, 1095 operator=location_line[:i]) 1096 return 1097 1098 # Handle the general case with more complex regular expressions 1099 if _re_complex_location.match(location_line): 1100 # e.g. "AL121804.2:41..610" 1101 cur_feature.location = _loc(location_line, self._expected_size, strand) 1102 return 1103 1104 if _re_complex_compound.match(location_line): 1105 i = location_line.find("(") 1106 # cur_feature.location_operator = location_line[:i] 1107 # Can't split on the comma because of positions like one-of(1,2,3) 1108 locs = [] 1109 for part in _split_compound_loc(location_line[i + 1:-1]): 1110 if part.startswith("complement("): 1111 assert part[-1] == ")" 1112 part = part[11:-1] 1113 assert strand != -1, "Double complement?" 1114 part_strand = -1 1115 else: 1116 part_strand = strand 1117 try: 1118 loc = _loc(part, self._expected_size, part_strand) 1119 except ValueError as err: 1120 print(location_line) 1121 print(part) 1122 raise err 1123 locs.append(loc) 1124 # Historically a join on the reverse strand has been represented 1125 # in Biopython with both the parent SeqFeature and its children 1126 # (the exons for a CDS) all given a strand of -1. Likewise, for 1127 # a join feature on the forward strand they all have strand +1. 1128 # However, we must also consider evil mixed strand examples like 1129 # this, join(complement(69611..69724),139856..140087,140625..140650) 1130 if strand == -1: 1131 # Whole thing was wrapped in complement(...) 1132 for l in locs: 1133 assert l.strand == -1 1134 # Reverse the backwards order used in GenBank files 1135 # with complement(join(...)) 1136 cur_feature.location = SeqFeature.CompoundLocation(locs[::-1], 1137 operator=location_line[:i]) 1138 else: 1139 cur_feature.location = SeqFeature.CompoundLocation(locs, 1140 operator=location_line[:i]) 1141 return 1142 # Not recognised 1143 if "order" in location_line and "join" in location_line: 1144 # See Bug 3197 1145 msg = 'Combinations of "join" and "order" within the same ' + \ 1146 'location (nested operators) are illegal:\n' + location_line 1147 raise LocationParserError(msg) 1148 # This used to be an error.... 1149 cur_feature.location = None 1150 import warnings 1151 from Bio import BiopythonParserWarning 1152 warnings.warn(BiopythonParserWarning("Couldn't parse feature location: %r" 1153 % location_line))
1154
1155 - def feature_qualifier(self, key, value):
1156 """When we get a qualifier key and its value. 1157 1158 Can receive None, since you can have valueless keys such as /pseudo 1159 """ 1160 # Hack to try to preserve historical behaviour of /pseudo etc 1161 if value is None: 1162 # if the key doesn't exist yet, add an empty string 1163 if key not in self._cur_feature.qualifiers: 1164 self._cur_feature.qualifiers[key] = [""] 1165 return 1166 # otherwise just skip this key 1167 return 1168 1169 value = value.replace('"', '') 1170 if self._feature_cleaner is not None: 1171 value = self._feature_cleaner.clean_value(key, value) 1172 1173 # if the qualifier name exists, append the value 1174 if key in self._cur_feature.qualifiers: 1175 self._cur_feature.qualifiers[key].append(value) 1176 # otherwise start a new list of the key with its values 1177 else: 1178 self._cur_feature.qualifiers[key] = [value]
1179
1180 - def feature_qualifier_name(self, content_list):
1181 """Use feature_qualifier instead (OBSOLETE).""" 1182 raise NotImplementedError("Use the feature_qualifier method instead.")
1183
1184 - def feature_qualifier_description(self, content):
1185 """Use feature_qualifier instead (OBSOLETE).""" 1186 raise NotImplementedError("Use the feature_qualifier method instead.")
1187
1188 - def contig_location(self, content):
1189 """Deal with CONTIG information.""" 1190 # Historically this was stored as a SeqFeature object, but it was 1191 # stored under record.annotations["contig"] and not under 1192 # record.features with the other SeqFeature objects. 1193 # 1194 # The CONTIG location line can include additional tokens like 1195 # Gap(), Gap(100) or Gap(unk100) which are not used in the feature 1196 # location lines, so storing it using SeqFeature based location 1197 # objects is difficult. 1198 # 1199 # We now store this a string, which means for BioSQL we are now in 1200 # much better agreement with how BioPerl records the CONTIG line 1201 # in the database. 1202 # 1203 # NOTE - This code assumes the scanner will return all the CONTIG 1204 # lines already combined into one long string! 1205 self.data.annotations["contig"] = content
1206
1207 - def origin_name(self, content):
1208 pass
1209
1210 - def base_count(self, content):
1211 pass
1212
1213 - def base_number(self, content):
1214 pass
1215
1216 - def sequence(self, content):
1217 """Add up sequence information as we get it. 1218 1219 To try and make things speedier, this puts all of the strings 1220 into a list of strings, and then uses string.join later to put 1221 them together. Supposedly, this is a big time savings 1222 """ 1223 assert ' ' not in content 1224 self._seq_data.append(content.upper())
1225
1226 - def record_end(self, content):
1227 """Clean up when we've finished the record. 1228 """ 1229 from Bio import Alphabet 1230 from Bio.Alphabet import IUPAC 1231 from Bio.Seq import Seq, UnknownSeq 1232 1233 # Try and append the version number to the accession for the full id 1234 if not self.data.id: 1235 assert 'accessions' not in self.data.annotations, \ 1236 self.data.annotations['accessions'] 1237 self.data.id = self.data.name # Good fall back? 1238 elif self.data.id.count('.') == 0: 1239 try: 1240 self.data.id += '.%i' % self.data.annotations['sequence_version'] 1241 except KeyError: 1242 pass 1243 1244 # add the sequence information 1245 # first, determine the alphabet 1246 # we default to an generic alphabet if we don't have a 1247 # seq type or have strange sequence information. 1248 seq_alphabet = Alphabet.generic_alphabet 1249 1250 # now set the sequence 1251 sequence = "".join(self._seq_data) 1252 1253 if self._expected_size is not None \ 1254 and len(sequence) != 0 \ 1255 and self._expected_size != len(sequence): 1256 import warnings 1257 from Bio import BiopythonParserWarning 1258 warnings.warn("Expected sequence length %i, found %i (%s)." 1259 % (self._expected_size, len(sequence), self.data.id), 1260 BiopythonParserWarning) 1261 1262 if self._seq_type: 1263 # mRNA is really also DNA, since it is actually cDNA 1264 if 'DNA' in self._seq_type.upper() or 'MRNA' in self._seq_type.upper(): 1265 seq_alphabet = IUPAC.ambiguous_dna 1266 # are there ever really RNA sequences in GenBank? 1267 elif 'RNA' in self._seq_type.upper(): 1268 # Even for data which was from RNA, the sequence string 1269 # is usually given as DNA (T not U). Bug 2408 1270 if "T" in sequence and "U" not in sequence: 1271 seq_alphabet = IUPAC.ambiguous_dna 1272 else: 1273 seq_alphabet = IUPAC.ambiguous_rna 1274 elif 'PROTEIN' in self._seq_type.upper() \ 1275 or self._seq_type == "PRT": # PRT is used in EMBL-bank for patents 1276 seq_alphabet = IUPAC.protein # or extended protein? 1277 # work around ugly GenBank records which have circular or 1278 # linear but no indication of sequence type 1279 elif self._seq_type in ["circular", "linear", "unspecified"]: 1280 pass 1281 # we have a bug if we get here 1282 else: 1283 raise ValueError("Could not determine alphabet for seq_type %s" 1284 % self._seq_type) 1285 1286 if not sequence and self.__expected_size: 1287 self.data.seq = UnknownSeq(self._expected_size, seq_alphabet) 1288 else: 1289 self.data.seq = Seq(sequence, seq_alphabet)
1290
1291 1292 -class _RecordConsumer(_BaseGenBankConsumer):
1293 """Create a GenBank Record object from scanner generated information (PRIVATE). 1294 """
1295 - def __init__(self):
1296 _BaseGenBankConsumer.__init__(self) 1297 from . import Record 1298 self.data = Record.Record() 1299 1300 self._seq_data = [] 1301 self._cur_reference = None 1302 self._cur_feature = None 1303 self._cur_qualifier = None
1304
1305 - def wgs(self, content):
1306 self.data.wgs = content.split('-')
1307
1308 - def add_wgs_scafld(self, content):
1309 self.data.wgs_scafld.append(content.split('-'))
1310
1311 - def locus(self, content):
1312 self.data.locus = content
1313
1314 - def size(self, content):
1315 self.data.size = content
1316
1317 - def residue_type(self, content):
1318 # Be lenient about parsing, but technically lowercase residue types are malformed. 1319 if 'dna' in content or 'rna' in content: 1320 import warnings 1321 from Bio import BiopythonParserWarning 1322 warnings.warn("Invalid seq_type (%s): DNA/RNA should be uppercase." % content, 1323 BiopythonParserWarning) 1324 self.data.residue_type = content
1325
1326 - def data_file_division(self, content):
1327 self.data.data_file_division = content
1328
1329 - def date(self, content):
1330 self.data.date = content
1331
1332 - def definition(self, content):
1333 self.data.definition = content
1334
1335 - def accession(self, content):
1336 for acc in self._split_accessions(content): 1337 if acc not in self.data.accession: 1338 self.data.accession.append(acc)
1339
1340 - def nid(self, content):
1341 self.data.nid = content
1342
1343 - def pid(self, content):
1344 self.data.pid = content
1345
1346 - def version(self, content):
1347 self.data.version = content
1348
1349 - def db_source(self, content):
1350 self.data.db_source = content.rstrip()
1351
1352 - def gi(self, content):
1353 self.data.gi = content
1354
1355 - def keywords(self, content):
1356 self.data.keywords = self._split_keywords(content)
1357
1358 - def project(self, content):
1359 self.data.projects.extend(p for p in content.split() if p)
1360 1363
1364 - def segment(self, content):
1365 self.data.segment = content
1366
1367 - def source(self, content):
1368 self.data.source = content
1369
1370 - def organism(self, content):
1371 self.data.organism = content
1372
1373 - def taxonomy(self, content):
1374 self.data.taxonomy = self._split_taxonomy(content)
1375
1376 - def reference_num(self, content):
1377 """Grab the reference number and signal the start of a new reference. 1378 """ 1379 # check if we have a reference to add 1380 if self._cur_reference is not None: 1381 self.data.references.append(self._cur_reference) 1382 1383 from . import Record 1384 self._cur_reference = Record.Reference() 1385 self._cur_reference.number = content
1386
1387 - def reference_bases(self, content):
1388 self._cur_reference.bases = content
1389
1390 - def authors(self, content):
1391 self._cur_reference.authors = content
1392
1393 - def consrtm(self, content):
1394 self._cur_reference.consrtm = content
1395
1396 - def title(self, content):
1397 if self._cur_reference is None: 1398 import warnings 1399 from Bio import BiopythonParserWarning 1400 warnings.warn("GenBank TITLE line without REFERENCE line.", 1401 BiopythonParserWarning) 1402 return 1403 self._cur_reference.title = content
1404
1405 - def journal(self, content):
1406 self._cur_reference.journal = content
1407
1408 - def medline_id(self, content):
1409 self._cur_reference.medline_id = content
1410
1411 - def pubmed_id(self, content):
1412 self._cur_reference.pubmed_id = content
1413
1414 - def remark(self, content):
1415 self._cur_reference.remark = content
1416
1417 - def comment(self, content):
1418 self.data.comment += "\n".join(content)
1419
1420 - def structured_comment(self, content):
1421 self.data.structured_comment = content
1422
1423 - def primary_ref_line(self, content):
1424 """Data for the PRIMARY line""" 1425 self.data.primary.append(content)
1426
1427 - def primary(self, content):
1428 pass
1429
1430 - def features_line(self, content):
1431 """Get ready for the feature table when we reach the FEATURE line. 1432 """ 1433 self.start_feature_table()
1434
1435 - def start_feature_table(self):
1436 """Signal the start of the feature table. 1437 """ 1438 # we need to add on the last reference 1439 if self._cur_reference is not None: 1440 self.data.references.append(self._cur_reference)
1441
1442 - def feature_key(self, content):
1443 """Grab the key of the feature and signal the start of a new feature. 1444 """ 1445 # first add on feature information if we've got any 1446 self._add_feature() 1447 1448 from . import Record 1449 self._cur_feature = Record.Feature() 1450 self._cur_feature.key = content
1451
1452 - def _add_feature(self):
1453 """Utility function to add a feature to the Record. 1454 1455 This does all of the appropriate checking to make sure we haven't 1456 left any info behind, and that we are only adding info if it 1457 exists. 1458 """ 1459 if self._cur_feature is not None: 1460 # if we have a left over qualifier, add it to the qualifiers 1461 # on the current feature 1462 if self._cur_qualifier is not None: 1463 self._cur_feature.qualifiers.append(self._cur_qualifier) 1464 1465 self._cur_qualifier = None 1466 self.data.features.append(self._cur_feature)
1467
1468 - def location(self, content):
1469 self._cur_feature.location = self._clean_location(content)
1470
1471 - def feature_qualifier(self, key, value):
1472 self.feature_qualifier_name([key]) 1473 if value is not None: 1474 self.feature_qualifier_description(value)
1475
1476 - def feature_qualifier_name(self, content_list):
1477 """Deal with qualifier names 1478 1479 We receive a list of keys, since you can have valueless keys such as 1480 /pseudo which would be passed in with the next key (since no other 1481 tags separate them in the file) 1482 """ 1483 from . import Record 1484 for content in content_list: 1485 # the record parser keeps the /s -- add them if we don't have 'em 1486 if not content.startswith("/"): 1487 content = "/%s" % content 1488 # add on a qualifier if we've got one 1489 if self._cur_qualifier is not None: 1490 self._cur_feature.qualifiers.append(self._cur_qualifier) 1491 1492 self._cur_qualifier = Record.Qualifier() 1493 self._cur_qualifier.key = content
1494
1495 - def feature_qualifier_description(self, content):
1496 # if we have info then the qualifier key should have a ='s 1497 if '=' not in self._cur_qualifier.key: 1498 self._cur_qualifier.key = "%s=" % self._cur_qualifier.key 1499 cur_content = self._remove_newlines(content) 1500 # remove all spaces from the value if it is a type where spaces 1501 # are not important 1502 for remove_space_key in self.__class__.remove_space_keys: 1503 if remove_space_key in self._cur_qualifier.key: 1504 cur_content = self._remove_spaces(cur_content) 1505 self._cur_qualifier.value = self._normalize_spaces(cur_content)
1506
1507 - def base_count(self, content):
1508 self.data.base_counts = content
1509
1510 - def origin_name(self, content):
1511 self.data.origin = content
1512
1513 - def contig_location(self, content):
1514 """Signal that we have contig information to add to the record. 1515 """ 1516 self.data.contig = self._clean_location(content)
1517
1518 - def sequence(self, content):
1519 """Add sequence information to a list of sequence strings. 1520 1521 This removes spaces in the data and uppercases the sequence, and 1522 then adds it to a list of sequences. Later on we'll join this 1523 list together to make the final sequence. This is faster than 1524 adding on the new string every time. 1525 """ 1526 assert ' ' not in content 1527 self._seq_data.append(content.upper())
1528
1529 - def record_end(self, content):
1530 """Signal the end of the record and do any necessary clean-up. 1531 """ 1532 # add together all of the sequence parts to create the 1533 # final sequence string 1534 self.data.sequence = "".join(self._seq_data) 1535 # add on the last feature 1536 self._add_feature()
1537
1538 1539 -def parse(handle):
1540 """Iterate over GenBank formatted entries as Record objects. 1541 1542 >>> from Bio import GenBank 1543 >>> with open("GenBank/NC_000932.gb") as handle: 1544 ... for record in GenBank.parse(handle): 1545 ... print(record.accession) 1546 ['NC_000932'] 1547 1548 To get SeqRecord objects use Bio.SeqIO.parse(..., format="gb") 1549 instead. 1550 """ 1551 return iter(Iterator(handle, RecordParser()))
1552
1553 1554 -def read(handle):
1555 """Read a handle containing a single GenBank entry as a Record object. 1556 1557 >>> from Bio import GenBank 1558 >>> with open("GenBank/NC_000932.gb") as handle: 1559 ... record = GenBank.read(handle) 1560 ... print(record.accession) 1561 ['NC_000932'] 1562 1563 To get a SeqRecord object use Bio.SeqIO.read(..., format="gb") 1564 instead. 1565 """ 1566 iterator = parse(handle) 1567 try: 1568 first = next(iterator) 1569 except StopIteration: 1570 first = None 1571 if first is None: 1572 raise ValueError("No records found in handle") 1573 try: 1574 second = next(iterator) 1575 except StopIteration: 1576 second = None 1577 if second is not None: 1578 raise ValueError("More than one record found in handle") 1579 return first
1580 1581 1582 if __name__ == "__main__": 1583 from Bio._utils import run_doctest 1584 run_doctest() 1585