Package Bio :: Package GenBank
[hide private]
[frames] | no frames]

Source Code for Package Bio.GenBank

   1  # Copyright 2000 by Jeffrey Chang, Brad Chapman.  All rights reserved. 
   2  # Copyright 2006-2017 by Peter Cock.  All rights reserved. 
   3  # 
   4  # This code is part of the Biopython distribution and governed by its 
   5  # license.  Please see the LICENSE file that should have been included 
   6  # as part of this package. 
   7   
   8  """Code to work with GenBank formatted files. 
   9   
  10  Rather than using Bio.GenBank, you are now encouraged to use Bio.SeqIO with 
  11  the "genbank" or "embl" format names to parse GenBank or EMBL files into 
  12  SeqRecord and SeqFeature objects (see the Biopython tutorial for details). 
  13   
  14  Using Bio.GenBank directly to parse GenBank files is only useful if you want 
  15  to obtain GenBank-specific Record objects, which is a much closer 
  16  representation to the raw file contents than the SeqRecord alternative from 
  17  the FeatureParser (used in Bio.SeqIO). 
  18   
  19  To use the Bio.GenBank parser, there are two helper functions: 
  20   
  21      - read                  Parse a handle containing a single GenBank record 
  22        as Bio.GenBank specific Record objects. 
  23      - parse                 Iterate over a handle containing multiple GenBank 
  24        records as Bio.GenBank specific Record objects. 
  25   
  26  The following internal classes are not intended for direct use and may 
  27  be deprecated in a future release. 
  28   
  29  Classes: 
  30   - Iterator              Iterate through a file of GenBank entries 
  31   - ErrorFeatureParser    Catch errors caused during parsing. 
  32   - FeatureParser         Parse GenBank data in SeqRecord and SeqFeature objects. 
  33   - RecordParser          Parse GenBank data into a Record object. 
  34   
  35  Exceptions: 
  36   - ParserFailureError    Exception indicating a failure in the parser (ie. 
  37     scanner or consumer) 
  38   - LocationParserError   Exception indicating a problem with the spark based 
  39     location parser. 
  40   
  41  """ 
  42  from __future__ import print_function 
  43   
  44  import re 
  45  import sys  # for checking if Python 2 
  46   
  47  # other Biopython stuff 
  48  from Bio import SeqFeature 
  49   
  50  # other Bio.GenBank stuff 
  51  from .utils import FeatureValueCleaner 
  52  from .Scanner import GenBankScanner 
  53   
  54   
  55  # Constants used to parse GenBank header lines 
  56  GENBANK_INDENT = 12 
  57  GENBANK_SPACER = " " * GENBANK_INDENT 
  58   
  59  # Constants for parsing GenBank feature lines 
  60  FEATURE_KEY_INDENT = 5 
  61  FEATURE_QUALIFIER_INDENT = 21 
  62  FEATURE_KEY_SPACER = " " * FEATURE_KEY_INDENT 
  63  FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT 
  64   
  65  # Regular expressions for location parsing 
  66  _solo_location = r"[<>]?\d+" 
  67  _pair_location = r"[<>]?\d+\.\.[<>]?\d+" 
  68  _between_location = r"\d+\^\d+" 
  69   
  70  _within_position = r"\(\d+\.\d+\)" 
  71  _re_within_position = re.compile(_within_position) 
  72  _within_location = r"([<>]?\d+|%s)\.\.([<>]?\d+|%s)" \ 
  73                     % (_within_position, _within_position) 
  74  assert _re_within_position.match("(3.9)") 
  75  assert re.compile(_within_location).match("(3.9)..10") 
  76  assert re.compile(_within_location).match("26..(30.33)") 
  77  assert re.compile(_within_location).match("(13.19)..(20.28)") 
  78   
  79  _oneof_position = r"one\-of\(\d+(,\d+)+\)" 
  80  _re_oneof_position = re.compile(_oneof_position) 
  81  _oneof_location = r"([<>]?\d+|%s)\.\.([<>]?\d+|%s)" \ 
  82                     % (_oneof_position, _oneof_position) 
  83  assert _re_oneof_position.match("one-of(6,9)") 
  84  assert re.compile(_oneof_location).match("one-of(6,9)..101") 
  85  assert re.compile(_oneof_location).match("one-of(6,9)..one-of(101,104)") 
  86  assert re.compile(_oneof_location).match("6..one-of(101,104)") 
  87   
  88  assert not _re_oneof_position.match("one-of(3)") 
  89  assert _re_oneof_position.match("one-of(3,6)") 
  90  assert _re_oneof_position.match("one-of(3,6,9)") 
  91   
  92   
  93  _simple_location = r"\d+\.\.\d+" 
  94  _re_simple_location = re.compile(r"^%s$" % _simple_location) 
  95  _re_simple_compound = re.compile(r"^(join|order|bond)\(%s(,%s)*\)$" 
  96                                   % (_simple_location, _simple_location)) 
  97  _complex_location = r"([a-zA-Z][a-zA-Z0-9_\.\|]*[a-zA-Z0-9]?\:)?(%s|%s|%s|%s|%s)" \ 
  98                      % (_pair_location, _solo_location, _between_location, 
  99                         _within_location, _oneof_location) 
 100  _re_complex_location = re.compile(r"^%s$" % _complex_location) 
 101  _possibly_complemented_complex_location = r"(%s|complement\(%s\))" \ 
 102                                            % (_complex_location, _complex_location) 
 103  _re_complex_compound = re.compile(r"^(join|order|bond)\(%s(,%s)*\)$" 
 104                                    % (_possibly_complemented_complex_location, 
 105                                       _possibly_complemented_complex_location)) 
 106   
 107   
 108  assert _re_simple_location.match("104..160") 
 109  assert not _re_simple_location.match("68451760..68452073^68452074") 
 110  assert not _re_simple_location.match("<104..>160") 
 111  assert not _re_simple_location.match("104") 
 112  assert not _re_simple_location.match("<1") 
 113  assert not _re_simple_location.match(">99999") 
 114  assert not _re_simple_location.match("join(104..160,320..390,504..579)") 
 115  assert not _re_simple_compound.match("bond(12,63)") 
 116  assert _re_simple_compound.match("join(104..160,320..390,504..579)") 
 117  assert _re_simple_compound.match("order(1..69,1308..1465)") 
 118  assert not _re_simple_compound.match("order(1..69,1308..1465,1524)") 
 119  assert not _re_simple_compound.match("join(<1..442,992..1228,1524..>1983)") 
 120  assert not _re_simple_compound.match("join(<1..181,254..336,422..497,574..>590)") 
 121  assert not _re_simple_compound.match("join(1475..1577,2841..2986,3074..3193,3314..3481,4126..>4215)") 
 122  assert not _re_simple_compound.match("test(1..69,1308..1465)") 
 123  assert not _re_simple_compound.match("complement(1..69)") 
 124  assert not _re_simple_compound.match("(1..69)") 
 125  assert _re_complex_location.match("(3.9)..10") 
 126  assert _re_complex_location.match("26..(30.33)") 
 127  assert _re_complex_location.match("(13.19)..(20.28)") 
 128  assert _re_complex_location.match("41^42")  # between 
 129  assert _re_complex_location.match("AL121804:41^42") 
 130  assert _re_complex_location.match("AL121804:41..610") 
 131  assert _re_complex_location.match("AL121804.2:41..610") 
 132  assert _re_complex_location.match("AL358792.24.1.166931:3274..3461")  # lots of dots in external reference 
 133  assert _re_complex_location.match("one-of(3,6)..101") 
 134  assert _re_complex_compound.match("join(153490..154269,AL121804.2:41..610,AL121804.2:672..1487)") 
 135  assert not _re_simple_compound.match("join(153490..154269,AL121804.2:41..610,AL121804.2:672..1487)") 
 136  assert _re_complex_compound.match("join(complement(69611..69724),139856..140650)") 
 137  assert _re_complex_compound.match("join(complement(AL354868.10.1.164018:80837..81016),complement(AL354868.10.1.164018:80539..80835))") 
 138   
 139  # Trans-spliced example from NC_016406, note underscore in reference name: 
 140  assert _re_complex_location.match("NC_016402.1:6618..6676") 
 141  assert _re_complex_location.match("181647..181905") 
 142  assert _re_complex_compound.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)") 
 143  assert not _re_complex_location.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)") 
 144  assert not _re_simple_compound.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)") 
 145  assert not _re_complex_location.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)") 
 146  assert not _re_simple_location.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)") 
 147   
 148  _solo_bond = re.compile("bond\(%s\)" % _solo_location) 
 149  assert _solo_bond.match("bond(196)") 
 150  assert _solo_bond.search("bond(196)") 
 151  assert _solo_bond.search("join(bond(284),bond(305),bond(309),bond(305))") 
152 153 154 -def _pos(pos_str, offset=0):
155 """Build a Position object (PRIVATE). 156 157 For an end position, leave offset as zero (default): 158 159 >>> _pos("5") 160 ExactPosition(5) 161 162 For a start position, set offset to minus one (for Python counting): 163 164 >>> _pos("5", -1) 165 ExactPosition(4) 166 167 This also covers fuzzy positions: 168 169 >>> p = _pos("<5") 170 >>> p 171 BeforePosition(5) 172 >>> print(p) 173 <5 174 >>> int(p) 175 5 176 177 >>> _pos(">5") 178 AfterPosition(5) 179 180 By default assumes an end position, so note the integer behaviour: 181 182 >>> p = _pos("one-of(5,8,11)") 183 >>> p 184 OneOfPosition(11, choices=[ExactPosition(5), ExactPosition(8), ExactPosition(11)]) 185 >>> print(p) 186 one-of(5,8,11) 187 >>> int(p) 188 11 189 190 >>> _pos("(8.10)") 191 WithinPosition(10, left=8, right=10) 192 193 Fuzzy start positions: 194 195 >>> p = _pos("<5", -1) 196 >>> p 197 BeforePosition(4) 198 >>> print(p) 199 <4 200 >>> int(p) 201 4 202 203 Notice how the integer behaviour changes too! 204 205 >>> p = _pos("one-of(5,8,11)", -1) 206 >>> p 207 OneOfPosition(4, choices=[ExactPosition(4), ExactPosition(7), ExactPosition(10)]) 208 >>> print(p) 209 one-of(4,7,10) 210 >>> int(p) 211 4 212 213 """ 214 if pos_str.startswith("<"): 215 return SeqFeature.BeforePosition(int(pos_str[1:]) + offset) 216 elif pos_str.startswith(">"): 217 return SeqFeature.AfterPosition(int(pos_str[1:]) + offset) 218 elif _re_within_position.match(pos_str): 219 s, e = pos_str[1:-1].split(".") 220 s = int(s) + offset 221 e = int(e) + offset 222 if offset == -1: 223 default = s 224 else: 225 default = e 226 return SeqFeature.WithinPosition(default, left=s, right=e) 227 elif _re_oneof_position.match(pos_str): 228 assert pos_str.startswith("one-of(") 229 assert pos_str[-1] == ")" 230 parts = [SeqFeature.ExactPosition(int(pos) + offset) 231 for pos in pos_str[7:-1].split(",")] 232 if offset == -1: 233 default = min(int(pos) for pos in parts) 234 else: 235 default = max(int(pos) for pos in parts) 236 return SeqFeature.OneOfPosition(default, choices=parts) 237 else: 238 return SeqFeature.ExactPosition(int(pos_str) + offset)
239
240 241 -def _loc(loc_str, expected_seq_length, strand):
242 """Make FeatureLocation from non-compound non-complement location (PRIVATE). 243 244 Simple examples, 245 246 >>> _loc("123..456", 1000, +1) 247 FeatureLocation(ExactPosition(122), ExactPosition(456), strand=1) 248 >>> _loc("<123..>456", 1000, strand = -1) 249 FeatureLocation(BeforePosition(122), AfterPosition(456), strand=-1) 250 251 A more complex location using within positions, 252 253 >>> _loc("(9.10)..(20.25)", 1000, 1) 254 FeatureLocation(WithinPosition(8, left=8, right=9), WithinPosition(25, left=20, right=25), strand=1) 255 256 Notice how that will act as though it has overall start 8 and end 25. 257 258 Zero length between feature, 259 260 >>> _loc("123^124", 1000, 0) 261 FeatureLocation(ExactPosition(123), ExactPosition(123), strand=0) 262 263 The expected sequence length is needed for a special case, a between 264 position at the start/end of a circular genome: 265 266 >>> _loc("1000^1", 1000, 1) 267 FeatureLocation(ExactPosition(1000), ExactPosition(1000), strand=1) 268 269 Apart from this special case, between positions P^Q must have P+1==Q, 270 271 >>> _loc("123^456", 1000, 1) 272 Traceback (most recent call last): 273 ... 274 ValueError: Invalid between location '123^456' 275 276 You can optionally provide a reference name: 277 278 >>> _loc("AL391218.9:105173..108462", 2000000, 1) 279 FeatureLocation(ExactPosition(105172), ExactPosition(108462), strand=1, ref='AL391218.9') 280 281 """ 282 if ":" in loc_str: 283 ref, loc_str = loc_str.split(":") 284 else: 285 ref = None 286 try: 287 s, e = loc_str.split("..") 288 except ValueError: 289 assert ".." not in loc_str 290 if "^" in loc_str: 291 # A between location like "67^68" (one based counting) is a 292 # special case (note it has zero length). In python slice 293 # notation this is 67:67, a zero length slice. See Bug 2622 294 # Further more, on a circular genome of length N you can have 295 # a location N^1 meaning the junction at the origin. See Bug 3098. 296 # NOTE - We can imagine between locations like "2^4", but this 297 # is just "3". Similarly, "2^5" is just "3..4" 298 s, e = loc_str.split("^") 299 if int(s) + 1 == int(e): 300 pos = _pos(s) 301 elif int(s) == expected_seq_length and e == "1": 302 pos = _pos(s) 303 else: 304 raise ValueError("Invalid between location %s" % repr(loc_str)) 305 return SeqFeature.FeatureLocation(pos, pos, strand, ref=ref) 306 else: 307 # e.g. "123" 308 s = loc_str 309 e = loc_str 310 return SeqFeature.FeatureLocation(_pos(s, -1), _pos(e), strand, ref=ref)
311
312 313 -def _split_compound_loc(compound_loc):
314 """Split a tricky compound location string (PRIVATE). 315 316 >>> list(_split_compound_loc("123..145")) 317 ['123..145'] 318 >>> list(_split_compound_loc("123..145,200..209")) 319 ['123..145', '200..209'] 320 >>> list(_split_compound_loc("one-of(200,203)..300")) 321 ['one-of(200,203)..300'] 322 >>> list(_split_compound_loc("complement(123..145),200..209")) 323 ['complement(123..145)', '200..209'] 324 >>> list(_split_compound_loc("123..145,one-of(200,203)..209")) 325 ['123..145', 'one-of(200,203)..209'] 326 >>> list(_split_compound_loc("123..145,one-of(200,203)..one-of(209,211),300")) 327 ['123..145', 'one-of(200,203)..one-of(209,211)', '300'] 328 >>> list(_split_compound_loc("123..145,complement(one-of(200,203)..one-of(209,211)),300")) 329 ['123..145', 'complement(one-of(200,203)..one-of(209,211))', '300'] 330 >>> list(_split_compound_loc("123..145,200..one-of(209,211),300")) 331 ['123..145', '200..one-of(209,211)', '300'] 332 >>> list(_split_compound_loc("123..145,200..one-of(209,211)")) 333 ['123..145', '200..one-of(209,211)'] 334 >>> list(_split_compound_loc("complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905")) 335 ['complement(149815..150200)', 'complement(293787..295573)', 'NC_016402.1:6618..6676', '181647..181905'] 336 """ 337 if "one-of(" in compound_loc: 338 # Hard case 339 while "," in compound_loc: 340 assert compound_loc[0] != "," 341 assert compound_loc[0:2] != ".." 342 i = compound_loc.find(",") 343 part = compound_loc[:i] 344 compound_loc = compound_loc[i:] # includes the comma 345 while part.count("(") > part.count(")"): 346 assert "one-of(" in part, (part, compound_loc) 347 i = compound_loc.find(")") 348 part += compound_loc[:i + 1] 349 compound_loc = compound_loc[i + 1:] 350 if compound_loc.startswith(".."): 351 i = compound_loc.find(",") 352 if i == -1: 353 part += compound_loc 354 compound_loc = "" 355 else: 356 part += compound_loc[:i] 357 compound_loc = compound_loc[i:] # includes the comma 358 while part.count("(") > part.count(")"): 359 assert part.count("one-of(") == 2 360 i = compound_loc.find(")") 361 part += compound_loc[:i + 1] 362 compound_loc = compound_loc[i + 1:] 363 if compound_loc.startswith(","): 364 compound_loc = compound_loc[1:] 365 assert part 366 yield part 367 if compound_loc: 368 yield compound_loc 369 else: 370 # Easy case 371 for part in compound_loc.split(","): 372 yield part
373
374 375 -class Iterator(object):
376 """Iterator interface to move over a file of GenBank entries one at a time (OBSOLETE). 377 378 This class is likely to be deprecated in a future release of Biopython. 379 Please use Bio.SeqIO.parse(..., format="gb") or Bio.GenBank.parse(...) 380 for SeqRecord and GenBank specific Record objects respectively instead. 381 """ 382
383 - def __init__(self, handle, parser=None):
384 """Initialize the iterator. 385 386 Arguments: 387 - handle - A handle with GenBank entries to iterate through. 388 - parser - An optional parser to pass the entries through before 389 returning them. If None, then the raw entry will be returned. 390 391 """ 392 self.handle = handle 393 self._parser = parser
394
395 - def __next__(self):
396 """Return the next GenBank record from the handle. 397 398 Will return None if we ran out of records. 399 """ 400 if self._parser is None: 401 lines = [] 402 while True: 403 line = self.handle.readline() 404 if not line: 405 return None # Premature end of file? 406 lines.append(line) 407 if line.rstrip() == "//": 408 break 409 return "".join(lines) 410 try: 411 return self._parser.parse(self.handle) 412 except StopIteration: 413 return None
414 415 if sys.version_info[0] < 3:
416 - def next(self):
417 """Python 2 style alias for Python 3 style __next__ method.""" 418 return self.__next__()
419
420 - def __iter__(self):
421 """Iterate over the records.""" 422 return iter(self.__next__, None)
423
424 425 -class ParserFailureError(Exception):
426 """Failure caused by some kind of problem in the parser.""" 427 428 pass
429
430 431 -class LocationParserError(Exception):
432 """Could not Properly parse out a location from a GenBank file.""" 433 434 pass
435
436 437 -class FeatureParser(object):
438 """Parse GenBank files into Seq + Feature objects (OBSOLETE). 439 440 Direct use of this class is discouraged, and may be deprecated in 441 a future release of Biopython. 442 443 Please use Bio.SeqIO.parse(...) or Bio.SeqIO.read(...) instead. 444 """ 445
446 - def __init__(self, debug_level=0, use_fuzziness=1, 447 feature_cleaner=FeatureValueCleaner()):
448 """Initialize a GenBank parser and Feature consumer. 449 450 Arguments: 451 - debug_level - An optional argument that species the amount of 452 debugging information the parser should spit out. By default we have 453 no debugging info (the fastest way to do things), but if you want 454 you can set this as high as two and see exactly where a parse fails. 455 - use_fuzziness - Specify whether or not to use fuzzy representations. 456 The default is 1 (use fuzziness). 457 - feature_cleaner - A class which will be used to clean out the 458 values of features. This class must implement the function 459 clean_value. GenBank.utils has a "standard" cleaner class, which 460 is used by default. 461 462 """ 463 self._scanner = GenBankScanner(debug_level) 464 self.use_fuzziness = use_fuzziness 465 self._cleaner = feature_cleaner
466
467 - def parse(self, handle):
468 """Parse the specified handle.""" 469 _consumer = _FeatureConsumer(self.use_fuzziness, 470 self._cleaner) 471 self._scanner.feed(handle, _consumer) 472 return _consumer.data
473
474 475 -class RecordParser(object):
476 """Parse GenBank files into Record objects (OBSOLETE). 477 478 Direct use of this class is discouraged, and may be deprecated in 479 a future release of Biopython. 480 481 Please use the Bio.GenBank.parse(...) or Bio.GenBank.read(...) functions 482 instead. 483 """ 484
485 - def __init__(self, debug_level=0):
486 """Initialize the parser. 487 488 Arguments: 489 - debug_level - An optional argument that species the amount of 490 debugging information the parser should spit out. By default we have 491 no debugging info (the fastest way to do things), but if you want 492 you can set this as high as two and see exactly where a parse fails. 493 494 """ 495 self._scanner = GenBankScanner(debug_level)
496
497 - def parse(self, handle):
498 """Parse the specified handle into a GenBank record.""" 499 _consumer = _RecordConsumer() 500 501 self._scanner.feed(handle, _consumer) 502 return _consumer.data
503
504 505 -class _BaseGenBankConsumer(object):
506 """Abstract GenBank consumer providing useful general functions (PRIVATE). 507 508 This just helps to eliminate some duplication in things that most 509 GenBank consumers want to do. 510 """ 511 512 # Special keys in GenBank records that we should remove spaces from 513 # For instance, \translation keys have values which are proteins and 514 # should have spaces and newlines removed from them. This class 515 # attribute gives us more control over specific formatting problems. 516 remove_space_keys = ["translation"] 517
518 - def __init__(self):
519 pass
520
521 - def _unhandled(self, data):
522 pass
523
524 - def __getattr__(self, attr):
525 return self._unhandled
526 527 @staticmethod
528 - def _split_keywords(keyword_string):
529 """Split a string of keywords into a nice clean list.""" 530 # process the keywords into a python list 531 if keyword_string == "" or keyword_string == ".": 532 keywords = "" 533 elif keyword_string[-1] == '.': 534 keywords = keyword_string[:-1] 535 else: 536 keywords = keyword_string 537 keyword_list = keywords.split(';') 538 clean_keyword_list = [x.strip() for x in keyword_list] 539 return clean_keyword_list
540 541 @staticmethod
542 - def _split_accessions(accession_string):
543 """Split a string of accession numbers into a list.""" 544 # first replace all line feeds with spaces 545 # Also, EMBL style accessions are split with ';' 546 accession = accession_string.replace("\n", " ").replace(";", " ") 547 548 return [x.strip() for x in accession.split() if x.strip()]
549 550 @staticmethod
551 - def _split_taxonomy(taxonomy_string):
552 """Split a string with taxonomy info into a list.""" 553 if not taxonomy_string or taxonomy_string == ".": 554 # Missing data, no taxonomy 555 return [] 556 557 if taxonomy_string[-1] == '.': 558 tax_info = taxonomy_string[:-1] 559 else: 560 tax_info = taxonomy_string 561 tax_list = tax_info.split(';') 562 new_tax_list = [] 563 for tax_item in tax_list: 564 new_items = tax_item.split("\n") 565 new_tax_list.extend(new_items) 566 while '' in new_tax_list: 567 new_tax_list.remove('') 568 clean_tax_list = [x.strip() for x in new_tax_list] 569 570 return clean_tax_list
571 572 @staticmethod
573 - def _clean_location(location_string):
574 """Clean whitespace out of a location string. 575 576 The location parser isn't a fan of whitespace, so we clean it out 577 before feeding it into the parser. 578 """ 579 # Originally this imported string.whitespace and did a replace 580 # via a loop. It's simpler to just split on whitespace and rejoin 581 # the string - and this avoids importing string too. See Bug 2684. 582 return ''.join(location_string.split())
583 584 @staticmethod
585 - def _remove_newlines(text):
586 """Remove any newlines in the passed text, returning the new string.""" 587 # get rid of newlines in the qualifier value 588 newlines = ["\n", "\r"] 589 for ws in newlines: 590 text = text.replace(ws, "") 591 592 return text
593 594 @staticmethod
595 - def _normalize_spaces(text):
596 """Replace multiple spaces in the passed text with single spaces.""" 597 # get rid of excessive spaces 598 return ' '.join(x for x in text.split(" ") if x)
599 600 @staticmethod
601 - def _remove_spaces(text):
602 """Remove all spaces from the passed text.""" 603 return text.replace(" ", "")
604 605 @staticmethod
606 - def _convert_to_python_numbers(start, end):
607 """Convert a start and end range to python notation. 608 609 In GenBank, starts and ends are defined in "biological" coordinates, 610 where 1 is the first base and [i, j] means to include both i and j. 611 612 In python, 0 is the first base and [i, j] means to include i, but 613 not j. 614 615 So, to convert "biological" to python coordinates, we need to 616 subtract 1 from the start, and leave the end and things should 617 be converted happily. 618 """ 619 new_start = start - 1 620 new_end = end 621 622 return new_start, new_end
623
624 625 -class _FeatureConsumer(_BaseGenBankConsumer):
626 """Create a SeqRecord object with Features to return (PRIVATE). 627 628 Attributes: 629 - use_fuzziness - specify whether or not to parse with fuzziness in 630 feature locations. 631 - feature_cleaner - a class that will be used to provide specialized 632 cleaning-up of feature values. 633 634 """ 635
636 - def __init__(self, use_fuzziness, feature_cleaner=None):
637 from Bio.SeqRecord import SeqRecord 638 _BaseGenBankConsumer.__init__(self) 639 self.data = SeqRecord(None, id=None) 640 self.data.id = None 641 self.data.description = "" 642 643 self._use_fuzziness = use_fuzziness 644 self._feature_cleaner = feature_cleaner 645 646 self._seq_type = '' 647 self._seq_data = [] 648 self._cur_reference = None 649 self._cur_feature = None 650 self._expected_size = None
651
652 - def locus(self, locus_name):
653 """Set the locus name is set as the name of the Sequence.""" 654 self.data.name = locus_name
655
656 - def size(self, content):
657 """Record the sequence length.""" 658 self._expected_size = int(content)
659
660 - def residue_type(self, type):
661 """Record the sequence type (SEMI-OBSOLETE). 662 663 This reflects the fact that the topology (linear/circular) and 664 molecule type (e.g. DNA vs RNA) were a single field in early 665 files. Current GenBank/EMBL files have two fields. 666 """ 667 self._seq_type = type.strip()
668
669 - def topology(self, topology): # noqa: D402
670 """Validate and record sequence topology (linear or circular as strings).""" 671 if topology: 672 if topology not in ['linear', 'circular']: 673 raise ParserFailureError("Unexpected topology %r should be linear or circular" % topology) 674 self.data.annotations['topology'] = topology
675
676 - def molecule_type(self, mol_type):
677 """Validate and record the molecule type (for round-trip etc).""" 678 if mol_type: 679 if "circular" in mol_type or 'linear' in mol_type: 680 raise ParserFailureError("Molecule type %r should not include topology" % mol_type) 681 self.data.annotations['molecule_type'] = mol_type
682
683 - def data_file_division(self, division):
684 self.data.annotations['data_file_division'] = division
685
686 - def date(self, submit_date):
687 self.data.annotations['date'] = submit_date
688
689 - def definition(self, definition):
690 """Set the definition as the description of the sequence.""" 691 if self.data.description: 692 # Append to any existing description 693 # e.g. EMBL files with two DE lines. 694 self.data.description += " " + definition 695 else: 696 self.data.description = definition
697
698 - def accession(self, acc_num):
699 """Set the accession number as the id of the sequence. 700 701 If we have multiple accession numbers, the first one passed is 702 used. 703 """ 704 new_acc_nums = self._split_accessions(acc_num) 705 706 # Also record them ALL in the annotations 707 try: 708 # On the off chance there was more than one accession line: 709 for acc in new_acc_nums: 710 # Prevent repeat entries 711 if acc not in self.data.annotations['accessions']: 712 self.data.annotations['accessions'].append(acc) 713 except KeyError: 714 self.data.annotations['accessions'] = new_acc_nums 715 716 # if we haven't set the id information yet, add the first acc num 717 if not self.data.id: 718 if len(new_acc_nums) > 0: 719 # self.data.id = new_acc_nums[0] 720 # Use the FIRST accession as the ID, not the first on this line! 721 self.data.id = self.data.annotations['accessions'][0]
722
723 - def wgs(self, content):
724 self.data.annotations['wgs'] = content.split('-')
725
726 - def add_wgs_scafld(self, content):
727 self.data.annotations.setdefault('wgs_scafld', []).append(content.split('-'))
728
729 - def nid(self, content):
730 self.data.annotations['nid'] = content
731
732 - def pid(self, content):
733 self.data.annotations['pid'] = content
734
735 - def version(self, version_id):
736 # Want to use the versioned accession as the record.id 737 # This comes from the VERSION line in GenBank files, or the 738 # obsolete SV line in EMBL. For the new EMBL files we need 739 # both the version suffix from the ID line and the accession 740 # from the AC line. 741 if version_id.count(".") == 1 and version_id.split(".")[1].isdigit(): 742 self.accession(version_id.split(".")[0]) 743 self.version_suffix(version_id.split(".")[1]) 744 elif version_id: 745 # For backwards compatibility... 746 self.data.id = version_id
747
748 - def project(self, content):
749 """Handle the information from the PROJECT line as a list of projects. 750 751 e.g.:: 752 753 PROJECT GenomeProject:28471 754 755 or:: 756 757 PROJECT GenomeProject:13543 GenomeProject:99999 758 759 This is stored as dbxrefs in the SeqRecord to be consistent with the 760 projected switch of this line to DBLINK in future GenBank versions. 761 Note the NCBI plan to replace "GenomeProject:28471" with the shorter 762 "Project:28471" as part of this transition. 763 """ 764 content = content.replace("GenomeProject:", "Project:") 765 self.data.dbxrefs.extend(p for p in content.split() if p)
766 799
800 - def version_suffix(self, version):
801 """Set the version to overwrite the id. 802 803 Since the version provides the same information as the accession 804 number, plus some extra info, we set this as the id if we have 805 a version. 806 """ 807 # e.g. GenBank line: 808 # VERSION U49845.1 GI:1293613 809 # or the obsolete EMBL line: 810 # SV U49845.1 811 # Scanner calls consumer.version("U49845.1") 812 # which then calls consumer.version_suffix(1) 813 # 814 # e.g. EMBL new line: 815 # ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP. 816 # Scanner calls consumer.version_suffix(1) 817 assert version.isdigit() 818 self.data.annotations['sequence_version'] = int(version)
819
820 - def db_source(self, content):
821 self.data.annotations['db_source'] = content.rstrip()
822
823 - def gi(self, content):
824 self.data.annotations['gi'] = content
825
826 - def keywords(self, content):
827 if 'keywords' in self.data.annotations: 828 # Multi-line keywords, append to list 829 # Note EMBL states "A keyword is never split between lines." 830 self.data.annotations['keywords'].extend(self._split_keywords(content)) 831 else: 832 self.data.annotations['keywords'] = self._split_keywords(content)
833
834 - def segment(self, content):
835 self.data.annotations['segment'] = content
836
837 - def source(self, content):
838 # Note that some software (e.g. VectorNTI) may produce an empty 839 # source (rather than using a dot/period as might be expected). 840 if content == "": 841 source_info = "" 842 elif content[-1] == '.': 843 source_info = content[:-1] 844 else: 845 source_info = content 846 self.data.annotations['source'] = source_info
847
848 - def organism(self, content):
849 self.data.annotations['organism'] = content
850
851 - def taxonomy(self, content):
852 """Record (another line of) the taxonomy lineage.""" 853 lineage = self._split_taxonomy(content) 854 try: 855 self.data.annotations['taxonomy'].extend(lineage) 856 except KeyError: 857 self.data.annotations['taxonomy'] = lineage
858
859 - def reference_num(self, content):
860 """Signal the beginning of a new reference object.""" 861 # if we have a current reference that hasn't been added to 862 # the list of references, add it. 863 if self._cur_reference is not None: 864 self.data.annotations['references'].append(self._cur_reference) 865 else: 866 self.data.annotations['references'] = [] 867 868 self._cur_reference = SeqFeature.Reference()
869
870 - def reference_bases(self, content):
871 """Attempt to determine the sequence region the reference entails. 872 873 Possible types of information we may have to deal with: 874 875 (bases 1 to 86436) 876 (sites) 877 (bases 1 to 105654; 110423 to 111122) 878 1 (residues 1 to 182) 879 """ 880 # first remove the parentheses 881 assert content.endswith(")"), content 882 ref_base_info = content[1:-1] 883 884 all_locations = [] 885 # parse if we've got 'bases' and 'to' 886 if 'bases' in ref_base_info and 'to' in ref_base_info: 887 # get rid of the beginning 'bases' 888 ref_base_info = ref_base_info[5:] 889 locations = self._split_reference_locations(ref_base_info) 890 all_locations.extend(locations) 891 elif 'residues' in ref_base_info and 'to' in ref_base_info: 892 residues_start = ref_base_info.find("residues") 893 # get only the information after "residues" 894 ref_base_info = ref_base_info[(residues_start + len("residues ")):] 895 locations = self._split_reference_locations(ref_base_info) 896 all_locations.extend(locations) 897 898 # make sure if we are not finding information then we have 899 # the string 'sites' or the string 'bases' 900 elif (ref_base_info == 'sites' or 901 ref_base_info.strip() == 'bases'): 902 pass 903 # otherwise raise an error 904 else: 905 raise ValueError("Could not parse base info %s in record %s" % 906 (ref_base_info, self.data.id)) 907 908 self._cur_reference.location = all_locations
909
910 - def _split_reference_locations(self, location_string):
911 """Get reference locations out of a string of reference information. 912 913 The passed string should be of the form:: 914 915 1 to 20; 20 to 100 916 917 This splits the information out and returns a list of location objects 918 based on the reference locations. 919 """ 920 # split possibly multiple locations using the ';' 921 all_base_info = location_string.split(';') 922 923 new_locations = [] 924 for base_info in all_base_info: 925 start, end = base_info.split('to') 926 new_start, new_end = \ 927 self._convert_to_python_numbers(int(start.strip()), 928 int(end.strip())) 929 this_location = SeqFeature.FeatureLocation(new_start, new_end) 930 new_locations.append(this_location) 931 return new_locations
932
933 - def authors(self, content):
934 if self._cur_reference.authors: 935 self._cur_reference.authors += ' ' + content 936 else: 937 self._cur_reference.authors = content
938
939 - def consrtm(self, content):
940 if self._cur_reference.consrtm: 941 self._cur_reference.consrtm += ' ' + content 942 else: 943 self._cur_reference.consrtm = content
944
945 - def title(self, content):
946 if self._cur_reference is None: 947 import warnings 948 from Bio import BiopythonParserWarning 949 warnings.warn("GenBank TITLE line without REFERENCE line.", 950 BiopythonParserWarning) 951 elif self._cur_reference.title: 952 self._cur_reference.title += ' ' + content 953 else: 954 self._cur_reference.title = content
955
956 - def journal(self, content):
957 if self._cur_reference.journal: 958 self._cur_reference.journal += ' ' + content 959 else: 960 self._cur_reference.journal = content
961
962 - def medline_id(self, content):
963 self._cur_reference.medline_id = content
964
965 - def pubmed_id(self, content):
966 self._cur_reference.pubmed_id = content
967
968 - def remark(self, content):
969 """Deal with a reference comment.""" 970 if self._cur_reference.comment: 971 self._cur_reference.comment += ' ' + content 972 else: 973 self._cur_reference.comment = content
974
975 - def comment(self, content):
976 try: 977 self.data.annotations['comment'] += "\n" + "\n".join(content) 978 except KeyError: 979 self.data.annotations['comment'] = "\n".join(content)
980
981 - def structured_comment(self, content):
982 self.data.annotations['structured_comment'] = content
983
984 - def features_line(self, content):
985 """Get ready for the feature table when we reach the FEATURE line.""" 986 self.start_feature_table()
987
988 - def start_feature_table(self):
989 """Indicate we've got to the start of the feature table.""" 990 # make sure we've added on our last reference object 991 if self._cur_reference is not None: 992 self.data.annotations['references'].append(self._cur_reference) 993 self._cur_reference = None
994
995 - def feature_key(self, content):
996 # start a new feature 997 self._cur_feature = SeqFeature.SeqFeature() 998 self._cur_feature.type = content 999 self.data.features.append(self._cur_feature)
1000
1001 - def location(self, content):
1002 """Parse out location information from the location string. 1003 1004 This uses simple Python code with some regular expressions to do the 1005 parsing, and then translates the results into appropriate objects. 1006 """ 1007 # clean up newlines and other whitespace inside the location before 1008 # parsing - locations should have no whitespace whatsoever 1009 location_line = self._clean_location(content) 1010 1011 # Older records have junk like replace(266,"c") in the 1012 # location line. Newer records just replace this with 1013 # the number 266 and have the information in a more reasonable 1014 # place. So we'll just grab out the number and feed this to the 1015 # parser. We shouldn't really be losing any info this way. 1016 if 'replace' in location_line: 1017 comma_pos = location_line.find(',') 1018 location_line = location_line[8:comma_pos] 1019 1020 cur_feature = self._cur_feature 1021 1022 # Handle top level complement here for speed 1023 if location_line.startswith("complement("): 1024 assert location_line.endswith(")") 1025 location_line = location_line[11:-1] 1026 strand = -1 1027 elif "PROTEIN" in self._seq_type.upper(): 1028 strand = None 1029 else: 1030 # Assume nucleotide otherwise feature strand for 1031 # GenBank files with bad LOCUS lines set to None 1032 strand = 1 1033 1034 # Special case handling of the most common cases for speed 1035 if _re_simple_location.match(location_line): 1036 # e.g. "123..456" 1037 s, e = location_line.split("..") 1038 cur_feature.location = SeqFeature.FeatureLocation(int(s) - 1, 1039 int(e), 1040 strand) 1041 return 1042 1043 if ",)" in location_line: 1044 import warnings 1045 from Bio import BiopythonParserWarning 1046 warnings.warn("Dropping trailing comma in malformed feature location", 1047 BiopythonParserWarning) 1048 location_line = location_line.replace(",)", ")") 1049 1050 if _solo_bond.search(location_line): 1051 # e.g. bond(196) 1052 # e.g. join(bond(284),bond(305),bond(309),bond(305)) 1053 import warnings 1054 from Bio import BiopythonParserWarning 1055 warnings.warn("Dropping bond qualifier in feature location", BiopythonParserWarning) 1056 # There ought to be a better way to do this... 1057 for x in _solo_bond.finditer(location_line): 1058 x = x.group() 1059 location_line = location_line.replace(x, x[5:-1]) 1060 1061 if _re_simple_compound.match(location_line): 1062 # e.g. join(<123..456,480..>500) 1063 i = location_line.find("(") 1064 # cur_feature.location_operator = location_line[:i] 1065 # we can split on the comma because these are simple locations 1066 locs = [] 1067 for part in location_line[i + 1:-1].split(","): 1068 s, e = part.split("..") 1069 locs.append(SeqFeature.FeatureLocation(int(s) - 1, 1070 int(e), 1071 strand)) 1072 if len(locs) < 2: 1073 # The CompoundLocation will raise a ValueError here! 1074 import warnings 1075 from Bio import BiopythonParserWarning 1076 warnings.warn("Should have at least 2 parts for compound location", 1077 BiopythonParserWarning) 1078 cur_feature.location = None 1079 return 1080 if strand == -1: 1081 cur_feature.location = SeqFeature.CompoundLocation(locs[::-1], 1082 operator=location_line[:i]) 1083 else: 1084 cur_feature.location = SeqFeature.CompoundLocation(locs, 1085 operator=location_line[:i]) 1086 return 1087 1088 # Handle the general case with more complex regular expressions 1089 if _re_complex_location.match(location_line): 1090 # e.g. "AL121804.2:41..610" 1091 cur_feature.location = _loc(location_line, self._expected_size, strand) 1092 return 1093 1094 if _re_complex_compound.match(location_line): 1095 i = location_line.find("(") 1096 # cur_feature.location_operator = location_line[:i] 1097 # Can't split on the comma because of positions like one-of(1,2,3) 1098 locs = [] 1099 for part in _split_compound_loc(location_line[i + 1:-1]): 1100 if part.startswith("complement("): 1101 assert part[-1] == ")" 1102 part = part[11:-1] 1103 assert strand != -1, "Double complement?" 1104 part_strand = -1 1105 else: 1106 part_strand = strand 1107 try: 1108 loc = _loc(part, self._expected_size, part_strand) 1109 except ValueError as err: 1110 print(location_line) 1111 print(part) 1112 raise err 1113 locs.append(loc) 1114 # Historically a join on the reverse strand has been represented 1115 # in Biopython with both the parent SeqFeature and its children 1116 # (the exons for a CDS) all given a strand of -1. Likewise, for 1117 # a join feature on the forward strand they all have strand +1. 1118 # However, we must also consider evil mixed strand examples like 1119 # this, join(complement(69611..69724),139856..140087,140625..140650) 1120 if strand == -1: 1121 # Whole thing was wrapped in complement(...) 1122 for l in locs: 1123 assert l.strand == -1 1124 # Reverse the backwards order used in GenBank files 1125 # with complement(join(...)) 1126 cur_feature.location = SeqFeature.CompoundLocation(locs[::-1], 1127 operator=location_line[:i]) 1128 else: 1129 cur_feature.location = SeqFeature.CompoundLocation(locs, 1130 operator=location_line[:i]) 1131 return 1132 # Not recognised 1133 if "order" in location_line and "join" in location_line: 1134 # See Bug 3197 1135 msg = 'Combinations of "join" and "order" within the same ' + \ 1136 'location (nested operators) are illegal:\n' + location_line 1137 raise LocationParserError(msg) 1138 # This used to be an error.... 1139 cur_feature.location = None 1140 import warnings 1141 from Bio import BiopythonParserWarning 1142 warnings.warn(BiopythonParserWarning("Couldn't parse feature location: %r" 1143 % location_line))
1144
1145 - def feature_qualifier(self, key, value):
1146 """When we get a qualifier key and its value. 1147 1148 Can receive None, since you can have valueless keys such as /pseudo 1149 """ 1150 # Hack to try to preserve historical behaviour of /pseudo etc 1151 if value is None: 1152 # if the key doesn't exist yet, add an empty string 1153 if key not in self._cur_feature.qualifiers: 1154 self._cur_feature.qualifiers[key] = [""] 1155 return 1156 # otherwise just skip this key 1157 return 1158 1159 value = value.replace('"', '') 1160 if self._feature_cleaner is not None: 1161 value = self._feature_cleaner.clean_value(key, value) 1162 1163 # if the qualifier name exists, append the value 1164 if key in self._cur_feature.qualifiers: 1165 self._cur_feature.qualifiers[key].append(value) 1166 # otherwise start a new list of the key with its values 1167 else: 1168 self._cur_feature.qualifiers[key] = [value]
1169
1170 - def feature_qualifier_name(self, content_list):
1171 """Use feature_qualifier instead (OBSOLETE).""" 1172 raise NotImplementedError("Use the feature_qualifier method instead.")
1173
1174 - def feature_qualifier_description(self, content):
1175 """Use feature_qualifier instead (OBSOLETE).""" 1176 raise NotImplementedError("Use the feature_qualifier method instead.")
1177
1178 - def contig_location(self, content):
1179 """Deal with CONTIG information.""" 1180 # Historically this was stored as a SeqFeature object, but it was 1181 # stored under record.annotations["contig"] and not under 1182 # record.features with the other SeqFeature objects. 1183 # 1184 # The CONTIG location line can include additional tokens like 1185 # Gap(), Gap(100) or Gap(unk100) which are not used in the feature 1186 # location lines, so storing it using SeqFeature based location 1187 # objects is difficult. 1188 # 1189 # We now store this a string, which means for BioSQL we are now in 1190 # much better agreement with how BioPerl records the CONTIG line 1191 # in the database. 1192 # 1193 # NOTE - This code assumes the scanner will return all the CONTIG 1194 # lines already combined into one long string! 1195 self.data.annotations["contig"] = content
1196
1197 - def origin_name(self, content):
1198 pass
1199
1200 - def base_count(self, content):
1201 pass
1202
1203 - def base_number(self, content):
1204 pass
1205
1206 - def sequence(self, content):
1207 """Add up sequence information as we get it. 1208 1209 To try and make things speedier, this puts all of the strings 1210 into a list of strings, and then uses string.join later to put 1211 them together. Supposedly, this is a big time savings 1212 """ 1213 assert ' ' not in content 1214 self._seq_data.append(content.upper())
1215
1216 - def record_end(self, content):
1217 """Clean up when we've finished the record.""" 1218 from Bio import Alphabet 1219 from Bio.Alphabet import IUPAC 1220 from Bio.Seq import Seq, UnknownSeq 1221 1222 # Try and append the version number to the accession for the full id 1223 if not self.data.id: 1224 assert 'accessions' not in self.data.annotations, \ 1225 self.data.annotations['accessions'] 1226 self.data.id = self.data.name # Good fall back? 1227 elif self.data.id.count('.') == 0: 1228 try: 1229 self.data.id += '.%i' % self.data.annotations['sequence_version'] 1230 except KeyError: 1231 pass 1232 1233 # add the sequence information 1234 # first, determine the alphabet 1235 # we default to an generic alphabet if we don't have a 1236 # seq type or have strange sequence information. 1237 seq_alphabet = Alphabet.generic_alphabet 1238 1239 # now set the sequence 1240 sequence = "".join(self._seq_data) 1241 1242 if self._expected_size is not None \ 1243 and len(sequence) != 0 \ 1244 and self._expected_size != len(sequence): 1245 import warnings 1246 from Bio import BiopythonParserWarning 1247 warnings.warn("Expected sequence length %i, found %i (%s)." 1248 % (self._expected_size, len(sequence), self.data.id), 1249 BiopythonParserWarning) 1250 1251 if self._seq_type: 1252 # mRNA is really also DNA, since it is actually cDNA 1253 if 'DNA' in self._seq_type.upper() or 'MRNA' in self._seq_type.upper(): 1254 seq_alphabet = IUPAC.ambiguous_dna 1255 # are there ever really RNA sequences in GenBank? 1256 elif 'RNA' in self._seq_type.upper(): 1257 # Even for data which was from RNA, the sequence string 1258 # is usually given as DNA (T not U). Bug 2408 1259 if "T" in sequence and "U" not in sequence: 1260 seq_alphabet = IUPAC.ambiguous_dna 1261 else: 1262 seq_alphabet = IUPAC.ambiguous_rna 1263 elif 'PROTEIN' in self._seq_type.upper() \ 1264 or self._seq_type == "PRT": # PRT is used in EMBL-bank for patents 1265 seq_alphabet = IUPAC.protein # or extended protein? 1266 # work around ugly GenBank records which have circular or 1267 # linear but no indication of sequence type 1268 elif self._seq_type in ["circular", "linear", "unspecified"]: 1269 pass 1270 # we have a bug if we get here 1271 else: 1272 raise ValueError("Could not determine alphabet for seq_type %s" 1273 % self._seq_type) 1274 1275 if not sequence and self.__expected_size: 1276 self.data.seq = UnknownSeq(self._expected_size, seq_alphabet) 1277 else: 1278 self.data.seq = Seq(sequence, seq_alphabet)
1279
1280 1281 -class _RecordConsumer(_BaseGenBankConsumer):
1282 """Create a GenBank Record object from scanner generated information (PRIVATE).""" 1283
1284 - def __init__(self):
1285 _BaseGenBankConsumer.__init__(self) 1286 from . import Record 1287 self.data = Record.Record() 1288 1289 self._seq_data = [] 1290 self._cur_reference = None 1291 self._cur_feature = None 1292 self._cur_qualifier = None
1293
1294 - def wgs(self, content):
1295 self.data.wgs = content.split('-')
1296
1297 - def add_wgs_scafld(self, content):
1298 self.data.wgs_scafld.append(content.split('-'))
1299
1300 - def locus(self, content):
1301 self.data.locus = content
1302
1303 - def size(self, content):
1304 self.data.size = content
1305
1306 - def residue_type(self, content):
1307 # Be lenient about parsing, but technically lowercase residue types are malformed. 1308 if 'dna' in content or 'rna' in content: 1309 import warnings 1310 from Bio import BiopythonParserWarning 1311 warnings.warn("Invalid seq_type (%s): DNA/RNA should be uppercase." % content, 1312 BiopythonParserWarning) 1313 self.data.residue_type = content
1314
1315 - def data_file_division(self, content):
1316 self.data.data_file_division = content
1317
1318 - def date(self, content):
1319 self.data.date = content
1320
1321 - def definition(self, content):
1322 self.data.definition = content
1323
1324 - def accession(self, content):
1325 for acc in self._split_accessions(content): 1326 if acc not in self.data.accession: 1327 self.data.accession.append(acc)
1328
1329 - def nid(self, content):
1330 self.data.nid = content
1331
1332 - def pid(self, content):
1333 self.data.pid = content
1334
1335 - def version(self, content):
1336 self.data.version = content
1337
1338 - def db_source(self, content):
1339 self.data.db_source = content.rstrip()
1340
1341 - def gi(self, content):
1342 self.data.gi = content
1343
1344 - def keywords(self, content):
1345 self.data.keywords = self._split_keywords(content)
1346
1347 - def project(self, content):
1348 self.data.projects.extend(p for p in content.split() if p)
1349 1352
1353 - def segment(self, content):
1354 self.data.segment = content
1355
1356 - def source(self, content):
1357 self.data.source = content
1358
1359 - def organism(self, content):
1360 self.data.organism = content
1361
1362 - def taxonomy(self, content):
1363 self.data.taxonomy = self._split_taxonomy(content)
1364
1365 - def reference_num(self, content):
1366 """Grab the reference number and signal the start of a new reference.""" 1367 # check if we have a reference to add 1368 if self._cur_reference is not None: 1369 self.data.references.append(self._cur_reference) 1370 1371 from . import Record 1372 self._cur_reference = Record.Reference() 1373 self._cur_reference.number = content
1374
1375 - def reference_bases(self, content):
1376 self._cur_reference.bases = content
1377
1378 - def authors(self, content):
1379 self._cur_reference.authors = content
1380
1381 - def consrtm(self, content):
1382 self._cur_reference.consrtm = content
1383
1384 - def title(self, content):
1385 if self._cur_reference is None: 1386 import warnings 1387 from Bio import BiopythonParserWarning 1388 warnings.warn("GenBank TITLE line without REFERENCE line.", 1389 BiopythonParserWarning) 1390 return 1391 self._cur_reference.title = content
1392
1393 - def journal(self, content):
1394 self._cur_reference.journal = content
1395
1396 - def medline_id(self, content):
1397 self._cur_reference.medline_id = content
1398
1399 - def pubmed_id(self, content):
1400 self._cur_reference.pubmed_id = content
1401
1402 - def remark(self, content):
1403 self._cur_reference.remark = content
1404
1405 - def comment(self, content):
1406 self.data.comment += "\n".join(content)
1407
1408 - def structured_comment(self, content):
1409 self.data.structured_comment = content
1410
1411 - def primary_ref_line(self, content):
1412 """Save reference data for the PRIMARY line.""" 1413 self.data.primary.append(content)
1414
1415 - def primary(self, content):
1416 pass
1417
1418 - def features_line(self, content):
1419 """Get ready for the feature table when we reach the FEATURE line.""" 1420 self.start_feature_table()
1421
1422 - def start_feature_table(self):
1423 """Signal the start of the feature table.""" 1424 # we need to add on the last reference 1425 if self._cur_reference is not None: 1426 self.data.references.append(self._cur_reference)
1427
1428 - def feature_key(self, content):
1429 """Grab the key of the feature and signal the start of a new feature.""" 1430 # first add on feature information if we've got any 1431 self._add_feature() 1432 1433 from . import Record 1434 self._cur_feature = Record.Feature() 1435 self._cur_feature.key = content
1436
1437 - def _add_feature(self):
1438 """Add a feature to the record, with relevant checks (PRIVATE). 1439 1440 This does all of the appropriate checking to make sure we haven't 1441 left any info behind, and that we are only adding info if it 1442 exists. 1443 """ 1444 if self._cur_feature is not None: 1445 # if we have a left over qualifier, add it to the qualifiers 1446 # on the current feature 1447 if self._cur_qualifier is not None: 1448 self._cur_feature.qualifiers.append(self._cur_qualifier) 1449 1450 self._cur_qualifier = None 1451 self.data.features.append(self._cur_feature)
1452
1453 - def location(self, content):
1454 self._cur_feature.location = self._clean_location(content)
1455
1456 - def feature_qualifier(self, key, value):
1457 self.feature_qualifier_name([key]) 1458 if value is not None: 1459 self.feature_qualifier_description(value)
1460
1461 - def feature_qualifier_name(self, content_list):
1462 """Deal with qualifier names. 1463 1464 We receive a list of keys, since you can have valueless keys such as 1465 /pseudo which would be passed in with the next key (since no other 1466 tags separate them in the file) 1467 """ 1468 from . import Record 1469 for content in content_list: 1470 # the record parser keeps the /s -- add them if we don't have 'em 1471 if not content.startswith("/"): 1472 content = "/%s" % content 1473 # add on a qualifier if we've got one 1474 if self._cur_qualifier is not None: 1475 self._cur_feature.qualifiers.append(self._cur_qualifier) 1476 1477 self._cur_qualifier = Record.Qualifier() 1478 self._cur_qualifier.key = content
1479
1480 - def feature_qualifier_description(self, content):
1481 # if we have info then the qualifier key should have a ='s 1482 if '=' not in self._cur_qualifier.key: 1483 self._cur_qualifier.key = "%s=" % self._cur_qualifier.key 1484 cur_content = self._remove_newlines(content) 1485 # remove all spaces from the value if it is a type where spaces 1486 # are not important 1487 for remove_space_key in self.__class__.remove_space_keys: 1488 if remove_space_key in self._cur_qualifier.key: 1489 cur_content = self._remove_spaces(cur_content) 1490 self._cur_qualifier.value = self._normalize_spaces(cur_content)
1491
1492 - def base_count(self, content):
1493 self.data.base_counts = content
1494
1495 - def origin_name(self, content):
1496 self.data.origin = content
1497
1498 - def contig_location(self, content):
1499 """Signal that we have contig information to add to the record.""" 1500 self.data.contig = self._clean_location(content)
1501
1502 - def sequence(self, content):
1503 """Add sequence information to a list of sequence strings. 1504 1505 This removes spaces in the data and uppercases the sequence, and 1506 then adds it to a list of sequences. Later on we'll join this 1507 list together to make the final sequence. This is faster than 1508 adding on the new string every time. 1509 """ 1510 assert ' ' not in content 1511 self._seq_data.append(content.upper())
1512
1513 - def record_end(self, content):
1514 """Signal the end of the record and do any necessary clean-up.""" 1515 # add together all of the sequence parts to create the 1516 # final sequence string 1517 self.data.sequence = "".join(self._seq_data) 1518 # add on the last feature 1519 self._add_feature()
1520
1521 1522 -def parse(handle):
1523 """Iterate over GenBank formatted entries as Record objects. 1524 1525 >>> from Bio import GenBank 1526 >>> with open("GenBank/NC_000932.gb") as handle: 1527 ... for record in GenBank.parse(handle): 1528 ... print(record.accession) 1529 ['NC_000932'] 1530 1531 To get SeqRecord objects use Bio.SeqIO.parse(..., format="gb") 1532 instead. 1533 """ 1534 return iter(Iterator(handle, RecordParser()))
1535
1536 1537 -def read(handle):
1538 """Read a handle containing a single GenBank entry as a Record object. 1539 1540 >>> from Bio import GenBank 1541 >>> with open("GenBank/NC_000932.gb") as handle: 1542 ... record = GenBank.read(handle) 1543 ... print(record.accession) 1544 ['NC_000932'] 1545 1546 To get a SeqRecord object use Bio.SeqIO.read(..., format="gb") 1547 instead. 1548 """ 1549 iterator = parse(handle) 1550 try: 1551 first = next(iterator) 1552 except StopIteration: 1553 first = None 1554 if first is None: 1555 raise ValueError("No records found in handle") 1556 try: 1557 second = next(iterator) 1558 except StopIteration: 1559 second = None 1560 if second is not None: 1561 raise ValueError("More than one record found in handle") 1562 return first
1563 1564 1565 if __name__ == "__main__": 1566 from Bio._utils import run_doctest 1567 run_doctest() 1568