Package Bio :: Package GenBank
[hide private]
[frames] | no frames]

Source Code for Package Bio.GenBank

   1  # Copyright 2000 by Jeffrey Chang, Brad Chapman.  All rights reserved. 
   2  # Copyright 2006-2016 by Peter Cock.  All rights reserved. 
   3  # 
   4  # This code is part of the Biopython distribution and governed by its 
   5  # license.  Please see the LICENSE file that should have been included 
   6  # as part of this package. 
   7   
   8  """Code to work with GenBank formatted files. 
   9   
  10  Rather than using Bio.GenBank, you are now encouraged to use Bio.SeqIO with 
  11  the "genbank" or "embl" format names to parse GenBank or EMBL files into 
  12  SeqRecord and SeqFeature objects (see the Biopython tutorial for details). 
  13   
  14  Using Bio.GenBank directly to parse GenBank files is only useful if you want 
  15  to obtain GenBank-specific Record objects, which is a much closer 
  16  representation to the raw file contents than the SeqRecord alternative from 
  17  the FeatureParser (used in Bio.SeqIO). 
  18   
  19  To use the Bio.GenBank parser, there are two helper functions: 
  20   
  21      - read                  Parse a handle containing a single GenBank record 
  22        as Bio.GenBank specific Record objects. 
  23      - parse                 Iterate over a handle containing multiple GenBank 
  24        records as Bio.GenBank specific Record objects. 
  25   
  26  The following internal classes are not intended for direct use and may 
  27  be deprecated in a future release. 
  28   
  29  Classes: 
  30   
  31      - Iterator              Iterate through a file of GenBank entries 
  32      - ErrorFeatureParser    Catch errors caused during parsing. 
  33      - FeatureParser         Parse GenBank data in SeqRecord and SeqFeature objects. 
  34      - RecordParser          Parse GenBank data into a Record object. 
  35   
  36  Exceptions: 
  37   
  38      - ParserFailureError    Exception indicating a failure in the parser (ie. 
  39        scanner or consumer) 
  40      - LocationParserError   Exception indicating a problem with the spark based 
  41        location parser. 
  42   
  43  """ 
  44  from __future__ import print_function 
  45   
  46  import re 
  47  import sys  # for checking if Python 2 
  48   
  49  # other Biopython stuff 
  50  from Bio import SeqFeature 
  51   
  52  # other Bio.GenBank stuff 
  53  from .utils import FeatureValueCleaner 
  54  from .Scanner import GenBankScanner 
  55   
  56   
  57  # Constants used to parse GenBank header lines 
  58  GENBANK_INDENT = 12 
  59  GENBANK_SPACER = " " * GENBANK_INDENT 
  60   
  61  # Constants for parsing GenBank feature lines 
  62  FEATURE_KEY_INDENT = 5 
  63  FEATURE_QUALIFIER_INDENT = 21 
  64  FEATURE_KEY_SPACER = " " * FEATURE_KEY_INDENT 
  65  FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT 
  66   
  67  # Regular expressions for location parsing 
  68  _solo_location = r"[<>]?\d+" 
  69  _pair_location = r"[<>]?\d+\.\.[<>]?\d+" 
  70  _between_location = r"\d+\^\d+" 
  71   
  72  _within_position = r"\(\d+\.\d+\)" 
  73  _re_within_position = re.compile(_within_position) 
  74  _within_location = r"([<>]?\d+|%s)\.\.([<>]?\d+|%s)" \ 
  75                     % (_within_position, _within_position) 
  76  assert _re_within_position.match("(3.9)") 
  77  assert re.compile(_within_location).match("(3.9)..10") 
  78  assert re.compile(_within_location).match("26..(30.33)") 
  79  assert re.compile(_within_location).match("(13.19)..(20.28)") 
  80   
  81  _oneof_position = r"one\-of\(\d+(,\d+)+\)" 
  82  _re_oneof_position = re.compile(_oneof_position) 
  83  _oneof_location = r"([<>]?\d+|%s)\.\.([<>]?\d+|%s)" \ 
  84                     % (_oneof_position, _oneof_position) 
  85  assert _re_oneof_position.match("one-of(6,9)") 
  86  assert re.compile(_oneof_location).match("one-of(6,9)..101") 
  87  assert re.compile(_oneof_location).match("one-of(6,9)..one-of(101,104)") 
  88  assert re.compile(_oneof_location).match("6..one-of(101,104)") 
  89   
  90  assert not _re_oneof_position.match("one-of(3)") 
  91  assert _re_oneof_position.match("one-of(3,6)") 
  92  assert _re_oneof_position.match("one-of(3,6,9)") 
  93   
  94   
  95  _simple_location = r"\d+\.\.\d+" 
  96  _re_simple_location = re.compile(r"^%s$" % _simple_location) 
  97  _re_simple_compound = re.compile(r"^(join|order|bond)\(%s(,%s)*\)$" 
  98                                   % (_simple_location, _simple_location)) 
  99  _complex_location = r"([a-zA-Z][a-zA-Z0-9_\.\|]*[a-zA-Z0-9]?\:)?(%s|%s|%s|%s|%s)" \ 
 100                      % (_pair_location, _solo_location, _between_location, 
 101                         _within_location, _oneof_location) 
 102  _re_complex_location = re.compile(r"^%s$" % _complex_location) 
 103  _possibly_complemented_complex_location = r"(%s|complement\(%s\))" \ 
 104                                            % (_complex_location, _complex_location) 
 105  _re_complex_compound = re.compile(r"^(join|order|bond)\(%s(,%s)*\)$" 
 106                                   % (_possibly_complemented_complex_location, 
 107                                      _possibly_complemented_complex_location)) 
 108   
 109   
 110  assert _re_simple_location.match("104..160") 
 111  assert not _re_simple_location.match("68451760..68452073^68452074") 
 112  assert not _re_simple_location.match("<104..>160") 
 113  assert not _re_simple_location.match("104") 
 114  assert not _re_simple_location.match("<1") 
 115  assert not _re_simple_location.match(">99999") 
 116  assert not _re_simple_location.match("join(104..160,320..390,504..579)") 
 117  assert not _re_simple_compound.match("bond(12,63)") 
 118  assert _re_simple_compound.match("join(104..160,320..390,504..579)") 
 119  assert _re_simple_compound.match("order(1..69,1308..1465)") 
 120  assert not _re_simple_compound.match("order(1..69,1308..1465,1524)") 
 121  assert not _re_simple_compound.match("join(<1..442,992..1228,1524..>1983)") 
 122  assert not _re_simple_compound.match("join(<1..181,254..336,422..497,574..>590)") 
 123  assert not _re_simple_compound.match("join(1475..1577,2841..2986,3074..3193,3314..3481,4126..>4215)") 
 124  assert not _re_simple_compound.match("test(1..69,1308..1465)") 
 125  assert not _re_simple_compound.match("complement(1..69)") 
 126  assert not _re_simple_compound.match("(1..69)") 
 127  assert _re_complex_location.match("(3.9)..10") 
 128  assert _re_complex_location.match("26..(30.33)") 
 129  assert _re_complex_location.match("(13.19)..(20.28)") 
 130  assert _re_complex_location.match("41^42")  # between 
 131  assert _re_complex_location.match("AL121804:41^42") 
 132  assert _re_complex_location.match("AL121804:41..610") 
 133  assert _re_complex_location.match("AL121804.2:41..610") 
 134  assert _re_complex_location.match("AL358792.24.1.166931:3274..3461")  # lots of dots in external reference 
 135  assert _re_complex_location.match("one-of(3,6)..101") 
 136  assert _re_complex_compound.match("join(153490..154269,AL121804.2:41..610,AL121804.2:672..1487)") 
 137  assert not _re_simple_compound.match("join(153490..154269,AL121804.2:41..610,AL121804.2:672..1487)") 
 138  assert _re_complex_compound.match("join(complement(69611..69724),139856..140650)") 
 139  assert _re_complex_compound.match("join(complement(AL354868.10.1.164018:80837..81016),complement(AL354868.10.1.164018:80539..80835))") 
 140   
 141  # Trans-spliced example from NC_016406, note underscore in reference name: 
 142  assert _re_complex_location.match("NC_016402.1:6618..6676") 
 143  assert _re_complex_location.match("181647..181905") 
 144  assert _re_complex_compound.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)") 
 145  assert not _re_complex_location.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)") 
 146  assert not _re_simple_compound.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)") 
 147  assert not _re_complex_location.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)") 
 148  assert not _re_simple_location.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)") 
 149   
 150  _solo_bond = re.compile("bond\(%s\)" % _solo_location) 
 151  assert _solo_bond.match("bond(196)") 
 152  assert _solo_bond.search("bond(196)") 
 153  assert _solo_bond.search("join(bond(284),bond(305),bond(309),bond(305))") 
 154   
 155   
156 -def _pos(pos_str, offset=0):
157 """Build a Position object (PRIVATE). 158 159 For an end position, leave offset as zero (default): 160 161 >>> _pos("5") 162 ExactPosition(5) 163 164 For a start position, set offset to minus one (for Python counting): 165 166 >>> _pos("5", -1) 167 ExactPosition(4) 168 169 This also covers fuzzy positions: 170 171 >>> p = _pos("<5") 172 >>> p 173 BeforePosition(5) 174 >>> print(p) 175 <5 176 >>> int(p) 177 5 178 179 >>> _pos(">5") 180 AfterPosition(5) 181 182 By default assumes an end position, so note the integer behaviour: 183 184 >>> p = _pos("one-of(5,8,11)") 185 >>> p 186 OneOfPosition(11, choices=[ExactPosition(5), ExactPosition(8), ExactPosition(11)]) 187 >>> print(p) 188 one-of(5,8,11) 189 >>> int(p) 190 11 191 192 >>> _pos("(8.10)") 193 WithinPosition(10, left=8, right=10) 194 195 Fuzzy start positions: 196 197 >>> p = _pos("<5", -1) 198 >>> p 199 BeforePosition(4) 200 >>> print(p) 201 <4 202 >>> int(p) 203 4 204 205 Notice how the integer behaviour changes too! 206 207 >>> p = _pos("one-of(5,8,11)", -1) 208 >>> p 209 OneOfPosition(4, choices=[ExactPosition(4), ExactPosition(7), ExactPosition(10)]) 210 >>> print(p) 211 one-of(4,7,10) 212 >>> int(p) 213 4 214 215 """ 216 if pos_str.startswith("<"): 217 return SeqFeature.BeforePosition(int(pos_str[1:]) + offset) 218 elif pos_str.startswith(">"): 219 return SeqFeature.AfterPosition(int(pos_str[1:]) + offset) 220 elif _re_within_position.match(pos_str): 221 s, e = pos_str[1:-1].split(".") 222 s = int(s) + offset 223 e = int(e) + offset 224 if offset == -1: 225 default = s 226 else: 227 default = e 228 return SeqFeature.WithinPosition(default, left=s, right=e) 229 elif _re_oneof_position.match(pos_str): 230 assert pos_str.startswith("one-of(") 231 assert pos_str[-1] == ")" 232 parts = [SeqFeature.ExactPosition(int(pos) + offset) 233 for pos in pos_str[7:-1].split(",")] 234 if offset == -1: 235 default = min(int(pos) for pos in parts) 236 else: 237 default = max(int(pos) for pos in parts) 238 return SeqFeature.OneOfPosition(default, choices=parts) 239 else: 240 return SeqFeature.ExactPosition(int(pos_str) + offset)
241 242
243 -def _loc(loc_str, expected_seq_length, strand):
244 """FeatureLocation from non-compound non-complement location (PRIVATE). 245 246 Simple examples, 247 248 >>> _loc("123..456", 1000, +1) 249 FeatureLocation(ExactPosition(122), ExactPosition(456), strand=1) 250 >>> _loc("<123..>456", 1000, strand = -1) 251 FeatureLocation(BeforePosition(122), AfterPosition(456), strand=-1) 252 253 A more complex location using within positions, 254 255 >>> _loc("(9.10)..(20.25)", 1000, 1) 256 FeatureLocation(WithinPosition(8, left=8, right=9), WithinPosition(25, left=20, right=25), strand=1) 257 258 Notice how that will act as though it has overall start 8 and end 25. 259 260 Zero length between feature, 261 262 >>> _loc("123^124", 1000, 0) 263 FeatureLocation(ExactPosition(123), ExactPosition(123), strand=0) 264 265 The expected sequence length is needed for a special case, a between 266 position at the start/end of a circular genome: 267 268 >>> _loc("1000^1", 1000, 1) 269 FeatureLocation(ExactPosition(1000), ExactPosition(1000), strand=1) 270 271 Apart from this special case, between positions P^Q must have P+1==Q, 272 273 >>> _loc("123^456", 1000, 1) 274 Traceback (most recent call last): 275 ... 276 ValueError: Invalid between location '123^456' 277 278 You can optionally provide a reference name: 279 280 >>> _loc("AL391218.9:105173..108462", 2000000, 1) 281 FeatureLocation(ExactPosition(105172), ExactPosition(108462), strand=1, ref='AL391218.9') 282 283 """ 284 if ":" in loc_str: 285 ref, loc_str = loc_str.split(":") 286 else: 287 ref = None 288 try: 289 s, e = loc_str.split("..") 290 except ValueError: 291 assert ".." not in loc_str 292 if "^" in loc_str: 293 # A between location like "67^68" (one based counting) is a 294 # special case (note it has zero length). In python slice 295 # notation this is 67:67, a zero length slice. See Bug 2622 296 # Further more, on a circular genome of length N you can have 297 # a location N^1 meaning the junction at the origin. See Bug 3098. 298 # NOTE - We can imagine between locations like "2^4", but this 299 # is just "3". Similarly, "2^5" is just "3..4" 300 s, e = loc_str.split("^") 301 if int(s) + 1 == int(e): 302 pos = _pos(s) 303 elif int(s) == expected_seq_length and e == "1": 304 pos = _pos(s) 305 else: 306 raise ValueError("Invalid between location %s" % repr(loc_str)) 307 return SeqFeature.FeatureLocation(pos, pos, strand, ref=ref) 308 else: 309 # e.g. "123" 310 s = loc_str 311 e = loc_str 312 return SeqFeature.FeatureLocation(_pos(s, -1), _pos(e), strand, ref=ref)
313 314
315 -def _split_compound_loc(compound_loc):
316 """Split a tricky compound location string (PRIVATE). 317 318 >>> list(_split_compound_loc("123..145")) 319 ['123..145'] 320 >>> list(_split_compound_loc("123..145,200..209")) 321 ['123..145', '200..209'] 322 >>> list(_split_compound_loc("one-of(200,203)..300")) 323 ['one-of(200,203)..300'] 324 >>> list(_split_compound_loc("complement(123..145),200..209")) 325 ['complement(123..145)', '200..209'] 326 >>> list(_split_compound_loc("123..145,one-of(200,203)..209")) 327 ['123..145', 'one-of(200,203)..209'] 328 >>> list(_split_compound_loc("123..145,one-of(200,203)..one-of(209,211),300")) 329 ['123..145', 'one-of(200,203)..one-of(209,211)', '300'] 330 >>> list(_split_compound_loc("123..145,complement(one-of(200,203)..one-of(209,211)),300")) 331 ['123..145', 'complement(one-of(200,203)..one-of(209,211))', '300'] 332 >>> list(_split_compound_loc("123..145,200..one-of(209,211),300")) 333 ['123..145', '200..one-of(209,211)', '300'] 334 >>> list(_split_compound_loc("123..145,200..one-of(209,211)")) 335 ['123..145', '200..one-of(209,211)'] 336 >>> list(_split_compound_loc("complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905")) 337 ['complement(149815..150200)', 'complement(293787..295573)', 'NC_016402.1:6618..6676', '181647..181905'] 338 """ 339 if "one-of(" in compound_loc: 340 # Hard case 341 while "," in compound_loc: 342 assert compound_loc[0] != "," 343 assert compound_loc[0:2] != ".." 344 i = compound_loc.find(",") 345 part = compound_loc[:i] 346 compound_loc = compound_loc[i:] # includes the comma 347 while part.count("(") > part.count(")"): 348 assert "one-of(" in part, (part, compound_loc) 349 i = compound_loc.find(")") 350 part += compound_loc[:i + 1] 351 compound_loc = compound_loc[i + 1:] 352 if compound_loc.startswith(".."): 353 i = compound_loc.find(",") 354 if i == -1: 355 part += compound_loc 356 compound_loc = "" 357 else: 358 part += compound_loc[:i] 359 compound_loc = compound_loc[i:] # includes the comma 360 while part.count("(") > part.count(")"): 361 assert part.count("one-of(") == 2 362 i = compound_loc.find(")") 363 part += compound_loc[:i + 1] 364 compound_loc = compound_loc[i + 1:] 365 if compound_loc.startswith(","): 366 compound_loc = compound_loc[1:] 367 assert part 368 yield part 369 if compound_loc: 370 yield compound_loc 371 else: 372 # Easy case 373 for part in compound_loc.split(","): 374 yield part
375 376
377 -class Iterator(object):
378 """Iterator interface to move over a file of GenBank entries one at a time (OBSOLETE). 379 380 This class is likely to be deprecated in a future release of Biopython. 381 Please use Bio.SeqIO.parse(..., format="gb") or Bio.GenBank.parse(...) 382 for SeqRecord and GenBank specific Record objects respectively instead. 383 """
384 - def __init__(self, handle, parser=None):
385 """Initialize the iterator. 386 387 Arguments: 388 389 - handle - A handle with GenBank entries to iterate through. 390 - parser - An optional parser to pass the entries through before 391 returning them. If None, then the raw entry will be returned. 392 """ 393 self.handle = handle 394 self._parser = parser
395
396 - def __next__(self):
397 """Return the next GenBank record from the handle. 398 399 Will return None if we ran out of records. 400 """ 401 if self._parser is None: 402 lines = [] 403 while True: 404 line = self.handle.readline() 405 if not line: 406 return None # Premature end of file? 407 lines.append(line) 408 if line.rstrip() == "//": 409 break 410 return "".join(lines) 411 try: 412 return self._parser.parse(self.handle) 413 except StopIteration: 414 return None
415 416 if sys.version_info[0] < 3:
417 - def next(self):
418 """Python 2 style alias for Python 3 style __next__ method.""" 419 return self.__next__()
420
421 - def __iter__(self):
422 return iter(self.__next__, None)
423 424
425 -class ParserFailureError(Exception):
426 """Failure caused by some kind of problem in the parser. 427 """ 428 pass
429 430
431 -class LocationParserError(Exception):
432 """Could not Properly parse out a location from a GenBank file. 433 """ 434 pass
435 436
437 -class FeatureParser(object):
438 """Parse GenBank files into Seq + Feature objects (OBSOLETE). 439 440 Direct use of this class is discouraged, and may be deprecated in 441 a future release of Biopython. 442 443 Please use Bio.SeqIO.parse(...) or Bio.SeqIO.read(...) instead. 444 """
445 - def __init__(self, debug_level=0, use_fuzziness=1, 446 feature_cleaner=FeatureValueCleaner()):
447 """Initialize a GenBank parser and Feature consumer. 448 449 Arguments: 450 451 - debug_level - An optional argument that species the amount of 452 debugging information the parser should spit out. By default we have 453 no debugging info (the fastest way to do things), but if you want 454 you can set this as high as two and see exactly where a parse fails. 455 - use_fuzziness - Specify whether or not to use fuzzy representations. 456 The default is 1 (use fuzziness). 457 - feature_cleaner - A class which will be used to clean out the 458 values of features. This class must implement the function 459 clean_value. GenBank.utils has a "standard" cleaner class, which 460 is used by default. 461 """ 462 self._scanner = GenBankScanner(debug_level) 463 self.use_fuzziness = use_fuzziness 464 self._cleaner = feature_cleaner
465
466 - def parse(self, handle):
467 """Parse the specified handle. 468 """ 469 self._consumer = _FeatureConsumer(self.use_fuzziness, 470 self._cleaner) 471 self._scanner.feed(handle, self._consumer) 472 return self._consumer.data
473 474
475 -class RecordParser(object):
476 """Parse GenBank files into Record objects (OBSOLETE). 477 478 Direct use of this class is discouraged, and may be deprecated in 479 a future release of Biopython. 480 481 Please use the Bio.GenBank.parse(...) or Bio.GenBank.read(...) functions 482 instead. 483 """
484 - def __init__(self, debug_level=0):
485 """Initialize the parser. 486 487 Arguments: 488 489 - debug_level - An optional argument that species the amount of 490 debugging information the parser should spit out. By default we have 491 no debugging info (the fastest way to do things), but if you want 492 you can set this as high as two and see exactly where a parse fails. 493 """ 494 self._scanner = GenBankScanner(debug_level)
495
496 - def parse(self, handle):
497 """Parse the specified handle into a GenBank record. 498 """ 499 self._consumer = _RecordConsumer() 500 501 self._scanner.feed(handle, self._consumer) 502 return self._consumer.data
503 504
505 -class _BaseGenBankConsumer(object):
506 """Abstract GenBank consumer providing useful general functions (PRIVATE). 507 508 This just helps to eliminate some duplication in things that most 509 GenBank consumers want to do. 510 """ 511 # Special keys in GenBank records that we should remove spaces from 512 # For instance, \translation keys have values which are proteins and 513 # should have spaces and newlines removed from them. This class 514 # attribute gives us more control over specific formatting problems. 515 remove_space_keys = ["translation"] 516
517 - def __init__(self):
518 pass
519
520 - def _unhandled(self, data):
521 pass
522
523 - def __getattr__(self, attr):
524 return self._unhandled
525
526 - def _split_keywords(self, keyword_string):
527 """Split a string of keywords into a nice clean list. 528 """ 529 # process the keywords into a python list 530 if keyword_string == "" or keyword_string == ".": 531 keywords = "" 532 elif keyword_string[-1] == '.': 533 keywords = keyword_string[:-1] 534 else: 535 keywords = keyword_string 536 keyword_list = keywords.split(';') 537 clean_keyword_list = [x.strip() for x in keyword_list] 538 return clean_keyword_list
539
540 - def _split_accessions(self, accession_string):
541 """Split a string of accession numbers into a list. 542 """ 543 # first replace all line feeds with spaces 544 # Also, EMBL style accessions are split with ';' 545 accession = accession_string.replace("\n", " ").replace(";", " ") 546 547 return [x.strip() for x in accession.split() if x.strip()]
548
549 - def _split_taxonomy(self, taxonomy_string):
550 """Split a string with taxonomy info into a list. 551 """ 552 if not taxonomy_string or taxonomy_string == ".": 553 # Missing data, no taxonomy 554 return [] 555 556 if taxonomy_string[-1] == '.': 557 tax_info = taxonomy_string[:-1] 558 else: 559 tax_info = taxonomy_string 560 tax_list = tax_info.split(';') 561 new_tax_list = [] 562 for tax_item in tax_list: 563 new_items = tax_item.split("\n") 564 new_tax_list.extend(new_items) 565 while '' in new_tax_list: 566 new_tax_list.remove('') 567 clean_tax_list = [x.strip() for x in new_tax_list] 568 569 return clean_tax_list
570
571 - def _clean_location(self, location_string):
572 """Clean whitespace out of a location string. 573 574 The location parser isn't a fan of whitespace, so we clean it out 575 before feeding it into the parser. 576 """ 577 # Originally this imported string.whitespace and did a replace 578 # via a loop. It's simpler to just split on whitespace and rejoin 579 # the string - and this avoids importing string too. See Bug 2684. 580 return ''.join(location_string.split())
581
582 - def _remove_newlines(self, text):
583 """Remove any newlines in the passed text, returning the new string. 584 """ 585 # get rid of newlines in the qualifier value 586 newlines = ["\n", "\r"] 587 for ws in newlines: 588 text = text.replace(ws, "") 589 590 return text
591
592 - def _normalize_spaces(self, text):
593 """Replace multiple spaces in the passed text with single spaces. 594 """ 595 # get rid of excessive spaces 596 return ' '.join(x for x in text.split(" ") if x)
597
598 - def _remove_spaces(self, text):
599 """Remove all spaces from the passed text. 600 """ 601 return text.replace(" ", "")
602
603 - def _convert_to_python_numbers(self, start, end):
604 """Convert a start and end range to python notation. 605 606 In GenBank, starts and ends are defined in "biological" coordinates, 607 where 1 is the first base and [i, j] means to include both i and j. 608 609 In python, 0 is the first base and [i, j] means to include i, but 610 not j. 611 612 So, to convert "biological" to python coordinates, we need to 613 subtract 1 from the start, and leave the end and things should 614 be converted happily. 615 """ 616 new_start = start - 1 617 new_end = end 618 619 return new_start, new_end
620 621
622 -class _FeatureConsumer(_BaseGenBankConsumer):
623 """Create a SeqRecord object with Features to return (PRIVATE). 624 625 Attributes: 626 627 - use_fuzziness - specify whether or not to parse with fuzziness in 628 feature locations. 629 - feature_cleaner - a class that will be used to provide specialized 630 cleaning-up of feature values. 631 """
632 - def __init__(self, use_fuzziness, feature_cleaner=None):
633 from Bio.SeqRecord import SeqRecord 634 _BaseGenBankConsumer.__init__(self) 635 self.data = SeqRecord(None, id=None) 636 self.data.id = None 637 self.data.description = "" 638 639 self._use_fuzziness = use_fuzziness 640 self._feature_cleaner = feature_cleaner 641 642 self._seq_type = '' 643 self._seq_data = [] 644 self._cur_reference = None 645 self._cur_feature = None 646 self._expected_size = None
647
648 - def locus(self, locus_name):
649 """Set the locus name is set as the name of the Sequence. 650 """ 651 self.data.name = locus_name
652
653 - def size(self, content):
654 """Record the sequence length.""" 655 self._expected_size = int(content)
656
657 - def residue_type(self, type):
658 """Record the sequence type so we can choose an appropriate alphabet. 659 """ 660 self._seq_type = type.strip()
661
662 - def data_file_division(self, division):
663 self.data.annotations['data_file_division'] = division
664
665 - def date(self, submit_date):
666 self.data.annotations['date'] = submit_date
667
668 - def definition(self, definition):
669 """Set the definition as the description of the sequence. 670 """ 671 if self.data.description: 672 # Append to any existing description 673 # e.g. EMBL files with two DE lines. 674 self.data.description += " " + definition 675 else: 676 self.data.description = definition
677
678 - def accession(self, acc_num):
679 """Set the accession number as the id of the sequence. 680 681 If we have multiple accession numbers, the first one passed is 682 used. 683 """ 684 new_acc_nums = self._split_accessions(acc_num) 685 686 # Also record them ALL in the annotations 687 try: 688 # On the off chance there was more than one accession line: 689 for acc in new_acc_nums: 690 # Prevent repeat entries 691 if acc not in self.data.annotations['accessions']: 692 self.data.annotations['accessions'].append(acc) 693 except KeyError: 694 self.data.annotations['accessions'] = new_acc_nums 695 696 # if we haven't set the id information yet, add the first acc num 697 if not self.data.id: 698 if len(new_acc_nums) > 0: 699 # self.data.id = new_acc_nums[0] 700 # Use the FIRST accession as the ID, not the first on this line! 701 self.data.id = self.data.annotations['accessions'][0]
702
703 - def wgs(self, content):
704 self.data.annotations['wgs'] = content.split('-')
705
706 - def add_wgs_scafld(self, content):
707 self.data.annotations.setdefault('wgs_scafld', []).append(content.split('-'))
708
709 - def nid(self, content):
710 self.data.annotations['nid'] = content
711
712 - def pid(self, content):
713 self.data.annotations['pid'] = content
714
715 - def version(self, version_id):
716 # Want to use the versioned accession as the record.id 717 # This comes from the VERSION line in GenBank files, or the 718 # obsolete SV line in EMBL. For the new EMBL files we need 719 # both the version suffix from the ID line and the accession 720 # from the AC line. 721 if version_id.count(".") == 1 and version_id.split(".")[1].isdigit(): 722 self.accession(version_id.split(".")[0]) 723 self.version_suffix(version_id.split(".")[1]) 724 elif version_id: 725 # For backwards compatibility... 726 self.data.id = version_id
727
728 - def project(self, content):
729 """Handle the information from the PROJECT line as a list of projects. 730 731 e.g.:: 732 733 PROJECT GenomeProject:28471 734 735 or:: 736 737 PROJECT GenomeProject:13543 GenomeProject:99999 738 739 This is stored as dbxrefs in the SeqRecord to be consistent with the 740 projected switch of this line to DBLINK in future GenBank versions. 741 Note the NCBI plan to replace "GenomeProject:28471" with the shorter 742 "Project:28471" as part of this transition. 743 """ 744 content = content.replace("GenomeProject:", "Project:") 745 self.data.dbxrefs.extend(p for p in content.split() if p)
746 779
780 - def version_suffix(self, version):
781 """Set the version to overwrite the id. 782 783 Since the version provides the same information as the accession 784 number, plus some extra info, we set this as the id if we have 785 a version. 786 """ 787 # e.g. GenBank line: 788 # VERSION U49845.1 GI:1293613 789 # or the obsolete EMBL line: 790 # SV U49845.1 791 # Scanner calls consumer.version("U49845.1") 792 # which then calls consumer.version_suffix(1) 793 # 794 # e.g. EMBL new line: 795 # ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP. 796 # Scanner calls consumer.version_suffix(1) 797 assert version.isdigit() 798 self.data.annotations['sequence_version'] = int(version)
799
800 - def db_source(self, content):
801 self.data.annotations['db_source'] = content.rstrip()
802
803 - def gi(self, content):
804 self.data.annotations['gi'] = content
805
806 - def keywords(self, content):
807 if 'keywords' in self.data.annotations: 808 # Multi-line keywords, append to list 809 # Note EMBL states "A keyword is never split between lines." 810 self.data.annotations['keywords'].extend(self._split_keywords(content)) 811 else: 812 self.data.annotations['keywords'] = self._split_keywords(content)
813
814 - def segment(self, content):
815 self.data.annotations['segment'] = content
816
817 - def source(self, content):
818 # Note that some software (e.g. VectorNTI) may produce an empty 819 # source (rather than using a dot/period as might be expected). 820 if content == "": 821 source_info = "" 822 elif content[-1] == '.': 823 source_info = content[:-1] 824 else: 825 source_info = content 826 self.data.annotations['source'] = source_info
827
828 - def organism(self, content):
829 self.data.annotations['organism'] = content
830
831 - def taxonomy(self, content):
832 """Records (another line of) the taxonomy lineage. 833 """ 834 lineage = self._split_taxonomy(content) 835 try: 836 self.data.annotations['taxonomy'].extend(lineage) 837 except KeyError: 838 self.data.annotations['taxonomy'] = lineage
839
840 - def reference_num(self, content):
841 """Signal the beginning of a new reference object. 842 """ 843 # if we have a current reference that hasn't been added to 844 # the list of references, add it. 845 if self._cur_reference is not None: 846 self.data.annotations['references'].append(self._cur_reference) 847 else: 848 self.data.annotations['references'] = [] 849 850 self._cur_reference = SeqFeature.Reference()
851
852 - def reference_bases(self, content):
853 """Attempt to determine the sequence region the reference entails. 854 855 Possible types of information we may have to deal with: 856 857 (bases 1 to 86436) 858 (sites) 859 (bases 1 to 105654; 110423 to 111122) 860 1 (residues 1 to 182) 861 """ 862 # first remove the parentheses or other junk 863 ref_base_info = content[1:-1] 864 865 all_locations = [] 866 # parse if we've got 'bases' and 'to' 867 if 'bases' in ref_base_info and 'to' in ref_base_info: 868 # get rid of the beginning 'bases' 869 ref_base_info = ref_base_info[5:] 870 locations = self._split_reference_locations(ref_base_info) 871 all_locations.extend(locations) 872 elif 'residues' in ref_base_info and 'to' in ref_base_info: 873 residues_start = ref_base_info.find("residues") 874 # get only the information after "residues" 875 ref_base_info = ref_base_info[(residues_start + len("residues ")):] 876 locations = self._split_reference_locations(ref_base_info) 877 all_locations.extend(locations) 878 879 # make sure if we are not finding information then we have 880 # the string 'sites' or the string 'bases' 881 elif (ref_base_info == 'sites' or 882 ref_base_info.strip() == 'bases'): 883 pass 884 # otherwise raise an error 885 else: 886 raise ValueError("Could not parse base info %s in record %s" % 887 (ref_base_info, self.data.id)) 888 889 self._cur_reference.location = all_locations
890
891 - def _split_reference_locations(self, location_string):
892 """Get reference locations out of a string of reference information 893 894 The passed string should be of the form:: 895 896 1 to 20; 20 to 100 897 898 This splits the information out and returns a list of location objects 899 based on the reference locations. 900 """ 901 # split possibly multiple locations using the ';' 902 all_base_info = location_string.split(';') 903 904 new_locations = [] 905 for base_info in all_base_info: 906 start, end = base_info.split('to') 907 new_start, new_end = \ 908 self._convert_to_python_numbers(int(start.strip()), 909 int(end.strip())) 910 this_location = SeqFeature.FeatureLocation(new_start, new_end) 911 new_locations.append(this_location) 912 return new_locations
913
914 - def authors(self, content):
915 if self._cur_reference.authors: 916 self._cur_reference.authors += ' ' + content 917 else: 918 self._cur_reference.authors = content
919
920 - def consrtm(self, content):
921 if self._cur_reference.consrtm: 922 self._cur_reference.consrtm += ' ' + content 923 else: 924 self._cur_reference.consrtm = content
925
926 - def title(self, content):
927 if self._cur_reference is None: 928 import warnings 929 from Bio import BiopythonParserWarning 930 warnings.warn("GenBank TITLE line without REFERENCE line.", 931 BiopythonParserWarning) 932 elif self._cur_reference.title: 933 self._cur_reference.title += ' ' + content 934 else: 935 self._cur_reference.title = content
936
937 - def journal(self, content):
938 if self._cur_reference.journal: 939 self._cur_reference.journal += ' ' + content 940 else: 941 self._cur_reference.journal = content
942
943 - def medline_id(self, content):
944 self._cur_reference.medline_id = content
945
946 - def pubmed_id(self, content):
947 self._cur_reference.pubmed_id = content
948
949 - def remark(self, content):
950 """Deal with a reference comment.""" 951 if self._cur_reference.comment: 952 self._cur_reference.comment += ' ' + content 953 else: 954 self._cur_reference.comment = content
955
956 - def comment(self, content):
957 try: 958 self.data.annotations['comment'] += "\n" + "\n".join(content) 959 except KeyError: 960 self.data.annotations['comment'] = "\n".join(content)
961
962 - def structured_comment(self, content):
963 self.data.annotations['structured_comment'] = content
964
965 - def features_line(self, content):
966 """Get ready for the feature table when we reach the FEATURE line. 967 """ 968 self.start_feature_table()
969
970 - def start_feature_table(self):
971 """Indicate we've got to the start of the feature table. 972 """ 973 # make sure we've added on our last reference object 974 if self._cur_reference is not None: 975 self.data.annotations['references'].append(self._cur_reference) 976 self._cur_reference = None
977
978 - def feature_key(self, content):
979 # start a new feature 980 self._cur_feature = SeqFeature.SeqFeature() 981 self._cur_feature.type = content 982 self.data.features.append(self._cur_feature)
983
984 - def location(self, content):
985 """Parse out location information from the location string. 986 987 This uses simple Python code with some regular expressions to do the 988 parsing, and then translates the results into appropriate objects. 989 """ 990 # clean up newlines and other whitespace inside the location before 991 # parsing - locations should have no whitespace whatsoever 992 location_line = self._clean_location(content) 993 994 # Older records have junk like replace(266,"c") in the 995 # location line. Newer records just replace this with 996 # the number 266 and have the information in a more reasonable 997 # place. So we'll just grab out the number and feed this to the 998 # parser. We shouldn't really be losing any info this way. 999 if 'replace' in location_line: 1000 comma_pos = location_line.find(',') 1001 location_line = location_line[8:comma_pos] 1002 1003 cur_feature = self._cur_feature 1004 1005 # Handle top level complement here for speed 1006 if location_line.startswith("complement("): 1007 assert location_line.endswith(")") 1008 location_line = location_line[11:-1] 1009 strand = -1 1010 elif "PROTEIN" in self._seq_type.upper(): 1011 strand = None 1012 else: 1013 # Assume nucleotide otherwise feature strand for 1014 # GenBank files with bad LOCUS lines set to None 1015 strand = 1 1016 1017 # Special case handling of the most common cases for speed 1018 if _re_simple_location.match(location_line): 1019 # e.g. "123..456" 1020 s, e = location_line.split("..") 1021 cur_feature.location = SeqFeature.FeatureLocation(int(s) - 1, 1022 int(e), 1023 strand) 1024 return 1025 1026 if _solo_bond.search(location_line): 1027 # e.g. bond(196) 1028 # e.g. join(bond(284),bond(305),bond(309),bond(305)) 1029 import warnings 1030 from Bio import BiopythonParserWarning 1031 warnings.warn("Dropping bond qualifier in feature location", BiopythonParserWarning) 1032 # There ought to be a better way to do this... 1033 for x in _solo_bond.finditer(location_line): 1034 x = x.group() 1035 location_line = location_line.replace(x, x[5:-1]) 1036 1037 if _re_simple_compound.match(location_line): 1038 # e.g. join(<123..456,480..>500) 1039 i = location_line.find("(") 1040 # cur_feature.location_operator = location_line[:i] 1041 # we can split on the comma because these are simple locations 1042 locs = [] 1043 for part in location_line[i + 1:-1].split(","): 1044 s, e = part.split("..") 1045 locs.append(SeqFeature.FeatureLocation(int(s) - 1, 1046 int(e), 1047 strand)) 1048 if strand == -1: 1049 cur_feature.location = SeqFeature.CompoundLocation(locs[::-1], 1050 operator=location_line[:i]) 1051 else: 1052 cur_feature.location = SeqFeature.CompoundLocation(locs, 1053 operator=location_line[:i]) 1054 return 1055 1056 # Handle the general case with more complex regular expressions 1057 if _re_complex_location.match(location_line): 1058 # e.g. "AL121804.2:41..610" 1059 cur_feature.location = _loc(location_line, self._expected_size, strand) 1060 return 1061 1062 if _re_complex_compound.match(location_line): 1063 i = location_line.find("(") 1064 # cur_feature.location_operator = location_line[:i] 1065 # Can't split on the comma because of positions like one-of(1,2,3) 1066 locs = [] 1067 for part in _split_compound_loc(location_line[i + 1:-1]): 1068 if part.startswith("complement("): 1069 assert part[-1] == ")" 1070 part = part[11:-1] 1071 assert strand != -1, "Double complement?" 1072 part_strand = -1 1073 else: 1074 part_strand = strand 1075 try: 1076 loc = _loc(part, self._expected_size, part_strand) 1077 except ValueError as err: 1078 print(location_line) 1079 print(part) 1080 raise err 1081 locs.append(loc) 1082 # Historically a join on the reverse strand has been represented 1083 # in Biopython with both the parent SeqFeature and its children 1084 # (the exons for a CDS) all given a strand of -1. Likewise, for 1085 # a join feature on the forward strand they all have strand +1. 1086 # However, we must also consider evil mixed strand examples like 1087 # this, join(complement(69611..69724),139856..140087,140625..140650) 1088 if strand == -1: 1089 # Whole thing was wrapped in complement(...) 1090 for l in locs: 1091 assert l.strand == -1 1092 # Reverse the backwards order used in GenBank files 1093 # with complement(join(...)) 1094 cur_feature.location = SeqFeature.CompoundLocation(locs[::-1], 1095 operator=location_line[:i]) 1096 else: 1097 cur_feature.location = SeqFeature.CompoundLocation(locs, 1098 operator=location_line[:i]) 1099 return 1100 # Not recognised 1101 if "order" in location_line and "join" in location_line: 1102 # See Bug 3197 1103 msg = 'Combinations of "join" and "order" within the same ' + \ 1104 'location (nested operators) are illegal:\n' + location_line 1105 raise LocationParserError(msg) 1106 # This used to be an error.... 1107 cur_feature.location = None 1108 import warnings 1109 from Bio import BiopythonParserWarning 1110 warnings.warn(BiopythonParserWarning("Couldn't parse feature location: %r" 1111 % (location_line)))
1112
1113 - def feature_qualifier(self, key, value):
1114 """When we get a qualifier key and its value. 1115 1116 Can receive None, since you can have valueless keys such as /pseudo 1117 """ 1118 # Hack to try to preserve historical behaviour of /pseudo etc 1119 if value is None: 1120 # if the key doesn't exist yet, add an empty string 1121 if key not in self._cur_feature.qualifiers: 1122 self._cur_feature.qualifiers[key] = [""] 1123 return 1124 # otherwise just skip this key 1125 return 1126 1127 value = value.replace('"', '') 1128 if self._feature_cleaner is not None: 1129 value = self._feature_cleaner.clean_value(key, value) 1130 1131 # if the qualifier name exists, append the value 1132 if key in self._cur_feature.qualifiers: 1133 self._cur_feature.qualifiers[key].append(value) 1134 # otherwise start a new list of the key with its values 1135 else: 1136 self._cur_feature.qualifiers[key] = [value]
1137
1138 - def feature_qualifier_name(self, content_list):
1139 """Use feature_qualifier instead (OBSOLETE).""" 1140 raise NotImplementedError("Use the feature_qualifier method instead.")
1141
1142 - def feature_qualifier_description(self, content):
1143 """Use feature_qualifier instead (OBSOLETE).""" 1144 raise NotImplementedError("Use the feature_qualifier method instead.")
1145
1146 - def contig_location(self, content):
1147 """Deal with CONTIG information.""" 1148 # Historically this was stored as a SeqFeature object, but it was 1149 # stored under record.annotations["contig"] and not under 1150 # record.features with the other SeqFeature objects. 1151 # 1152 # The CONTIG location line can include additional tokens like 1153 # Gap(), Gap(100) or Gap(unk100) which are not used in the feature 1154 # location lines, so storing it using SeqFeature based location 1155 # objects is difficult. 1156 # 1157 # We now store this a string, which means for BioSQL we are now in 1158 # much better agreement with how BioPerl records the CONTIG line 1159 # in the database. 1160 # 1161 # NOTE - This code assumes the scanner will return all the CONTIG 1162 # lines already combined into one long string! 1163 self.data.annotations["contig"] = content
1164
1165 - def origin_name(self, content):
1166 pass
1167
1168 - def base_count(self, content):
1169 pass
1170
1171 - def base_number(self, content):
1172 pass
1173
1174 - def sequence(self, content):
1175 """Add up sequence information as we get it. 1176 1177 To try and make things speedier, this puts all of the strings 1178 into a list of strings, and then uses string.join later to put 1179 them together. Supposedly, this is a big time savings 1180 """ 1181 assert ' ' not in content 1182 self._seq_data.append(content.upper())
1183
1184 - def record_end(self, content):
1185 """Clean up when we've finished the record. 1186 """ 1187 from Bio import Alphabet 1188 from Bio.Alphabet import IUPAC 1189 from Bio.Seq import Seq, UnknownSeq 1190 1191 # Try and append the version number to the accession for the full id 1192 if not self.data.id: 1193 assert 'accessions' not in self.data.annotations, \ 1194 self.data.annotations['accessions'] 1195 self.data.id = self.data.name # Good fall back? 1196 elif self.data.id.count('.') == 0: 1197 try: 1198 self.data.id += '.%i' % self.data.annotations['sequence_version'] 1199 except KeyError: 1200 pass 1201 1202 # add the sequence information 1203 # first, determine the alphabet 1204 # we default to an generic alphabet if we don't have a 1205 # seq type or have strange sequence information. 1206 seq_alphabet = Alphabet.generic_alphabet 1207 1208 # now set the sequence 1209 sequence = "".join(self._seq_data) 1210 1211 if self._expected_size is not None \ 1212 and len(sequence) != 0 \ 1213 and self._expected_size != len(sequence): 1214 import warnings 1215 from Bio import BiopythonParserWarning 1216 warnings.warn("Expected sequence length %i, found %i (%s)." 1217 % (self._expected_size, len(sequence), self.data.id), 1218 BiopythonParserWarning) 1219 1220 if self._seq_type: 1221 # mRNA is really also DNA, since it is actually cDNA 1222 if 'DNA' in self._seq_type.upper() or 'MRNA' in self._seq_type.upper(): 1223 seq_alphabet = IUPAC.ambiguous_dna 1224 # are there ever really RNA sequences in GenBank? 1225 elif 'RNA' in self._seq_type.upper(): 1226 # Even for data which was from RNA, the sequence string 1227 # is usually given as DNA (T not U). Bug 2408 1228 if "T" in sequence and "U" not in sequence: 1229 seq_alphabet = IUPAC.ambiguous_dna 1230 else: 1231 seq_alphabet = IUPAC.ambiguous_rna 1232 elif 'PROTEIN' in self._seq_type.upper() \ 1233 or self._seq_type == "PRT": # PRT is used in EMBL-bank for patents 1234 seq_alphabet = IUPAC.protein # or extended protein? 1235 # work around ugly GenBank records which have circular or 1236 # linear but no indication of sequence type 1237 elif self._seq_type in ["circular", "linear", "unspecified"]: 1238 pass 1239 # we have a bug if we get here 1240 else: 1241 raise ValueError("Could not determine alphabet for seq_type %s" 1242 % self._seq_type) 1243 1244 # Also save the chomosome layout 1245 if 'circular' in self._seq_type.lower(): 1246 self.data.annotations['topology'] = 'circular' 1247 elif 'linear' in self._seq_type.lower(): 1248 self.data.annotations['topology'] = 'linear' 1249 1250 if not sequence and self.__expected_size: 1251 self.data.seq = UnknownSeq(self._expected_size, seq_alphabet) 1252 else: 1253 self.data.seq = Seq(sequence, seq_alphabet)
1254 1255
1256 -class _RecordConsumer(_BaseGenBankConsumer):
1257 """Create a GenBank Record object from scanner generated information (PRIVATE). 1258 """
1259 - def __init__(self):
1260 _BaseGenBankConsumer.__init__(self) 1261 from . import Record 1262 self.data = Record.Record() 1263 1264 self._seq_data = [] 1265 self._cur_reference = None 1266 self._cur_feature = None 1267 self._cur_qualifier = None
1268
1269 - def wgs(self, content):
1270 self.data.wgs = content.split('-')
1271
1272 - def add_wgs_scafld(self, content):
1273 self.data.wgs_scafld.append(content.split('-'))
1274
1275 - def locus(self, content):
1276 self.data.locus = content
1277
1278 - def size(self, content):
1279 self.data.size = content
1280
1281 - def residue_type(self, content):
1282 # Be lenient about parsing, but technically lowercase residue types are malformed. 1283 if 'dna' in content or 'rna' in content: 1284 import warnings 1285 from Bio import BiopythonParserWarning 1286 warnings.warn("Invalid seq_type (%s): DNA/RNA should be uppercase." % content, 1287 BiopythonParserWarning) 1288 self.data.residue_type = content
1289
1290 - def data_file_division(self, content):
1291 self.data.data_file_division = content
1292
1293 - def date(self, content):
1294 self.data.date = content
1295
1296 - def definition(self, content):
1297 self.data.definition = content
1298
1299 - def accession(self, content):
1300 for acc in self._split_accessions(content): 1301 if acc not in self.data.accession: 1302 self.data.accession.append(acc)
1303
1304 - def nid(self, content):
1305 self.data.nid = content
1306
1307 - def pid(self, content):
1308 self.data.pid = content
1309
1310 - def version(self, content):
1311 self.data.version = content
1312
1313 - def db_source(self, content):
1314 self.data.db_source = content.rstrip()
1315
1316 - def gi(self, content):
1317 self.data.gi = content
1318
1319 - def keywords(self, content):
1320 self.data.keywords = self._split_keywords(content)
1321
1322 - def project(self, content):
1323 self.data.projects.extend(p for p in content.split() if p)
1324 1327
1328 - def segment(self, content):
1329 self.data.segment = content
1330
1331 - def source(self, content):
1332 self.data.source = content
1333
1334 - def organism(self, content):
1335 self.data.organism = content
1336
1337 - def taxonomy(self, content):
1338 self.data.taxonomy = self._split_taxonomy(content)
1339
1340 - def reference_num(self, content):
1341 """Grab the reference number and signal the start of a new reference. 1342 """ 1343 # check if we have a reference to add 1344 if self._cur_reference is not None: 1345 self.data.references.append(self._cur_reference) 1346 1347 from . import Record 1348 self._cur_reference = Record.Reference() 1349 self._cur_reference.number = content
1350
1351 - def reference_bases(self, content):
1352 self._cur_reference.bases = content
1353
1354 - def authors(self, content):
1355 self._cur_reference.authors = content
1356
1357 - def consrtm(self, content):
1358 self._cur_reference.consrtm = content
1359
1360 - def title(self, content):
1361 if self._cur_reference is None: 1362 import warnings 1363 from Bio import BiopythonParserWarning 1364 warnings.warn("GenBank TITLE line without REFERENCE line.", 1365 BiopythonParserWarning) 1366 return 1367 self._cur_reference.title = content
1368
1369 - def journal(self, content):
1370 self._cur_reference.journal = content
1371
1372 - def medline_id(self, content):
1373 self._cur_reference.medline_id = content
1374
1375 - def pubmed_id(self, content):
1376 self._cur_reference.pubmed_id = content
1377
1378 - def remark(self, content):
1379 self._cur_reference.remark = content
1380
1381 - def comment(self, content):
1382 self.data.comment += "\n".join(content)
1383
1384 - def structured_comment(self, content):
1385 self.data.structured_comment = content
1386
1387 - def primary_ref_line(self, content):
1388 """Data for the PRIMARY line""" 1389 self.data.primary.append(content)
1390
1391 - def primary(self, content):
1392 pass
1393
1394 - def features_line(self, content):
1395 """Get ready for the feature table when we reach the FEATURE line. 1396 """ 1397 self.start_feature_table()
1398
1399 - def start_feature_table(self):
1400 """Signal the start of the feature table. 1401 """ 1402 # we need to add on the last reference 1403 if self._cur_reference is not None: 1404 self.data.references.append(self._cur_reference)
1405
1406 - def feature_key(self, content):
1407 """Grab the key of the feature and signal the start of a new feature. 1408 """ 1409 # first add on feature information if we've got any 1410 self._add_feature() 1411 1412 from . import Record 1413 self._cur_feature = Record.Feature() 1414 self._cur_feature.key = content
1415
1416 - def _add_feature(self):
1417 """Utility function to add a feature to the Record. 1418 1419 This does all of the appropriate checking to make sure we haven't 1420 left any info behind, and that we are only adding info if it 1421 exists. 1422 """ 1423 if self._cur_feature is not None: 1424 # if we have a left over qualifier, add it to the qualifiers 1425 # on the current feature 1426 if self._cur_qualifier is not None: 1427 self._cur_feature.qualifiers.append(self._cur_qualifier) 1428 1429 self._cur_qualifier = None 1430 self.data.features.append(self._cur_feature)
1431
1432 - def location(self, content):
1433 self._cur_feature.location = self._clean_location(content)
1434
1435 - def feature_qualifier(self, key, value):
1436 self.feature_qualifier_name([key]) 1437 if value is not None: 1438 self.feature_qualifier_description(value)
1439
1440 - def feature_qualifier_name(self, content_list):
1441 """Deal with qualifier names 1442 1443 We receive a list of keys, since you can have valueless keys such as 1444 /pseudo which would be passed in with the next key (since no other 1445 tags separate them in the file) 1446 """ 1447 from . import Record 1448 for content in content_list: 1449 # the record parser keeps the /s -- add them if we don't have 'em 1450 if not content.startswith("/"): 1451 content = "/%s" % content 1452 # add on a qualifier if we've got one 1453 if self._cur_qualifier is not None: 1454 self._cur_feature.qualifiers.append(self._cur_qualifier) 1455 1456 self._cur_qualifier = Record.Qualifier() 1457 self._cur_qualifier.key = content
1458
1459 - def feature_qualifier_description(self, content):
1460 # if we have info then the qualifier key should have a ='s 1461 if '=' not in self._cur_qualifier.key: 1462 self._cur_qualifier.key = "%s=" % self._cur_qualifier.key 1463 cur_content = self._remove_newlines(content) 1464 # remove all spaces from the value if it is a type where spaces 1465 # are not important 1466 for remove_space_key in self.__class__.remove_space_keys: 1467 if remove_space_key in self._cur_qualifier.key: 1468 cur_content = self._remove_spaces(cur_content) 1469 self._cur_qualifier.value = self._normalize_spaces(cur_content)
1470
1471 - def base_count(self, content):
1472 self.data.base_counts = content
1473
1474 - def origin_name(self, content):
1475 self.data.origin = content
1476
1477 - def contig_location(self, content):
1478 """Signal that we have contig information to add to the record. 1479 """ 1480 self.data.contig = self._clean_location(content)
1481
1482 - def sequence(self, content):
1483 """Add sequence information to a list of sequence strings. 1484 1485 This removes spaces in the data and uppercases the sequence, and 1486 then adds it to a list of sequences. Later on we'll join this 1487 list together to make the final sequence. This is faster than 1488 adding on the new string every time. 1489 """ 1490 assert ' ' not in content 1491 self._seq_data.append(content.upper())
1492
1493 - def record_end(self, content):
1494 """Signal the end of the record and do any necessary clean-up. 1495 """ 1496 # add together all of the sequence parts to create the 1497 # final sequence string 1498 self.data.sequence = "".join(self._seq_data) 1499 # add on the last feature 1500 self._add_feature()
1501 1502
1503 -def parse(handle):
1504 """Iterate over GenBank formatted entries as Record objects. 1505 1506 >>> from Bio import GenBank 1507 >>> with open("GenBank/NC_000932.gb") as handle: 1508 ... for record in GenBank.parse(handle): 1509 ... print(record.accession) 1510 ['NC_000932'] 1511 1512 To get SeqRecord objects use Bio.SeqIO.parse(..., format="gb") 1513 instead. 1514 """ 1515 return iter(Iterator(handle, RecordParser()))
1516 1517
1518 -def read(handle):
1519 """Read a handle containing a single GenBank entry as a Record object. 1520 1521 >>> from Bio import GenBank 1522 >>> with open("GenBank/NC_000932.gb") as handle: 1523 ... record = GenBank.read(handle) 1524 ... print(record.accession) 1525 ['NC_000932'] 1526 1527 To get a SeqRecord object use Bio.SeqIO.read(..., format="gb") 1528 instead. 1529 """ 1530 iterator = parse(handle) 1531 try: 1532 first = next(iterator) 1533 except StopIteration: 1534 first = None 1535 if first is None: 1536 raise ValueError("No records found in handle") 1537 try: 1538 second = next(iterator) 1539 except StopIteration: 1540 second = None 1541 if second is not None: 1542 raise ValueError("More than one record found in handle") 1543 return first
1544 1545
1546 -def _test():
1547 """Run the Bio.GenBank module's doctests.""" 1548 import doctest 1549 import os 1550 if os.path.isdir(os.path.join("..", "..", "Tests")): 1551 print("Running doctests...") 1552 cur_dir = os.path.abspath(os.curdir) 1553 os.chdir(os.path.join("..", "..", "Tests")) 1554 doctest.testmod() 1555 os.chdir(cur_dir) 1556 del cur_dir 1557 print("Done") 1558 elif os.path.isdir(os.path.join("Tests")): 1559 print("Running doctests...") 1560 cur_dir = os.path.abspath(os.curdir) 1561 os.chdir(os.path.join("Tests")) 1562 doctest.testmod() 1563 os.chdir(cur_dir) 1564 del cur_dir 1565 print("Done")
1566 1567 if __name__ == "__main__": 1568 _test() 1569