Package Bio :: Package GenBank
[hide private]
[frames] | no frames]

Source Code for Package Bio.GenBank

   1  # Copyright 2000 by Jeffrey Chang, Brad Chapman.  All rights reserved. 
   2  # Copyright 2006-2011 by Peter Cock.  All rights reserved. 
   3  # This code is part of the Biopython distribution and governed by its 
   4  # license.  Please see the LICENSE file that should have been included 
   5  # as part of this package. 
   6   
   7  """Code to work with GenBank formatted files. 
   8   
   9  Rather than using Bio.GenBank, you are now encouraged to use Bio.SeqIO with 
  10  the "genbank" or "embl" format names to parse GenBank or EMBL files into 
  11  SeqRecord and SeqFeature objects (see the Biopython tutorial for details). 
  12   
  13  Using Bio.GenBank directly to parse GenBank files is only useful if you want 
  14  to obtain GenBank-specific Record objects, which is a much closer 
  15  representation to the raw file contents that the SeqRecord alternative from 
  16  the FeatureParser (used in Bio.SeqIO). 
  17   
  18  To use the Bio.GenBank parser, there are two helper functions: 
  19   
  20  read                  Parse a handle containing a single GenBank record 
  21                        as Bio.GenBank specific Record objects. 
  22  parse                 Iterate over a handle containing multiple GenBank 
  23                        records as Bio.GenBank specific Record objects. 
  24   
  25  The following internal classes are not intended for direct use and may 
  26  be deprecated in a future release. 
  27   
  28  Classes: 
  29  Iterator              Iterate through a file of GenBank entries 
  30  ErrorFeatureParser    Catch errors caused during parsing. 
  31  FeatureParser         Parse GenBank data in SeqRecord and SeqFeature objects. 
  32  RecordParser          Parse GenBank data into a Record object. 
  33   
  34  Exceptions: 
  35  ParserFailureError    Exception indicating a failure in the parser (ie. 
  36                        scanner or consumer) 
  37  LocationParserError   Exception indiciating a problem with the spark based 
  38                        location parser. 
  39   
  40  """ 
  41  import re 
  42   
  43  # other Biopython stuff 
  44  from Bio import SeqFeature 
  45   
  46  # other Bio.GenBank stuff 
  47  from utils import FeatureValueCleaner 
  48  from Scanner import GenBankScanner 
  49   
  50  #Constants used to parse GenBank header lines 
  51  GENBANK_INDENT = 12 
  52  GENBANK_SPACER = " " * GENBANK_INDENT 
  53   
  54  #Constants for parsing GenBank feature lines 
  55  FEATURE_KEY_INDENT = 5 
  56  FEATURE_QUALIFIER_INDENT = 21 
  57  FEATURE_KEY_SPACER = " " * FEATURE_KEY_INDENT 
  58  FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT 
  59   
  60  #Regular expressions for location parsing 
  61  _solo_location = r"[<>]?\d+" 
  62  _pair_location = r"[<>]?\d+\.\.[<>]?\d+" 
  63  _between_location = r"\d+\^\d+" 
  64   
  65  _within_position = r"\(\d+\.\d+\)" 
  66  _re_within_position = re.compile(_within_position) 
  67  _within_location = r"([<>]?\d+|%s)\.\.([<>]?\d+|%s)" \ 
  68                     % (_within_position,_within_position) 
  69  assert _re_within_position.match("(3.9)") 
  70  assert re.compile(_within_location).match("(3.9)..10") 
  71  assert re.compile(_within_location).match("26..(30.33)") 
  72  assert re.compile(_within_location).match("(13.19)..(20.28)") 
  73   
  74  _oneof_position = r"one\-of\(\d+(,\d+)+\)" 
  75  _re_oneof_position = re.compile(_oneof_position) 
  76  _oneof_location = r"([<>]?\d+|%s)\.\.([<>]?\d+|%s)" \ 
  77                     % (_oneof_position,_oneof_position) 
  78  assert _re_oneof_position.match("one-of(6,9)") 
  79  assert re.compile(_oneof_location).match("one-of(6,9)..101") 
  80  assert re.compile(_oneof_location).match("one-of(6,9)..one-of(101,104)") 
  81  assert re.compile(_oneof_location).match("6..one-of(101,104)") 
  82   
  83  assert not _re_oneof_position.match("one-of(3)") 
  84  assert _re_oneof_position.match("one-of(3,6)") 
  85  assert _re_oneof_position.match("one-of(3,6,9)") 
  86   
  87   
  88  _simple_location = r"\d+\.\.\d+" 
  89  _re_simple_location = re.compile(r"^%s$" % _simple_location) 
  90  _re_simple_compound = re.compile(r"^(join|order|bond)\(%s(,%s)*\)$" 
  91                                   % (_simple_location, _simple_location)) 
  92  _complex_location = r"([a-zA-z][a-zA-Z0-9_]*(\.[a-zA-Z0-9]+)?\:)?(%s|%s|%s|%s|%s)" \ 
  93                      % (_pair_location, _solo_location, _between_location, 
  94                         _within_location, _oneof_location) 
  95  _re_complex_location = re.compile(r"^%s$" % _complex_location) 
  96  _possibly_complemented_complex_location = r"(%s|complement\(%s\))" \ 
  97                                            % (_complex_location, _complex_location) 
  98  _re_complex_compound = re.compile(r"^(join|order|bond)\(%s(,%s)*\)$" 
  99                                   % (_possibly_complemented_complex_location, 
 100                                      _possibly_complemented_complex_location)) 
 101   
 102  assert _re_simple_location.match("104..160") 
 103  assert not _re_simple_location.match("68451760..68452073^68452074") 
 104  assert not _re_simple_location.match("<104..>160") 
 105  assert not _re_simple_location.match("104") 
 106  assert not _re_simple_location.match("<1") 
 107  assert not _re_simple_location.match(">99999") 
 108  assert not _re_simple_location.match("join(104..160,320..390,504..579)") 
 109  assert not _re_simple_compound.match("bond(12,63)") 
 110  assert _re_simple_compound.match("join(104..160,320..390,504..579)") 
 111  assert _re_simple_compound.match("order(1..69,1308..1465)") 
 112  assert not _re_simple_compound.match("order(1..69,1308..1465,1524)") 
 113  assert not _re_simple_compound.match("join(<1..442,992..1228,1524..>1983)") 
 114  assert not _re_simple_compound.match("join(<1..181,254..336,422..497,574..>590)") 
 115  assert not _re_simple_compound.match("join(1475..1577,2841..2986,3074..3193,3314..3481,4126..>4215)") 
 116  assert not _re_simple_compound.match("test(1..69,1308..1465)") 
 117  assert not _re_simple_compound.match("complement(1..69)") 
 118  assert not _re_simple_compound.match("(1..69)") 
 119  assert _re_complex_location.match("(3.9)..10") 
 120  assert _re_complex_location.match("26..(30.33)") 
 121  assert _re_complex_location.match("(13.19)..(20.28)") 
 122  assert _re_complex_location.match("41^42")  # between 
 123  assert _re_complex_location.match("AL121804:41^42") 
 124  assert _re_complex_location.match("AL121804:41..610") 
 125  assert _re_complex_location.match("AL121804.2:41..610") 
 126  assert _re_complex_location.match("one-of(3,6)..101") 
 127  assert _re_complex_compound.match("join(153490..154269,AL121804.2:41..610,AL121804.2:672..1487)") 
 128  assert not _re_simple_compound.match("join(153490..154269,AL121804.2:41..610,AL121804.2:672..1487)") 
 129  assert _re_complex_compound.match("join(complement(69611..69724),139856..140650)") 
 130   
 131  #Trans-spliced example from NC_016406, note underscore in reference name: 
 132  assert _re_complex_location.match("NC_016402.1:6618..6676") 
 133  assert _re_complex_location.match("181647..181905") 
 134  assert _re_complex_compound.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)") 
 135  assert not _re_complex_location.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)") 
 136  assert not _re_simple_compound.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)") 
 137  assert not _re_complex_location.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)") 
 138  assert not _re_simple_location.match("join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)") 
 139   
 140   
141 -def _pos(pos_str, offset=0):
142 """Build a Position object (PRIVATE). 143 144 For an end position, leave offset as zero (default): 145 146 >>> _pos("5") 147 ExactPosition(5) 148 149 For a start position, set offset to minus one (for Python counting): 150 151 >>> _pos("5", -1) 152 ExactPosition(4) 153 154 This also covers fuzzy positions: 155 156 >>> p = _pos("<5") 157 >>> p 158 BeforePosition(5) 159 >>> print p 160 <5 161 >>> int(p) 162 5 163 164 >>> _pos(">5") 165 AfterPosition(5) 166 167 By default assumes an end position, so note the integer behaviour: 168 169 >>> p = _pos("one-of(5,8,11)") 170 >>> p 171 OneOfPosition(11, choices=[ExactPosition(5), ExactPosition(8), ExactPosition(11)]) 172 >>> print p 173 one-of(5,8,11) 174 >>> int(p) 175 11 176 177 >>> _pos("(8.10)") 178 WithinPosition(10, left=8, right=10) 179 180 Fuzzy start positions: 181 182 >>> p = _pos("<5", -1) 183 >>> p 184 BeforePosition(4) 185 >>> print p 186 <4 187 >>> int(p) 188 4 189 190 Notice how the integer behaviour changes too! 191 192 >>> p = _pos("one-of(5,8,11)", -1) 193 >>> p 194 OneOfPosition(4, choices=[ExactPosition(4), ExactPosition(7), ExactPosition(10)]) 195 >>> print(p) 196 one-of(4,7,10) 197 >>> int(p) 198 4 199 200 """ 201 if pos_str.startswith("<"): 202 return SeqFeature.BeforePosition(int(pos_str[1:])+offset) 203 elif pos_str.startswith(">"): 204 return SeqFeature.AfterPosition(int(pos_str[1:])+offset) 205 elif _re_within_position.match(pos_str): 206 s,e = pos_str[1:-1].split(".") 207 s = int(s) + offset 208 e = int(e) + offset 209 if offset == -1: 210 default = s 211 else: 212 default = e 213 return SeqFeature.WithinPosition(default, left=s, right=e) 214 elif _re_oneof_position.match(pos_str): 215 assert pos_str.startswith("one-of(") 216 assert pos_str[-1]==")" 217 parts = [SeqFeature.ExactPosition(int(pos)+offset) 218 for pos in pos_str[7:-1].split(",")] 219 if offset == -1: 220 default = min(int(pos) for pos in parts) 221 else: 222 default = max(int(pos) for pos in parts) 223 return SeqFeature.OneOfPosition(default, choices=parts) 224 else: 225 return SeqFeature.ExactPosition(int(pos_str)+offset)
226 227
228 -def _loc(loc_str, expected_seq_length, strand):
229 """FeatureLocation from non-compound non-complement location (PRIVATE). 230 231 Simple examples, 232 233 >>> _loc("123..456", 1000, +1) 234 FeatureLocation(ExactPosition(122), ExactPosition(456), strand=1) 235 >>> _loc("<123..>456", 1000, strand = -1) 236 FeatureLocation(BeforePosition(122), AfterPosition(456), strand=-1) 237 238 A more complex location using within positions, 239 240 >>> _loc("(9.10)..(20.25)", 1000, 1) 241 FeatureLocation(WithinPosition(8, left=8, right=9), WithinPosition(25, left=20, right=25), strand=1) 242 243 Notice how that will act as though it has overall start 8 and end 25. 244 245 Zero length between feature, 246 247 >>> _loc("123^124", 1000, 0) 248 FeatureLocation(ExactPosition(123), ExactPosition(123), strand=0) 249 250 The expected sequence length is needed for a special case, a between 251 position at the start/end of a circular genome: 252 253 >>> _loc("1000^1", 1000, 1) 254 FeatureLocation(ExactPosition(1000), ExactPosition(1000), strand=1) 255 256 Apart from this special case, between positions P^Q must have P+1==Q, 257 258 >>> _loc("123^456", 1000, 1) 259 Traceback (most recent call last): 260 ... 261 ValueError: Invalid between location '123^456' 262 """ 263 try: 264 s, e = loc_str.split("..") 265 except ValueError: 266 assert ".." not in loc_str 267 if "^" in loc_str: 268 #A between location like "67^68" (one based counting) is a 269 #special case (note it has zero length). In python slice 270 #notation this is 67:67, a zero length slice. See Bug 2622 271 #Further more, on a circular genome of length N you can have 272 #a location N^1 meaning the junction at the origin. See Bug 3098. 273 #NOTE - We can imagine between locations like "2^4", but this 274 #is just "3". Similarly, "2^5" is just "3..4" 275 s, e = loc_str.split("^") 276 if int(s)+1==int(e): 277 pos = _pos(s) 278 elif int(s)==expected_seq_length and e=="1": 279 pos = _pos(s) 280 else: 281 raise ValueError("Invalid between location %s" % repr(loc_str)) 282 return SeqFeature.FeatureLocation(pos, pos, strand) 283 else: 284 #e.g. "123" 285 s = loc_str 286 e = loc_str 287 return SeqFeature.FeatureLocation(_pos(s,-1), _pos(e), strand)
288 289
290 -def _split_compound_loc(compound_loc):
291 """Split a tricky compound location string (PRIVATE). 292 293 >>> list(_split_compound_loc("123..145")) 294 ['123..145'] 295 >>> list(_split_compound_loc("123..145,200..209")) 296 ['123..145', '200..209'] 297 >>> list(_split_compound_loc("one-of(200,203)..300")) 298 ['one-of(200,203)..300'] 299 >>> list(_split_compound_loc("complement(123..145),200..209")) 300 ['complement(123..145)', '200..209'] 301 >>> list(_split_compound_loc("123..145,one-of(200,203)..209")) 302 ['123..145', 'one-of(200,203)..209'] 303 >>> list(_split_compound_loc("123..145,one-of(200,203)..one-of(209,211),300")) 304 ['123..145', 'one-of(200,203)..one-of(209,211)', '300'] 305 >>> list(_split_compound_loc("123..145,complement(one-of(200,203)..one-of(209,211)),300")) 306 ['123..145', 'complement(one-of(200,203)..one-of(209,211))', '300'] 307 >>> list(_split_compound_loc("123..145,200..one-of(209,211),300")) 308 ['123..145', '200..one-of(209,211)', '300'] 309 >>> list(_split_compound_loc("123..145,200..one-of(209,211)")) 310 ['123..145', '200..one-of(209,211)'] 311 >>> list(_split_compound_loc("complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905")) 312 ['complement(149815..150200)', 'complement(293787..295573)', 'NC_016402.1:6618..6676', '181647..181905'] 313 """ 314 if "one-of(" in compound_loc: 315 #Hard case 316 while "," in compound_loc: 317 assert compound_loc[0] != "," 318 assert compound_loc[0:2] != ".." 319 i = compound_loc.find(",") 320 part = compound_loc[:i] 321 compound_loc = compound_loc[i:] # includes the comma 322 while part.count("(") > part.count(")"): 323 assert "one-of(" in part, (part, compound_loc) 324 i = compound_loc.find(")") 325 part += compound_loc[:i+1] 326 compound_loc = compound_loc[i+1:] 327 if compound_loc.startswith(".."): 328 i = compound_loc.find(",") 329 if i==-1: 330 part += compound_loc 331 compound_loc = "" 332 else: 333 part += compound_loc[:i] 334 compound_loc = compound_loc[i:] # includes the comma 335 while part.count("(") > part.count(")"): 336 assert part.count("one-of(") == 2 337 i = compound_loc.find(")") 338 part += compound_loc[:i+1] 339 compound_loc = compound_loc[i+1:] 340 if compound_loc.startswith(","): 341 compound_loc = compound_loc[1:] 342 assert part 343 yield part 344 if compound_loc: 345 yield compound_loc 346 else: 347 #Easy case 348 for part in compound_loc.split(","): 349 yield part
350 351
352 -class Iterator(object):
353 """Iterator interface to move over a file of GenBank entries one at a time (OBSOLETE). 354 355 This class is likely to be deprecated in a future release of Biopython. 356 Please use Bio.SeqIO.parse(..., format="gb") or Bio.GenBank.parse(...) 357 for SeqRecord and GenBank specific Record objects respectively instead. 358 """
359 - def __init__(self, handle, parser = None):
360 """Initialize the iterator. 361 362 Arguments: 363 o handle - A handle with GenBank entries to iterate through. 364 o parser - An optional parser to pass the entries through before 365 returning them. If None, then the raw entry will be returned. 366 """ 367 self.handle = handle 368 self._parser = parser
369
370 - def next(self):
371 """Return the next GenBank record from the handle. 372 373 Will return None if we ran out of records. 374 """ 375 if self._parser is None: 376 lines = [] 377 while True: 378 line = self.handle.readline() 379 if not line: 380 return None # Premature end of file? 381 lines.append(line) 382 if line.rstrip() == "//": 383 break 384 return "".join(lines) 385 try: 386 return self._parser.parse(self.handle) 387 except StopIteration: 388 return None
389
390 - def __iter__(self):
391 return iter(self.next, None)
392 393
394 -class ParserFailureError(Exception):
395 """Failure caused by some kind of problem in the parser. 396 """ 397 pass
398 399
400 -class LocationParserError(Exception):
401 """Could not Properly parse out a location from a GenBank file. 402 """ 403 pass
404 405
406 -class FeatureParser(object):
407 """Parse GenBank files into Seq + Feature objects (OBSOLETE). 408 409 Direct use of this class is discouraged, and may be deprecated in 410 a future release of Biopython. 411 412 Please use Bio.SeqIO.parse(...) or Bio.SeqIO.read(...) instead. 413 """
414 - def __init__(self, debug_level = 0, use_fuzziness = 1, 415 feature_cleaner = FeatureValueCleaner()):
416 """Initialize a GenBank parser and Feature consumer. 417 418 Arguments: 419 o debug_level - An optional argument that species the amount of 420 debugging information the parser should spit out. By default we have 421 no debugging info (the fastest way to do things), but if you want 422 you can set this as high as two and see exactly where a parse fails. 423 o use_fuzziness - Specify whether or not to use fuzzy representations. 424 The default is 1 (use fuzziness). 425 o feature_cleaner - A class which will be used to clean out the 426 values of features. This class must implement the function 427 clean_value. GenBank.utils has a "standard" cleaner class, which 428 is used by default. 429 """ 430 self._scanner = GenBankScanner(debug_level) 431 self.use_fuzziness = use_fuzziness 432 self._cleaner = feature_cleaner
433
434 - def parse(self, handle):
435 """Parse the specified handle. 436 """ 437 self._consumer = _FeatureConsumer(self.use_fuzziness, 438 self._cleaner) 439 self._scanner.feed(handle, self._consumer) 440 return self._consumer.data
441 442
443 -class RecordParser(object):
444 """Parse GenBank files into Record objects (OBSOLETE). 445 446 Direct use of this class is discouraged, and may be deprecated in 447 a future release of Biopython. 448 449 Please use the Bio.GenBank.parse(...) or Bio.GenBank.read(...) functions 450 instead. 451 """
452 - def __init__(self, debug_level = 0):
453 """Initialize the parser. 454 455 Arguments: 456 o debug_level - An optional argument that species the amount of 457 debugging information the parser should spit out. By default we have 458 no debugging info (the fastest way to do things), but if you want 459 you can set this as high as two and see exactly where a parse fails. 460 """ 461 self._scanner = GenBankScanner(debug_level)
462
463 - def parse(self, handle):
464 """Parse the specified handle into a GenBank record. 465 """ 466 self._consumer = _RecordConsumer() 467 468 self._scanner.feed(handle, self._consumer) 469 return self._consumer.data
470 471
472 -class _BaseGenBankConsumer(object):
473 """Abstract GenBank consumer providing useful general functions (PRIVATE). 474 475 This just helps to eliminate some duplication in things that most 476 GenBank consumers want to do. 477 """ 478 # Special keys in GenBank records that we should remove spaces from 479 # For instance, \translation keys have values which are proteins and 480 # should have spaces and newlines removed from them. This class 481 # attribute gives us more control over specific formatting problems. 482 remove_space_keys = ["translation"] 483
484 - def __init__(self):
485 pass
486
487 - def _unhandled(self, data):
488 pass
489
490 - def __getattr__(self, attr):
491 return self._unhandled
492
493 - def _split_keywords(self, keyword_string):
494 """Split a string of keywords into a nice clean list. 495 """ 496 # process the keywords into a python list 497 if keyword_string == "" or keyword_string == ".": 498 keywords = "" 499 elif keyword_string[-1] == '.': 500 keywords = keyword_string[:-1] 501 else: 502 keywords = keyword_string 503 keyword_list = keywords.split(';') 504 clean_keyword_list = [x.strip() for x in keyword_list] 505 return clean_keyword_list
506
507 - def _split_accessions(self, accession_string):
508 """Split a string of accession numbers into a list. 509 """ 510 # first replace all line feeds with spaces 511 # Also, EMBL style accessions are split with ';' 512 accession = accession_string.replace("\n", " ").replace(";"," ") 513 514 return [x.strip() for x in accession.split() if x.strip()]
515
516 - def _split_taxonomy(self, taxonomy_string):
517 """Split a string with taxonomy info into a list. 518 """ 519 if not taxonomy_string or taxonomy_string==".": 520 #Missing data, no taxonomy 521 return [] 522 523 if taxonomy_string[-1] == '.': 524 tax_info = taxonomy_string[:-1] 525 else: 526 tax_info = taxonomy_string 527 tax_list = tax_info.split(';') 528 new_tax_list = [] 529 for tax_item in tax_list: 530 new_items = tax_item.split("\n") 531 new_tax_list.extend(new_items) 532 while '' in new_tax_list: 533 new_tax_list.remove('') 534 clean_tax_list = [x.strip() for x in new_tax_list] 535 536 return clean_tax_list
537
538 - def _clean_location(self, location_string):
539 """Clean whitespace out of a location string. 540 541 The location parser isn't a fan of whitespace, so we clean it out 542 before feeding it into the parser. 543 """ 544 #Originally this imported string.whitespace and did a replace 545 #via a loop. It's simpler to just split on whitespace and rejoin 546 #the string - and this avoids importing string too. See Bug 2684. 547 return ''.join(location_string.split())
548
549 - def _remove_newlines(self, text):
550 """Remove any newlines in the passed text, returning the new string. 551 """ 552 # get rid of newlines in the qualifier value 553 newlines = ["\n", "\r"] 554 for ws in newlines: 555 text = text.replace(ws, "") 556 557 return text
558
559 - def _normalize_spaces(self, text):
560 """Replace multiple spaces in the passed text with single spaces. 561 """ 562 # get rid of excessive spaces 563 text_parts = text.split(" ") 564 text_parts = filter(None, text_parts) 565 return ' '.join(text_parts)
566
567 - def _remove_spaces(self, text):
568 """Remove all spaces from the passed text. 569 """ 570 return text.replace(" ", "")
571
572 - def _convert_to_python_numbers(self, start, end):
573 """Convert a start and end range to python notation. 574 575 In GenBank, starts and ends are defined in "biological" coordinates, 576 where 1 is the first base and [i, j] means to include both i and j. 577 578 In python, 0 is the first base and [i, j] means to include i, but 579 not j. 580 581 So, to convert "biological" to python coordinates, we need to 582 subtract 1 from the start, and leave the end and things should 583 be converted happily. 584 """ 585 new_start = start - 1 586 new_end = end 587 588 return new_start, new_end
589 590
591 -class _FeatureConsumer(_BaseGenBankConsumer):
592 """Create a SeqRecord object with Features to return (PRIVATE). 593 594 Attributes: 595 o use_fuzziness - specify whether or not to parse with fuzziness in 596 feature locations. 597 o feature_cleaner - a class that will be used to provide specialized 598 cleaning-up of feature values. 599 """
600 - def __init__(self, use_fuzziness, feature_cleaner = None):
601 from Bio.SeqRecord import SeqRecord 602 _BaseGenBankConsumer.__init__(self) 603 self.data = SeqRecord(None, id = None) 604 self.data.id = None 605 self.data.description = "" 606 607 self._use_fuzziness = use_fuzziness 608 self._feature_cleaner = feature_cleaner 609 610 self._seq_type = '' 611 self._seq_data = [] 612 self._cur_reference = None 613 self._cur_feature = None 614 self._expected_size = None
615
616 - def locus(self, locus_name):
617 """Set the locus name is set as the name of the Sequence. 618 """ 619 self.data.name = locus_name
620
621 - def size(self, content):
622 """Record the sequence length.""" 623 self._expected_size = int(content)
624
625 - def residue_type(self, type):
626 """Record the sequence type so we can choose an appropriate alphabet. 627 """ 628 self._seq_type = type
629
630 - def data_file_division(self, division):
631 self.data.annotations['data_file_division'] = division
632
633 - def date(self, submit_date):
634 self.data.annotations['date'] = submit_date
635
636 - def definition(self, definition):
637 """Set the definition as the description of the sequence. 638 """ 639 if self.data.description: 640 #Append to any existing description 641 #e.g. EMBL files with two DE lines. 642 self.data.description += " " + definition 643 else: 644 self.data.description = definition
645
646 - def accession(self, acc_num):
647 """Set the accession number as the id of the sequence. 648 649 If we have multiple accession numbers, the first one passed is 650 used. 651 """ 652 new_acc_nums = self._split_accessions(acc_num) 653 654 #Also record them ALL in the annotations 655 try: 656 #On the off chance there was more than one accession line: 657 for acc in new_acc_nums: 658 #Prevent repeat entries 659 if acc not in self.data.annotations['accessions']: 660 self.data.annotations['accessions'].append(acc) 661 except KeyError: 662 self.data.annotations['accessions'] = new_acc_nums 663 664 # if we haven't set the id information yet, add the first acc num 665 if not self.data.id: 666 if len(new_acc_nums) > 0: 667 #self.data.id = new_acc_nums[0] 668 #Use the FIRST accession as the ID, not the first on this line! 669 self.data.id = self.data.annotations['accessions'][0]
670
671 - def wgs(self, content):
672 self.data.annotations['wgs'] = content.split('-')
673
674 - def add_wgs_scafld(self, content):
675 self.data.annotations.setdefault('wgs_scafld',[]).append(content.split('-'))
676
677 - def nid(self, content):
678 self.data.annotations['nid'] = content
679
680 - def pid(self, content):
681 self.data.annotations['pid'] = content
682
683 - def version(self, version_id):
684 #Want to use the versioned accession as the record.id 685 #This comes from the VERSION line in GenBank files, or the 686 #obsolete SV line in EMBL. For the new EMBL files we need 687 #both the version suffix from the ID line and the accession 688 #from the AC line. 689 if version_id.count(".")==1 and version_id.split(".")[1].isdigit(): 690 self.accession(version_id.split(".")[0]) 691 self.version_suffix(version_id.split(".")[1]) 692 elif version_id: 693 #For backwards compatibility... 694 self.data.id = version_id
695
696 - def project(self, content):
697 """Handle the information from the PROJECT line as a list of projects. 698 699 e.g. 700 PROJECT GenomeProject:28471 701 702 or: 703 PROJECT GenomeProject:13543 GenomeProject:99999 704 705 This is stored as dbxrefs in the SeqRecord to be consistent with the 706 projected switch of this line to DBLINK in future GenBank versions. 707 Note the NCBI plan to replace "GenomeProject:28471" with the shorter 708 "Project:28471" as part of this transition. 709 """ 710 content = content.replace("GenomeProject:", "Project:") 711 self.data.dbxrefs.extend([p for p in content.split() if p])
712 744
745 - def version_suffix(self, version):
746 """Set the version to overwrite the id. 747 748 Since the verison provides the same information as the accession 749 number, plus some extra info, we set this as the id if we have 750 a version. 751 """ 752 #e.g. GenBank line: 753 #VERSION U49845.1 GI:1293613 754 #or the obsolete EMBL line: 755 #SV U49845.1 756 #Scanner calls consumer.version("U49845.1") 757 #which then calls consumer.version_suffix(1) 758 # 759 #e.g. EMBL new line: 760 #ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP. 761 #Scanner calls consumer.version_suffix(1) 762 assert version.isdigit() 763 self.data.annotations['sequence_version'] = int(version)
764
765 - def db_source(self, content):
766 self.data.annotations['db_source'] = content.rstrip()
767
768 - def gi(self, content):
769 self.data.annotations['gi'] = content
770
771 - def keywords(self, content):
772 self.data.annotations['keywords'] = self._split_keywords(content)
773
774 - def segment(self, content):
775 self.data.annotations['segment'] = content
776
777 - def source(self, content):
778 #Note that some software (e.g. VectorNTI) may produce an empty 779 #source (rather than using a dot/period as might be expected). 780 if content == "": 781 source_info = "" 782 elif content[-1] == '.': 783 source_info = content[:-1] 784 else: 785 source_info = content 786 self.data.annotations['source'] = source_info
787
788 - def organism(self, content):
789 self.data.annotations['organism'] = content
790
791 - def taxonomy(self, content):
792 """Records (another line of) the taxonomy lineage. 793 """ 794 lineage = self._split_taxonomy(content) 795 try: 796 self.data.annotations['taxonomy'].extend(lineage) 797 except KeyError: 798 self.data.annotations['taxonomy'] = lineage
799
800 - def reference_num(self, content):
801 """Signal the beginning of a new reference object. 802 """ 803 # if we have a current reference that hasn't been added to 804 # the list of references, add it. 805 if self._cur_reference is not None: 806 self.data.annotations['references'].append(self._cur_reference) 807 else: 808 self.data.annotations['references'] = [] 809 810 self._cur_reference = SeqFeature.Reference()
811
812 - def reference_bases(self, content):
813 """Attempt to determine the sequence region the reference entails. 814 815 Possible types of information we may have to deal with: 816 817 (bases 1 to 86436) 818 (sites) 819 (bases 1 to 105654; 110423 to 111122) 820 1 (residues 1 to 182) 821 """ 822 # first remove the parentheses or other junk 823 ref_base_info = content[1:-1] 824 825 all_locations = [] 826 # parse if we've got 'bases' and 'to' 827 if 'bases' in ref_base_info and 'to' in ref_base_info: 828 # get rid of the beginning 'bases' 829 ref_base_info = ref_base_info[5:] 830 locations = self._split_reference_locations(ref_base_info) 831 all_locations.extend(locations) 832 elif 'residues' in ref_base_info and 'to' in ref_base_info: 833 residues_start = ref_base_info.find("residues") 834 # get only the information after "residues" 835 ref_base_info = ref_base_info[(residues_start + len("residues ")):] 836 locations = self._split_reference_locations(ref_base_info) 837 all_locations.extend(locations) 838 839 # make sure if we are not finding information then we have 840 # the string 'sites' or the string 'bases' 841 elif (ref_base_info == 'sites' or 842 ref_base_info.strip() == 'bases'): 843 pass 844 # otherwise raise an error 845 else: 846 raise ValueError("Could not parse base info %s in record %s" % 847 (ref_base_info, self.data.id)) 848 849 self._cur_reference.location = all_locations
850
851 - def _split_reference_locations(self, location_string):
852 """Get reference locations out of a string of reference information 853 854 The passed string should be of the form: 855 856 1 to 20; 20 to 100 857 858 This splits the information out and returns a list of location objects 859 based on the reference locations. 860 """ 861 # split possibly multiple locations using the ';' 862 all_base_info = location_string.split(';') 863 864 new_locations = [] 865 for base_info in all_base_info: 866 start, end = base_info.split('to') 867 new_start, new_end = \ 868 self._convert_to_python_numbers(int(start.strip()), 869 int(end.strip())) 870 this_location = SeqFeature.FeatureLocation(new_start, new_end) 871 new_locations.append(this_location) 872 return new_locations
873
874 - def authors(self, content):
875 if self._cur_reference.authors: 876 self._cur_reference.authors += ' ' + content 877 else: 878 self._cur_reference.authors = content
879
880 - def consrtm(self, content):
881 if self._cur_reference.consrtm: 882 self._cur_reference.consrtm += ' ' + content 883 else: 884 self._cur_reference.consrtm = content
885
886 - def title(self, content):
887 if self._cur_reference is None: 888 import warnings 889 from Bio import BiopythonParserWarning 890 warnings.warn("GenBank TITLE line without REFERENCE line.", 891 BiopythonParserWarning) 892 elif self._cur_reference.title: 893 self._cur_reference.title += ' ' + content 894 else: 895 self._cur_reference.title = content
896
897 - def journal(self, content):
898 if self._cur_reference.journal: 899 self._cur_reference.journal += ' ' + content 900 else: 901 self._cur_reference.journal = content
902
903 - def medline_id(self, content):
904 self._cur_reference.medline_id = content
905
906 - def pubmed_id(self, content):
907 self._cur_reference.pubmed_id = content
908
909 - def remark(self, content):
910 """Deal with a reference comment.""" 911 if self._cur_reference.comment: 912 self._cur_reference.comment += ' ' + content 913 else: 914 self._cur_reference.comment = content
915
916 - def comment(self, content):
917 try: 918 self.data.annotations['comment'] += "\n" + "\n".join(content) 919 except KeyError: 920 self.data.annotations['comment'] = "\n".join(content)
921
922 - def features_line(self, content):
923 """Get ready for the feature table when we reach the FEATURE line. 924 """ 925 self.start_feature_table()
926
927 - def start_feature_table(self):
928 """Indicate we've got to the start of the feature table. 929 """ 930 # make sure we've added on our last reference object 931 if self._cur_reference is not None: 932 self.data.annotations['references'].append(self._cur_reference) 933 self._cur_reference = None
934
935 - def feature_key(self, content):
936 # start a new feature 937 self._cur_feature = SeqFeature.SeqFeature() 938 self._cur_feature.type = content 939 self.data.features.append(self._cur_feature)
940
941 - def location(self, content):
942 """Parse out location information from the location string. 943 944 This uses simple Python code with some regular expressions to do the 945 parsing, and then translates the results into appropriate objects. 946 """ 947 # clean up newlines and other whitespace inside the location before 948 # parsing - locations should have no whitespace whatsoever 949 location_line = self._clean_location(content) 950 951 # Older records have junk like replace(266,"c") in the 952 # location line. Newer records just replace this with 953 # the number 266 and have the information in a more reasonable 954 # place. So we'll just grab out the number and feed this to the 955 # parser. We shouldn't really be losing any info this way. 956 if 'replace' in location_line: 957 comma_pos = location_line.find(',') 958 location_line = location_line[8:comma_pos] 959 960 cur_feature = self._cur_feature 961 962 #Handle top level complement here for speed 963 if location_line.startswith("complement("): 964 assert location_line.endswith(")") 965 location_line = location_line[11:-1] 966 strand = -1 967 elif 'DNA' in self._seq_type.upper() or 'RNA' in self._seq_type.upper(): 968 #Nucleotide 969 strand = 1 970 else: 971 #Protein 972 strand = None 973 974 #Special case handling of the most common cases for speed 975 if _re_simple_location.match(location_line): 976 #e.g. "123..456" 977 s, e = location_line.split("..") 978 cur_feature.location = SeqFeature.FeatureLocation(int(s)-1, 979 int(e), 980 strand) 981 return 982 983 if _re_simple_compound.match(location_line): 984 #e.g. join(<123..456,480..>500) 985 i = location_line.find("(") 986 cur_feature.location_operator = location_line[:i] 987 #we can split on the comma because these are simple locations 988 for part in location_line[i+1:-1].split(","): 989 s, e = part.split("..") 990 f = SeqFeature.SeqFeature(SeqFeature.FeatureLocation(int(s)-1, 991 int(e), 992 strand), 993 location_operator=cur_feature.location_operator, 994 type=cur_feature.type) 995 cur_feature.sub_features.append(f) 996 s = cur_feature.sub_features[0].location.start 997 e = cur_feature.sub_features[-1].location.end 998 cur_feature.location = SeqFeature.FeatureLocation(s,e, strand) 999 return 1000 1001 #Handle the general case with more complex regular expressions 1002 if _re_complex_location.match(location_line): 1003 #e.g. "AL121804.2:41..610" 1004 if ":" in location_line: 1005 location_ref, location_line = location_line.split(":") 1006 cur_feature.location = _loc(location_line, self._expected_size, strand) 1007 cur_feature.location.ref = location_ref 1008 else: 1009 cur_feature.location = _loc(location_line, self._expected_size, strand) 1010 return 1011 1012 if _re_complex_compound.match(location_line): 1013 i = location_line.find("(") 1014 cur_feature.location_operator = location_line[:i] 1015 #Can't split on the comma because of positions like one-of(1,2,3) 1016 for part in _split_compound_loc(location_line[i+1:-1]): 1017 if part.startswith("complement("): 1018 assert part[-1]==")" 1019 part = part[11:-1] 1020 assert strand != -1, "Double complement?" 1021 part_strand = -1 1022 else: 1023 part_strand = strand 1024 if ":" in part: 1025 ref, part = part.split(":") 1026 else: 1027 ref = None 1028 try: 1029 loc = _loc(part, self._expected_size, part_strand) 1030 except ValueError, err: 1031 print location_line 1032 print part 1033 raise err 1034 f = SeqFeature.SeqFeature(location=loc, ref=ref, 1035 location_operator=cur_feature.location_operator, 1036 type=cur_feature.type) 1037 cur_feature.sub_features.append(f) 1038 # Historically a join on the reverse strand has been represented 1039 # in Biopython with both the parent SeqFeature and its children 1040 # (the exons for a CDS) all given a strand of -1. Likewise, for 1041 # a join feature on the forward strand they all have strand +1. 1042 # However, we must also consider evil mixed strand examples like 1043 # this, join(complement(69611..69724),139856..140087,140625..140650) 1044 strands = set(sf.strand for sf in cur_feature.sub_features) 1045 if len(strands)==1: 1046 strand = cur_feature.sub_features[0].strand 1047 else: 1048 strand = None # i.e. mixed strands 1049 s = cur_feature.sub_features[0].location.start 1050 e = cur_feature.sub_features[-1].location.end 1051 cur_feature.location = SeqFeature.FeatureLocation(s, e, strand) 1052 return 1053 #Not recognised 1054 if "order" in location_line and "join" in location_line: 1055 #See Bug 3197 1056 msg = 'Combinations of "join" and "order" within the same ' + \ 1057 'location (nested operators) are illegal:\n' + location_line 1058 raise LocationParserError(msg) 1059 #This used to be an error.... 1060 cur_feature.location = None 1061 import warnings 1062 from Bio import BiopythonParserWarning 1063 warnings.warn(BiopythonParserWarning("Couldn't parse feature location: %r" 1064 % (location_line)))
1065
1066 - def feature_qualifier(self, key, value):
1067 """When we get a qualifier key and its value. 1068 1069 Can receive None, since you can have valueless keys such as /pseudo 1070 """ 1071 # Hack to try to preserve historical behaviour of /pseudo etc 1072 if value is None: 1073 if key not in self._cur_feature.qualifiers: 1074 self._cur_feature.qualifiers[key] = [""] 1075 return 1076 1077 value = value.replace('"', '') 1078 if self._feature_cleaner is not None: 1079 value = self._feature_cleaner.clean_value(key, value) 1080 1081 # if the qualifier name exists, append the value 1082 if key in self._cur_feature.qualifiers: 1083 self._cur_feature.qualifiers[key].append(value) 1084 # otherwise start a new list of the key with its values 1085 else: 1086 self._cur_feature.qualifiers[key] = [value]
1087
1088 - def feature_qualifier_name(self, content_list):
1089 """Use feature_qualifier instead (OBSOLETE).""" 1090 raise NotImplementedError("Use the feature_qualifier method instead.")
1091
1092 - def feature_qualifier_description(self, content):
1093 """Use feature_qualifier instead (OBSOLETE).""" 1094 raise NotImplementedError("Use the feature_qualifier method instead.")
1095
1096 - def contig_location(self, content):
1097 """Deal with CONTIG information.""" 1098 #Historically this was stored as a SeqFeature object, but it was 1099 #stored under record.annotations["contig"] and not under 1100 #record.features with the other SeqFeature objects. 1101 # 1102 #The CONTIG location line can include additional tokens like 1103 #Gap(), Gap(100) or Gap(unk100) which are not used in the feature 1104 #location lines, so storing it using SeqFeature based location 1105 #objects is difficult. 1106 # 1107 #We now store this a string, which means for BioSQL we are now in 1108 #much better agreement with how BioPerl records the CONTIG line 1109 #in the database. 1110 # 1111 #NOTE - This code assumes the scanner will return all the CONTIG 1112 #lines already combined into one long string! 1113 self.data.annotations["contig"] = content
1114
1115 - def origin_name(self, content):
1116 pass
1117
1118 - def base_count(self, content):
1119 pass
1120
1121 - def base_number(self, content):
1122 pass
1123
1124 - def sequence(self, content):
1125 """Add up sequence information as we get it. 1126 1127 To try and make things speedier, this puts all of the strings 1128 into a list of strings, and then uses string.join later to put 1129 them together. Supposedly, this is a big time savings 1130 """ 1131 assert ' ' not in content 1132 self._seq_data.append(content.upper())
1133
1134 - def record_end(self, content):
1135 """Clean up when we've finished the record. 1136 """ 1137 from Bio import Alphabet 1138 from Bio.Alphabet import IUPAC 1139 from Bio.Seq import Seq, UnknownSeq 1140 1141 #Try and append the version number to the accession for the full id 1142 if not self.data.id: 1143 assert 'accessions' not in self.data.annotations, \ 1144 self.data.annotations['accessions'] 1145 self.data.id = self.data.name # Good fall back? 1146 elif self.data.id.count('.') == 0: 1147 try: 1148 self.data.id+='.%i' % self.data.annotations['sequence_version'] 1149 except KeyError: 1150 pass 1151 1152 # add the sequence information 1153 # first, determine the alphabet 1154 # we default to an generic alphabet if we don't have a 1155 # seq type or have strange sequence information. 1156 seq_alphabet = Alphabet.generic_alphabet 1157 1158 # now set the sequence 1159 sequence = "".join(self._seq_data) 1160 1161 if self._expected_size is not None \ 1162 and len(sequence) != 0 \ 1163 and self._expected_size != len(sequence): 1164 import warnings 1165 from Bio import BiopythonParserWarning 1166 warnings.warn("Expected sequence length %i, found %i (%s)." 1167 % (self._expected_size, len(sequence), self.data.id), 1168 BiopythonParserWarning) 1169 1170 if self._seq_type: 1171 # mRNA is really also DNA, since it is actually cDNA 1172 if 'DNA' in self._seq_type.upper() or 'MRNA' in self._seq_type.upper(): 1173 seq_alphabet = IUPAC.ambiguous_dna 1174 # are there ever really RNA sequences in GenBank? 1175 elif 'RNA' in self._seq_type.upper(): 1176 #Even for data which was from RNA, the sequence string 1177 #is usually given as DNA (T not U). Bug 2408 1178 if "T" in sequence and "U" not in sequence: 1179 seq_alphabet = IUPAC.ambiguous_dna 1180 else: 1181 seq_alphabet = IUPAC.ambiguous_rna 1182 elif 'PROTEIN' in self._seq_type.upper(): 1183 seq_alphabet = IUPAC.protein # or extended protein? 1184 # work around ugly GenBank records which have circular or 1185 # linear but no indication of sequence type 1186 elif self._seq_type in ["circular", "linear", "unspecified"]: 1187 pass 1188 # we have a bug if we get here 1189 else: 1190 raise ValueError("Could not determine alphabet for seq_type %s" 1191 % self._seq_type) 1192 1193 if not sequence and self.__expected_size: 1194 self.data.seq = UnknownSeq(self._expected_size, seq_alphabet) 1195 else: 1196 self.data.seq = Seq(sequence, seq_alphabet)
1197 1198
1199 -class _RecordConsumer(_BaseGenBankConsumer):
1200 """Create a GenBank Record object from scanner generated information (PRIVATE). 1201 """
1202 - def __init__(self):
1203 _BaseGenBankConsumer.__init__(self) 1204 import Record 1205 self.data = Record.Record() 1206 1207 self._seq_data = [] 1208 self._cur_reference = None 1209 self._cur_feature = None 1210 self._cur_qualifier = None
1211
1212 - def wgs(self, content):
1213 self.data.wgs = content.split('-')
1214
1215 - def add_wgs_scafld(self, content):
1216 self.data.wgs_scafld.append(content.split('-'))
1217
1218 - def locus(self, content):
1219 self.data.locus = content
1220
1221 - def size(self, content):
1222 self.data.size = content
1223
1224 - def residue_type(self, content):
1225 # Be lenient about parsing, but technically lowercase residue types are malformed. 1226 if 'dna' in content or 'rna' in content: 1227 import warnings 1228 from Bio import BiopythonParserWarning 1229 warnings.warn("Invalid seq_type (%s): DNA/RNA should be uppercase." % content, 1230 BiopythonParserWarning) 1231 self.data.residue_type = content
1232
1233 - def data_file_division(self, content):
1234 self.data.data_file_division = content
1235
1236 - def date(self, content):
1237 self.data.date = content
1238
1239 - def definition(self, content):
1240 self.data.definition = content
1241
1242 - def accession(self, content):
1243 for acc in self._split_accessions(content): 1244 if acc not in self.data.accession: 1245 self.data.accession.append(acc)
1246
1247 - def nid(self, content):
1248 self.data.nid = content
1249
1250 - def pid(self, content):
1251 self.data.pid = content
1252
1253 - def version(self, content):
1254 self.data.version = content
1255
1256 - def db_source(self, content):
1257 self.data.db_source = content.rstrip()
1258
1259 - def gi(self, content):
1260 self.data.gi = content
1261
1262 - def keywords(self, content):
1263 self.data.keywords = self._split_keywords(content)
1264
1265 - def project(self, content):
1266 self.data.projects.extend([p for p in content.split() if p])
1267 1270
1271 - def segment(self, content):
1272 self.data.segment = content
1273
1274 - def source(self, content):
1275 self.data.source = content
1276
1277 - def organism(self, content):
1278 self.data.organism = content
1279
1280 - def taxonomy(self, content):
1281 self.data.taxonomy = self._split_taxonomy(content)
1282
1283 - def reference_num(self, content):
1284 """Grab the reference number and signal the start of a new reference. 1285 """ 1286 # check if we have a reference to add 1287 if self._cur_reference is not None: 1288 self.data.references.append(self._cur_reference) 1289 1290 import Record 1291 self._cur_reference = Record.Reference() 1292 self._cur_reference.number = content
1293
1294 - def reference_bases(self, content):
1295 self._cur_reference.bases = content
1296
1297 - def authors(self, content):
1298 self._cur_reference.authors = content
1299
1300 - def consrtm(self, content):
1301 self._cur_reference.consrtm = content
1302
1303 - def title(self, content):
1304 if self._cur_reference is None: 1305 import warnings 1306 from Bio import BiopythonParserWarning 1307 warnings.warn("GenBank TITLE line without REFERENCE line.", 1308 BiopythonParserWarning) 1309 return 1310 self._cur_reference.title = content
1311
1312 - def journal(self, content):
1313 self._cur_reference.journal = content
1314
1315 - def medline_id(self, content):
1316 self._cur_reference.medline_id = content
1317
1318 - def pubmed_id(self, content):
1319 self._cur_reference.pubmed_id = content
1320
1321 - def remark(self, content):
1322 self._cur_reference.remark = content
1323
1324 - def comment(self, content):
1325 self.data.comment += "\n".join(content)
1326
1327 - def primary_ref_line(self,content):
1328 """Data for the PRIMARY line""" 1329 self.data.primary.append(content)
1330
1331 - def primary(self,content):
1332 pass
1333
1334 - def features_line(self, content):
1335 """Get ready for the feature table when we reach the FEATURE line. 1336 """ 1337 self.start_feature_table()
1338
1339 - def start_feature_table(self):
1340 """Signal the start of the feature table. 1341 """ 1342 # we need to add on the last reference 1343 if self._cur_reference is not None: 1344 self.data.references.append(self._cur_reference)
1345
1346 - def feature_key(self, content):
1347 """Grab the key of the feature and signal the start of a new feature. 1348 """ 1349 # first add on feature information if we've got any 1350 self._add_feature() 1351 1352 import Record 1353 self._cur_feature = Record.Feature() 1354 self._cur_feature.key = content
1355
1356 - def _add_feature(self):
1357 """Utility function to add a feature to the Record. 1358 1359 This does all of the appropriate checking to make sure we haven't 1360 left any info behind, and that we are only adding info if it 1361 exists. 1362 """ 1363 if self._cur_feature is not None: 1364 # if we have a left over qualifier, add it to the qualifiers 1365 # on the current feature 1366 if self._cur_qualifier is not None: 1367 self._cur_feature.qualifiers.append(self._cur_qualifier) 1368 1369 self._cur_qualifier = None 1370 self.data.features.append(self._cur_feature)
1371
1372 - def location(self, content):
1373 self._cur_feature.location = self._clean_location(content)
1374
1375 - def feature_qualifier(self, key, value):
1376 self.feature_qualifier_name([key]) 1377 if value is not None: 1378 self.feature_qualifier_description(value)
1379
1380 - def feature_qualifier_name(self, content_list):
1381 """Deal with qualifier names 1382 1383 We receive a list of keys, since you can have valueless keys such as 1384 /pseudo which would be passed in with the next key (since no other 1385 tags separate them in the file) 1386 """ 1387 import Record 1388 for content in content_list: 1389 # the record parser keeps the /s -- add them if we don't have 'em 1390 if not content.startswith("/"): 1391 content = "/%s" % content 1392 # add on a qualifier if we've got one 1393 if self._cur_qualifier is not None: 1394 self._cur_feature.qualifiers.append(self._cur_qualifier) 1395 1396 self._cur_qualifier = Record.Qualifier() 1397 self._cur_qualifier.key = content
1398
1399 - def feature_qualifier_description(self, content):
1400 # if we have info then the qualifier key should have a ='s 1401 if '=' not in self._cur_qualifier.key: 1402 self._cur_qualifier.key = "%s=" % self._cur_qualifier.key 1403 cur_content = self._remove_newlines(content) 1404 # remove all spaces from the value if it is a type where spaces 1405 # are not important 1406 for remove_space_key in self.__class__.remove_space_keys: 1407 if remove_space_key in self._cur_qualifier.key: 1408 cur_content = self._remove_spaces(cur_content) 1409 self._cur_qualifier.value = self._normalize_spaces(cur_content)
1410
1411 - def base_count(self, content):
1412 self.data.base_counts = content
1413
1414 - def origin_name(self, content):
1415 self.data.origin = content
1416
1417 - def contig_location(self, content):
1418 """Signal that we have contig information to add to the record. 1419 """ 1420 self.data.contig = self._clean_location(content)
1421
1422 - def sequence(self, content):
1423 """Add sequence information to a list of sequence strings. 1424 1425 This removes spaces in the data and uppercases the sequence, and 1426 then adds it to a list of sequences. Later on we'll join this 1427 list together to make the final sequence. This is faster than 1428 adding on the new string every time. 1429 """ 1430 assert ' ' not in content 1431 self._seq_data.append(content.upper())
1432
1433 - def record_end(self, content):
1434 """Signal the end of the record and do any necessary clean-up. 1435 """ 1436 # add together all of the sequence parts to create the 1437 # final sequence string 1438 self.data.sequence = "".join(self._seq_data) 1439 # add on the last feature 1440 self._add_feature()
1441 1442
1443 -def parse(handle):
1444 """Iterate over GenBank formatted entries as Record objects. 1445 1446 >>> from Bio import GenBank 1447 >>> handle = open("GenBank/NC_000932.gb") 1448 >>> for record in GenBank.parse(handle): 1449 ... print record.accession 1450 ['NC_000932'] 1451 >>> handle.close() 1452 1453 To get SeqRecord objects use Bio.SeqIO.parse(..., format="gb") 1454 instead. 1455 """ 1456 return iter(Iterator(handle, RecordParser()))
1457 1458
1459 -def read(handle):
1460 """Read a handle containing a single GenBank entry as a Record object. 1461 1462 >>> from Bio import GenBank 1463 >>> handle = open("GenBank/NC_000932.gb") 1464 >>> record = GenBank.read(handle) 1465 >>> print record.accession 1466 ['NC_000932'] 1467 >>> handle.close() 1468 1469 To get a SeqRecord object use Bio.SeqIO.read(..., format="gb") 1470 instead. 1471 """ 1472 iterator = parse(handle) 1473 try: 1474 first = iterator.next() 1475 except StopIteration: 1476 first = None 1477 if first is None: 1478 raise ValueError("No records found in handle") 1479 try: 1480 second = iterator.next() 1481 except StopIteration: 1482 second = None 1483 if second is not None: 1484 raise ValueError("More than one record found in handle") 1485 return first
1486 1487
1488 -def _test():
1489 """Run the Bio.GenBank module's doctests.""" 1490 import doctest 1491 import os 1492 if os.path.isdir(os.path.join("..","..","Tests")): 1493 print "Running doctests..." 1494 cur_dir = os.path.abspath(os.curdir) 1495 os.chdir(os.path.join("..","..","Tests")) 1496 doctest.testmod() 1497 os.chdir(cur_dir) 1498 del cur_dir 1499 print "Done" 1500 elif os.path.isdir(os.path.join("Tests")): 1501 print "Running doctests..." 1502 cur_dir = os.path.abspath(os.curdir) 1503 os.chdir(os.path.join("Tests")) 1504 doctest.testmod() 1505 os.chdir(cur_dir) 1506 del cur_dir 1507 print "Done"
1508 1509 if __name__ == "__main__": 1510 _test() 1511