Package Bio :: Package SeqIO :: Module _index
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO._index

  1  # Copyright 2009-2011 by Peter Cock.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Dictionary like indexing of sequence files (PRIVATE). 
  6   
  7  You are not expected to access this module, or any of its code, directly. This 
  8  is all handled internally by the Bio.SeqIO.index(...) function which is the 
  9  public interface for this functionality. 
 10   
 11  The basic idea is that we scan over a sequence file, looking for new record 
 12  markers. We then try and extract the string that Bio.SeqIO.parse/read would 
 13  use as the record id, ideally without actually parsing the full record. We 
 14  then use a subclassed Python dictionary to record the file offset for the 
 15  record start against the record id. 
 16   
 17  Note that this means full parsing is on demand, so any invalid or problem 
 18  record may not trigger an exception until it is accessed. This is by design. 
 19   
 20  This means our dictionary like objects have in memory ALL the keys (all the 
 21  record identifiers), which shouldn't be a problem even with second generation 
 22  sequencing. If this is an issue later on, storing the keys and offsets in a 
 23  temp lookup file might be one idea (e.g. using SQLite or an OBDA style index). 
 24  """ 
 25   
 26  from __future__ import print_function 
 27   
 28  import re 
 29  from Bio._py3k import StringIO 
 30  from Bio._py3k import _bytes_to_string, _as_bytes 
 31   
 32  from Bio import SeqIO 
 33  from Bio import Alphabet 
 34  from Bio.File import _IndexedSeqFileProxy, _open_for_random_access 
 35   
 36   
37 -class SeqFileRandomAccess(_IndexedSeqFileProxy):
38 - def __init__(self, filename, format, alphabet):
39 self._handle = _open_for_random_access(filename) 40 self._alphabet = alphabet 41 self._format = format 42 # Load the parser class/function once an avoid the dict lookup in each 43 # __getitem__ call: 44 i = SeqIO._FormatToIterator[format] 45 # The following alphabet code is a bit nasty... duplicates logic in 46 # Bio.SeqIO.parse() 47 if alphabet is None: 48 def _parse(handle): 49 """Dynamically generated parser function (PRIVATE).""" 50 return next(i(handle))
51 else: 52 # TODO - Detect alphabet support ONCE at __init__ 53 def _parse(handle): 54 """Dynamically generated parser function (PRIVATE).""" 55 try: 56 return next(i(handle, alphabet=alphabet)) 57 except TypeError: 58 return next(SeqIO._force_alphabet(i(handle), alphabet))
59 self._parse = _parse 60
61 - def get(self, offset):
62 """Returns SeqRecord.""" 63 # Should be overridden for binary file formats etc: 64 return self._parse(StringIO(_bytes_to_string(self.get_raw(offset))))
65 66 67 #################### 68 # Special indexers # 69 #################### 70 # Anything where the records cannot be read simply by parsing from 71 # the record start. For example, anything requiring information from 72 # a file header - e.g. SFF files where we would need to know the 73 # number of flows.
74 -class SffRandomAccess(SeqFileRandomAccess):
75 """Random access to a Standard Flowgram Format (SFF) file."""
76 - def __init__(self, filename, format, alphabet):
77 SeqFileRandomAccess.__init__(self, filename, format, alphabet) 78 header_length, index_offset, index_length, number_of_reads, \ 79 self._flows_per_read, self._flow_chars, self._key_sequence \ 80 = SeqIO.SffIO._sff_file_header(self._handle)
81
82 - def __iter__(self):
83 """Load any index block in the file, or build it the slow way (PRIVATE).""" 84 if self._alphabet is None: 85 self._alphabet = Alphabet.generic_dna 86 handle = self._handle 87 handle.seek(0) 88 # Alread did this in __init__ but need handle in right place 89 header_length, index_offset, index_length, number_of_reads, \ 90 self._flows_per_read, self._flow_chars, self._key_sequence \ 91 = SeqIO.SffIO._sff_file_header(handle) 92 if index_offset and index_length: 93 # There is an index provided, try this the fast way: 94 count = 0 95 max_offset = 0 96 try: 97 for name, offset in SeqIO.SffIO._sff_read_roche_index(handle): 98 max_offset = max(max_offset, offset) 99 yield name, offset, 0 100 count += 1 101 assert count == number_of_reads, \ 102 "Indexed %i records, expected %i" \ 103 % (count, number_of_reads) 104 # If that worked, call _check_eof ... 105 except ValueError as err: 106 import warnings 107 from Bio import BiopythonParserWarning 108 warnings.warn("Could not parse the SFF index: %s" % err, 109 BiopythonParserWarning) 110 assert count == 0, "Partially populated index" 111 handle.seek(0) 112 # Drop out to the slow way... 113 else: 114 # Fast way worked, check EOF 115 if index_offset + index_length <= max_offset: 116 # Can have an index at start (or mid-file) 117 handle.seek(max_offset) 118 # Parse the final read, 119 SeqIO.SffIO._sff_read_raw_record(handle, self._flows_per_read) 120 # Should now be at the end of the file! 121 SeqIO.SffIO._check_eof(handle, index_offset, index_length) 122 return 123 # We used to give a warning in this case, but Ion Torrent's 124 # SFF files don't have an index so that would be annoying. 125 # Fall back on the slow way! 126 count = 0 127 for name, offset in SeqIO.SffIO._sff_do_slow_index(handle): 128 yield name, offset, 0 129 count += 1 130 assert count == number_of_reads, \ 131 "Indexed %i records, expected %i" % (count, number_of_reads) 132 SeqIO.SffIO._check_eof(handle, index_offset, index_length)
133
134 - def get(self, offset):
135 handle = self._handle 136 handle.seek(offset) 137 return SeqIO.SffIO._sff_read_seq_record(handle, 138 self._flows_per_read, 139 self._flow_chars, 140 self._key_sequence, 141 self._alphabet)
142
143 - def get_raw(self, offset):
144 """Return the raw record from the file as a bytes string.""" 145 handle = self._handle 146 handle.seek(offset) 147 return SeqIO.SffIO._sff_read_raw_record(handle, self._flows_per_read)
148 149
150 -class SffTrimedRandomAccess(SffRandomAccess):
151 - def get(self, offset):
152 handle = self._handle 153 handle.seek(offset) 154 return SeqIO.SffIO._sff_read_seq_record(handle, 155 self._flows_per_read, 156 self._flow_chars, 157 self._key_sequence, 158 self._alphabet, 159 trim=True)
160 161 162 ################### 163 # Simple indexers # 164 ################### 165
166 -class SequentialSeqFileRandomAccess(SeqFileRandomAccess):
167 - def __init__(self, filename, format, alphabet):
168 SeqFileRandomAccess.__init__(self, filename, format, alphabet) 169 marker = {"ace": "CO ", 170 "embl": "ID ", 171 "fasta": ">", 172 "genbank": "LOCUS ", 173 "gb": "LOCUS ", 174 "imgt": "ID ", 175 "phd": "BEGIN_SEQUENCE", 176 "pir": ">..;", 177 "qual": ">", 178 "swiss": "ID ", 179 "uniprot-xml": "<entry ", 180 }[format] 181 self._marker = marker 182 self._marker_re = re.compile(_as_bytes("^%s" % marker))
183
184 - def __iter__(self):
185 """Returns (id,offset) tuples.""" 186 marker_offset = len(self._marker) 187 marker_re = self._marker_re 188 handle = self._handle 189 handle.seek(0) 190 # Skip and header before first record 191 while True: 192 start_offset = handle.tell() 193 line = handle.readline() 194 if marker_re.match(line) or not line: 195 break 196 # Should now be at the start of a record, or end of the file 197 while marker_re.match(line): 198 # Here we can assume the record.id is the first word after the 199 # marker. This is generally fine... but not for GenBank, EMBL, Swiss 200 id = line[marker_offset:].strip().split(None, 1)[0] 201 length = len(line) 202 while True: 203 end_offset = handle.tell() 204 line = handle.readline() 205 if marker_re.match(line) or not line: 206 yield _bytes_to_string(id), start_offset, length 207 start_offset = end_offset 208 break 209 else: 210 # Track this explicitly as can't do file offset difference on BGZF 211 length += len(line) 212 assert not line, repr(line)
213
214 - def get_raw(self, offset):
215 """Return the raw record from the file as a bytes string.""" 216 # For non-trivial file formats this must be over-ridden in the subclass 217 handle = self._handle 218 marker_re = self._marker_re 219 handle.seek(offset) 220 lines = [handle.readline()] 221 while True: 222 line = handle.readline() 223 if marker_re.match(line) or not line: 224 # End of file, or start of next record => end of this record 225 break 226 lines.append(line) 227 return _as_bytes("").join(lines)
228 229 230 ####################################### 231 # Fiddly indexers: GenBank, EMBL, ... # 232 ####################################### 233
234 -class GenBankRandomAccess(SequentialSeqFileRandomAccess):
235 """Indexed dictionary like access to a GenBank file."""
236 - def __iter__(self):
237 handle = self._handle 238 handle.seek(0) 239 marker_re = self._marker_re 240 dot_char = _as_bytes(".") 241 accession_marker = _as_bytes("ACCESSION ") 242 version_marker = _as_bytes("VERSION ") 243 # Skip and header before first record 244 while True: 245 start_offset = handle.tell() 246 line = handle.readline() 247 if marker_re.match(line) or not line: 248 break 249 # Should now be at the start of a record, or end of the file 250 while marker_re.match(line): 251 # We cannot assume the record.id is the first word after LOCUS, 252 # normally the first entry on the VERSION or ACCESSION line is used. 253 key = None 254 length = len(line) 255 while True: 256 end_offset = handle.tell() 257 line = handle.readline() 258 if marker_re.match(line) or not line: 259 if not key: 260 raise ValueError( 261 "Did not find ACCESSION/VERSION lines") 262 yield _bytes_to_string(key), start_offset, length 263 start_offset = end_offset 264 break 265 elif line.startswith(accession_marker): 266 key = line.rstrip().split()[1] 267 elif line.startswith(version_marker): 268 version_id = line.rstrip().split()[1] 269 if version_id.count(dot_char) == 1 and version_id.split(dot_char)[1].isdigit(): 270 # This should mimic the GenBank parser... 271 key = version_id 272 length += len(line) 273 assert not line, repr(line)
274 275
276 -class EmblRandomAccess(SequentialSeqFileRandomAccess):
277 """Indexed dictionary like access to an EMBL file."""
278 - def __iter__(self):
279 handle = self._handle 280 handle.seek(0) 281 marker_re = self._marker_re 282 semi_char = _as_bytes(";") 283 dot_char = _as_bytes(".") 284 sv_marker = _as_bytes("SV ") 285 ac_marker = _as_bytes("AC ") 286 # Skip any header before first record 287 while True: 288 start_offset = handle.tell() 289 line = handle.readline() 290 if marker_re.match(line) or not line: 291 break 292 # Should now be at the start of a record, or end of the file 293 while marker_re.match(line): 294 # We cannot assume the record.id is the first word after ID, 295 # normally the SV line is used. 296 setbysv = False # resets sv as false 297 length = len(line) 298 if line[2:].count(semi_char) == 6: 299 # Looks like the semi colon separated style introduced in 2006 300 parts = line[3:].rstrip().split(semi_char) 301 if parts[1].strip().startswith(sv_marker): 302 # The SV bit gives the version 303 key = parts[0].strip() + dot_char + \ 304 parts[1].strip().split()[1] 305 setbysv = True 306 else: 307 key = parts[0].strip() 308 elif line[2:].count(semi_char) == 3: 309 # Looks like the pre 2006 style, take first word only 310 key = line[3:].strip().split(None, 1)[0] 311 if key.endswith(semi_char): 312 key = key[:-1] 313 else: 314 raise ValueError( 315 'Did not recognise the ID line layout:\n' + line) 316 while True: 317 end_offset = handle.tell() 318 line = handle.readline() 319 if marker_re.match(line) or not line: 320 end_offset = handle.tell() - len(line) 321 yield _bytes_to_string(key), start_offset, length 322 start_offset = end_offset 323 break 324 elif line.startswith(ac_marker) and not setbysv: 325 key = line.rstrip().split()[1] 326 if key.endswith(semi_char): 327 key = key[:-1] 328 elif line.startswith(sv_marker): 329 key = line.rstrip().split()[1] 330 setbysv = True 331 length += len(line) 332 assert not line, repr(line)
333 334
335 -class SwissRandomAccess(SequentialSeqFileRandomAccess):
336 """Random access to a SwissProt file."""
337 - def __iter__(self):
338 handle = self._handle 339 handle.seek(0) 340 marker_re = self._marker_re 341 semi_char = _as_bytes(";") 342 # Skip any header before first record 343 while True: 344 start_offset = handle.tell() 345 line = handle.readline() 346 if marker_re.match(line) or not line: 347 break 348 # Should now be at the start of a record, or end of the file 349 while marker_re.match(line): 350 length = len(line) 351 # We cannot assume the record.id is the first word after ID, 352 # normally the following AC line is used. 353 line = handle.readline() 354 length += len(line) 355 assert line.startswith(_as_bytes("AC ")) 356 key = line[3:].strip().split(semi_char)[0].strip() 357 while True: 358 end_offset = handle.tell() 359 line = handle.readline() 360 if marker_re.match(line) or not line: 361 yield _bytes_to_string(key), start_offset, length 362 start_offset = end_offset 363 break 364 length += len(line) 365 assert not line, repr(line)
366 367
368 -class UniprotRandomAccess(SequentialSeqFileRandomAccess):
369 """Random access to a UniProt XML file."""
370 - def __iter__(self):
371 handle = self._handle 372 handle.seek(0) 373 marker_re = self._marker_re 374 start_acc_marker = _as_bytes("<accession>") 375 end_acc_marker = _as_bytes("</accession>") 376 end_entry_marker = _as_bytes("</entry>") 377 less_than = _as_bytes("<") 378 # Skip any header before first record 379 while True: 380 start_offset = handle.tell() 381 line = handle.readline() 382 if marker_re.match(line) or not line: 383 break 384 # Should now be at the start of a record, or end of the file 385 while marker_re.match(line): 386 length = len(line) 387 # We expect the next line to be <accession>xxx</accession> 388 # (possibly with leading spaces) 389 # but allow it to be later on within the <entry> 390 key = None 391 while True: 392 line = handle.readline() 393 if key is None and start_acc_marker in line: 394 assert end_acc_marker in line, line 395 key = line[line.find( 396 start_acc_marker) + 11:].split(less_than, 1)[0] 397 length += len(line) 398 elif end_entry_marker in line: 399 end_offset = handle.tell() - len(line) \ 400 + line.find(end_entry_marker) + 8 401 break 402 elif marker_re.match(line) or not line: 403 # Start of next record or end of file 404 raise ValueError("Didn't find end of record") 405 else: 406 length += len(line) 407 if not key: 408 raise ValueError("Did not find <accession> line in bytes %i to %i" 409 % (start_offset, end_offset)) 410 yield _bytes_to_string(key), start_offset, length 411 # Find start of next record 412 while not marker_re.match(line) and line: 413 start_offset = handle.tell() 414 line = handle.readline() 415 assert not line, repr(line)
416
417 - def get_raw(self, offset):
418 """Return the raw record from the file as a bytes string.""" 419 handle = self._handle 420 marker_re = self._marker_re 421 end_entry_marker = _as_bytes("</entry>") 422 handle.seek(offset) 423 data = [handle.readline()] 424 while True: 425 line = handle.readline() 426 i = line.find(end_entry_marker) 427 if i != -1: 428 data.append(line[:i + 8]) 429 break 430 if marker_re.match(line) or not line: 431 # End of file, or start of next record 432 raise ValueError("Didn't find end of record") 433 data.append(line) 434 return _as_bytes("").join(data)
435
436 - def get(self, offset):
437 # TODO - Can we handle this directly in the parser? 438 # This is a hack - use get_raw for <entry>...</entry> and wrap it with 439 # the apparently required XML header and footer. 440 data = """<?xml version='1.0' encoding='UTF-8'?> 441 <uniprot xmlns="http://uniprot.org/uniprot" 442 xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 443 xsi:schemaLocation="http://uniprot.org/uniprot 444 http://www.uniprot.org/support/docs/uniprot.xsd"> 445 %s 446 </uniprot> 447 """ % _bytes_to_string(self.get_raw(offset)) 448 # TODO - For consistency, this function should not accept a string: 449 return next(SeqIO.UniprotIO.UniprotIterator(data))
450 451
452 -class IntelliGeneticsRandomAccess(SeqFileRandomAccess):
453 """Random access to a IntelliGenetics file."""
454 - def __init__(self, filename, format, alphabet):
455 SeqFileRandomAccess.__init__(self, filename, format, alphabet) 456 self._marker_re = re.compile(_as_bytes("^;"))
457
458 - def __iter__(self):
459 handle = self._handle 460 handle.seek(0) 461 marker_re = self._marker_re 462 semi_char = _as_bytes(";") 463 while True: 464 offset = handle.tell() 465 line = handle.readline() 466 length = len(line) 467 if marker_re.match(line): 468 # Now look for the first line which doesn't start ";" 469 while True: 470 line = handle.readline() 471 if line[0:1] != semi_char and line.strip(): 472 key = line.split()[0] 473 yield _bytes_to_string(key), offset, length 474 break 475 if not line: 476 raise ValueError("Premature end of file?") 477 length += len(line) 478 elif not line: 479 # End of file 480 break
481
482 - def get_raw(self, offset):
483 """Return the raw record from the file as a bytes string.""" 484 handle = self._handle 485 handle.seek(offset) 486 marker_re = self._marker_re 487 lines = [] 488 line = handle.readline() 489 semi_char = _as_bytes(";") 490 while line.startswith(semi_char): 491 lines.append(line) 492 line = handle.readline() 493 while line and not line.startswith(semi_char): 494 lines.append(line) 495 line = handle.readline() 496 return _as_bytes("").join(lines)
497 498
499 -class TabRandomAccess(SeqFileRandomAccess):
500 """Random access to a simple tabbed file."""
501 - def __iter__(self):
502 handle = self._handle 503 handle.seek(0) 504 tab_char = _as_bytes("\t") 505 while True: 506 start_offset = handle.tell() 507 line = handle.readline() 508 if not line: 509 break # End of file 510 try: 511 key = line.split(tab_char)[0] 512 except ValueError as err: 513 if not line.strip(): 514 # Ignore blank lines 515 continue 516 else: 517 raise err 518 else: 519 yield _bytes_to_string(key), start_offset, len(line)
520
521 - def get_raw(self, offset):
522 """Return the raw record from the file as a bytes string.""" 523 handle = self._handle 524 handle.seek(offset) 525 return handle.readline()
526 527 528 ########################## 529 # Now the FASTQ indexers # 530 ########################## 531
532 -class FastqRandomAccess(SeqFileRandomAccess):
533 """Random access to a FASTQ file (any supported variant). 534 535 With FASTQ the records all start with a "@" line, but so can quality lines. 536 Note this will cope with line-wrapped FASTQ files. 537 """
538 - def __iter__(self):
539 handle = self._handle 540 handle.seek(0) 541 id = None 542 start_offset = handle.tell() 543 line = handle.readline() 544 if not line: 545 # Empty file! 546 return 547 at_char = _as_bytes("@") 548 plus_char = _as_bytes("+") 549 if line[0:1] != at_char: 550 raise ValueError("Problem with FASTQ @ line:\n%r" % line) 551 while line: 552 # assert line[0]=="@" 553 # This record seems OK (so far) 554 id = line[1:].rstrip().split(None, 1)[0] 555 # Find the seq line(s) 556 seq_len = 0 557 length = len(line) 558 while line: 559 line = handle.readline() 560 length += len(line) 561 if line.startswith(plus_char): 562 break 563 seq_len += len(line.strip()) 564 if not line: 565 raise ValueError("Premature end of file in seq section") 566 # assert line[0]=="+" 567 # Find the qual line(s) 568 qual_len = 0 569 while line: 570 if seq_len == qual_len: 571 if seq_len == 0: 572 # Special case, quality line should be just "\n" 573 line = handle.readline() 574 if line.strip(): 575 raise ValueError("Expected blank quality line, not %r" % line) 576 # Should be end of record... 577 end_offset = handle.tell() 578 line = handle.readline() 579 if line and line[0:1] != at_char: 580 raise ValueError("Problem with line %r" % line) 581 break 582 else: 583 line = handle.readline() 584 qual_len += len(line.strip()) 585 length += len(line) 586 if seq_len != qual_len: 587 raise ValueError("Problem with quality section") 588 yield _bytes_to_string(id), start_offset, length 589 start_offset = end_offset
590 # print("EOF") 591
592 - def get_raw(self, offset):
593 """Return the raw record from the file as a bytes string.""" 594 # TODO - Refactor this and the __init__ method to reduce code duplication? 595 handle = self._handle 596 handle.seek(offset) 597 line = handle.readline() 598 data = line 599 at_char = _as_bytes("@") 600 plus_char = _as_bytes("+") 601 if line[0:1] != at_char: 602 raise ValueError("Problem with FASTQ @ line:\n%r" % line) 603 # Find the seq line(s) 604 seq_len = 0 605 while line: 606 line = handle.readline() 607 data += line 608 if line.startswith(plus_char): 609 break 610 seq_len += len(line.strip()) 611 if not line: 612 raise ValueError("Premature end of file in seq section") 613 assert line[0:1] == plus_char 614 # Find the qual line(s) 615 qual_len = 0 616 while line: 617 if seq_len == qual_len: 618 if seq_len == 0: 619 # Special case, quality line should be just "\n" 620 line = handle.readline() 621 if line.strip(): 622 raise ValueError("Expected blank quality line, not %r" % line) 623 data += line 624 # Should be end of record... 625 line = handle.readline() 626 if line and line[0:1] != at_char: 627 raise ValueError("Problem with line %r" % line) 628 break 629 else: 630 line = handle.readline() 631 data += line 632 qual_len += len(line.strip()) 633 if seq_len != qual_len: 634 raise ValueError("Problem with quality section") 635 return data
636 637 638 ############################################################################### 639 640 _FormatToRandomAccess = {"ace": SequentialSeqFileRandomAccess, 641 "embl": EmblRandomAccess, 642 "fasta": SequentialSeqFileRandomAccess, 643 "fastq": FastqRandomAccess, # Class handles all three variants 644 "fastq-sanger": FastqRandomAccess, # alias of the above 645 "fastq-solexa": FastqRandomAccess, 646 "fastq-illumina": FastqRandomAccess, 647 "genbank": GenBankRandomAccess, 648 "gb": GenBankRandomAccess, # alias of the above 649 "ig": IntelliGeneticsRandomAccess, 650 "imgt": EmblRandomAccess, 651 "phd": SequentialSeqFileRandomAccess, 652 "pir": SequentialSeqFileRandomAccess, 653 "sff": SffRandomAccess, 654 "sff-trim": SffTrimedRandomAccess, 655 "swiss": SwissRandomAccess, 656 "tab": TabRandomAccess, 657 "qual": SequentialSeqFileRandomAccess, 658 "uniprot-xml": UniprotRandomAccess, 659 } 660