Package Bio :: Package SeqIO :: Module _index
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO._index

  1  # Copyright 2009-2011 by Peter Cock.  All rights reserved. 
  2  # 
  3  # This file is part of the Biopython distribution and governed by your 
  4  # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". 
  5  # Please see the LICENSE file that should have been included as part of this 
  6  # package. 
  7  """Dictionary like indexing of sequence files (PRIVATE). 
  8   
  9  You are not expected to access this module, or any of its code, directly. This 
 10  is all handled internally by the Bio.SeqIO.index(...) and index_db(...) 
 11  functions which are the public interface for this functionality. 
 12   
 13  The basic idea is that we scan over a sequence file, looking for new record 
 14  markers. We then try to extract the string that Bio.SeqIO.parse/read would 
 15  use as the record id, ideally without actually parsing the full record. We 
 16  then use a subclassed Python dictionary to record the file offset for the 
 17  record start against the record id. 
 18   
 19  Note that this means full parsing is on demand, so any invalid or problem 
 20  record may not trigger an exception until it is accessed. This is by design. 
 21   
 22  This means our dictionary like objects have in memory ALL the keys (all the 
 23  record identifiers), which shouldn't be a problem even with second generation 
 24  sequencing. If memory is an issue, the index_db(...) interface stores the 
 25  keys and offsets in an SQLite database - which can be re-used to avoid 
 26  re-indexing the file for use another time. 
 27  """ 
 28   
 29  from __future__ import print_function 
 30   
 31  import re 
 32  from io import BytesIO 
 33  from Bio._py3k import StringIO 
 34  from Bio._py3k import _bytes_to_string 
 35   
 36  from Bio import SeqIO 
 37  from Bio import Alphabet 
 38  from Bio.File import _IndexedSeqFileProxy, _open_for_random_access 
 39   
 40   
41 -class SeqFileRandomAccess(_IndexedSeqFileProxy):
42 - def __init__(self, filename, format, alphabet):
43 """Initialize the class.""" 44 self._handle = _open_for_random_access(filename) 45 self._alphabet = alphabet 46 self._format = format 47 # Load the parser class/function once an avoid the dict lookup in each 48 # __getitem__ call: 49 i = SeqIO._FormatToIterator[format] 50 # The following alphabet code is a bit nasty... duplicates logic in 51 # Bio.SeqIO.parse() 52 if alphabet is None: 53 def _parse(handle): 54 """Dynamically generated parser function (PRIVATE).""" 55 return next(i(handle))
56 else: 57 # TODO - Detect alphabet support ONCE at __init__ 58 def _parse(handle): 59 """Dynamically generated parser function (PRIVATE).""" 60 try: 61 return next(i(handle, alphabet=alphabet)) 62 except TypeError: 63 return next(SeqIO._force_alphabet(i(handle), alphabet))
64 self._parse = _parse 65
66 - def get(self, offset):
67 """Return SeqRecord.""" 68 # Should be overridden for binary file formats etc: 69 return self._parse(StringIO(_bytes_to_string(self.get_raw(offset))))
70 71 72 #################### 73 # Special indexers # 74 #################### 75 # Anything where the records cannot be read simply by parsing from 76 # the record start. For example, anything requiring information from 77 # a file header - e.g. SFF files where we would need to know the 78 # number of flows.
79 -class SffRandomAccess(SeqFileRandomAccess):
80 """Random access to a Standard Flowgram Format (SFF) file.""" 81
82 - def __init__(self, filename, format, alphabet):
83 """Initialize the class.""" 84 SeqFileRandomAccess.__init__(self, filename, format, alphabet) 85 header_length, index_offset, index_length, number_of_reads, \ 86 self._flows_per_read, self._flow_chars, self._key_sequence \ 87 = SeqIO.SffIO._sff_file_header(self._handle)
88
89 - def __iter__(self):
90 """Load any index block in the file, or build it the slow way (PRIVATE).""" 91 if self._alphabet is None: 92 self._alphabet = Alphabet.generic_dna 93 handle = self._handle 94 handle.seek(0) 95 # Alread did this in __init__ but need handle in right place 96 header_length, index_offset, index_length, number_of_reads, \ 97 self._flows_per_read, self._flow_chars, self._key_sequence \ 98 = SeqIO.SffIO._sff_file_header(handle) 99 if index_offset and index_length: 100 # There is an index provided, try this the fast way: 101 count = 0 102 max_offset = 0 103 try: 104 for name, offset in SeqIO.SffIO._sff_read_roche_index(handle): 105 max_offset = max(max_offset, offset) 106 yield name, offset, 0 107 count += 1 108 assert count == number_of_reads, \ 109 "Indexed %i records, expected %i" \ 110 % (count, number_of_reads) 111 # If that worked, call _check_eof ... 112 except ValueError as err: 113 import warnings 114 from Bio import BiopythonParserWarning 115 warnings.warn("Could not parse the SFF index: %s" % err, 116 BiopythonParserWarning) 117 assert count == 0, "Partially populated index" 118 handle.seek(0) 119 # Drop out to the slow way... 120 else: 121 # Fast way worked, check EOF 122 if index_offset + index_length <= max_offset: 123 # Can have an index at start (or mid-file) 124 handle.seek(max_offset) 125 # Parse the final read, 126 SeqIO.SffIO._sff_read_raw_record(handle, self._flows_per_read) 127 # Should now be at the end of the file! 128 SeqIO.SffIO._check_eof(handle, index_offset, index_length) 129 return 130 # We used to give a warning in this case, but Ion Torrent's 131 # SFF files don't have an index so that would be annoying. 132 # Fall back on the slow way! 133 count = 0 134 for name, offset in SeqIO.SffIO._sff_do_slow_index(handle): 135 yield name, offset, 0 136 count += 1 137 assert count == number_of_reads, \ 138 "Indexed %i records, expected %i" % (count, number_of_reads) 139 SeqIO.SffIO._check_eof(handle, index_offset, index_length)
140
141 - def get(self, offset):
142 handle = self._handle 143 handle.seek(offset) 144 return SeqIO.SffIO._sff_read_seq_record(handle, 145 self._flows_per_read, 146 self._flow_chars, 147 self._key_sequence, 148 self._alphabet)
149
150 - def get_raw(self, offset):
151 """Return the raw record from the file as a bytes string.""" 152 handle = self._handle 153 handle.seek(offset) 154 return SeqIO.SffIO._sff_read_raw_record(handle, self._flows_per_read)
155 156
157 -class SffTrimedRandomAccess(SffRandomAccess):
158 - def get(self, offset):
159 handle = self._handle 160 handle.seek(offset) 161 return SeqIO.SffIO._sff_read_seq_record(handle, 162 self._flows_per_read, 163 self._flow_chars, 164 self._key_sequence, 165 self._alphabet, 166 trim=True)
167 168 169 ################### 170 # Simple indexers # 171 ################### 172
173 -class SequentialSeqFileRandomAccess(SeqFileRandomAccess):
174 - def __init__(self, filename, format, alphabet):
175 """Initialize the class.""" 176 SeqFileRandomAccess.__init__(self, filename, format, alphabet) 177 marker = {"ace": b"CO ", 178 "embl": b"ID ", 179 "fasta": b">", 180 "genbank": b"LOCUS ", 181 "gb": b"LOCUS ", 182 "imgt": b"ID ", 183 "phd": b"BEGIN_SEQUENCE", 184 "pir": b">..;", 185 "qual": b">", 186 "swiss": b"ID ", 187 "uniprot-xml": b"<entry ", 188 }[format] 189 self._marker = marker 190 self._marker_re = re.compile(b"^" + marker)
191
192 - def __iter__(self):
193 """Return (id, offset, length) tuples.""" 194 marker_offset = len(self._marker) 195 marker_re = self._marker_re 196 handle = self._handle 197 handle.seek(0) 198 # Skip any header before first record 199 while True: 200 start_offset = handle.tell() 201 line = handle.readline() 202 if marker_re.match(line) or not line: 203 break 204 # Should now be at the start of a record, or end of the file 205 while marker_re.match(line): 206 # Here we can assume the record.id is the first word after the 207 # marker. This is generally fine... but not for GenBank, EMBL, Swiss 208 id = line[marker_offset:].strip().split(None, 1)[0] 209 length = len(line) 210 while True: 211 end_offset = handle.tell() 212 line = handle.readline() 213 if marker_re.match(line) or not line: 214 yield _bytes_to_string(id), start_offset, length 215 start_offset = end_offset 216 break 217 else: 218 # Track this explicitly as can't do file offset difference on BGZF 219 length += len(line) 220 assert not line, repr(line)
221
222 - def get_raw(self, offset):
223 """Return the raw record from the file as a bytes string.""" 224 # For non-trivial file formats this must be over-ridden in the subclass 225 handle = self._handle 226 marker_re = self._marker_re 227 handle.seek(offset) 228 lines = [handle.readline()] 229 while True: 230 line = handle.readline() 231 if marker_re.match(line) or not line: 232 # End of file, or start of next record => end of this record 233 break 234 lines.append(line) 235 return b"".join(lines)
236 237 238 ####################################### 239 # Fiddly indexers: GenBank, EMBL, ... # 240 ####################################### 241
242 -class GenBankRandomAccess(SequentialSeqFileRandomAccess):
243 """Indexed dictionary like access to a GenBank file.""" 244
245 - def __iter__(self):
246 handle = self._handle 247 handle.seek(0) 248 marker_re = self._marker_re 249 accession_marker = b"ACCESSION " 250 version_marker = b"VERSION " 251 # Skip and header before first record 252 while True: 253 start_offset = handle.tell() 254 line = handle.readline() 255 if marker_re.match(line) or not line: 256 break 257 # Should now be at the start of a record, or end of the file 258 while marker_re.match(line): 259 # We cannot assume the record.id is the first word after LOCUS, 260 # normally the first entry on the VERSION or ACCESSION line is used. 261 # However if both missing, GenBank parser falls back on LOCUS entry. 262 try: 263 key = line[5:].split(None, 1)[0] 264 except ValueError: 265 # Warning? 266 # No content in LOCUS line 267 key = None 268 length = len(line) 269 while True: 270 end_offset = handle.tell() 271 line = handle.readline() 272 if marker_re.match(line) or not line: 273 if not key: 274 raise ValueError("Did not find usable ACCESSION/VERSION/LOCUS lines") 275 yield _bytes_to_string(key), start_offset, length 276 start_offset = end_offset 277 break 278 elif line.startswith(accession_marker): 279 try: 280 key = line.rstrip().split()[1] 281 except IndexError: 282 # No content in ACCESSION line 283 pass 284 elif line.startswith(version_marker): 285 try: 286 version_id = line.rstrip().split()[1] 287 if version_id.count(b".") == 1 and version_id.split(b".")[1].isdigit(): 288 # This should mimic the GenBank parser... 289 key = version_id 290 except IndexError: 291 # No content in VERSION line 292 pass 293 294 length += len(line) 295 assert not line, repr(line)
296 297
298 -class EmblRandomAccess(SequentialSeqFileRandomAccess):
299 """Indexed dictionary like access to an EMBL file.""" 300
301 - def __iter__(self):
302 handle = self._handle 303 handle.seek(0) 304 marker_re = self._marker_re 305 semi_char = b";" 306 sv_marker = b"SV " 307 ac_marker = b"AC " 308 # Skip any header before first record 309 while True: 310 start_offset = handle.tell() 311 line = handle.readline() 312 if marker_re.match(line) or not line: 313 break 314 # Should now be at the start of a record, or end of the file 315 while marker_re.match(line): 316 # We cannot assume the record.id is the first word after ID, 317 # normally the SV line is used. 318 setbysv = False # resets sv as false 319 length = len(line) 320 if line[2:].count(b";") in [5, 6]: 321 # Looks like the semi colon separated style introduced in 2006 322 # Or style from IPD-IMGT/HLA after their v3.16.0 release 323 parts = line[3:].rstrip().split(b";") 324 if parts[1].strip().startswith(sv_marker): 325 # The SV bit gives the version 326 key = parts[0].strip() + b"." + \ 327 parts[1].strip().split()[1] 328 setbysv = True 329 else: 330 key = parts[0].strip() 331 elif line[2:].count(b";") in [2, 3]: 332 # Looks like the pre 2006 style, take first word only 333 # Or, with two colons, the KIPO patent variantion 334 key = line[3:].strip().split(None, 1)[0] 335 if key.endswith(b";"): 336 key = key[:-1] 337 else: 338 raise ValueError( 339 'Did not recognise the ID line layout:\n%r' % line) 340 while True: 341 end_offset = handle.tell() 342 line = handle.readline() 343 if marker_re.match(line) or not line: 344 end_offset = handle.tell() - len(line) 345 yield _bytes_to_string(key), start_offset, length 346 start_offset = end_offset 347 break 348 elif line.startswith(ac_marker) and not setbysv: 349 key = line.rstrip().split()[1] 350 if key.endswith(b";"): 351 key = key[:-1] 352 elif line.startswith(sv_marker): 353 key = line.rstrip().split()[1] 354 setbysv = True 355 length += len(line) 356 assert not line, repr(line)
357 358
359 -class SwissRandomAccess(SequentialSeqFileRandomAccess):
360 """Random access to a SwissProt file.""" 361
362 - def __iter__(self):
363 handle = self._handle 364 handle.seek(0) 365 marker_re = self._marker_re 366 # Skip any header before first record 367 while True: 368 start_offset = handle.tell() 369 line = handle.readline() 370 if marker_re.match(line) or not line: 371 break 372 # Should now be at the start of a record, or end of the file 373 while marker_re.match(line): 374 length = len(line) 375 # We cannot assume the record.id is the first word after ID, 376 # normally the following AC line is used. 377 line = handle.readline() 378 length += len(line) 379 assert line.startswith(b"AC ") 380 key = line[3:].strip().split(b";")[0].strip() 381 while True: 382 end_offset = handle.tell() 383 line = handle.readline() 384 if marker_re.match(line) or not line: 385 yield _bytes_to_string(key), start_offset, length 386 start_offset = end_offset 387 break 388 length += len(line) 389 assert not line, repr(line)
390 391
392 -class UniprotRandomAccess(SequentialSeqFileRandomAccess):
393 """Random access to a UniProt XML file.""" 394
395 - def __iter__(self):
396 handle = self._handle 397 handle.seek(0) 398 marker_re = self._marker_re 399 start_acc_marker = b"<accession>" 400 end_acc_marker = b"</accession>" 401 end_entry_marker = b"</entry>" 402 # Skip any header before first record 403 while True: 404 start_offset = handle.tell() 405 line = handle.readline() 406 if marker_re.match(line) or not line: 407 break 408 # Should now be at the start of a record, or end of the file 409 while marker_re.match(line): 410 length = len(line) 411 # We expect the next line to be <accession>xxx</accession> 412 # (possibly with leading spaces) 413 # but allow it to be later on within the <entry> 414 key = None 415 while True: 416 line = handle.readline() 417 if key is None and start_acc_marker in line: 418 assert end_acc_marker in line, line 419 key = line[line.find( 420 start_acc_marker) + 11:].split(b"<", 1)[0] 421 length += len(line) 422 elif end_entry_marker in line: 423 length += line.find(end_entry_marker) + 8 424 end_offset = handle.tell() - len(line) \ 425 + line.find(end_entry_marker) + 8 426 assert start_offset + length == end_offset 427 break 428 elif marker_re.match(line) or not line: 429 # Start of next record or end of file 430 raise ValueError("Didn't find end of record") 431 else: 432 length += len(line) 433 if not key: 434 raise ValueError("Did not find <accession> line in bytes %i to %i" 435 % (start_offset, start_offset + length)) 436 yield _bytes_to_string(key), start_offset, length 437 # Find start of next record 438 while not marker_re.match(line) and line: 439 start_offset = handle.tell() 440 line = handle.readline() 441 assert not line, repr(line)
442
443 - def get_raw(self, offset):
444 """Return the raw record from the file as a bytes string.""" 445 handle = self._handle 446 marker_re = self._marker_re 447 end_entry_marker = b"</entry>" 448 handle.seek(offset) 449 data = [handle.readline()] 450 while True: 451 line = handle.readline() 452 i = line.find(end_entry_marker) 453 if i != -1: 454 data.append(line[:i + 8]) 455 break 456 if marker_re.match(line) or not line: 457 # End of file, or start of next record 458 raise ValueError("Didn't find end of record") 459 data.append(line) 460 return b"".join(data)
461
462 - def get(self, offset):
463 # TODO - Can we handle this directly in the parser? 464 # This is a hack - use get_raw for <entry>...</entry> and wrap it with 465 # the apparently required XML header and footer. 466 data = b"""<?xml version='1.0' encoding='UTF-8'?> 467 <uniprot xmlns="http://uniprot.org/uniprot" 468 xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 469 xsi:schemaLocation="http://uniprot.org/uniprot 470 http://www.uniprot.org/support/docs/uniprot.xsd"> 471 """ + self.get_raw(offset) + b"</uniprot>" 472 return next(SeqIO.UniprotIO.UniprotIterator(BytesIO(data)))
473 474
475 -class IntelliGeneticsRandomAccess(SeqFileRandomAccess):
476 """Random access to a IntelliGenetics file.""" 477
478 - def __init__(self, filename, format, alphabet):
479 """Initialize the class.""" 480 SeqFileRandomAccess.__init__(self, filename, format, alphabet) 481 self._marker_re = re.compile(b"^;")
482
483 - def __iter__(self):
484 handle = self._handle 485 handle.seek(0) 486 # Skip any header 487 offset = 0 488 line = "" 489 while True: 490 offset += len(line) 491 line = handle.readline() 492 if not line: 493 break # Premature end of file, or just empty? 494 if not line.startswith(b";;"): 495 break 496 while line: 497 length = 0 498 assert offset + len(line) == handle.tell() 499 if not line.startswith(b";"): 500 raise ValueError("Records should start with ';' and not:\n%r" % line) 501 while line.startswith(b";"): 502 length += len(line) 503 line = handle.readline() 504 key = line.rstrip() 505 # Now look for the first line which starts ";" 506 while line and not line.startswith(b";"): 507 length += len(line) 508 line = handle.readline() 509 yield _bytes_to_string(key), offset, length 510 offset += length 511 assert offset + len(line) == handle.tell()
512
513 - def get_raw(self, offset):
514 """Return the raw record from the file as a bytes string.""" 515 handle = self._handle 516 handle.seek(offset) 517 marker_re = self._marker_re 518 lines = [] 519 line = handle.readline() 520 while line.startswith(b";"): 521 lines.append(line) 522 line = handle.readline() 523 while line and not line.startswith(b";"): 524 lines.append(line) 525 line = handle.readline() 526 return b"".join(lines)
527 528
529 -class TabRandomAccess(SeqFileRandomAccess):
530 """Random access to a simple tabbed file.""" 531
532 - def __iter__(self):
533 handle = self._handle 534 handle.seek(0) 535 tab_char = b"\t" 536 while True: 537 start_offset = handle.tell() 538 line = handle.readline() 539 if not line: 540 break # End of file 541 try: 542 key = line.split(tab_char)[0] 543 except ValueError as err: 544 if not line.strip(): 545 # Ignore blank lines 546 continue 547 else: 548 raise err 549 else: 550 yield _bytes_to_string(key), start_offset, len(line)
551
552 - def get_raw(self, offset):
553 """Return the raw record from the file as a bytes string.""" 554 handle = self._handle 555 handle.seek(offset) 556 return handle.readline()
557 558 559 ########################## 560 # Now the FASTQ indexers # 561 ########################## 562
563 -class FastqRandomAccess(SeqFileRandomAccess):
564 """Random access to a FASTQ file (any supported variant). 565 566 With FASTQ the records all start with a "@" line, but so can quality lines. 567 Note this will cope with line-wrapped FASTQ files. 568 """ 569
570 - def __iter__(self):
571 handle = self._handle 572 handle.seek(0) 573 id = None 574 start_offset = handle.tell() 575 line = handle.readline() 576 if not line: 577 # Empty file! 578 return 579 if line[0:1] != b"@": 580 raise ValueError("Problem with FASTQ @ line:\n%r" % line) 581 while line: 582 # assert line[0]=="@" 583 # This record seems OK (so far) 584 id = line[1:].rstrip().split(None, 1)[0] 585 # Find the seq line(s) 586 seq_len = 0 587 length = len(line) 588 while line: 589 line = handle.readline() 590 length += len(line) 591 if line.startswith(b"+"): 592 break 593 seq_len += len(line.strip()) 594 if not line: 595 raise ValueError("Premature end of file in seq section") 596 # assert line[0]=="+" 597 # Find the qual line(s) 598 qual_len = 0 599 while line: 600 if seq_len == qual_len: 601 if seq_len == 0: 602 # Special case, quality line should be just "\n" 603 line = handle.readline() 604 if line.strip(): 605 raise ValueError("Expected blank quality line, not %r" % line) 606 length += len(line) # Need to include the blank ling 607 # Should be end of record... 608 end_offset = handle.tell() 609 line = handle.readline() 610 if line and line[0:1] != b"@": 611 raise ValueError("Problem with line %r" % line) 612 break 613 else: 614 line = handle.readline() 615 qual_len += len(line.strip()) 616 length += len(line) 617 if seq_len != qual_len: 618 raise ValueError("Problem with quality section") 619 yield _bytes_to_string(id), start_offset, length 620 start_offset = end_offset
621 # print("EOF") 622
623 - def get_raw(self, offset):
624 """Return the raw record from the file as a bytes string.""" 625 # TODO - Refactor this and the __init__ method to reduce code duplication? 626 handle = self._handle 627 handle.seek(offset) 628 line = handle.readline() 629 data = line 630 if line[0:1] != b"@": 631 raise ValueError("Problem with FASTQ @ line:\n%r" % line) 632 # Find the seq line(s) 633 seq_len = 0 634 while line: 635 line = handle.readline() 636 data += line 637 if line.startswith(b"+"): 638 break 639 seq_len += len(line.strip()) 640 if not line: 641 raise ValueError("Premature end of file in seq section") 642 assert line[0:1] == b"+" 643 # Find the qual line(s) 644 qual_len = 0 645 while line: 646 if seq_len == qual_len: 647 if seq_len == 0: 648 # Special case, quality line should be just "\n" 649 line = handle.readline() 650 if line.strip(): 651 raise ValueError("Expected blank quality line, not %r" % line) 652 data += line 653 # Should be end of record... 654 line = handle.readline() 655 if line and line[0:1] != b"@": 656 raise ValueError("Problem with line %r" % line) 657 break 658 else: 659 line = handle.readline() 660 data += line 661 qual_len += len(line.strip()) 662 if seq_len != qual_len: 663 raise ValueError("Problem with quality section") 664 return data
665 666 667 ############################################################################### 668 669 _FormatToRandomAccess = {"ace": SequentialSeqFileRandomAccess, 670 "embl": EmblRandomAccess, 671 "fasta": SequentialSeqFileRandomAccess, 672 "fastq": FastqRandomAccess, # Class handles all three variants 673 "fastq-sanger": FastqRandomAccess, # alias of the above 674 "fastq-solexa": FastqRandomAccess, 675 "fastq-illumina": FastqRandomAccess, 676 "genbank": GenBankRandomAccess, 677 "gb": GenBankRandomAccess, # alias of the above 678 "ig": IntelliGeneticsRandomAccess, 679 "imgt": EmblRandomAccess, 680 "phd": SequentialSeqFileRandomAccess, 681 "pir": SequentialSeqFileRandomAccess, 682 "sff": SffRandomAccess, 683 "sff-trim": SffTrimedRandomAccess, 684 "swiss": SwissRandomAccess, 685 "tab": TabRandomAccess, 686 "qual": SequentialSeqFileRandomAccess, 687 "uniprot-xml": UniprotRandomAccess, 688 } 689