Package Bio :: Package SeqIO :: Module _index
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO._index

  1  # Copyright 2009-2011 by Peter Cock.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Dictionary like indexing of sequence files (PRIVATE). 
  6   
  7  You are not expected to access this module, or any of its code, directly. This 
  8  is all handled internally by the Bio.SeqIO.index(...) function which is the 
  9  public interface for this functionality. 
 10   
 11  The basic idea is that we scan over a sequence file, looking for new record 
 12  markers. We then try and extract the string that Bio.SeqIO.parse/read would 
 13  use as the record id, ideally without actually parsing the full record. We 
 14  then use a subclassed Python dictionary to record the file offset for the 
 15  record start against the record id. 
 16   
 17  Note that this means full parsing is on demand, so any invalid or problem 
 18  record may not trigger an exception until it is accessed. This is by design. 
 19   
 20  This means our dictionary like objects have in memory ALL the keys (all the 
 21  record identifiers), which shouldn't be a problem even with second generation 
 22  sequencing. If this is an issue later on, storing the keys and offsets in a 
 23  temp lookup file might be one idea (e.g. using SQLite or an OBDA style index). 
 24  """ 
 25   
 26  from __future__ import print_function 
 27   
 28  import re 
 29  from io import BytesIO 
 30  from Bio._py3k import StringIO 
 31  from Bio._py3k import _bytes_to_string 
 32   
 33  from Bio import SeqIO 
 34  from Bio import Alphabet 
 35  from Bio.File import _IndexedSeqFileProxy, _open_for_random_access 
 36   
 37   
38 -class SeqFileRandomAccess(_IndexedSeqFileProxy):
39 - def __init__(self, filename, format, alphabet):
40 self._handle = _open_for_random_access(filename) 41 self._alphabet = alphabet 42 self._format = format 43 # Load the parser class/function once an avoid the dict lookup in each 44 # __getitem__ call: 45 i = SeqIO._FormatToIterator[format] 46 # The following alphabet code is a bit nasty... duplicates logic in 47 # Bio.SeqIO.parse() 48 if alphabet is None: 49 def _parse(handle): 50 """Dynamically generated parser function (PRIVATE).""" 51 return next(i(handle))
52 else: 53 # TODO - Detect alphabet support ONCE at __init__ 54 def _parse(handle): 55 """Dynamically generated parser function (PRIVATE).""" 56 try: 57 return next(i(handle, alphabet=alphabet)) 58 except TypeError: 59 return next(SeqIO._force_alphabet(i(handle), alphabet))
60 self._parse = _parse 61
62 - def get(self, offset):
63 """Returns SeqRecord.""" 64 # Should be overridden for binary file formats etc: 65 return self._parse(StringIO(_bytes_to_string(self.get_raw(offset))))
66 67 68 #################### 69 # Special indexers # 70 #################### 71 # Anything where the records cannot be read simply by parsing from 72 # the record start. For example, anything requiring information from 73 # a file header - e.g. SFF files where we would need to know the 74 # number of flows.
75 -class SffRandomAccess(SeqFileRandomAccess):
76 """Random access to a Standard Flowgram Format (SFF) file."""
77 - def __init__(self, filename, format, alphabet):
78 SeqFileRandomAccess.__init__(self, filename, format, alphabet) 79 header_length, index_offset, index_length, number_of_reads, \ 80 self._flows_per_read, self._flow_chars, self._key_sequence \ 81 = SeqIO.SffIO._sff_file_header(self._handle)
82
83 - def __iter__(self):
84 """Load any index block in the file, or build it the slow way (PRIVATE).""" 85 if self._alphabet is None: 86 self._alphabet = Alphabet.generic_dna 87 handle = self._handle 88 handle.seek(0) 89 # Alread did this in __init__ but need handle in right place 90 header_length, index_offset, index_length, number_of_reads, \ 91 self._flows_per_read, self._flow_chars, self._key_sequence \ 92 = SeqIO.SffIO._sff_file_header(handle) 93 if index_offset and index_length: 94 # There is an index provided, try this the fast way: 95 count = 0 96 max_offset = 0 97 try: 98 for name, offset in SeqIO.SffIO._sff_read_roche_index(handle): 99 max_offset = max(max_offset, offset) 100 yield name, offset, 0 101 count += 1 102 assert count == number_of_reads, \ 103 "Indexed %i records, expected %i" \ 104 % (count, number_of_reads) 105 # If that worked, call _check_eof ... 106 except ValueError as err: 107 import warnings 108 from Bio import BiopythonParserWarning 109 warnings.warn("Could not parse the SFF index: %s" % err, 110 BiopythonParserWarning) 111 assert count == 0, "Partially populated index" 112 handle.seek(0) 113 # Drop out to the slow way... 114 else: 115 # Fast way worked, check EOF 116 if index_offset + index_length <= max_offset: 117 # Can have an index at start (or mid-file) 118 handle.seek(max_offset) 119 # Parse the final read, 120 SeqIO.SffIO._sff_read_raw_record(handle, self._flows_per_read) 121 # Should now be at the end of the file! 122 SeqIO.SffIO._check_eof(handle, index_offset, index_length) 123 return 124 # We used to give a warning in this case, but Ion Torrent's 125 # SFF files don't have an index so that would be annoying. 126 # Fall back on the slow way! 127 count = 0 128 for name, offset in SeqIO.SffIO._sff_do_slow_index(handle): 129 yield name, offset, 0 130 count += 1 131 assert count == number_of_reads, \ 132 "Indexed %i records, expected %i" % (count, number_of_reads) 133 SeqIO.SffIO._check_eof(handle, index_offset, index_length)
134
135 - def get(self, offset):
136 handle = self._handle 137 handle.seek(offset) 138 return SeqIO.SffIO._sff_read_seq_record(handle, 139 self._flows_per_read, 140 self._flow_chars, 141 self._key_sequence, 142 self._alphabet)
143
144 - def get_raw(self, offset):
145 """Return the raw record from the file as a bytes string.""" 146 handle = self._handle 147 handle.seek(offset) 148 return SeqIO.SffIO._sff_read_raw_record(handle, self._flows_per_read)
149 150
151 -class SffTrimedRandomAccess(SffRandomAccess):
152 - def get(self, offset):
153 handle = self._handle 154 handle.seek(offset) 155 return SeqIO.SffIO._sff_read_seq_record(handle, 156 self._flows_per_read, 157 self._flow_chars, 158 self._key_sequence, 159 self._alphabet, 160 trim=True)
161 162 163 ################### 164 # Simple indexers # 165 ################### 166
167 -class SequentialSeqFileRandomAccess(SeqFileRandomAccess):
168 - def __init__(self, filename, format, alphabet):
169 SeqFileRandomAccess.__init__(self, filename, format, alphabet) 170 marker = {"ace": b"CO ", 171 "embl": b"ID ", 172 "fasta": b">", 173 "genbank": b"LOCUS ", 174 "gb": b"LOCUS ", 175 "imgt": b"ID ", 176 "phd": b"BEGIN_SEQUENCE", 177 "pir": b">..;", 178 "qual": b">", 179 "swiss": b"ID ", 180 "uniprot-xml": b"<entry ", 181 }[format] 182 self._marker = marker 183 self._marker_re = re.compile(b"^" + marker)
184
185 - def __iter__(self):
186 """Returns (id, offset, length) tuples.""" 187 marker_offset = len(self._marker) 188 marker_re = self._marker_re 189 handle = self._handle 190 handle.seek(0) 191 # Skip any header before first record 192 while True: 193 start_offset = handle.tell() 194 line = handle.readline() 195 if marker_re.match(line) or not line: 196 break 197 # Should now be at the start of a record, or end of the file 198 while marker_re.match(line): 199 # Here we can assume the record.id is the first word after the 200 # marker. This is generally fine... but not for GenBank, EMBL, Swiss 201 id = line[marker_offset:].strip().split(None, 1)[0] 202 length = len(line) 203 while True: 204 end_offset = handle.tell() 205 line = handle.readline() 206 if marker_re.match(line) or not line: 207 yield _bytes_to_string(id), start_offset, length 208 start_offset = end_offset 209 break 210 else: 211 # Track this explicitly as can't do file offset difference on BGZF 212 length += len(line) 213 assert not line, repr(line)
214
215 - def get_raw(self, offset):
216 """Return the raw record from the file as a bytes string.""" 217 # For non-trivial file formats this must be over-ridden in the subclass 218 handle = self._handle 219 marker_re = self._marker_re 220 handle.seek(offset) 221 lines = [handle.readline()] 222 while True: 223 line = handle.readline() 224 if marker_re.match(line) or not line: 225 # End of file, or start of next record => end of this record 226 break 227 lines.append(line) 228 return b"".join(lines)
229 230 231 ####################################### 232 # Fiddly indexers: GenBank, EMBL, ... # 233 ####################################### 234
235 -class GenBankRandomAccess(SequentialSeqFileRandomAccess):
236 """Indexed dictionary like access to a GenBank file."""
237 - def __iter__(self):
238 handle = self._handle 239 handle.seek(0) 240 marker_re = self._marker_re 241 accession_marker = b"ACCESSION " 242 version_marker = b"VERSION " 243 # Skip and header before first record 244 while True: 245 start_offset = handle.tell() 246 line = handle.readline() 247 if marker_re.match(line) or not line: 248 break 249 # Should now be at the start of a record, or end of the file 250 while marker_re.match(line): 251 # We cannot assume the record.id is the first word after LOCUS, 252 # normally the first entry on the VERSION or ACCESSION line is used. 253 key = None 254 length = len(line) 255 while True: 256 end_offset = handle.tell() 257 line = handle.readline() 258 if marker_re.match(line) or not line: 259 if not key: 260 raise ValueError( 261 "Did not find ACCESSION/VERSION lines") 262 yield _bytes_to_string(key), start_offset, length 263 start_offset = end_offset 264 break 265 elif line.startswith(accession_marker): 266 key = line.rstrip().split()[1] 267 elif line.startswith(version_marker): 268 version_id = line.rstrip().split()[1] 269 if version_id.count(b".") == 1 and version_id.split(b".")[1].isdigit(): 270 # This should mimic the GenBank parser... 271 key = version_id 272 length += len(line) 273 assert not line, repr(line)
274 275
276 -class EmblRandomAccess(SequentialSeqFileRandomAccess):
277 """Indexed dictionary like access to an EMBL file."""
278 - def __iter__(self):
279 handle = self._handle 280 handle.seek(0) 281 marker_re = self._marker_re 282 semi_char = b";" 283 sv_marker = b"SV " 284 ac_marker = b"AC " 285 # Skip any header before first record 286 while True: 287 start_offset = handle.tell() 288 line = handle.readline() 289 if marker_re.match(line) or not line: 290 break 291 # Should now be at the start of a record, or end of the file 292 while marker_re.match(line): 293 # We cannot assume the record.id is the first word after ID, 294 # normally the SV line is used. 295 setbysv = False # resets sv as false 296 length = len(line) 297 if line[2:].count(b";") in [5, 6]: 298 # Looks like the semi colon separated style introduced in 2006 299 # Or style from IPD-IMGT/HLA after their v3.16.0 release 300 parts = line[3:].rstrip().split(b";") 301 if parts[1].strip().startswith(sv_marker): 302 # The SV bit gives the version 303 key = parts[0].strip() + b"." + \ 304 parts[1].strip().split()[1] 305 setbysv = True 306 else: 307 key = parts[0].strip() 308 elif line[2:].count(b";") in [2, 3]: 309 # Looks like the pre 2006 style, take first word only 310 # Or, with two colons, the KIPO patent variantion 311 key = line[3:].strip().split(None, 1)[0] 312 if key.endswith(b";"): 313 key = key[:-1] 314 else: 315 raise ValueError( 316 'Did not recognise the ID line layout:\n%r' % line) 317 while True: 318 end_offset = handle.tell() 319 line = handle.readline() 320 if marker_re.match(line) or not line: 321 end_offset = handle.tell() - len(line) 322 yield _bytes_to_string(key), start_offset, length 323 start_offset = end_offset 324 break 325 elif line.startswith(ac_marker) and not setbysv: 326 key = line.rstrip().split()[1] 327 if key.endswith(b";"): 328 key = key[:-1] 329 elif line.startswith(sv_marker): 330 key = line.rstrip().split()[1] 331 setbysv = True 332 length += len(line) 333 assert not line, repr(line)
334 335
336 -class SwissRandomAccess(SequentialSeqFileRandomAccess):
337 """Random access to a SwissProt file."""
338 - def __iter__(self):
339 handle = self._handle 340 handle.seek(0) 341 marker_re = self._marker_re 342 # Skip any header before first record 343 while True: 344 start_offset = handle.tell() 345 line = handle.readline() 346 if marker_re.match(line) or not line: 347 break 348 # Should now be at the start of a record, or end of the file 349 while marker_re.match(line): 350 length = len(line) 351 # We cannot assume the record.id is the first word after ID, 352 # normally the following AC line is used. 353 line = handle.readline() 354 length += len(line) 355 assert line.startswith(b"AC ") 356 key = line[3:].strip().split(b";")[0].strip() 357 while True: 358 end_offset = handle.tell() 359 line = handle.readline() 360 if marker_re.match(line) or not line: 361 yield _bytes_to_string(key), start_offset, length 362 start_offset = end_offset 363 break 364 length += len(line) 365 assert not line, repr(line)
366 367
368 -class UniprotRandomAccess(SequentialSeqFileRandomAccess):
369 """Random access to a UniProt XML file."""
370 - def __iter__(self):
371 handle = self._handle 372 handle.seek(0) 373 marker_re = self._marker_re 374 start_acc_marker = b"<accession>" 375 end_acc_marker = b"</accession>" 376 end_entry_marker = b"</entry>" 377 # Skip any header before first record 378 while True: 379 start_offset = handle.tell() 380 line = handle.readline() 381 if marker_re.match(line) or not line: 382 break 383 # Should now be at the start of a record, or end of the file 384 while marker_re.match(line): 385 length = len(line) 386 # We expect the next line to be <accession>xxx</accession> 387 # (possibly with leading spaces) 388 # but allow it to be later on within the <entry> 389 key = None 390 while True: 391 line = handle.readline() 392 if key is None and start_acc_marker in line: 393 assert end_acc_marker in line, line 394 key = line[line.find( 395 start_acc_marker) + 11:].split(b"<", 1)[0] 396 length += len(line) 397 elif end_entry_marker in line: 398 length += line.find(end_entry_marker) + 8 399 end_offset = handle.tell() - len(line) \ 400 + line.find(end_entry_marker) + 8 401 assert start_offset + length == end_offset 402 break 403 elif marker_re.match(line) or not line: 404 # Start of next record or end of file 405 raise ValueError("Didn't find end of record") 406 else: 407 length += len(line) 408 if not key: 409 raise ValueError("Did not find <accession> line in bytes %i to %i" 410 % (start_offset, start_offset + length)) 411 yield _bytes_to_string(key), start_offset, length 412 # Find start of next record 413 while not marker_re.match(line) and line: 414 start_offset = handle.tell() 415 line = handle.readline() 416 assert not line, repr(line)
417
418 - def get_raw(self, offset):
419 """Return the raw record from the file as a bytes string.""" 420 handle = self._handle 421 marker_re = self._marker_re 422 end_entry_marker = b"</entry>" 423 handle.seek(offset) 424 data = [handle.readline()] 425 while True: 426 line = handle.readline() 427 i = line.find(end_entry_marker) 428 if i != -1: 429 data.append(line[:i + 8]) 430 break 431 if marker_re.match(line) or not line: 432 # End of file, or start of next record 433 raise ValueError("Didn't find end of record") 434 data.append(line) 435 return b"".join(data)
436
437 - def get(self, offset):
438 # TODO - Can we handle this directly in the parser? 439 # This is a hack - use get_raw for <entry>...</entry> and wrap it with 440 # the apparently required XML header and footer. 441 data = b"""<?xml version='1.0' encoding='UTF-8'?> 442 <uniprot xmlns="http://uniprot.org/uniprot" 443 xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 444 xsi:schemaLocation="http://uniprot.org/uniprot 445 http://www.uniprot.org/support/docs/uniprot.xsd"> 446 """ + self.get_raw(offset) + b"</uniprot>" 447 return next(SeqIO.UniprotIO.UniprotIterator(BytesIO(data)))
448 449
450 -class IntelliGeneticsRandomAccess(SeqFileRandomAccess):
451 """Random access to a IntelliGenetics file."""
452 - def __init__(self, filename, format, alphabet):
453 SeqFileRandomAccess.__init__(self, filename, format, alphabet) 454 self._marker_re = re.compile(b"^;")
455
456 - def __iter__(self):
457 handle = self._handle 458 handle.seek(0) 459 # Skip any header 460 offset = 0 461 line = "" 462 while True: 463 offset += len(line) 464 line = handle.readline() 465 if not line: 466 break # Premature end of file, or just empty? 467 if not line.startswith(b";;"): 468 break 469 while line: 470 length = 0 471 assert offset + len(line) == handle.tell() 472 if not line.startswith(b";"): 473 raise ValueError("Records should start with ';' and not:\n%r" % line) 474 while line.startswith(b";"): 475 length += len(line) 476 line = handle.readline() 477 key = line.rstrip() 478 # Now look for the first line which starts ";" 479 while line and not line.startswith(b";"): 480 length += len(line) 481 line = handle.readline() 482 yield _bytes_to_string(key), offset, length 483 offset += length 484 assert offset + len(line) == handle.tell()
485
486 - def get_raw(self, offset):
487 """Return the raw record from the file as a bytes string.""" 488 handle = self._handle 489 handle.seek(offset) 490 marker_re = self._marker_re 491 lines = [] 492 line = handle.readline() 493 while line.startswith(b";"): 494 lines.append(line) 495 line = handle.readline() 496 while line and not line.startswith(b";"): 497 lines.append(line) 498 line = handle.readline() 499 return b"".join(lines)
500 501
502 -class TabRandomAccess(SeqFileRandomAccess):
503 """Random access to a simple tabbed file."""
504 - def __iter__(self):
505 handle = self._handle 506 handle.seek(0) 507 tab_char = b"\t" 508 while True: 509 start_offset = handle.tell() 510 line = handle.readline() 511 if not line: 512 break # End of file 513 try: 514 key = line.split(tab_char)[0] 515 except ValueError as err: 516 if not line.strip(): 517 # Ignore blank lines 518 continue 519 else: 520 raise err 521 else: 522 yield _bytes_to_string(key), start_offset, len(line)
523
524 - def get_raw(self, offset):
525 """Return the raw record from the file as a bytes string.""" 526 handle = self._handle 527 handle.seek(offset) 528 return handle.readline()
529 530 531 ########################## 532 # Now the FASTQ indexers # 533 ########################## 534
535 -class FastqRandomAccess(SeqFileRandomAccess):
536 """Random access to a FASTQ file (any supported variant). 537 538 With FASTQ the records all start with a "@" line, but so can quality lines. 539 Note this will cope with line-wrapped FASTQ files. 540 """
541 - def __iter__(self):
542 handle = self._handle 543 handle.seek(0) 544 id = None 545 start_offset = handle.tell() 546 line = handle.readline() 547 if not line: 548 # Empty file! 549 return 550 if line[0:1] != b"@": 551 raise ValueError("Problem with FASTQ @ line:\n%r" % line) 552 while line: 553 # assert line[0]=="@" 554 # This record seems OK (so far) 555 id = line[1:].rstrip().split(None, 1)[0] 556 # Find the seq line(s) 557 seq_len = 0 558 length = len(line) 559 while line: 560 line = handle.readline() 561 length += len(line) 562 if line.startswith(b"+"): 563 break 564 seq_len += len(line.strip()) 565 if not line: 566 raise ValueError("Premature end of file in seq section") 567 # assert line[0]=="+" 568 # Find the qual line(s) 569 qual_len = 0 570 while line: 571 if seq_len == qual_len: 572 if seq_len == 0: 573 # Special case, quality line should be just "\n" 574 line = handle.readline() 575 if line.strip(): 576 raise ValueError("Expected blank quality line, not %r" % line) 577 length += len(line) # Need to include the blank ling 578 # Should be end of record... 579 end_offset = handle.tell() 580 line = handle.readline() 581 if line and line[0:1] != b"@": 582 raise ValueError("Problem with line %r" % line) 583 break 584 else: 585 line = handle.readline() 586 qual_len += len(line.strip()) 587 length += len(line) 588 if seq_len != qual_len: 589 raise ValueError("Problem with quality section") 590 yield _bytes_to_string(id), start_offset, length 591 start_offset = end_offset
592 # print("EOF") 593
594 - def get_raw(self, offset):
595 """Return the raw record from the file as a bytes string.""" 596 # TODO - Refactor this and the __init__ method to reduce code duplication? 597 handle = self._handle 598 handle.seek(offset) 599 line = handle.readline() 600 data = line 601 if line[0:1] != b"@": 602 raise ValueError("Problem with FASTQ @ line:\n%r" % line) 603 # Find the seq line(s) 604 seq_len = 0 605 while line: 606 line = handle.readline() 607 data += line 608 if line.startswith(b"+"): 609 break 610 seq_len += len(line.strip()) 611 if not line: 612 raise ValueError("Premature end of file in seq section") 613 assert line[0:1] == b"+" 614 # Find the qual line(s) 615 qual_len = 0 616 while line: 617 if seq_len == qual_len: 618 if seq_len == 0: 619 # Special case, quality line should be just "\n" 620 line = handle.readline() 621 if line.strip(): 622 raise ValueError("Expected blank quality line, not %r" % line) 623 data += line 624 # Should be end of record... 625 line = handle.readline() 626 if line and line[0:1] != b"@": 627 raise ValueError("Problem with line %r" % line) 628 break 629 else: 630 line = handle.readline() 631 data += line 632 qual_len += len(line.strip()) 633 if seq_len != qual_len: 634 raise ValueError("Problem with quality section") 635 return data
636 637 638 ############################################################################### 639 640 _FormatToRandomAccess = {"ace": SequentialSeqFileRandomAccess, 641 "embl": EmblRandomAccess, 642 "fasta": SequentialSeqFileRandomAccess, 643 "fastq": FastqRandomAccess, # Class handles all three variants 644 "fastq-sanger": FastqRandomAccess, # alias of the above 645 "fastq-solexa": FastqRandomAccess, 646 "fastq-illumina": FastqRandomAccess, 647 "genbank": GenBankRandomAccess, 648 "gb": GenBankRandomAccess, # alias of the above 649 "ig": IntelliGeneticsRandomAccess, 650 "imgt": EmblRandomAccess, 651 "phd": SequentialSeqFileRandomAccess, 652 "pir": SequentialSeqFileRandomAccess, 653 "sff": SffRandomAccess, 654 "sff-trim": SffTrimedRandomAccess, 655 "swiss": SwissRandomAccess, 656 "tab": TabRandomAccess, 657 "qual": SequentialSeqFileRandomAccess, 658 "uniprot-xml": UniprotRandomAccess, 659 } 660