Package Bio :: Package SeqIO :: Module _index
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO._index

  1  # Copyright 2009-2011 by Peter Cock.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Dictionary like indexing of sequence files (PRIVATE). 
  6   
  7  You are not expected to access this module, or any of its code, directly. This 
  8  is all handled internally by the Bio.SeqIO.index(...) function which is the 
  9  public interface for this functionality. 
 10   
 11  The basic idea is that we scan over a sequence file, looking for new record 
 12  markers. We then try and extract the string that Bio.SeqIO.parse/read would 
 13  use as the record id, ideally without actually parsing the full record. We 
 14  then use a subclassed Python dictionary to record the file offset for the 
 15  record start against the record id. 
 16   
 17  Note that this means full parsing is on demand, so any invalid or problem 
 18  record may not trigger an exception until it is accessed. This is by design. 
 19   
 20  This means our dictionary like objects have in memory ALL the keys (all the 
 21  record identifiers), which shouldn't be a problem even with second generation 
 22  sequencing. If this is an issue later on, storing the keys and offsets in a 
 23  temp lookup file might be one idea (e.g. using SQLite or an OBDA style index). 
 24  """ 
 25   
 26  from __future__ import print_function 
 27   
 28  import re 
 29  from io import BytesIO 
 30  from Bio._py3k import StringIO 
 31  from Bio._py3k import _bytes_to_string 
 32   
 33  from Bio import SeqIO 
 34  from Bio import Alphabet 
 35  from Bio.File import _IndexedSeqFileProxy, _open_for_random_access 
 36   
 37   
38 -class SeqFileRandomAccess(_IndexedSeqFileProxy):
39 - def __init__(self, filename, format, alphabet):
40 self._handle = _open_for_random_access(filename) 41 self._alphabet = alphabet 42 self._format = format 43 # Load the parser class/function once an avoid the dict lookup in each 44 # __getitem__ call: 45 i = SeqIO._FormatToIterator[format] 46 # The following alphabet code is a bit nasty... duplicates logic in 47 # Bio.SeqIO.parse() 48 if alphabet is None: 49 def _parse(handle): 50 """Dynamically generated parser function (PRIVATE).""" 51 return next(i(handle))
52 else: 53 # TODO - Detect alphabet support ONCE at __init__ 54 def _parse(handle): 55 """Dynamically generated parser function (PRIVATE).""" 56 try: 57 return next(i(handle, alphabet=alphabet)) 58 except TypeError: 59 return next(SeqIO._force_alphabet(i(handle), alphabet))
60 self._parse = _parse 61
62 - def get(self, offset):
63 """Returns SeqRecord.""" 64 # Should be overridden for binary file formats etc: 65 return self._parse(StringIO(_bytes_to_string(self.get_raw(offset))))
66 67 68 #################### 69 # Special indexers # 70 #################### 71 # Anything where the records cannot be read simply by parsing from 72 # the record start. For example, anything requiring information from 73 # a file header - e.g. SFF files where we would need to know the 74 # number of flows.
75 -class SffRandomAccess(SeqFileRandomAccess):
76 """Random access to a Standard Flowgram Format (SFF) file.""" 77
78 - def __init__(self, filename, format, alphabet):
79 SeqFileRandomAccess.__init__(self, filename, format, alphabet) 80 header_length, index_offset, index_length, number_of_reads, \ 81 self._flows_per_read, self._flow_chars, self._key_sequence \ 82 = SeqIO.SffIO._sff_file_header(self._handle)
83
84 - def __iter__(self):
85 """Load any index block in the file, or build it the slow way (PRIVATE).""" 86 if self._alphabet is None: 87 self._alphabet = Alphabet.generic_dna 88 handle = self._handle 89 handle.seek(0) 90 # Alread did this in __init__ but need handle in right place 91 header_length, index_offset, index_length, number_of_reads, \ 92 self._flows_per_read, self._flow_chars, self._key_sequence \ 93 = SeqIO.SffIO._sff_file_header(handle) 94 if index_offset and index_length: 95 # There is an index provided, try this the fast way: 96 count = 0 97 max_offset = 0 98 try: 99 for name, offset in SeqIO.SffIO._sff_read_roche_index(handle): 100 max_offset = max(max_offset, offset) 101 yield name, offset, 0 102 count += 1 103 assert count == number_of_reads, \ 104 "Indexed %i records, expected %i" \ 105 % (count, number_of_reads) 106 # If that worked, call _check_eof ... 107 except ValueError as err: 108 import warnings 109 from Bio import BiopythonParserWarning 110 warnings.warn("Could not parse the SFF index: %s" % err, 111 BiopythonParserWarning) 112 assert count == 0, "Partially populated index" 113 handle.seek(0) 114 # Drop out to the slow way... 115 else: 116 # Fast way worked, check EOF 117 if index_offset + index_length <= max_offset: 118 # Can have an index at start (or mid-file) 119 handle.seek(max_offset) 120 # Parse the final read, 121 SeqIO.SffIO._sff_read_raw_record(handle, self._flows_per_read) 122 # Should now be at the end of the file! 123 SeqIO.SffIO._check_eof(handle, index_offset, index_length) 124 return 125 # We used to give a warning in this case, but Ion Torrent's 126 # SFF files don't have an index so that would be annoying. 127 # Fall back on the slow way! 128 count = 0 129 for name, offset in SeqIO.SffIO._sff_do_slow_index(handle): 130 yield name, offset, 0 131 count += 1 132 assert count == number_of_reads, \ 133 "Indexed %i records, expected %i" % (count, number_of_reads) 134 SeqIO.SffIO._check_eof(handle, index_offset, index_length)
135
136 - def get(self, offset):
137 handle = self._handle 138 handle.seek(offset) 139 return SeqIO.SffIO._sff_read_seq_record(handle, 140 self._flows_per_read, 141 self._flow_chars, 142 self._key_sequence, 143 self._alphabet)
144
145 - def get_raw(self, offset):
146 """Return the raw record from the file as a bytes string.""" 147 handle = self._handle 148 handle.seek(offset) 149 return SeqIO.SffIO._sff_read_raw_record(handle, self._flows_per_read)
150 151
152 -class SffTrimedRandomAccess(SffRandomAccess):
153 - def get(self, offset):
154 handle = self._handle 155 handle.seek(offset) 156 return SeqIO.SffIO._sff_read_seq_record(handle, 157 self._flows_per_read, 158 self._flow_chars, 159 self._key_sequence, 160 self._alphabet, 161 trim=True)
162 163 164 ################### 165 # Simple indexers # 166 ################### 167
168 -class SequentialSeqFileRandomAccess(SeqFileRandomAccess):
169 - def __init__(self, filename, format, alphabet):
170 SeqFileRandomAccess.__init__(self, filename, format, alphabet) 171 marker = {"ace": b"CO ", 172 "embl": b"ID ", 173 "fasta": b">", 174 "genbank": b"LOCUS ", 175 "gb": b"LOCUS ", 176 "imgt": b"ID ", 177 "phd": b"BEGIN_SEQUENCE", 178 "pir": b">..;", 179 "qual": b">", 180 "swiss": b"ID ", 181 "uniprot-xml": b"<entry ", 182 }[format] 183 self._marker = marker 184 self._marker_re = re.compile(b"^" + marker)
185
186 - def __iter__(self):
187 """Returns (id, offset, length) tuples.""" 188 marker_offset = len(self._marker) 189 marker_re = self._marker_re 190 handle = self._handle 191 handle.seek(0) 192 # Skip any header before first record 193 while True: 194 start_offset = handle.tell() 195 line = handle.readline() 196 if marker_re.match(line) or not line: 197 break 198 # Should now be at the start of a record, or end of the file 199 while marker_re.match(line): 200 # Here we can assume the record.id is the first word after the 201 # marker. This is generally fine... but not for GenBank, EMBL, Swiss 202 id = line[marker_offset:].strip().split(None, 1)[0] 203 length = len(line) 204 while True: 205 end_offset = handle.tell() 206 line = handle.readline() 207 if marker_re.match(line) or not line: 208 yield _bytes_to_string(id), start_offset, length 209 start_offset = end_offset 210 break 211 else: 212 # Track this explicitly as can't do file offset difference on BGZF 213 length += len(line) 214 assert not line, repr(line)
215
216 - def get_raw(self, offset):
217 """Return the raw record from the file as a bytes string.""" 218 # For non-trivial file formats this must be over-ridden in the subclass 219 handle = self._handle 220 marker_re = self._marker_re 221 handle.seek(offset) 222 lines = [handle.readline()] 223 while True: 224 line = handle.readline() 225 if marker_re.match(line) or not line: 226 # End of file, or start of next record => end of this record 227 break 228 lines.append(line) 229 return b"".join(lines)
230 231 232 ####################################### 233 # Fiddly indexers: GenBank, EMBL, ... # 234 ####################################### 235
236 -class GenBankRandomAccess(SequentialSeqFileRandomAccess):
237 """Indexed dictionary like access to a GenBank file.""" 238
239 - def __iter__(self):
240 handle = self._handle 241 handle.seek(0) 242 marker_re = self._marker_re 243 accession_marker = b"ACCESSION " 244 version_marker = b"VERSION " 245 # Skip and header before first record 246 while True: 247 start_offset = handle.tell() 248 line = handle.readline() 249 if marker_re.match(line) or not line: 250 break 251 # Should now be at the start of a record, or end of the file 252 while marker_re.match(line): 253 # We cannot assume the record.id is the first word after LOCUS, 254 # normally the first entry on the VERSION or ACCESSION line is used. 255 key = None 256 length = len(line) 257 while True: 258 end_offset = handle.tell() 259 line = handle.readline() 260 if marker_re.match(line) or not line: 261 if not key: 262 raise ValueError( 263 "Did not find ACCESSION/VERSION lines") 264 yield _bytes_to_string(key), start_offset, length 265 start_offset = end_offset 266 break 267 elif line.startswith(accession_marker): 268 key = line.rstrip().split()[1] 269 elif line.startswith(version_marker): 270 version_id = line.rstrip().split()[1] 271 if version_id.count(b".") == 1 and version_id.split(b".")[1].isdigit(): 272 # This should mimic the GenBank parser... 273 key = version_id 274 length += len(line) 275 assert not line, repr(line)
276 277
278 -class EmblRandomAccess(SequentialSeqFileRandomAccess):
279 """Indexed dictionary like access to an EMBL file.""" 280
281 - def __iter__(self):
282 handle = self._handle 283 handle.seek(0) 284 marker_re = self._marker_re 285 semi_char = b";" 286 sv_marker = b"SV " 287 ac_marker = b"AC " 288 # Skip any header before first record 289 while True: 290 start_offset = handle.tell() 291 line = handle.readline() 292 if marker_re.match(line) or not line: 293 break 294 # Should now be at the start of a record, or end of the file 295 while marker_re.match(line): 296 # We cannot assume the record.id is the first word after ID, 297 # normally the SV line is used. 298 setbysv = False # resets sv as false 299 length = len(line) 300 if line[2:].count(b";") in [5, 6]: 301 # Looks like the semi colon separated style introduced in 2006 302 # Or style from IPD-IMGT/HLA after their v3.16.0 release 303 parts = line[3:].rstrip().split(b";") 304 if parts[1].strip().startswith(sv_marker): 305 # The SV bit gives the version 306 key = parts[0].strip() + b"." + \ 307 parts[1].strip().split()[1] 308 setbysv = True 309 else: 310 key = parts[0].strip() 311 elif line[2:].count(b";") in [2, 3]: 312 # Looks like the pre 2006 style, take first word only 313 # Or, with two colons, the KIPO patent variantion 314 key = line[3:].strip().split(None, 1)[0] 315 if key.endswith(b";"): 316 key = key[:-1] 317 else: 318 raise ValueError( 319 'Did not recognise the ID line layout:\n%r' % line) 320 while True: 321 end_offset = handle.tell() 322 line = handle.readline() 323 if marker_re.match(line) or not line: 324 end_offset = handle.tell() - len(line) 325 yield _bytes_to_string(key), start_offset, length 326 start_offset = end_offset 327 break 328 elif line.startswith(ac_marker) and not setbysv: 329 key = line.rstrip().split()[1] 330 if key.endswith(b";"): 331 key = key[:-1] 332 elif line.startswith(sv_marker): 333 key = line.rstrip().split()[1] 334 setbysv = True 335 length += len(line) 336 assert not line, repr(line)
337 338
339 -class SwissRandomAccess(SequentialSeqFileRandomAccess):
340 """Random access to a SwissProt file.""" 341
342 - def __iter__(self):
343 handle = self._handle 344 handle.seek(0) 345 marker_re = self._marker_re 346 # Skip any header before first record 347 while True: 348 start_offset = handle.tell() 349 line = handle.readline() 350 if marker_re.match(line) or not line: 351 break 352 # Should now be at the start of a record, or end of the file 353 while marker_re.match(line): 354 length = len(line) 355 # We cannot assume the record.id is the first word after ID, 356 # normally the following AC line is used. 357 line = handle.readline() 358 length += len(line) 359 assert line.startswith(b"AC ") 360 key = line[3:].strip().split(b";")[0].strip() 361 while True: 362 end_offset = handle.tell() 363 line = handle.readline() 364 if marker_re.match(line) or not line: 365 yield _bytes_to_string(key), start_offset, length 366 start_offset = end_offset 367 break 368 length += len(line) 369 assert not line, repr(line)
370 371
372 -class UniprotRandomAccess(SequentialSeqFileRandomAccess):
373 """Random access to a UniProt XML file.""" 374
375 - def __iter__(self):
376 handle = self._handle 377 handle.seek(0) 378 marker_re = self._marker_re 379 start_acc_marker = b"<accession>" 380 end_acc_marker = b"</accession>" 381 end_entry_marker = b"</entry>" 382 # Skip any header before first record 383 while True: 384 start_offset = handle.tell() 385 line = handle.readline() 386 if marker_re.match(line) or not line: 387 break 388 # Should now be at the start of a record, or end of the file 389 while marker_re.match(line): 390 length = len(line) 391 # We expect the next line to be <accession>xxx</accession> 392 # (possibly with leading spaces) 393 # but allow it to be later on within the <entry> 394 key = None 395 while True: 396 line = handle.readline() 397 if key is None and start_acc_marker in line: 398 assert end_acc_marker in line, line 399 key = line[line.find( 400 start_acc_marker) + 11:].split(b"<", 1)[0] 401 length += len(line) 402 elif end_entry_marker in line: 403 length += line.find(end_entry_marker) + 8 404 end_offset = handle.tell() - len(line) \ 405 + line.find(end_entry_marker) + 8 406 assert start_offset + length == end_offset 407 break 408 elif marker_re.match(line) or not line: 409 # Start of next record or end of file 410 raise ValueError("Didn't find end of record") 411 else: 412 length += len(line) 413 if not key: 414 raise ValueError("Did not find <accession> line in bytes %i to %i" 415 % (start_offset, start_offset + length)) 416 yield _bytes_to_string(key), start_offset, length 417 # Find start of next record 418 while not marker_re.match(line) and line: 419 start_offset = handle.tell() 420 line = handle.readline() 421 assert not line, repr(line)
422
423 - def get_raw(self, offset):
424 """Return the raw record from the file as a bytes string.""" 425 handle = self._handle 426 marker_re = self._marker_re 427 end_entry_marker = b"</entry>" 428 handle.seek(offset) 429 data = [handle.readline()] 430 while True: 431 line = handle.readline() 432 i = line.find(end_entry_marker) 433 if i != -1: 434 data.append(line[:i + 8]) 435 break 436 if marker_re.match(line) or not line: 437 # End of file, or start of next record 438 raise ValueError("Didn't find end of record") 439 data.append(line) 440 return b"".join(data)
441
442 - def get(self, offset):
443 # TODO - Can we handle this directly in the parser? 444 # This is a hack - use get_raw for <entry>...</entry> and wrap it with 445 # the apparently required XML header and footer. 446 data = b"""<?xml version='1.0' encoding='UTF-8'?> 447 <uniprot xmlns="http://uniprot.org/uniprot" 448 xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 449 xsi:schemaLocation="http://uniprot.org/uniprot 450 http://www.uniprot.org/support/docs/uniprot.xsd"> 451 """ + self.get_raw(offset) + b"</uniprot>" 452 return next(SeqIO.UniprotIO.UniprotIterator(BytesIO(data)))
453 454
455 -class IntelliGeneticsRandomAccess(SeqFileRandomAccess):
456 """Random access to a IntelliGenetics file.""" 457
458 - def __init__(self, filename, format, alphabet):
459 SeqFileRandomAccess.__init__(self, filename, format, alphabet) 460 self._marker_re = re.compile(b"^;")
461
462 - def __iter__(self):
463 handle = self._handle 464 handle.seek(0) 465 # Skip any header 466 offset = 0 467 line = "" 468 while True: 469 offset += len(line) 470 line = handle.readline() 471 if not line: 472 break # Premature end of file, or just empty? 473 if not line.startswith(b";;"): 474 break 475 while line: 476 length = 0 477 assert offset + len(line) == handle.tell() 478 if not line.startswith(b";"): 479 raise ValueError("Records should start with ';' and not:\n%r" % line) 480 while line.startswith(b";"): 481 length += len(line) 482 line = handle.readline() 483 key = line.rstrip() 484 # Now look for the first line which starts ";" 485 while line and not line.startswith(b";"): 486 length += len(line) 487 line = handle.readline() 488 yield _bytes_to_string(key), offset, length 489 offset += length 490 assert offset + len(line) == handle.tell()
491
492 - def get_raw(self, offset):
493 """Return the raw record from the file as a bytes string.""" 494 handle = self._handle 495 handle.seek(offset) 496 marker_re = self._marker_re 497 lines = [] 498 line = handle.readline() 499 while line.startswith(b";"): 500 lines.append(line) 501 line = handle.readline() 502 while line and not line.startswith(b";"): 503 lines.append(line) 504 line = handle.readline() 505 return b"".join(lines)
506 507
508 -class TabRandomAccess(SeqFileRandomAccess):
509 """Random access to a simple tabbed file.""" 510
511 - def __iter__(self):
512 handle = self._handle 513 handle.seek(0) 514 tab_char = b"\t" 515 while True: 516 start_offset = handle.tell() 517 line = handle.readline() 518 if not line: 519 break # End of file 520 try: 521 key = line.split(tab_char)[0] 522 except ValueError as err: 523 if not line.strip(): 524 # Ignore blank lines 525 continue 526 else: 527 raise err 528 else: 529 yield _bytes_to_string(key), start_offset, len(line)
530
531 - def get_raw(self, offset):
532 """Return the raw record from the file as a bytes string.""" 533 handle = self._handle 534 handle.seek(offset) 535 return handle.readline()
536 537 538 ########################## 539 # Now the FASTQ indexers # 540 ########################## 541
542 -class FastqRandomAccess(SeqFileRandomAccess):
543 """Random access to a FASTQ file (any supported variant). 544 545 With FASTQ the records all start with a "@" line, but so can quality lines. 546 Note this will cope with line-wrapped FASTQ files. 547 """ 548
549 - def __iter__(self):
550 handle = self._handle 551 handle.seek(0) 552 id = None 553 start_offset = handle.tell() 554 line = handle.readline() 555 if not line: 556 # Empty file! 557 return 558 if line[0:1] != b"@": 559 raise ValueError("Problem with FASTQ @ line:\n%r" % line) 560 while line: 561 # assert line[0]=="@" 562 # This record seems OK (so far) 563 id = line[1:].rstrip().split(None, 1)[0] 564 # Find the seq line(s) 565 seq_len = 0 566 length = len(line) 567 while line: 568 line = handle.readline() 569 length += len(line) 570 if line.startswith(b"+"): 571 break 572 seq_len += len(line.strip()) 573 if not line: 574 raise ValueError("Premature end of file in seq section") 575 # assert line[0]=="+" 576 # Find the qual line(s) 577 qual_len = 0 578 while line: 579 if seq_len == qual_len: 580 if seq_len == 0: 581 # Special case, quality line should be just "\n" 582 line = handle.readline() 583 if line.strip(): 584 raise ValueError("Expected blank quality line, not %r" % line) 585 length += len(line) # Need to include the blank ling 586 # Should be end of record... 587 end_offset = handle.tell() 588 line = handle.readline() 589 if line and line[0:1] != b"@": 590 raise ValueError("Problem with line %r" % line) 591 break 592 else: 593 line = handle.readline() 594 qual_len += len(line.strip()) 595 length += len(line) 596 if seq_len != qual_len: 597 raise ValueError("Problem with quality section") 598 yield _bytes_to_string(id), start_offset, length 599 start_offset = end_offset
600 # print("EOF") 601
602 - def get_raw(self, offset):
603 """Return the raw record from the file as a bytes string.""" 604 # TODO - Refactor this and the __init__ method to reduce code duplication? 605 handle = self._handle 606 handle.seek(offset) 607 line = handle.readline() 608 data = line 609 if line[0:1] != b"@": 610 raise ValueError("Problem with FASTQ @ line:\n%r" % line) 611 # Find the seq line(s) 612 seq_len = 0 613 while line: 614 line = handle.readline() 615 data += line 616 if line.startswith(b"+"): 617 break 618 seq_len += len(line.strip()) 619 if not line: 620 raise ValueError("Premature end of file in seq section") 621 assert line[0:1] == b"+" 622 # Find the qual line(s) 623 qual_len = 0 624 while line: 625 if seq_len == qual_len: 626 if seq_len == 0: 627 # Special case, quality line should be just "\n" 628 line = handle.readline() 629 if line.strip(): 630 raise ValueError("Expected blank quality line, not %r" % line) 631 data += line 632 # Should be end of record... 633 line = handle.readline() 634 if line and line[0:1] != b"@": 635 raise ValueError("Problem with line %r" % line) 636 break 637 else: 638 line = handle.readline() 639 data += line 640 qual_len += len(line.strip()) 641 if seq_len != qual_len: 642 raise ValueError("Problem with quality section") 643 return data
644 645 646 ############################################################################### 647 648 _FormatToRandomAccess = {"ace": SequentialSeqFileRandomAccess, 649 "embl": EmblRandomAccess, 650 "fasta": SequentialSeqFileRandomAccess, 651 "fastq": FastqRandomAccess, # Class handles all three variants 652 "fastq-sanger": FastqRandomAccess, # alias of the above 653 "fastq-solexa": FastqRandomAccess, 654 "fastq-illumina": FastqRandomAccess, 655 "genbank": GenBankRandomAccess, 656 "gb": GenBankRandomAccess, # alias of the above 657 "ig": IntelliGeneticsRandomAccess, 658 "imgt": EmblRandomAccess, 659 "phd": SequentialSeqFileRandomAccess, 660 "pir": SequentialSeqFileRandomAccess, 661 "sff": SffRandomAccess, 662 "sff-trim": SffTrimedRandomAccess, 663 "swiss": SwissRandomAccess, 664 "tab": TabRandomAccess, 665 "qual": SequentialSeqFileRandomAccess, 666 "uniprot-xml": UniprotRandomAccess, 667 } 668