Package Bio :: Package SeqIO :: Module _index
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO._index

  1  # Copyright 2009-2011 by Peter Cock.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Dictionary like indexing of sequence files (PRIVATE). 
  6   
  7  You are not expected to access this module, or any of its code, directly. This 
  8  is all handled internally by the Bio.SeqIO.index(...) function which is the 
  9  public interface for this functionality. 
 10   
 11  The basic idea is that we scan over a sequence file, looking for new record 
 12  markers. We then try and extract the string that Bio.SeqIO.parse/read would 
 13  use as the record id, ideally without actually parsing the full record. We 
 14  then use a subclassed Python dictionary to record the file offset for the 
 15  record start against the record id. 
 16   
 17  Note that this means full parsing is on demand, so any invalid or problem 
 18  record may not trigger an exception until it is accessed. This is by design. 
 19   
 20  This means our dictionary like objects have in memory ALL the keys (all the 
 21  record identifiers), which shouldn't be a problem even with second generation 
 22  sequencing. If this is an issue later on, storing the keys and offsets in a 
 23  temp lookup file might be one idea (e.g. using SQLite or an OBDA style index). 
 24  """ 
 25   
 26  from __future__ import print_function 
 27   
 28  import re 
 29  from Bio._py3k import StringIO 
 30  from Bio._py3k import _bytes_to_string, _as_bytes 
 31   
 32  from Bio import SeqIO 
 33  from Bio import Alphabet 
 34  from Bio import bgzf 
 35  from Bio.File import _IndexedSeqFileProxy, _open_for_random_access 
 36   
 37   
38 -class SeqFileRandomAccess(_IndexedSeqFileProxy):
39 - def __init__(self, filename, format, alphabet):
40 self._handle = _open_for_random_access(filename) 41 self._alphabet = alphabet 42 self._format = format 43 #Load the parser class/function once an avoid the dict lookup in each 44 #__getitem__ call: 45 i = SeqIO._FormatToIterator[format] 46 #The following alphabet code is a bit nasty... duplicates logic in 47 #Bio.SeqIO.parse() 48 if alphabet is None: 49 def _parse(handle): 50 """Dynamically generated parser function (PRIVATE).""" 51 return next(i(handle))
52 else: 53 #TODO - Detect alphabet support ONCE at __init__ 54 def _parse(handle): 55 """Dynamically generated parser function (PRIVATE).""" 56 try: 57 return next(i(handle, alphabet=alphabet)) 58 except TypeError: 59 return next(SeqIO._force_alphabet(i(handle), alphabet))
60 self._parse = _parse 61
62 - def get(self, offset):
63 """Returns SeqRecord.""" 64 #Should be overridden for binary file formats etc: 65 return self._parse(StringIO(_bytes_to_string(self.get_raw(offset))))
66 67 68 #################### 69 # Special indexers # 70 #################### 71 # Anything where the records cannot be read simply by parsing from 72 # the record start. For example, anything requiring information from 73 # a file header - e.g. SFF files where we would need to know the 74 # number of flows.
75 -class SffRandomAccess(SeqFileRandomAccess):
76 """Random access to a Standard Flowgram Format (SFF) file."""
77 - def __init__(self, filename, format, alphabet):
78 SeqFileRandomAccess.__init__(self, filename, format, alphabet) 79 header_length, index_offset, index_length, number_of_reads, \ 80 self._flows_per_read, self._flow_chars, self._key_sequence \ 81 = SeqIO.SffIO._sff_file_header(self._handle)
82
83 - def __iter__(self):
84 """Load any index block in the file, or build it the slow way (PRIVATE).""" 85 if self._alphabet is None: 86 self._alphabet = Alphabet.generic_dna 87 handle = self._handle 88 handle.seek(0) 89 #Alread did this in __init__ but need handle in right place 90 header_length, index_offset, index_length, number_of_reads, \ 91 self._flows_per_read, self._flow_chars, self._key_sequence \ 92 = SeqIO.SffIO._sff_file_header(handle) 93 if index_offset and index_length: 94 #There is an index provided, try this the fast way: 95 count = 0 96 max_offset = 0 97 try: 98 for name, offset in SeqIO.SffIO._sff_read_roche_index(handle): 99 max_offset = max(max_offset, offset) 100 yield name, offset, 0 101 count += 1 102 assert count == number_of_reads, \ 103 "Indexed %i records, expected %i" \ 104 % (count, number_of_reads) 105 # If that worked, call _check_eof ... 106 except ValueError as err: 107 import warnings 108 from Bio import BiopythonParserWarning 109 warnings.warn("Could not parse the SFF index: %s" % err, 110 BiopythonParserWarning) 111 assert count == 0, "Partially populated index" 112 handle.seek(0) 113 # Drop out to the slow way... 114 else: 115 # Fast way worked, check EOF 116 if index_offset + index_length <= max_offset: 117 # Can have an index at start (or mid-file) 118 handle.seek(max_offset) 119 # Parse the final read, 120 SeqIO.SffIO._sff_read_raw_record(handle, self._flows_per_read) 121 # Should now be at the end of the file! 122 SeqIO.SffIO._check_eof(handle, index_offset, index_length) 123 return 124 #We used to give a warning in this case, but Ion Torrent's 125 #SFF files don't have an index so that would be annoying. 126 #Fall back on the slow way! 127 count = 0 128 for name, offset in SeqIO.SffIO._sff_do_slow_index(handle): 129 yield name, offset, 0 130 count += 1 131 assert count == number_of_reads, \ 132 "Indexed %i records, expected %i" % (count, number_of_reads) 133 SeqIO.SffIO._check_eof(handle, index_offset, index_length)
134 135
136 - def get(self, offset):
137 handle = self._handle 138 handle.seek(offset) 139 return SeqIO.SffIO._sff_read_seq_record(handle, 140 self._flows_per_read, 141 self._flow_chars, 142 self._key_sequence, 143 self._alphabet)
144
145 - def get_raw(self, offset):
146 handle = self._handle 147 handle.seek(offset) 148 return SeqIO.SffIO._sff_read_raw_record(handle, self._flows_per_read)
149 150
151 -class SffTrimedRandomAccess(SffRandomAccess):
152 - def get(self, offset):
153 handle = self._handle 154 handle.seek(offset) 155 return SeqIO.SffIO._sff_read_seq_record(handle, 156 self._flows_per_read, 157 self._flow_chars, 158 self._key_sequence, 159 self._alphabet, 160 trim=True)
161 162 163 ################### 164 # Simple indexers # 165 ################### 166
167 -class SequentialSeqFileRandomAccess(SeqFileRandomAccess):
168 - def __init__(self, filename, format, alphabet):
169 SeqFileRandomAccess.__init__(self, filename, format, alphabet) 170 marker = {"ace": "CO ", 171 "embl": "ID ", 172 "fasta": ">", 173 "genbank": "LOCUS ", 174 "gb": "LOCUS ", 175 "imgt": "ID ", 176 "phd": "BEGIN_SEQUENCE", 177 "pir": ">..;", 178 "qual": ">", 179 "qual": ">", 180 "swiss": "ID ", 181 "uniprot-xml": "<entry ", 182 }[format] 183 self._marker = marker 184 self._marker_re = re.compile(_as_bytes("^%s" % marker))
185
186 - def __iter__(self):
187 """Returns (id,offset) tuples.""" 188 marker_offset = len(self._marker) 189 marker_re = self._marker_re 190 handle = self._handle 191 handle.seek(0) 192 #Skip and header before first record 193 while True: 194 start_offset = handle.tell() 195 line = handle.readline() 196 if marker_re.match(line) or not line: 197 break 198 #Should now be at the start of a record, or end of the file 199 while marker_re.match(line): 200 #Here we can assume the record.id is the first word after the 201 #marker. This is generally fine... but not for GenBank, EMBL, Swiss 202 id = line[marker_offset:].strip().split(None, 1)[0] 203 length = len(line) 204 while True: 205 end_offset = handle.tell() 206 line = handle.readline() 207 if marker_re.match(line) or not line: 208 yield _bytes_to_string(id), start_offset, length 209 start_offset = end_offset 210 break 211 else: 212 #Track this explicitly as can't do file offset difference on BGZF 213 length += len(line) 214 assert not line, repr(line)
215
216 - def get_raw(self, offset):
217 """Similar to the get method, but returns the record as a raw string.""" 218 #For non-trivial file formats this must be over-ridden in the subclass 219 handle = self._handle 220 marker_re = self._marker_re 221 handle.seek(offset) 222 lines = [handle.readline()] 223 while True: 224 line = handle.readline() 225 if marker_re.match(line) or not line: 226 #End of file, or start of next record => end of this record 227 break 228 lines.append(line) 229 return _as_bytes("").join(lines)
230 231 232 ####################################### 233 # Fiddly indexers: GenBank, EMBL, ... # 234 ####################################### 235
236 -class GenBankRandomAccess(SequentialSeqFileRandomAccess):
237 """Indexed dictionary like access to a GenBank file."""
238 - def __iter__(self):
239 handle = self._handle 240 handle.seek(0) 241 marker_re = self._marker_re 242 dot_char = _as_bytes(".") 243 accession_marker = _as_bytes("ACCESSION ") 244 version_marker = _as_bytes("VERSION ") 245 #Skip and header before first record 246 while True: 247 start_offset = handle.tell() 248 line = handle.readline() 249 if marker_re.match(line) or not line: 250 break 251 #Should now be at the start of a record, or end of the file 252 while marker_re.match(line): 253 #We cannot assume the record.id is the first word after LOCUS, 254 #normally the first entry on the VERSION or ACCESSION line is used. 255 key = None 256 length = len(line) 257 while True: 258 end_offset = handle.tell() 259 line = handle.readline() 260 if marker_re.match(line) or not line: 261 if not key: 262 raise ValueError( 263 "Did not find ACCESSION/VERSION lines") 264 yield _bytes_to_string(key), start_offset, length 265 start_offset = end_offset 266 break 267 elif line.startswith(accession_marker): 268 key = line.rstrip().split()[1] 269 elif line.startswith(version_marker): 270 version_id = line.rstrip().split()[1] 271 if version_id.count(dot_char) == 1 and version_id.split(dot_char)[1].isdigit(): 272 #This should mimic the GenBank parser... 273 key = version_id 274 length += len(line) 275 assert not line, repr(line)
276 277
278 -class EmblRandomAccess(SequentialSeqFileRandomAccess):
279 """Indexed dictionary like access to an EMBL file."""
280 - def __iter__(self):
281 handle = self._handle 282 handle.seek(0) 283 marker_re = self._marker_re 284 semi_char = _as_bytes(";") 285 dot_char = _as_bytes(".") 286 sv_marker = _as_bytes("SV ") 287 #Skip any header before first record 288 while True: 289 start_offset = handle.tell() 290 line = handle.readline() 291 if marker_re.match(line) or not line: 292 break 293 #Should now be at the start of a record, or end of the file 294 while marker_re.match(line): 295 #We cannot assume the record.id is the first word after ID, 296 #normally the SV line is used. 297 length = len(line) 298 if line[2:].count(semi_char) == 6: 299 #Looks like the semi colon separated style introduced in 2006 300 parts = line[3:].rstrip().split(semi_char) 301 if parts[1].strip().startswith(sv_marker): 302 #The SV bit gives the version 303 key = parts[0].strip() + dot_char + \ 304 parts[1].strip().split()[1] 305 else: 306 key = parts[0].strip() 307 elif line[2:].count(semi_char) == 3: 308 #Looks like the pre 2006 style, take first word only 309 key = line[3:].strip().split(None, 1)[0] 310 else: 311 raise ValueError( 312 'Did not recognise the ID line layout:\n' + line) 313 while True: 314 end_offset = handle.tell() 315 line = handle.readline() 316 if marker_re.match(line) or not line: 317 end_offset = handle.tell() - len(line) 318 yield _bytes_to_string(key), start_offset, length 319 start_offset = end_offset 320 break 321 elif line.startswith(sv_marker): 322 key = line.rstrip().split()[1] 323 length += len(line) 324 assert not line, repr(line)
325 326
327 -class SwissRandomAccess(SequentialSeqFileRandomAccess):
328 """Random access to a SwissProt file."""
329 - def __iter__(self):
330 handle = self._handle 331 handle.seek(0) 332 marker_re = self._marker_re 333 semi_char = _as_bytes(";") 334 #Skip any header before first record 335 while True: 336 start_offset = handle.tell() 337 line = handle.readline() 338 if marker_re.match(line) or not line: 339 break 340 #Should now be at the start of a record, or end of the file 341 while marker_re.match(line): 342 length = len(line) 343 #We cannot assume the record.id is the first word after ID, 344 #normally the following AC line is used. 345 line = handle.readline() 346 length += len(line) 347 assert line.startswith(_as_bytes("AC ")) 348 key = line[3:].strip().split(semi_char)[0].strip() 349 while True: 350 end_offset = handle.tell() 351 line = handle.readline() 352 if marker_re.match(line) or not line: 353 yield _bytes_to_string(key), start_offset, length 354 start_offset = end_offset 355 break 356 length += len(line) 357 assert not line, repr(line)
358 359
360 -class UniprotRandomAccess(SequentialSeqFileRandomAccess):
361 """Random access to a UniProt XML file."""
362 - def __iter__(self):
363 handle = self._handle 364 handle.seek(0) 365 marker_re = self._marker_re 366 start_acc_marker = _as_bytes("<accession>") 367 end_acc_marker = _as_bytes("</accession>") 368 end_entry_marker = _as_bytes("</entry>") 369 less_than = _as_bytes("<") 370 #Skip any header before first record 371 while True: 372 start_offset = handle.tell() 373 line = handle.readline() 374 if marker_re.match(line) or not line: 375 break 376 #Should now be at the start of a record, or end of the file 377 while marker_re.match(line): 378 length = len(line) 379 #We expect the next line to be <accession>xxx</accession> 380 #(possibly with leading spaces) 381 #but allow it to be later on within the <entry> 382 key = None 383 while True: 384 line = handle.readline() 385 if key is None and start_acc_marker in line: 386 assert end_acc_marker in line, line 387 key = line[line.find( 388 start_acc_marker) + 11:].split(less_than, 1)[0] 389 length += len(line) 390 elif end_entry_marker in line: 391 end_offset = handle.tell() - len(line) \ 392 + line.find(end_entry_marker) + 8 393 break 394 elif marker_re.match(line) or not line: 395 #Start of next record or end of file 396 raise ValueError("Didn't find end of record") 397 else: 398 length += len(line) 399 if not key: 400 raise ValueError("Did not find <accession> line in bytes %i to %i" 401 % (start_offset, end_offset)) 402 yield _bytes_to_string(key), start_offset, length 403 #Find start of next record 404 while not marker_re.match(line) and line: 405 start_offset = handle.tell() 406 line = handle.readline() 407 assert not line, repr(line)
408
409 - def get_raw(self, offset):
410 """Similar to the get method, but returns the record as a raw string.""" 411 handle = self._handle 412 marker_re = self._marker_re 413 end_entry_marker = _as_bytes("</entry>") 414 handle.seek(offset) 415 data = [handle.readline()] 416 while True: 417 line = handle.readline() 418 i = line.find(end_entry_marker) 419 if i != -1: 420 data.append(line[:i + 8]) 421 break 422 if marker_re.match(line) or not line: 423 #End of file, or start of next record 424 raise ValueError("Didn't find end of record") 425 data.append(line) 426 return _as_bytes("").join(data)
427
428 - def get(self, offset):
429 #TODO - Can we handle this directly in the parser? 430 #This is a hack - use get_raw for <entry>...</entry> and wrap it with 431 #the apparently required XML header and footer. 432 data = """<?xml version='1.0' encoding='UTF-8'?> 433 <uniprot xmlns="http://uniprot.org/uniprot" 434 xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 435 xsi:schemaLocation="http://uniprot.org/uniprot 436 http://www.uniprot.org/support/docs/uniprot.xsd"> 437 %s 438 </uniprot> 439 """ % _bytes_to_string(self.get_raw(offset)) 440 #TODO - For consistency, this function should not accept a string: 441 return next(SeqIO.UniprotIO.UniprotIterator(data))
442 443
444 -class IntelliGeneticsRandomAccess(SeqFileRandomAccess):
445 """Random access to a IntelliGenetics file."""
446 - def __init__(self, filename, format, alphabet):
447 SeqFileRandomAccess.__init__(self, filename, format, alphabet) 448 self._marker_re = re.compile(_as_bytes("^;"))
449
450 - def __iter__(self):
451 handle = self._handle 452 handle.seek(0) 453 marker_re = self._marker_re 454 semi_char = _as_bytes(";") 455 while True: 456 offset = handle.tell() 457 line = handle.readline() 458 length = len(line) 459 if marker_re.match(line): 460 #Now look for the first line which doesn't start ";" 461 while True: 462 line = handle.readline() 463 if line[0:1] != semi_char and line.strip(): 464 key = line.split()[0] 465 yield _bytes_to_string(key), offset, length 466 break 467 if not line: 468 raise ValueError("Premature end of file?") 469 length += len(line) 470 elif not line: 471 #End of file 472 break
473
474 - def get_raw(self, offset):
475 handle = self._handle 476 handle.seek(offset) 477 marker_re = self._marker_re 478 lines = [] 479 line = handle.readline() 480 semi_char = _as_bytes(";") 481 while line.startswith(semi_char): 482 lines.append(line) 483 line = handle.readline() 484 while line and not line.startswith(semi_char): 485 lines.append(line) 486 line = handle.readline() 487 return _as_bytes("").join(lines)
488 489
490 -class TabRandomAccess(SeqFileRandomAccess):
491 """Random access to a simple tabbed file."""
492 - def __iter__(self):
493 handle = self._handle 494 handle.seek(0) 495 tab_char = _as_bytes("\t") 496 while True: 497 start_offset = handle.tell() 498 line = handle.readline() 499 if not line: 500 break # End of file 501 try: 502 key = line.split(tab_char)[0] 503 except ValueError as err: 504 if not line.strip(): 505 #Ignore blank lines 506 continue 507 else: 508 raise err 509 else: 510 yield _bytes_to_string(key), start_offset, len(line)
511
512 - def get_raw(self, offset):
513 """Like the get method, but returns the record as a raw string.""" 514 handle = self._handle 515 handle.seek(offset) 516 return handle.readline()
517 518 519 ########################## 520 # Now the FASTQ indexers # 521 ########################## 522
523 -class FastqRandomAccess(SeqFileRandomAccess):
524 """Random access to a FASTQ file (any supported variant). 525 526 With FASTQ the records all start with a "@" line, but so can quality lines. 527 Note this will cope with line-wrapped FASTQ files. 528 """
529 - def __iter__(self):
530 handle = self._handle 531 handle.seek(0) 532 id = None 533 start_offset = handle.tell() 534 line = handle.readline() 535 if not line: 536 #Empty file! 537 return 538 at_char = _as_bytes("@") 539 plus_char = _as_bytes("+") 540 if line[0:1] != at_char: 541 raise ValueError("Problem with FASTQ @ line:\n%r" % line) 542 while line: 543 #assert line[0]=="@" 544 #This record seems OK (so far) 545 id = line[1:].rstrip().split(None, 1)[0] 546 #Find the seq line(s) 547 seq_len = 0 548 length = len(line) 549 while line: 550 line = handle.readline() 551 length += len(line) 552 if line.startswith(plus_char): 553 break 554 seq_len += len(line.strip()) 555 if not line: 556 raise ValueError("Premature end of file in seq section") 557 #assert line[0]=="+" 558 #Find the qual line(s) 559 qual_len = 0 560 while line: 561 if seq_len == qual_len: 562 if seq_len == 0: 563 #Special case, quality line should be just "\n" 564 line = handle.readline() 565 if line.strip(): 566 raise ValueError("Expected blank quality line, not %r" % line) 567 #Should be end of record... 568 end_offset = handle.tell() 569 line = handle.readline() 570 if line and line[0:1] != at_char: 571 raise ValueError("Problem with line %r" % line) 572 break 573 else: 574 line = handle.readline() 575 qual_len += len(line.strip()) 576 length += len(line) 577 if seq_len != qual_len: 578 raise ValueError("Problem with quality section") 579 yield _bytes_to_string(id), start_offset, length 580 start_offset = end_offset
581 #print("EOF") 582
583 - def get_raw(self, offset):
584 """Similar to the get method, but returns the record as a raw string.""" 585 #TODO - Refactor this and the __init__ method to reduce code duplication? 586 handle = self._handle 587 handle.seek(offset) 588 line = handle.readline() 589 data = line 590 at_char = _as_bytes("@") 591 plus_char = _as_bytes("+") 592 if line[0:1] != at_char: 593 raise ValueError("Problem with FASTQ @ line:\n%r" % line) 594 #Find the seq line(s) 595 seq_len = 0 596 while line: 597 line = handle.readline() 598 data += line 599 if line.startswith(plus_char): 600 break 601 seq_len += len(line.strip()) 602 if not line: 603 raise ValueError("Premature end of file in seq section") 604 assert line[0:1] == plus_char 605 #Find the qual line(s) 606 qual_len = 0 607 while line: 608 if seq_len == qual_len: 609 if seq_len == 0: 610 #Special case, quality line should be just "\n" 611 line = handle.readline() 612 if line.strip(): 613 raise ValueError("Expected blank quality line, not %r" % line) 614 data += line 615 #Should be end of record... 616 line = handle.readline() 617 if line and line[0:1] != at_char: 618 raise ValueError("Problem with line %r" % line) 619 break 620 else: 621 line = handle.readline() 622 data += line 623 qual_len += len(line.strip()) 624 if seq_len != qual_len: 625 raise ValueError("Problem with quality section") 626 return data
627 628 629 ############################################################################### 630 631 _FormatToRandomAccess = {"ace": SequentialSeqFileRandomAccess, 632 "embl": EmblRandomAccess, 633 "fasta": SequentialSeqFileRandomAccess, 634 "fastq": FastqRandomAccess, # Class handles all three variants 635 "fastq-sanger": FastqRandomAccess, # alias of the above 636 "fastq-solexa": FastqRandomAccess, 637 "fastq-illumina": FastqRandomAccess, 638 "genbank": GenBankRandomAccess, 639 "gb": GenBankRandomAccess, # alias of the above 640 "ig": IntelliGeneticsRandomAccess, 641 "imgt": EmblRandomAccess, 642 "phd": SequentialSeqFileRandomAccess, 643 "pir": SequentialSeqFileRandomAccess, 644 "sff": SffRandomAccess, 645 "sff-trim": SffTrimedRandomAccess, 646 "swiss": SwissRandomAccess, 647 "tab": TabRandomAccess, 648 "qual": SequentialSeqFileRandomAccess, 649 "uniprot-xml": UniprotRandomAccess, 650 } 651