1
2
3
4
5 """Dictionary like indexing of sequence files (PRIVATE).
6
7 You are not expected to access this module, or any of its code, directly. This
8 is all handled internally by the Bio.SeqIO.index(...) function which is the
9 public interface for this functionality.
10
11 The basic idea is that we scan over a sequence file, looking for new record
12 markers. We then try and extract the string that Bio.SeqIO.parse/read would
13 use as the record id, ideally without actually parsing the full record. We
14 then use a subclassed Python dictionary to record the file offset for the
15 record start against the record id.
16
17 Note that this means full parsing is on demand, so any invalid or problem
18 record may not trigger an exception until it is accessed. This is by design.
19
20 This means our dictionary like objects have in memory ALL the keys (all the
21 record identifiers), which shouldn't be a problem even with second generation
22 sequencing. If this is an issue later on, storing the keys and offsets in a
23 temp lookup file might be one idea (e.g. using SQLite or an OBDA style index).
24 """
25
26 import re
27 from StringIO import StringIO
28
29 from Bio._py3k import _bytes_to_string, _as_bytes
30
31 from Bio import SeqIO
32 from Bio import Alphabet
33 from Bio import bgzf
34 from Bio.File import _IndexedSeqFileProxy, _open_for_random_access
35
36
60 self._parse = _parse
61
62 - def get(self, offset):
66
67
68
69
70
71
72
73
74
76 """Random access to a Standard Flowgram Format (SFF) file."""
77 - def __init__(self, filename, format, alphabet):
82
84 """Load any index block in the file, or build it the slow way (PRIVATE)."""
85 if self._alphabet is None:
86 self._alphabet = Alphabet.generic_dna
87 handle = self._handle
88 handle.seek(0)
89
90 header_length, index_offset, index_length, number_of_reads, \
91 self._flows_per_read, self._flow_chars, self._key_sequence \
92 = SeqIO.SffIO._sff_file_header(handle)
93 if index_offset and index_length:
94
95 count = 0
96 try:
97 for name, offset in SeqIO.SffIO._sff_read_roche_index(handle):
98 yield name, offset, 0
99 count += 1
100 assert count == number_of_reads, \
101 "Indexed %i records, expected %i" \
102 % (count, number_of_reads)
103 return
104 except ValueError, err:
105 import warnings
106 warnings.warn("Could not parse the SFF index: %s" % err)
107 assert count == 0, "Partially populated index"
108 handle.seek(0)
109
110
111
112 count = 0
113 for name, offset in SeqIO.SffIO._sff_do_slow_index(handle):
114 yield name, offset, 0
115 count += 1
116 assert count == number_of_reads, \
117 "Indexed %i records, expected %i" % (count, number_of_reads)
118
119 - def get(self, offset):
127
132
133
135 - def get(self, offset):
144
145
146
147
148
149
151 - def __init__(self, filename, format, alphabet):
152 SeqFileRandomAccess.__init__(self, filename, format, alphabet)
153 marker = {"ace": "CO ",
154 "embl": "ID ",
155 "fasta": ">",
156 "genbank": "LOCUS ",
157 "gb": "LOCUS ",
158 "imgt": "ID ",
159 "phd": "BEGIN_SEQUENCE",
160 "pir": ">..;",
161 "qual": ">",
162 "qual": ">",
163 "swiss": "ID ",
164 "uniprot-xml": "<entry ",
165 }[format]
166 self._marker = marker
167 self._marker_re = re.compile(_as_bytes("^%s" % marker))
168
170 """Returns (id,offset) tuples."""
171 marker_offset = len(self._marker)
172 marker_re = self._marker_re
173 handle = self._handle
174 handle.seek(0)
175
176 while True:
177 start_offset = handle.tell()
178 line = handle.readline()
179 if marker_re.match(line) or not line:
180 break
181
182 while marker_re.match(line):
183
184
185 id = line[marker_offset:].strip().split(None, 1)[0]
186 length = len(line)
187 while True:
188 end_offset = handle.tell()
189 line = handle.readline()
190 if marker_re.match(line) or not line:
191 yield _bytes_to_string(id), start_offset, length
192 start_offset = end_offset
193 break
194 else:
195
196 length += len(line)
197 assert not line, repr(line)
198
200 """Similar to the get method, but returns the record as a raw string."""
201
202 handle = self._handle
203 marker_re = self._marker_re
204 handle.seek(offset)
205 lines = [handle.readline()]
206 while True:
207 line = handle.readline()
208 if marker_re.match(line) or not line:
209
210 break
211 lines.append(line)
212 return _as_bytes("").join(lines)
213
214
215
216
217
218
220 """Indexed dictionary like access to a GenBank file."""
222 handle = self._handle
223 handle.seek(0)
224 marker_re = self._marker_re
225 dot_char = _as_bytes(".")
226 accession_marker = _as_bytes("ACCESSION ")
227 version_marker = _as_bytes("VERSION ")
228
229 while True:
230 start_offset = handle.tell()
231 line = handle.readline()
232 if marker_re.match(line) or not line:
233 break
234
235 while marker_re.match(line):
236
237
238 key = None
239 length = len(line)
240 while True:
241 end_offset = handle.tell()
242 line = handle.readline()
243 if marker_re.match(line) or not line:
244 if not key:
245 raise ValueError(
246 "Did not find ACCESSION/VERSION lines")
247 yield _bytes_to_string(key), start_offset, length
248 start_offset = end_offset
249 break
250 elif line.startswith(accession_marker):
251 key = line.rstrip().split()[1]
252 elif line.startswith(version_marker):
253 version_id = line.rstrip().split()[1]
254 if version_id.count(dot_char) == 1 and version_id.split(dot_char)[1].isdigit():
255
256 key = version_id
257 length += len(line)
258 assert not line, repr(line)
259
260
262 """Indexed dictionary like access to an EMBL file."""
308
309
311 """Random access to a SwissProt file."""
341
342
344 """Random access to a UniProt XML file."""
346 handle = self._handle
347 handle.seek(0)
348 marker_re = self._marker_re
349 start_acc_marker = _as_bytes("<accession>")
350 end_acc_marker = _as_bytes("</accession>")
351 end_entry_marker = _as_bytes("</entry>")
352 less_than = _as_bytes("<")
353
354 while True:
355 start_offset = handle.tell()
356 line = handle.readline()
357 if marker_re.match(line) or not line:
358 break
359
360 while marker_re.match(line):
361 length = len(line)
362
363
364
365 key = None
366 while True:
367 line = handle.readline()
368 if key is None and start_acc_marker in line:
369 assert end_acc_marker in line, line
370 key = line[line.find(
371 start_acc_marker) + 11:].split(less_than, 1)[0]
372 length += len(line)
373 elif end_entry_marker in line:
374 end_offset = handle.tell() - len(line) \
375 + line.find(end_entry_marker) + 8
376 break
377 elif marker_re.match(line) or not line:
378
379 raise ValueError("Didn't find end of record")
380 else:
381 length += len(line)
382 if not key:
383 raise ValueError("Did not find <accession> line in bytes %i to %i"
384 % (start_offset, end_offset))
385 yield _bytes_to_string(key), start_offset, length
386
387 while not marker_re.match(line) and line:
388 start_offset = handle.tell()
389 line = handle.readline()
390 assert not line, repr(line)
391
393 """Similar to the get method, but returns the record as a raw string."""
394 handle = self._handle
395 marker_re = self._marker_re
396 end_entry_marker = _as_bytes("</entry>")
397 handle.seek(offset)
398 data = [handle.readline()]
399 while True:
400 line = handle.readline()
401 i = line.find(end_entry_marker)
402 if i != -1:
403 data.append(line[:i + 8])
404 break
405 if marker_re.match(line) or not line:
406
407 raise ValueError("Didn't find end of record")
408 data.append(line)
409 return _as_bytes("").join(data)
410
411 - def get(self, offset):
412
413
414
415 data = """<?xml version='1.0' encoding='UTF-8'?>
416 <uniprot xmlns="http://uniprot.org/uniprot"
417 xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
418 xsi:schemaLocation="http://uniprot.org/uniprot
419 http://www.uniprot.org/support/docs/uniprot.xsd">
420 %s
421 </uniprot>
422 """ % _bytes_to_string(self.get_raw(offset))
423
424 return SeqIO.UniprotIO.UniprotIterator(data).next()
425
426
428 """Random access to a IntelliGenetics file."""
429 - def __init__(self, filename, format, alphabet):
432
456
471
472
474 """Random access to a simple tabbed file."""
476 handle = self._handle
477 handle.seek(0)
478 tab_char = _as_bytes("\t")
479 while True:
480 start_offset = handle.tell()
481 line = handle.readline()
482 if not line:
483 break
484 try:
485 key = line.split(tab_char)[0]
486 except ValueError, err:
487 if not line.strip():
488
489 continue
490 else:
491 raise err
492 else:
493 yield _bytes_to_string(key), start_offset, len(line)
494
500
501
502
503
504
505
507 """Random access to a FASTQ file (any supported variant).
508
509 With FASTQ the records all start with a "@" line, but so can quality lines.
510 Note this will cope with line-wrapped FASTQ files.
511 """
513 handle = self._handle
514 handle.seek(0)
515 id = None
516 start_offset = handle.tell()
517 line = handle.readline()
518 if not line:
519
520 return
521 at_char = _as_bytes("@")
522 plus_char = _as_bytes("+")
523 if line[0:1] != at_char:
524 raise ValueError("Problem with FASTQ @ line:\n%s" % repr(line))
525 while line:
526
527
528 id = line[1:].rstrip().split(None, 1)[0]
529
530 seq_len = 0
531 length = len(line)
532 while line:
533 line = handle.readline()
534 length += len(line)
535 if line.startswith(plus_char):
536 break
537 seq_len += len(line.strip())
538 if not line:
539 raise ValueError("Premature end of file in seq section")
540
541
542 qual_len = 0
543 while line:
544 if seq_len == qual_len:
545
546 end_offset = handle.tell()
547 line = handle.readline()
548 if line and line[0:1] != at_char:
549 ValueError("Problem with line %s" % repr(line))
550 break
551 else:
552 line = handle.readline()
553 qual_len += len(line.strip())
554 length += len(line)
555 if seq_len != qual_len:
556 raise ValueError("Problem with quality section")
557 yield _bytes_to_string(id), start_offset, length
558 start_offset = end_offset
559
560
562 """Similar to the get method, but returns the record as a raw string."""
563
564 handle = self._handle
565 handle.seek(offset)
566 line = handle.readline()
567 data = line
568 at_char = _as_bytes("@")
569 plus_char = _as_bytes("+")
570 if line[0:1] != at_char:
571 raise ValueError("Problem with FASTQ @ line:\n%s" % repr(line))
572
573 seq_len = 0
574 while line:
575 line = handle.readline()
576 data += line
577 if line.startswith(plus_char):
578 break
579 seq_len += len(line.strip())
580 if not line:
581 raise ValueError("Premature end of file in seq section")
582 assert line[0:1] == plus_char
583
584 qual_len = 0
585 while line:
586 if seq_len == qual_len:
587
588 line = handle.readline()
589 if line and line[0:1] != at_char:
590 ValueError("Problem with line %s" % repr(line))
591 break
592 else:
593 line = handle.readline()
594 data += line
595 qual_len += len(line.strip())
596 if seq_len != qual_len:
597 raise ValueError("Problem with quality section")
598 return data
599
600
601
602
603 _FormatToRandomAccess = {"ace": SequentialSeqFileRandomAccess,
604 "embl": EmblRandomAccess,
605 "fasta": SequentialSeqFileRandomAccess,
606 "fastq": FastqRandomAccess,
607 "fastq-sanger": FastqRandomAccess,
608 "fastq-solexa": FastqRandomAccess,
609 "fastq-illumina": FastqRandomAccess,
610 "genbank": GenBankRandomAccess,
611 "gb": GenBankRandomAccess,
612 "ig": IntelliGeneticsRandomAccess,
613 "imgt": EmblRandomAccess,
614 "phd": SequentialSeqFileRandomAccess,
615 "pir": SequentialSeqFileRandomAccess,
616 "sff": SffRandomAccess,
617 "sff-trim": SffTrimedRandomAccess,
618 "swiss": SwissRandomAccess,
619 "tab": TabRandomAccess,
620 "qual": SequentialSeqFileRandomAccess,
621 "uniprot-xml": UniprotRandomAccess,
622 }
623