1
2
3
4
5
6
7 """Code for more fancy file handles.
8
9
10 Classes:
11
12 UndoHandle File object decorator with support for undo-like operations.
13
14 Additional private classes used in Bio.SeqIO and Bio.SearchIO for indexing
15 files are also defined under Bio.File but these are not intended for direct
16 use.
17 """
18
19 from __future__ import with_statement
20 import codecs
21 import os
22 import contextlib
23 import StringIO
24 import itertools
25
26 try:
27 from collections import UserDict as _dict_base
28 except ImportError:
29 from UserDict import DictMixin as _dict_base
30
31 try:
32 from sqlite3 import dbapi2 as _sqlite
33 from sqlite3 import IntegrityError as _IntegrityError
34 from sqlite3 import OperationalError as _OperationalError
35 except ImportError:
36
37
38
39 _sqlite = None
40 pass
41
42
43 @contextlib.contextmanager
44 -def as_handle(handleish, mode='r', **kwargs):
45 """
46 Context manager for arguments that can be passed to
47 SeqIO and AlignIO read, write, and parse methods: either file objects or strings.
48
49 When given a string, returns a file handle open to handleish with provided
50 mode which will be closed when the manager exits.
51
52 All other inputs are returned, and are *not* closed
53
54 - handleish - Either a string or file handle
55 - mode - Mode to open handleish (used only if handleish is a string)
56 - kwargs - Further arguments to pass to open(...)
57
58 Example:
59
60 >>> with as_handle('seqs.fasta', 'w') as fp:
61 ... fp.write('>test\nACGT')
62 >>> fp.closed
63 True
64
65 >>> handle = open('seqs.fasta', 'w')
66 >>> with as_handle(handle) as fp:
67 ... fp.write('>test\nACGT')
68 >>> fp.closed
69 False
70 >>> fp.close()
71 """
72 if isinstance(handleish, basestring):
73 if 'encoding' in kwargs:
74 with codecs.open(handleish, mode, **kwargs) as fp:
75 yield fp
76 else:
77 with open(handleish, mode, **kwargs) as fp:
78 yield fp
79 else:
80 yield handleish
81
83 """Open a file in binary mode, spot if it is BGZF format etc (PRIVATE).
84
85 This funcationality is used by the Bio.SeqIO and Bio.SearchIO index
86 and index_db functions.
87 """
88 handle = open(filename, "rb")
89 import bgzf
90 try:
91 return bgzf.BgzfReader(mode="rb", fileobj=handle)
92 except ValueError, e:
93 assert "BGZF" in str(e)
94
95 handle.seek(0)
96 return handle
97
100 """A Python handle that adds functionality for saving lines.
101
102 Saves lines in a LIFO fashion.
103
104 Added methods:
105 saveline Save a line to be returned next time.
106 peekline Peek at the next line without consuming it.
107
108 """
110 self._handle = handle
111 self._saved = []
112
115
121
123 lines = self._saved + self._handle.readlines(*args, **keywds)
124 self._saved = []
125 return lines
126
128 if self._saved:
129 line = self._saved.pop(0)
130 else:
131 line = self._handle.readline(*args, **keywds)
132 return line
133
134 - def read(self, size=-1):
135 if size == -1:
136 saved = "".join(self._saved)
137 self._saved[:] = []
138 else:
139 saved = ''
140 while size > 0 and self._saved:
141 if len(self._saved[0]) <= size:
142 size = size - len(self._saved[0])
143 saved = saved + self._saved.pop(0)
144 else:
145 saved = saved + self._saved[0][:size]
146 self._saved[0] = self._saved[0][size:]
147 size = 0
148 return saved + self._handle.read(size)
149
151 if line:
152 self._saved = [line] + self._saved
153
155 if self._saved:
156 line = self._saved[0]
157 else:
158 line = self._handle.readline()
159 self.saveline(line)
160 return line
161
163 lengths = map(len, self._saved)
164 sum = reduce(lambda x, y: x+y, lengths, 0)
165 return self._handle.tell() - sum
166
167 - def seek(self, *args):
168 self._saved = []
169 self._handle.seek(*args)
170
172 return getattr(self._handle, attr)
173
176
177 - def __exit__(self, type, value, traceback):
179
185 """Base class for file format specific random access (PRIVATE).
186
187 This is subclasses in both Bio.SeqIO for indexing as SeqRecord
188 objects, and in Bio.SearchIO for indexing QueryResult objects.
189
190 Subclasses for each file format should define '__iter__', 'get'
191 and optionally 'get_raw' methods.
192 """
193
195 """Returns (identifier, offset, length in bytes) tuples.
196
197 The length can be zero where it is not implemented or not
198 possible for a particular file format.
199 """
200 raise NotImplementedError("Subclass should implement this")
201
202 - def get(self, offset):
203 """Returns parsed object for this entry."""
204
205
206 raise NotImplementedError("Subclass should implement this")
207
209 """Returns bytes string (if implemented for this file format)."""
210
211 raise NotImplementedError("Not available for this file format.")
212
215 """Read only dictionary interface to a sequential record file.
216
217 This code is used in both Bio.SeqIO for indexing as SeqRecord
218 objects, and in Bio.SearchIO for indexing QueryResult objects.
219
220 Keeps the keys and associated file offsets in memory, reads the file
221 to access entries as objects parsing them on demand. This approach
222 is memory limited, but will work even with millions of records.
223
224 Note duplicate keys are not allowed. If this happens, a ValueError
225 exception is raised.
226
227 As used in Bio.SeqIO, by default the SeqRecord's id string is used
228 as the dictionary key. In Bio.SearchIO, the query's id string is
229 used. This can be changed by suppling an optional key_function,
230 a callback function which will be given the record id and must
231 return the desired key. For example, this allows you to parse
232 NCBI style FASTA identifiers, and extract the GI number to use
233 as the dictionary key.
234
235 Note that this dictionary is essentially read only. You cannot
236 add or change values, pop values, nor clear the dictionary.
237 """
238 - def __init__(self, random_access_proxy, key_function,
239 repr, obj_repr):
240
241 self._proxy = random_access_proxy
242 self._key_function = key_function
243 self._repr = repr
244 self._obj_repr = obj_repr
245 if key_function:
246 offset_iter = (
247 (key_function(k), o, l) for (k, o, l) in random_access_proxy)
248 else:
249 offset_iter = random_access_proxy
250 offsets = {}
251 for key, offset, length in offset_iter:
252
253
254
255
256
257
258
259
260
261 if key in offsets:
262 self._proxy._handle.close()
263 raise ValueError("Duplicate key '%s'" % key)
264 else:
265 offsets[key] = offset
266 self._offsets = offsets
267
270
272
273 if self:
274 return "{%r : %s(...), ...}" % (self.keys()[0], self._obj_repr)
275 else:
276 return "{}"
277
279 return key in self._offsets
280
282 """How many records are there?"""
283 return len(self._offsets)
284
285 if hasattr(dict, "iteritems"):
286
288 """Would be a list of the SeqRecord objects, but not implemented.
289
290 In general you can be indexing very very large files, with millions
291 of sequences. Loading all these into memory at once as SeqRecord
292 objects would (probably) use up all the RAM. Therefore we simply
293 don't support this dictionary method.
294 """
295 raise NotImplementedError("Due to memory concerns, when indexing a "
296 "sequence file you cannot access all the "
297 "records at once.")
298
300 """Would be a list of the (key, SeqRecord) tuples, but not implemented.
301
302 In general you can be indexing very very large files, with millions
303 of sequences. Loading all these into memory at once as SeqRecord
304 objects would (probably) use up all the RAM. Therefore we simply
305 don't support this dictionary method.
306 """
307 raise NotImplementedError("Due to memory concerns, when indexing a "
308 "sequence file you cannot access all the "
309 "records at once.")
310
312 """Return a list of all the keys (SeqRecord identifiers)."""
313
314 return self._offsets.keys()
315
317 """Iterate over the SeqRecord) items."""
318 for key in self.__iter__():
319 yield self.__getitem__(key)
320
322 """Iterate over the (key, SeqRecord) items."""
323 for key in self.__iter__():
324 yield key, self.__getitem__(key)
325
327 """Iterate over the keys."""
328 return self.__iter__()
329
330 else:
331
333 """Iterate over the (key, SeqRecord) items."""
334 for key in self.__iter__():
335 yield key, self.__getitem__(key)
336
338 """Iterate over the SeqRecord items."""
339 for key in self.__iter__():
340 yield self.__getitem__(key)
341
343 """Iterate over the keys."""
344 return self.__iter__()
345
347 """Iterate over the keys."""
348 return iter(self._offsets)
349
351 """x.__getitem__(y) <==> x[y]"""
352
353 record = self._proxy.get(self._offsets[key])
354 if self._key_function:
355 key2 = self._key_function(record.id)
356 else:
357 key2 = record.id
358 if key != key2:
359 raise ValueError("Key did not match (%s vs %s)" % (key, key2))
360 return record
361
362 - def get(self, k, d=None):
363 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
364 try:
365 return self.__getitem__(k)
366 except KeyError:
367 return d
368
370 """Similar to the get method, but returns the record as a raw string.
371
372 If the key is not found, a KeyError exception is raised.
373
374 Note that on Python 3 a bytes string is returned, not a typical
375 unicode string.
376
377 NOTE - This functionality is not supported for every file format.
378 """
379
380 return self._proxy.get_raw(self._offsets[key])
381
383 """Would allow setting or replacing records, but not implemented."""
384 raise NotImplementedError("An indexed a sequence file is read only.")
385
386 - def update(self, *args, **kwargs):
387 """Would allow adding more values, but not implemented."""
388 raise NotImplementedError("An indexed a sequence file is read only.")
389
390 - def pop(self, key, default=None):
391 """Would remove specified record, but not implemented."""
392 raise NotImplementedError("An indexed a sequence file is read only.")
393
395 """Would remove and return a SeqRecord, but not implemented."""
396 raise NotImplementedError("An indexed a sequence file is read only.")
397
399 """Would clear dictionary, but not implemented."""
400 raise NotImplementedError("An indexed a sequence file is read only.")
401
403 """A dictionary method which we don't implement."""
404 raise NotImplementedError("An indexed a sequence file doesn't "
405 "support this.")
406
408 """A dictionary method which we don't implement."""
409 raise NotImplementedError("An indexed a sequence file doesn't "
410 "support this.")
411
414 """Read only dictionary interface to many sequential record files.
415
416 This code is used in both Bio.SeqIO for indexing as SeqRecord
417 objects, and in Bio.SearchIO for indexing QueryResult objects.
418
419 Keeps the keys, file-numbers and offsets in an SQLite database. To access
420 a record by key, reads from the offset in the appropriate file and then
421 parses the record into an object.
422
423 There are OS limits on the number of files that can be open at once,
424 so a pool are kept. If a record is required from a closed file, then
425 one of the open handles is closed first.
426 """
427 - def __init__(self, index_filename, filenames,
428 proxy_factory, format,
429 key_function, repr, max_open=10):
430 self._proxy_factory = proxy_factory
431 self._repr = repr
432 random_access_proxies = {}
433
434
435
436
437 if not _sqlite:
438
439 from Bio import MissingPythonDependencyError
440 raise MissingPythonDependencyError("Requires sqlite3, which is "
441 "included Python 2.5+")
442 if filenames is not None:
443 filenames = list(filenames)
444 if os.path.isfile(index_filename):
445
446 con = _sqlite.connect(index_filename)
447 self._con = con
448
449 try:
450 count, = con.execute(
451 "SELECT value FROM meta_data WHERE key=?;",
452 ("count",)).fetchone()
453 self._length = int(count)
454 if self._length == -1:
455 con.close()
456 raise ValueError("Unfinished/partial database")
457 count, = con.execute(
458 "SELECT COUNT(key) FROM offset_data;").fetchone()
459 if self._length != int(count):
460 con.close()
461 raise ValueError("Corrupt database? %i entries not %i"
462 % (int(count), self._length))
463 self._format, = con.execute(
464 "SELECT value FROM meta_data WHERE key=?;",
465 ("format",)).fetchone()
466 if format and format != self._format:
467 con.close()
468 raise ValueError("Index file says format %s, not %s"
469 % (self._format, format))
470 self._filenames = [row[0] for row in
471 con.execute("SELECT name FROM file_data "
472 "ORDER BY file_number;").fetchall()]
473 if filenames and len(filenames) != len(self._filenames):
474 con.close()
475 raise ValueError("Index file says %i files, not %i"
476 % (len(self._filenames), len(filenames)))
477 if filenames and filenames != self._filenames:
478 con.close()
479 raise ValueError("Index file has different filenames")
480 except _OperationalError, err:
481 con.close()
482 raise ValueError("Not a Biopython index database? %s" % err)
483
484 if not proxy_factory(self._format):
485 con.close()
486 raise ValueError("Unsupported format '%s'" % self._format)
487 else:
488 self._filenames = filenames
489 self._format = format
490 if not format or not filenames:
491 raise ValueError("Filenames to index and format required")
492 if not proxy_factory(format):
493 raise ValueError("Unsupported format '%s'" % format)
494
495 con = _sqlite.connect(index_filename)
496 self._con = con
497
498
499 con.execute("PRAGMA synchronous=OFF")
500 con.execute("PRAGMA locking_mode=EXCLUSIVE")
501
502
503
504 con.execute("CREATE TABLE meta_data (key TEXT, value TEXT);")
505 con.execute("INSERT INTO meta_data (key, value) VALUES (?,?);",
506 ("count", -1))
507 con.execute("INSERT INTO meta_data (key, value) VALUES (?,?);",
508 ("format", format))
509
510
511 con.execute(
512 "CREATE TABLE file_data (file_number INTEGER, name TEXT);")
513 con.execute("CREATE TABLE offset_data (key TEXT, file_number INTEGER, offset INTEGER, length INTEGER);")
514 count = 0
515 for i, filename in enumerate(filenames):
516 con.execute(
517 "INSERT INTO file_data (file_number, name) VALUES (?,?);",
518 (i, filename))
519 random_access_proxy = proxy_factory(format, filename)
520 if key_function:
521 offset_iter = ((key_function(
522 k), i, o, l) for (k, o, l) in random_access_proxy)
523 else:
524 offset_iter = (
525 (k, i, o, l) for (k, o, l) in random_access_proxy)
526 while True:
527 batch = list(itertools.islice(offset_iter, 100))
528 if not batch:
529 break
530
531
532 con.executemany(
533 "INSERT INTO offset_data (key,file_number,offset,length) VALUES (?,?,?,?);",
534 batch)
535 con.commit()
536 count += len(batch)
537 if len(random_access_proxies) < max_open:
538 random_access_proxies[i] = random_access_proxy
539 else:
540 random_access_proxy._handle.close()
541 self._length = count
542
543 try:
544 con.execute("CREATE UNIQUE INDEX IF NOT EXISTS "
545 "key_index ON offset_data(key);")
546 except _IntegrityError, err:
547 self._proxies = random_access_proxies
548 self.close()
549 con.close()
550 raise ValueError("Duplicate key? %s" % err)
551 con.execute("PRAGMA locking_mode=NORMAL")
552 con.execute("UPDATE meta_data SET value = ? WHERE key = ?;",
553 (count, "count"))
554 con.commit()
555
556 self._proxies = random_access_proxies
557 self._max_open = max_open
558 self._index_filename = index_filename
559 self._key_function = key_function
560
563
565 return bool(
566 self._con.execute("SELECT key FROM offset_data WHERE key=?;",
567 (key,)).fetchone())
568
570 """How many records are there?"""
571 return self._length
572
573
575 """Iterate over the keys."""
576 for row in self._con.execute("SELECT key FROM offset_data;"):
577 yield str(row[0])
578
579 if hasattr(dict, "iteritems"):
580
581
583 """Return a list of all the keys (SeqRecord identifiers)."""
584 return [str(row[0]) for row in
585 self._con.execute("SELECT key FROM offset_data;").fetchall()]
586
588 """x.__getitem__(y) <==> x[y]"""
589
590 row = self._con.execute(
591 "SELECT file_number, offset FROM offset_data WHERE key=?;",
592 (key,)).fetchone()
593 if not row:
594 raise KeyError
595 file_number, offset = row
596 proxies = self._proxies
597 if file_number in proxies:
598 record = proxies[file_number].get(offset)
599 else:
600 if len(proxies) >= self._max_open:
601
602 proxies.popitem()[1]._handle.close()
603
604 proxy = self._proxy_factory(self._format, self._filenames[file_number])
605 record = proxy.get(offset)
606 proxies[file_number] = proxy
607 if self._key_function:
608 key2 = self._key_function(record.id)
609 else:
610 key2 = record.id
611 if key != key2:
612 raise ValueError("Key did not match (%s vs %s)" % (key, key2))
613 return record
614
615 - def get(self, k, d=None):
616 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
617 try:
618 return self.__getitem__(k)
619 except KeyError:
620 return d
621
623 """Similar to the get method, but returns the record as a raw string.
624
625 If the key is not found, a KeyError exception is raised.
626
627 Note that on Python 3 a bytes string is returned, not a typical
628 unicode string.
629
630 NOTE - This functionality is not supported for every file format.
631 """
632
633 row = self._con.execute(
634 "SELECT file_number, offset, length FROM offset_data WHERE key=?;",
635 (key,)).fetchone()
636 if not row:
637 raise KeyError
638 file_number, offset, length = row
639 proxies = self._proxies
640 if file_number in proxies:
641 if length:
642
643 h = proxies[file_number]._handle
644 h.seek(offset)
645 return h.read(length)
646 else:
647 return proxies[file_number].get_raw(offset)
648 else:
649
650 if len(proxies) >= self._max_open:
651
652 proxies.popitem()[1]._handle.close()
653
654 proxy = self._proxy_factory(self._format, self._filenames[file_number])
655 proxies[file_number] = proxy
656 if length:
657
658 h = proxy._handle
659 h.seek(offset)
660 return h.read(length)
661 else:
662 return proxy.get_raw(offset)
663
665 """Close any open file handles."""
666 proxies = self._proxies
667 while proxies:
668 proxies.popitem()[1]._handle.close()
669