Package Bio :: Module File
[hide private]
[frames] | no frames]

Source Code for Module Bio.File

  1  # Copyright 1999 by Jeffrey Chang.  All rights reserved. 
  2  # Copyright 2009-2015 by Peter Cock. All rights reserved. 
  3  # 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7   
  8  """Code for more fancy file handles. 
  9   
 10  Classes: 
 11   - UndoHandle     File object decorator with support for undo-like operations. 
 12   
 13  Additional private classes used in Bio.SeqIO and Bio.SearchIO for indexing 
 14  files are also defined under Bio.File but these are not intended for direct 
 15  use. 
 16  """ 
 17   
 18  from __future__ import print_function 
 19   
 20  import codecs 
 21  import os 
 22  import sys 
 23  import contextlib 
 24  import itertools 
 25   
 26  from Bio._py3k import basestring 
 27   
 28  try: 
 29      from collections import UserDict as _dict_base 
 30  except ImportError: 
 31      from UserDict import DictMixin as _dict_base 
 32   
 33  try: 
 34      from sqlite3 import dbapi2 as _sqlite 
 35      from sqlite3 import IntegrityError as _IntegrityError 
 36      from sqlite3 import OperationalError as _OperationalError 
 37  except ImportError: 
 38      # Not present on Jython, but should be included in Python 2.5 
 39      # or later (unless compiled from source without its dependencies) 
 40      # Still want to offer in-memory indexing. 
 41      _sqlite = None 
 42      pass 
43 44 45 @contextlib.contextmanager 46 -def as_handle(handleish, mode='r', **kwargs):
47 r"""Context manager to ensure we are using a handle. 48 49 Context manager for arguments that can be passed to 50 SeqIO and AlignIO read, write, and parse methods: either file objects or strings. 51 52 When given a string, returns a file handle open to handleish with provided 53 mode which will be closed when the manager exits. 54 55 All other inputs are returned, and are *not* closed. 56 57 Arguments: 58 - handleish - Either a string or file handle 59 - mode - Mode to open handleish (used only if handleish is a string) 60 - kwargs - Further arguments to pass to open(...) 61 62 Example: 63 64 >>> with as_handle('seqs.fasta', 'w') as fp: 65 ... fp.write('>test\nACGT') 66 >>> fp.closed 67 True 68 69 >>> handle = open('seqs.fasta', 'w') 70 >>> with as_handle(handle) as fp: 71 ... fp.write('>test\nACGT') 72 >>> fp.closed 73 False 74 >>> fp.close() 75 76 Note that if the mode argument includes U (for universal new lines) 77 this will be removed under Python 3 where is is redundant and has 78 been deprecated (this happens automatically in text mode). 79 """ 80 if isinstance(handleish, basestring): 81 if sys.version_info[0] >= 3 and "U" in mode: 82 mode = mode.replace("U", "") 83 if 'encoding' in kwargs: 84 with codecs.open(handleish, mode, **kwargs) as fp: 85 yield fp 86 else: 87 with open(handleish, mode, **kwargs) as fp: 88 yield fp 89 else: 90 yield handleish
91
92 93 -def _open_for_random_access(filename):
94 """Open a file in binary mode, spot if it is BGZF format etc (PRIVATE). 95 96 This functionality is used by the Bio.SeqIO and Bio.SearchIO index 97 and index_db functions. 98 """ 99 handle = open(filename, "rb") 100 from . import bgzf 101 try: 102 return bgzf.BgzfReader(mode="rb", fileobj=handle) 103 except ValueError as e: 104 assert "BGZF" in str(e) 105 # Not a BGZF file after all, rewind to start: 106 handle.seek(0) 107 return handle
108
109 110 -class UndoHandle(object):
111 """A Python handle that adds functionality for saving lines. 112 113 Saves lines in a LIFO fashion. 114 115 Added methods: 116 - saveline Save a line to be returned next time. 117 - peekline Peek at the next line without consuming it. 118 119 """ 120
121 - def __init__(self, handle):
122 """Initialize the class.""" 123 self._handle = handle 124 self._saved = [] 125 try: 126 # If wrapping an online handle, this this is nice to have: 127 self.url = handle.url 128 except AttributeError: 129 pass
130
131 - def __iter__(self):
132 """Iterate over the lines in the File.""" 133 return self
134
135 - def __next__(self):
136 """Return the next line.""" 137 next = self.readline() 138 if not next: 139 raise StopIteration 140 return next
141 142 if sys.version_info[0] < 3:
143 - def next(self):
144 """Python 2 style alias for Python 3 style __next__ method.""" 145 return self.__next__()
146
147 - def readlines(self, *args, **keywds):
148 """Read all the lines from the file as a list of strings.""" 149 lines = self._saved + self._handle.readlines(*args, **keywds) 150 self._saved = [] 151 return lines
152
153 - def readline(self, *args, **keywds):
154 """Read the next line from the file as string.""" 155 if self._saved: 156 line = self._saved.pop(0) 157 else: 158 line = self._handle.readline(*args, **keywds) 159 return line
160
161 - def read(self, size=-1):
162 """Read the File.""" 163 if size == -1: 164 saved = "".join(self._saved) 165 self._saved[:] = [] 166 else: 167 saved = '' 168 while size > 0 and self._saved: 169 if len(self._saved[0]) <= size: 170 size = size - len(self._saved[0]) 171 saved = saved + self._saved.pop(0) 172 else: 173 saved = saved + self._saved[0][:size] 174 self._saved[0] = self._saved[0][size:] 175 size = 0 176 return saved + self._handle.read(size)
177
178 - def saveline(self, line):
179 """Store a line in the cache memory for later use. 180 181 This acts to undo a readline, reflecting the name of the class: UndoHandle. 182 """ 183 if line: 184 self._saved = [line] + self._saved
185
186 - def peekline(self):
187 """Return the next line in the file, but do not move forward though the file.""" 188 if self._saved: 189 line = self._saved[0] 190 else: 191 line = self._handle.readline() 192 self.saveline(line) 193 return line
194
195 - def tell(self):
196 """Return the current position of the file read/write pointer within the File.""" 197 return self._handle.tell() - sum(len(line) for line in self._saved)
198
199 - def seek(self, *args):
200 """Set the current position at the offset specified.""" 201 self._saved = [] 202 self._handle.seek(*args)
203
204 - def __getattr__(self, attr):
205 """Return File attribute.""" 206 return getattr(self._handle, attr)
207
208 - def __enter__(self):
209 """Call special method when opening the file using a with-statement.""" 210 return self
211
212 - def __exit__(self, type, value, traceback):
213 """Call special method when closing the file using a with-statement.""" 214 self._handle.close()
215
216 217 # The rest of this file defines code used in Bio.SeqIO and Bio.SearchIO 218 # for indexing 219 220 -class _IndexedSeqFileProxy(object):
221 """Base class for file format specific random access (PRIVATE). 222 223 This is subclasses in both Bio.SeqIO for indexing as SeqRecord 224 objects, and in Bio.SearchIO for indexing QueryResult objects. 225 226 Subclasses for each file format should define '__iter__', 'get' 227 and optionally 'get_raw' methods. 228 """ 229
230 - def __iter__(self):
231 """Return (identifier, offset, length in bytes) tuples. 232 233 The length can be zero where it is not implemented or not 234 possible for a particular file format. 235 """ 236 raise NotImplementedError("Subclass should implement this")
237
238 - def get(self, offset):
239 """Return parsed object for this entry.""" 240 # Most file formats with self contained records can be handled by 241 # parsing StringIO(_bytes_to_string(self.get_raw(offset))) 242 raise NotImplementedError("Subclass should implement this")
243
244 - def get_raw(self, offset):
245 """Return the raw record from the file as a bytes string (if implemented). 246 247 If the key is not found, a KeyError exception is raised. 248 249 This may not have been implemented for all file formats. 250 """ 251 # Should be done by each sub-class (if possible) 252 raise NotImplementedError("Not available for this file format.")
253
254 255 -class _IndexedSeqFileDict(_dict_base):
256 """Read only dictionary interface to a sequential record file. 257 258 This code is used in both Bio.SeqIO for indexing as SeqRecord 259 objects, and in Bio.SearchIO for indexing QueryResult objects. 260 261 Keeps the keys and associated file offsets in memory, reads the file 262 to access entries as objects parsing them on demand. This approach 263 is memory limited, but will work even with millions of records. 264 265 Note duplicate keys are not allowed. If this happens, a ValueError 266 exception is raised. 267 268 As used in Bio.SeqIO, by default the SeqRecord's id string is used 269 as the dictionary key. In Bio.SearchIO, the query's id string is 270 used. This can be changed by suppling an optional key_function, 271 a callback function which will be given the record id and must 272 return the desired key. For example, this allows you to parse 273 NCBI style FASTA identifiers, and extract the GI number to use 274 as the dictionary key. 275 276 Note that this dictionary is essentially read only. You cannot 277 add or change values, pop values, nor clear the dictionary. 278 """ 279
280 - def __init__(self, random_access_proxy, key_function, 281 repr, obj_repr):
282 """Initialize the class.""" 283 # Use key_function=None for default value 284 self._proxy = random_access_proxy 285 self._key_function = key_function 286 self._repr = repr 287 self._obj_repr = obj_repr 288 if key_function: 289 offset_iter = ( 290 (key_function(k), o, l) for (k, o, l) in random_access_proxy) 291 else: 292 offset_iter = random_access_proxy 293 offsets = {} 294 for key, offset, length in offset_iter: 295 # Note - we don't store the length because I want to minimise the 296 # memory requirements. With the SQLite backend the length is kept 297 # and is used to speed up the get_raw method (by about 3 times). 298 # The length should be provided by all the current backends except 299 # SFF where there is an existing Roche index we can reuse (very fast 300 # but lacks the record lengths) 301 # assert length or format in ["sff", "sff-trim"], \ 302 # "%s at offset %i given length %r (%s format %s)" \ 303 # % (key, offset, length, filename, format) 304 if key in offsets: 305 self._proxy._handle.close() 306 raise ValueError("Duplicate key '%s'" % key) 307 else: 308 offsets[key] = offset 309 self._offsets = offsets
310
311 - def __repr__(self):
312 """Return a string representation of the File object.""" 313 return self._repr
314
315 - def __str__(self):
316 """Create a string representation of the File object.""" 317 # TODO - How best to handle the __str__ for SeqIO and SearchIO? 318 if self: 319 return "{%r : %s(...), ...}" % (list(self.keys())[0], self._obj_repr) 320 else: 321 return "{}"
322
323 - def __contains__(self, key):
324 """Return key if contained in the offsets dictionary.""" 325 return key in self._offsets
326
327 - def __len__(self):
328 """Return the number of records.""" 329 return len(self._offsets)
330
331 - def items(self):
332 """Iterate over the (key, SeqRecord) items. 333 334 This tries to act like a Python 3 dictionary, and does not return 335 a list of (key, value) pairs due to memory concerns. 336 """ 337 for key in self.__iter__(): 338 yield key, self.__getitem__(key)
339
340 - def values(self):
341 """Iterate over the SeqRecord items. 342 343 This tries to act like a Python 3 dictionary, and does not return 344 a list of value due to memory concerns. 345 """ 346 for key in self.__iter__(): 347 yield self.__getitem__(key)
348
349 - def keys(self):
350 """Iterate over the keys. 351 352 This tries to act like a Python 3 dictionary, and does not return 353 a list of keys due to memory concerns. 354 """ 355 return self.__iter__()
356 357 if hasattr(dict, "iteritems"): 358 # Python 2, also define iteritems etc
359 - def itervalues(self):
360 """Iterate over the SeqRecord) items.""" 361 for key in self.__iter__(): 362 yield self.__getitem__(key)
363
364 - def iteritems(self):
365 """Iterate over the (key, SeqRecord) items.""" 366 for key in self.__iter__(): 367 yield key, self.__getitem__(key)
368
369 - def iterkeys(self):
370 """Iterate over the keys.""" 371 return self.__iter__()
372
373 - def __iter__(self):
374 """Iterate over the keys.""" 375 return iter(self._offsets)
376
377 - def __getitem__(self, key):
378 """Return record for the specified key.""" 379 # Pass the offset to the proxy 380 record = self._proxy.get(self._offsets[key]) 381 if self._key_function: 382 key2 = self._key_function(record.id) 383 else: 384 key2 = record.id 385 if key != key2: 386 raise ValueError("Key did not match (%s vs %s)" % (key, key2)) 387 return record
388
389 - def get(self, k, d=None):
390 """Return the value in the dictionary. 391 392 If the key (k) is not found, this returns None unless a 393 default (d) is specified. 394 """ 395 try: 396 return self.__getitem__(k) 397 except KeyError: 398 return d
399
400 - def get_raw(self, key):
401 """Return the raw record from the file as a bytes string. 402 403 If the key is not found, a KeyError exception is raised. 404 """ 405 # Pass the offset to the proxy 406 return self._proxy.get_raw(self._offsets[key])
407
408 - def __setitem__(self, key, value):
409 """Would allow setting or replacing records, but not implemented. 410 411 Python dictionaries provide this method for modifying data in the 412 dictionary. This class mimics the dictionary interface but is read only. 413 """ 414 raise NotImplementedError("An indexed a sequence file is read only.")
415
416 - def update(self, *args, **kwargs):
417 """Would allow adding more values, but not implemented. 418 419 Python dictionaries provide this method for modifying data in the 420 dictionary. This class mimics the dictionary interface but is read only. 421 """ 422 raise NotImplementedError("An indexed a sequence file is read only.")
423
424 - def pop(self, key, default=None):
425 """Would remove specified record, but not implemented. 426 427 Python dictionaries provide this method for modifying data in the 428 dictionary. This class mimics the dictionary interface but is read only. 429 """ 430 raise NotImplementedError("An indexed a sequence file is read only.")
431
432 - def popitem(self):
433 """Would remove and return a SeqRecord, but not implemented. 434 435 Python dictionaries provide this method for modifying data in the 436 dictionary. This class mimics the dictionary interface but is read only. 437 """ 438 raise NotImplementedError("An indexed a sequence file is read only.")
439
440 - def clear(self):
441 """Would clear dictionary, but not implemented. 442 443 Python dictionaries provide this method for modifying data in the 444 dictionary. This class mimics the dictionary interface but is read only. 445 """ 446 raise NotImplementedError("An indexed a sequence file is read only.")
447
448 - def fromkeys(self, keys, value=None):
449 """Would return a new dictionary with keys and values, but not implemented. 450 451 Python dictionaries provide this method for modifying data in the 452 dictionary. This class mimics the dictionary interface but is read only. 453 """ 454 raise NotImplementedError("An indexed a sequence file doesn't " 455 "support this.")
456
457 - def copy(self):
458 """Would copy a dictionary, but not implemented. 459 460 Python dictionaries provide this method for modifying data in the 461 dictionary. This class mimics the dictionary interface but is read only. 462 """ 463 raise NotImplementedError("An indexed a sequence file doesn't " 464 "support this.")
465
466 - def close(self):
467 """Close the file handle being used to read the data. 468 469 Once called, further use of the index won't work. The sole purpose 470 of this method is to allow explicit handle closure - for example 471 if you wish to delete the file, on Windows you must first close 472 all open handles to that file. 473 """ 474 self._proxy._handle.close()
475
476 477 -class _SQLiteManySeqFilesDict(_IndexedSeqFileDict):
478 """Read only dictionary interface to many sequential record files. 479 480 This code is used in both Bio.SeqIO for indexing as SeqRecord 481 objects, and in Bio.SearchIO for indexing QueryResult objects. 482 483 Keeps the keys, file-numbers and offsets in an SQLite database. To access 484 a record by key, reads from the offset in the appropriate file and then 485 parses the record into an object. 486 487 There are OS limits on the number of files that can be open at once, 488 so a pool are kept. If a record is required from a closed file, then 489 one of the open handles is closed first. 490 """ 491
492 - def __init__(self, index_filename, filenames, 493 proxy_factory, format, 494 key_function, repr, max_open=10):
495 """Initialize the class.""" 496 # TODO? - Don't keep filename list in memory (just in DB)? 497 # Should save a chunk of memory if dealing with 1000s of files. 498 # Furthermore could compare a generator to the DB on reloading 499 # (no need to turn it into a list) 500 501 if not _sqlite: 502 # Hack for Jython (of if Python is compiled without it) 503 from Bio import MissingPythonDependencyError 504 raise MissingPythonDependencyError("Requires sqlite3, which is " 505 "included Python 2.5+") 506 if filenames is not None: 507 filenames = list(filenames) # In case it was a generator 508 509 # Cache the arguments as private variables 510 self._index_filename = index_filename 511 self._filenames = filenames 512 self._format = format 513 self._key_function = key_function 514 self._proxy_factory = proxy_factory 515 self._repr = repr 516 self._max_open = max_open 517 self._proxies = {} 518 519 # Note if using SQLite :memory: trick index filename, this will 520 # give $PWD as the relative path (which is fine). 521 self._relative_path = os.path.abspath(os.path.dirname(index_filename)) 522 523 if os.path.isfile(index_filename): 524 self._load_index() 525 else: 526 self._build_index()
527
528 - def _load_index(self):
529 """Call from __init__ to re-use an existing index (PRIVATE).""" 530 index_filename = self._index_filename 531 relative_path = self._relative_path 532 filenames = self._filenames 533 format = self._format 534 proxy_factory = self._proxy_factory 535 536 con = _sqlite.connect(index_filename) 537 self._con = con 538 # Check the count... 539 try: 540 count, = con.execute( 541 "SELECT value FROM meta_data WHERE key=?;", 542 ("count",)).fetchone() 543 self._length = int(count) 544 if self._length == -1: 545 con.close() 546 raise ValueError("Unfinished/partial database") 547 count, = con.execute( 548 "SELECT COUNT(key) FROM offset_data;").fetchone() 549 if self._length != int(count): 550 con.close() 551 raise ValueError("Corrupt database? %i entries not %i" 552 % (int(count), self._length)) 553 self._format, = con.execute( 554 "SELECT value FROM meta_data WHERE key=?;", 555 ("format",)).fetchone() 556 if format and format != self._format: 557 con.close() 558 raise ValueError("Index file says format %s, not %s" 559 % (self._format, format)) 560 try: 561 filenames_relative_to_index, = con.execute( 562 "SELECT value FROM meta_data WHERE key=?;", 563 ("filenames_relative_to_index",)).fetchone() 564 filenames_relative_to_index = (filenames_relative_to_index.upper() == "TRUE") 565 except TypeError: 566 # Original behaviour, assume if meta_data missing 567 filenames_relative_to_index = False 568 self._filenames = [row[0] for row in 569 con.execute("SELECT name FROM file_data " 570 "ORDER BY file_number;").fetchall()] 571 if filenames_relative_to_index: 572 # Not implicitly relative to $PWD, explicitly relative to index file 573 relative_path = os.path.abspath(os.path.dirname(index_filename)) 574 tmp = [] 575 for f in self._filenames: 576 if os.path.isabs(f): 577 tmp.append(f) 578 else: 579 # Would be stored with Unix / path separator, so convert 580 # it to the local OS path separator here: 581 tmp.append(os.path.join(relative_path, f.replace("/", os.path.sep))) 582 self._filenames = tmp 583 del tmp 584 if filenames and len(filenames) != len(self._filenames): 585 con.close() 586 raise ValueError("Index file says %i files, not %i" 587 % (len(self._filenames), len(filenames))) 588 if filenames and filenames != self._filenames: 589 for old, new in zip(self._filenames, filenames): 590 # Want exact match (after making relative to the index above) 591 if os.path.abspath(old) != os.path.abspath(new): 592 con.close() 593 if filenames_relative_to_index: 594 raise ValueError("Index file has different filenames, e.g. %r != %r" 595 % (os.path.abspath(old), os.path.abspath(new))) 596 else: 597 raise ValueError("Index file has different filenames " 598 "[This is an old index where any relative paths " 599 "were relative to the original working directory]. " 600 "e.g. %r != %r" 601 % (os.path.abspath(old), os.path.abspath(new))) 602 # Filenames are equal (after imposing abspath) 603 except _OperationalError as err: 604 con.close() 605 raise ValueError("Not a Biopython index database? %s" % err) 606 # Now we have the format (from the DB if not given to us), 607 if not proxy_factory(self._format): 608 con.close() 609 raise ValueError("Unsupported format '%s'" % self._format)
610
611 - def _build_index(self):
612 """Call from __init__ to create a new index (PRIVATE).""" 613 index_filename = self._index_filename 614 relative_path = self._relative_path 615 filenames = self._filenames 616 format = self._format 617 key_function = self._key_function 618 proxy_factory = self._proxy_factory 619 max_open = self._max_open 620 random_access_proxies = self._proxies 621 622 if not format or not filenames: 623 raise ValueError("Filenames to index and format required to build %r" % index_filename) 624 if not proxy_factory(format): 625 raise ValueError("Unsupported format '%s'" % format) 626 # Create the index 627 con = _sqlite.connect(index_filename) 628 self._con = con 629 # print("Creating index") 630 # Sqlite PRAGMA settings for speed 631 con.execute("PRAGMA synchronous=OFF") 632 con.execute("PRAGMA locking_mode=EXCLUSIVE") 633 # Don't index the key column until the end (faster) 634 # con.execute("CREATE TABLE offset_data (key TEXT PRIMARY KEY, " 635 # "offset INTEGER);") 636 con.execute("CREATE TABLE meta_data (key TEXT, value TEXT);") 637 con.execute("INSERT INTO meta_data (key, value) VALUES (?,?);", 638 ("count", -1)) 639 con.execute("INSERT INTO meta_data (key, value) VALUES (?,?);", 640 ("format", format)) 641 con.execute("INSERT INTO meta_data (key, value) VALUES (?,?);", 642 ("filenames_relative_to_index", "True")) 643 # TODO - Record the alphabet? 644 # TODO - Record the file size and modified date? 645 con.execute( 646 "CREATE TABLE file_data (file_number INTEGER, name TEXT);") 647 con.execute("CREATE TABLE offset_data (key TEXT, " 648 "file_number INTEGER, offset INTEGER, length INTEGER);") 649 count = 0 650 for i, filename in enumerate(filenames): 651 # Default to storing as an absolute path, 652 f = os.path.abspath(filename) 653 if not os.path.isabs(filename) and not os.path.isabs(index_filename): 654 # Since user gave BOTH filename & index as relative paths, 655 # we will store this relative to the index file even though 656 # if it may now start ../ (meaning up a level) 657 # Note for cross platform use (e.g. shared drive over SAMBA), 658 # convert any Windows slash into Unix style for rel paths. 659 f = os.path.relpath(filename, relative_path).replace(os.path.sep, "/") 660 elif (os.path.dirname(os.path.abspath(filename)) + 661 os.path.sep).startswith(relative_path + os.path.sep): 662 # Since sequence file is in same directory or sub directory, 663 # might as well make this into a relative path: 664 f = os.path.relpath(filename, relative_path).replace(os.path.sep, "/") 665 assert not f.startswith("../"), f 666 # print("DEBUG - storing %r as [%r] %r" % (filename, relative_path, f)) 667 con.execute( 668 "INSERT INTO file_data (file_number, name) VALUES (?,?);", 669 (i, f)) 670 random_access_proxy = proxy_factory(format, filename) 671 if key_function: 672 offset_iter = ((key_function(k), i, o, l) 673 for (k, o, l) in random_access_proxy) 674 else: 675 offset_iter = ((k, i, o, l) 676 for (k, o, l) in random_access_proxy) 677 while True: 678 batch = list(itertools.islice(offset_iter, 100)) 679 if not batch: 680 break 681 # print("Inserting batch of %i offsets, %s ... %s" 682 # % (len(batch), batch[0][0], batch[-1][0])) 683 con.executemany( 684 "INSERT INTO offset_data (key,file_number,offset,length) VALUES (?,?,?,?);", 685 batch) 686 con.commit() 687 count += len(batch) 688 if len(random_access_proxies) < max_open: 689 random_access_proxies[i] = random_access_proxy 690 else: 691 random_access_proxy._handle.close() 692 self._length = count 693 # print("About to index %i entries" % count) 694 try: 695 con.execute("CREATE UNIQUE INDEX IF NOT EXISTS " 696 "key_index ON offset_data(key);") 697 except _IntegrityError as err: 698 self._proxies = random_access_proxies 699 self.close() 700 con.close() 701 raise ValueError("Duplicate key? %s" % err) 702 con.execute("PRAGMA locking_mode=NORMAL") 703 con.execute("UPDATE meta_data SET value = ? WHERE key = ?;", 704 (count, "count")) 705 con.commit()
706 # print("Index created") 707
708 - def __repr__(self):
709 return self._repr
710
711 - def __contains__(self, key):
712 return bool( 713 self._con.execute("SELECT key FROM offset_data WHERE key=?;", 714 (key,)).fetchone())
715
716 - def __len__(self):
717 """Return the number of records indexed.""" 718 return self._length
719 # return self._con.execute("SELECT COUNT(key) FROM offset_data;").fetchone()[0] 720
721 - def __iter__(self):
722 """Iterate over the keys.""" 723 for row in self._con.execute("SELECT key FROM offset_data;"): 724 yield str(row[0])
725 726 if hasattr(dict, "iteritems"): 727 # Python 2, use iteritems but not items etc 728 # Just need to override this...
729 - def keys(self):
730 """Iterate over the keys. 731 732 This tries to act like a Python 3 dictionary, and does not return 733 a list of keys due to memory concerns. 734 """ 735 return [str(row[0]) for row in 736 self._con.execute("SELECT key FROM offset_data;").fetchall()]
737
738 - def __getitem__(self, key):
739 """Return record for the specified key.""" 740 # Pass the offset to the proxy 741 row = self._con.execute( 742 "SELECT file_number, offset FROM offset_data WHERE key=?;", 743 (key,)).fetchone() 744 if not row: 745 raise KeyError 746 file_number, offset = row 747 proxies = self._proxies 748 if file_number in proxies: 749 record = proxies[file_number].get(offset) 750 else: 751 if len(proxies) >= self._max_open: 752 # Close an old handle... 753 proxies.popitem()[1]._handle.close() 754 # Open a new handle... 755 proxy = self._proxy_factory(self._format, self._filenames[file_number]) 756 record = proxy.get(offset) 757 proxies[file_number] = proxy 758 if self._key_function: 759 key2 = self._key_function(record.id) 760 else: 761 key2 = record.id 762 if key != key2: 763 raise ValueError("Key did not match (%s vs %s)" % (key, key2)) 764 return record
765
766 - def get(self, k, d=None):
767 """Return the value in the dictionary. 768 769 If the key (k) is not found, this returns None unless a 770 default (d) is specified. 771 """ 772 try: 773 return self.__getitem__(k) 774 except KeyError: 775 return d
776
777 - def get_raw(self, key):
778 """Return the raw record from the file as a bytes string. 779 780 If the key is not found, a KeyError exception is raised. 781 """ 782 # Pass the offset to the proxy 783 row = self._con.execute( 784 "SELECT file_number, offset, length FROM offset_data WHERE key=?;", 785 (key,)).fetchone() 786 if not row: 787 raise KeyError 788 file_number, offset, length = row 789 proxies = self._proxies 790 if file_number in proxies: 791 if length: 792 # Shortcut if we have the length 793 h = proxies[file_number]._handle 794 h.seek(offset) 795 return h.read(length) 796 else: 797 return proxies[file_number].get_raw(offset) 798 else: 799 # This code is duplicated from __getitem__ to avoid a function call 800 if len(proxies) >= self._max_open: 801 # Close an old handle... 802 proxies.popitem()[1]._handle.close() 803 # Open a new handle... 804 proxy = self._proxy_factory(self._format, self._filenames[file_number]) 805 proxies[file_number] = proxy 806 if length: 807 # Shortcut if we have the length 808 h = proxy._handle 809 h.seek(offset) 810 return h.read(length) 811 else: 812 return proxy.get_raw(offset)
813
814 - def close(self):
815 """Close any open file handles.""" 816 proxies = self._proxies 817 while proxies: 818 proxies.popitem()[1]._handle.close()
819