Package Bio :: Module File
[hide private]
[frames] | no frames]

Source Code for Module Bio.File

  1  # Copyright 1999 by Jeffrey Chang.  All rights reserved. 
  2  # Copyright 2009-2013 by Peter Cock. All rights reserved. 
  3  # 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7   
  8  """Code for more fancy file handles. 
  9   
 10   
 11  Classes: 
 12   
 13  UndoHandle     File object decorator with support for undo-like operations. 
 14   
 15  Additional private classes used in Bio.SeqIO and Bio.SearchIO for indexing 
 16  files are also defined under Bio.File but these are not intended for direct 
 17  use. 
 18  """ 
 19  from __future__ import print_function 
 20   
 21  import codecs 
 22  import os 
 23  import sys 
 24  import contextlib 
 25  import itertools 
 26   
 27  from Bio._py3k import basestring 
 28   
 29  try: 
 30      from collections import UserDict as _dict_base 
 31  except ImportError: 
 32      from UserDict import DictMixin as _dict_base 
 33   
 34  try: 
 35      from sqlite3 import dbapi2 as _sqlite 
 36      from sqlite3 import IntegrityError as _IntegrityError 
 37      from sqlite3 import OperationalError as _OperationalError 
 38  except ImportError: 
 39      #Not present on Jython, but should be included in Python 2.5 
 40      #or later (unless compiled from source without its dependencies) 
 41      #Still want to offer in-memory indexing. 
 42      _sqlite = None 
 43      pass 
44 45 46 @contextlib.contextmanager 47 -def as_handle(handleish, mode='r', **kwargs):
48 """ 49 Context manager for arguments that can be passed to 50 SeqIO and AlignIO read, write, and parse methods: either file objects or strings. 51 52 When given a string, returns a file handle open to handleish with provided 53 mode which will be closed when the manager exits. 54 55 All other inputs are returned, and are *not* closed 56 57 - handleish - Either a string or file handle 58 - mode - Mode to open handleish (used only if handleish is a string) 59 - kwargs - Further arguments to pass to open(...) 60 61 Example: 62 63 >>> with as_handle('seqs.fasta', 'w') as fp: 64 ... fp.write('>test\nACGT') 65 >>> fp.closed 66 True 67 68 >>> handle = open('seqs.fasta', 'w') 69 >>> with as_handle(handle) as fp: 70 ... fp.write('>test\nACGT') 71 >>> fp.closed 72 False 73 >>> fp.close() 74 75 Note that if the mode argument includes U (for universal new lines) 76 this will be removed under Python 3 where is is redundant and has 77 been deprecated (this happens automatically in text mode). 78 """ 79 if isinstance(handleish, basestring): 80 if sys.version_info[0] >= 3 and "U" in mode: 81 mode = mode.replace("U", "") 82 if 'encoding' in kwargs: 83 with codecs.open(handleish, mode, **kwargs) as fp: 84 yield fp 85 else: 86 with open(handleish, mode, **kwargs) as fp: 87 yield fp 88 else: 89 yield handleish
90
91 -def _open_for_random_access(filename):
92 """Open a file in binary mode, spot if it is BGZF format etc (PRIVATE). 93 94 This funcationality is used by the Bio.SeqIO and Bio.SearchIO index 95 and index_db functions. 96 """ 97 handle = open(filename, "rb") 98 from . import bgzf 99 try: 100 return bgzf.BgzfReader(mode="rb", fileobj=handle) 101 except ValueError as e: 102 assert "BGZF" in str(e) 103 #Not a BGZF file after all, rewind to start: 104 handle.seek(0) 105 return handle
106
107 108 -class UndoHandle(object):
109 """A Python handle that adds functionality for saving lines. 110 111 Saves lines in a LIFO fashion. 112 113 Added methods: 114 saveline Save a line to be returned next time. 115 peekline Peek at the next line without consuming it. 116 117 """
118 - def __init__(self, handle):
119 self._handle = handle 120 self._saved = []
121
122 - def __iter__(self):
123 return self
124
125 - def __next__(self):
126 next = self.readline() 127 if not next: 128 raise StopIteration 129 return next
130 131 if sys.version_info[0] < 3:
132 - def next(self):
133 """Python 2 style alias for Python 3 style __next__ method.""" 134 return self.__next__()
135
136 - def readlines(self, *args, **keywds):
137 lines = self._saved + self._handle.readlines(*args, **keywds) 138 self._saved = [] 139 return lines
140
141 - def readline(self, *args, **keywds):
142 if self._saved: 143 line = self._saved.pop(0) 144 else: 145 line = self._handle.readline(*args, **keywds) 146 return line
147
148 - def read(self, size=-1):
149 if size == -1: 150 saved = "".join(self._saved) 151 self._saved[:] = [] 152 else: 153 saved = '' 154 while size > 0 and self._saved: 155 if len(self._saved[0]) <= size: 156 size = size - len(self._saved[0]) 157 saved = saved + self._saved.pop(0) 158 else: 159 saved = saved + self._saved[0][:size] 160 self._saved[0] = self._saved[0][size:] 161 size = 0 162 return saved + self._handle.read(size)
163
164 - def saveline(self, line):
165 if line: 166 self._saved = [line] + self._saved
167
168 - def peekline(self):
169 if self._saved: 170 line = self._saved[0] 171 else: 172 line = self._handle.readline() 173 self.saveline(line) 174 return line
175
176 - def tell(self):
177 return self._handle.tell() - sum(len(line) for line in self._saved)
178
179 - def seek(self, *args):
180 self._saved = [] 181 self._handle.seek(*args)
182
183 - def __getattr__(self, attr):
184 return getattr(self._handle, attr)
185
186 - def __enter__(self):
187 return self
188
189 - def __exit__(self, type, value, traceback):
190 self._handle.close()
191
192 193 #The rest of this file defines code used in Bio.SeqIO and Bio.SearchIO 194 #for indexing 195 196 -class _IndexedSeqFileProxy(object):
197 """Base class for file format specific random access (PRIVATE). 198 199 This is subclasses in both Bio.SeqIO for indexing as SeqRecord 200 objects, and in Bio.SearchIO for indexing QueryResult objects. 201 202 Subclasses for each file format should define '__iter__', 'get' 203 and optionally 'get_raw' methods. 204 """ 205
206 - def __iter__(self):
207 """Returns (identifier, offset, length in bytes) tuples. 208 209 The length can be zero where it is not implemented or not 210 possible for a particular file format. 211 """ 212 raise NotImplementedError("Subclass should implement this")
213
214 - def get(self, offset):
215 """Returns parsed object for this entry.""" 216 #Most file formats with self contained records can be handled by 217 #parsing StringIO(_bytes_to_string(self.get_raw(offset))) 218 raise NotImplementedError("Subclass should implement this")
219
220 - def get_raw(self, offset):
221 """Returns bytes string (if implemented for this file format).""" 222 #Should be done by each sub-class (if possible) 223 raise NotImplementedError("Not available for this file format.")
224
225 226 -class _IndexedSeqFileDict(_dict_base):
227 """Read only dictionary interface to a sequential record file. 228 229 This code is used in both Bio.SeqIO for indexing as SeqRecord 230 objects, and in Bio.SearchIO for indexing QueryResult objects. 231 232 Keeps the keys and associated file offsets in memory, reads the file 233 to access entries as objects parsing them on demand. This approach 234 is memory limited, but will work even with millions of records. 235 236 Note duplicate keys are not allowed. If this happens, a ValueError 237 exception is raised. 238 239 As used in Bio.SeqIO, by default the SeqRecord's id string is used 240 as the dictionary key. In Bio.SearchIO, the query's id string is 241 used. This can be changed by suppling an optional key_function, 242 a callback function which will be given the record id and must 243 return the desired key. For example, this allows you to parse 244 NCBI style FASTA identifiers, and extract the GI number to use 245 as the dictionary key. 246 247 Note that this dictionary is essentially read only. You cannot 248 add or change values, pop values, nor clear the dictionary. 249 """
250 - def __init__(self, random_access_proxy, key_function, 251 repr, obj_repr):
252 #Use key_function=None for default value 253 self._proxy = random_access_proxy 254 self._key_function = key_function 255 self._repr = repr 256 self._obj_repr = obj_repr 257 if key_function: 258 offset_iter = ( 259 (key_function(k), o, l) for (k, o, l) in random_access_proxy) 260 else: 261 offset_iter = random_access_proxy 262 offsets = {} 263 for key, offset, length in offset_iter: 264 #Note - we don't store the length because I want to minimise the 265 #memory requirements. With the SQLite backend the length is kept 266 #and is used to speed up the get_raw method (by about 3 times). 267 #The length should be provided by all the current backends except 268 #SFF where there is an existing Roche index we can reuse (very fast 269 #but lacks the record lengths) 270 #assert length or format in ["sff", "sff-trim"], \ 271 # "%s at offset %i given length %r (%s format %s)" \ 272 # % (key, offset, length, filename, format) 273 if key in offsets: 274 self._proxy._handle.close() 275 raise ValueError("Duplicate key '%s'" % key) 276 else: 277 offsets[key] = offset 278 self._offsets = offsets
279
280 - def __repr__(self):
281 return self._repr
282
283 - def __str__(self):
284 #TODO - How best to handle the __str__ for SeqIO and SearchIO? 285 if self: 286 return "{%r : %s(...), ...}" % (list(self.keys())[0], self._obj_repr) 287 else: 288 return "{}"
289
290 - def __contains__(self, key):
291 return key in self._offsets
292
293 - def __len__(self):
294 """How many records are there?""" 295 return len(self._offsets)
296
297 - def items(self):
298 """Iterate over the (key, SeqRecord) items. 299 300 This tries to act like a Python 3 dictionary, and does not return 301 a list of (key, value) pairs due to memory concerns. 302 """ 303 for key in self.__iter__(): 304 yield key, self.__getitem__(key)
305
306 - def values(self):
307 """Iterate over the SeqRecord items. 308 309 This tries to act like a Python 3 dictionary, and does not return 310 a list of value due to memory concerns. 311 """ 312 for key in self.__iter__(): 313 yield self.__getitem__(key)
314
315 - def keys(self):
316 """Iterate over the keys. 317 318 This tries to act like a Python 3 dictionary, and does not return 319 a list of keys due to memory concerns. 320 """ 321 return self.__iter__()
322 323 if hasattr(dict, "iteritems"): 324 #Python 2, also define iteritems etc
325 - def itervalues(self):
326 """Iterate over the SeqRecord) items.""" 327 for key in self.__iter__(): 328 yield self.__getitem__(key)
329
330 - def iteritems(self):
331 """Iterate over the (key, SeqRecord) items.""" 332 for key in self.__iter__(): 333 yield key, self.__getitem__(key)
334
335 - def iterkeys(self):
336 """Iterate over the keys.""" 337 return self.__iter__()
338
339 - def __iter__(self):
340 """Iterate over the keys.""" 341 return iter(self._offsets)
342
343 - def __getitem__(self, key):
344 """x.__getitem__(y) <==> x[y]""" 345 #Pass the offset to the proxy 346 record = self._proxy.get(self._offsets[key]) 347 if self._key_function: 348 key2 = self._key_function(record.id) 349 else: 350 key2 = record.id 351 if key != key2: 352 raise ValueError("Key did not match (%s vs %s)" % (key, key2)) 353 return record
354
355 - def get(self, k, d=None):
356 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" 357 try: 358 return self.__getitem__(k) 359 except KeyError: 360 return d
361
362 - def get_raw(self, key):
363 """Similar to the get method, but returns the record as a raw string. 364 365 If the key is not found, a KeyError exception is raised. 366 367 Note that on Python 3 a bytes string is returned, not a typical 368 unicode string. 369 370 NOTE - This functionality is not supported for every file format. 371 """ 372 #Pass the offset to the proxy 373 return self._proxy.get_raw(self._offsets[key])
374
375 - def __setitem__(self, key, value):
376 """Would allow setting or replacing records, but not implemented.""" 377 raise NotImplementedError("An indexed a sequence file is read only.")
378
379 - def update(self, *args, **kwargs):
380 """Would allow adding more values, but not implemented.""" 381 raise NotImplementedError("An indexed a sequence file is read only.")
382
383 - def pop(self, key, default=None):
384 """Would remove specified record, but not implemented.""" 385 raise NotImplementedError("An indexed a sequence file is read only.")
386
387 - def popitem(self):
388 """Would remove and return a SeqRecord, but not implemented.""" 389 raise NotImplementedError("An indexed a sequence file is read only.")
390
391 - def clear(self):
392 """Would clear dictionary, but not implemented.""" 393 raise NotImplementedError("An indexed a sequence file is read only.")
394
395 - def fromkeys(self, keys, value=None):
396 """A dictionary method which we don't implement.""" 397 raise NotImplementedError("An indexed a sequence file doesn't " 398 "support this.")
399
400 - def copy(self):
401 """A dictionary method which we don't implement.""" 402 raise NotImplementedError("An indexed a sequence file doesn't " 403 "support this.")
404
405 - def close(self):
406 """Close the file handle being used to read the data. 407 408 Once called, further use of the index won't work. The sole purpose 409 of this method is to allow explicit handle closure - for example 410 if you wish to delete the file, on Windows you must first close 411 all open handles to that file. 412 """ 413 self._proxy._handle.close()
414
415 416 -class _SQLiteManySeqFilesDict(_IndexedSeqFileDict):
417 """Read only dictionary interface to many sequential record files. 418 419 This code is used in both Bio.SeqIO for indexing as SeqRecord 420 objects, and in Bio.SearchIO for indexing QueryResult objects. 421 422 Keeps the keys, file-numbers and offsets in an SQLite database. To access 423 a record by key, reads from the offset in the appropriate file and then 424 parses the record into an object. 425 426 There are OS limits on the number of files that can be open at once, 427 so a pool are kept. If a record is required from a closed file, then 428 one of the open handles is closed first. 429 """
430 - def __init__(self, index_filename, filenames, 431 proxy_factory, format, 432 key_function, repr, max_open=10):
433 self._proxy_factory = proxy_factory 434 self._repr = repr 435 random_access_proxies = {} 436 #TODO? - Don't keep filename list in memory (just in DB)? 437 #Should save a chunk of memory if dealing with 1000s of files. 438 #Furthermore could compare a generator to the DB on reloading 439 #(no need to turn it into a list) 440 if not _sqlite: 441 # Hack for Jython (of if Python is compiled without it) 442 from Bio import MissingPythonDependencyError 443 raise MissingPythonDependencyError("Requires sqlite3, which is " 444 "included Python 2.5+") 445 if filenames is not None: 446 filenames = list(filenames) # In case it was a generator 447 if os.path.isfile(index_filename): 448 #Reuse the index. 449 con = _sqlite.connect(index_filename) 450 self._con = con 451 #Check the count... 452 try: 453 count, = con.execute( 454 "SELECT value FROM meta_data WHERE key=?;", 455 ("count",)).fetchone() 456 self._length = int(count) 457 if self._length == -1: 458 con.close() 459 raise ValueError("Unfinished/partial database") 460 count, = con.execute( 461 "SELECT COUNT(key) FROM offset_data;").fetchone() 462 if self._length != int(count): 463 con.close() 464 raise ValueError("Corrupt database? %i entries not %i" 465 % (int(count), self._length)) 466 self._format, = con.execute( 467 "SELECT value FROM meta_data WHERE key=?;", 468 ("format",)).fetchone() 469 if format and format != self._format: 470 con.close() 471 raise ValueError("Index file says format %s, not %s" 472 % (self._format, format)) 473 self._filenames = [row[0] for row in 474 con.execute("SELECT name FROM file_data " 475 "ORDER BY file_number;").fetchall()] 476 if filenames and len(filenames) != len(self._filenames): 477 con.close() 478 raise ValueError("Index file says %i files, not %i" 479 % (len(self._filenames), len(filenames))) 480 if filenames and filenames != self._filenames: 481 con.close() 482 raise ValueError("Index file has different filenames") 483 except _OperationalError as err: 484 con.close() 485 raise ValueError("Not a Biopython index database? %s" % err) 486 #Now we have the format (from the DB if not given to us), 487 if not proxy_factory(self._format): 488 con.close() 489 raise ValueError("Unsupported format '%s'" % self._format) 490 else: 491 self._filenames = filenames 492 self._format = format 493 if not format or not filenames: 494 raise ValueError("Filenames to index and format required") 495 if not proxy_factory(format): 496 raise ValueError("Unsupported format '%s'" % format) 497 #Create the index 498 con = _sqlite.connect(index_filename) 499 self._con = con 500 #print("Creating index") 501 # Sqlite PRAGMA settings for speed 502 con.execute("PRAGMA synchronous=OFF") 503 con.execute("PRAGMA locking_mode=EXCLUSIVE") 504 #Don't index the key column until the end (faster) 505 #con.execute("CREATE TABLE offset_data (key TEXT PRIMARY KEY, " 506 # "offset INTEGER);") 507 con.execute("CREATE TABLE meta_data (key TEXT, value TEXT);") 508 con.execute("INSERT INTO meta_data (key, value) VALUES (?,?);", 509 ("count", -1)) 510 con.execute("INSERT INTO meta_data (key, value) VALUES (?,?);", 511 ("format", format)) 512 #TODO - Record the alphabet? 513 #TODO - Record the file size and modified date? 514 con.execute( 515 "CREATE TABLE file_data (file_number INTEGER, name TEXT);") 516 con.execute("CREATE TABLE offset_data (key TEXT, file_number INTEGER, offset INTEGER, length INTEGER);") 517 count = 0 518 for i, filename in enumerate(filenames): 519 con.execute( 520 "INSERT INTO file_data (file_number, name) VALUES (?,?);", 521 (i, filename)) 522 random_access_proxy = proxy_factory(format, filename) 523 if key_function: 524 offset_iter = ((key_function( 525 k), i, o, l) for (k, o, l) in random_access_proxy) 526 else: 527 offset_iter = ( 528 (k, i, o, l) for (k, o, l) in random_access_proxy) 529 while True: 530 batch = list(itertools.islice(offset_iter, 100)) 531 if not batch: 532 break 533 #print("Inserting batch of %i offsets, %s ... %s" \ 534 # % (len(batch), batch[0][0], batch[-1][0])) 535 con.executemany( 536 "INSERT INTO offset_data (key,file_number,offset,length) VALUES (?,?,?,?);", 537 batch) 538 con.commit() 539 count += len(batch) 540 if len(random_access_proxies) < max_open: 541 random_access_proxies[i] = random_access_proxy 542 else: 543 random_access_proxy._handle.close() 544 self._length = count 545 #print("About to index %i entries" % count) 546 try: 547 con.execute("CREATE UNIQUE INDEX IF NOT EXISTS " 548 "key_index ON offset_data(key);") 549 except _IntegrityError as err: 550 self._proxies = random_access_proxies 551 self.close() 552 con.close() 553 raise ValueError("Duplicate key? %s" % err) 554 con.execute("PRAGMA locking_mode=NORMAL") 555 con.execute("UPDATE meta_data SET value = ? WHERE key = ?;", 556 (count, "count")) 557 con.commit() 558 #print("Index created") 559 self._proxies = random_access_proxies 560 self._max_open = max_open 561 self._index_filename = index_filename 562 self._key_function = key_function
563
564 - def __repr__(self):
565 return self._repr
566
567 - def __contains__(self, key):
568 return bool( 569 self._con.execute("SELECT key FROM offset_data WHERE key=?;", 570 (key,)).fetchone())
571
572 - def __len__(self):
573 """How many records are there?""" 574 return self._length
575 #return self._con.execute("SELECT COUNT(key) FROM offset_data;").fetchone()[0] 576
577 - def __iter__(self):
578 """Iterate over the keys.""" 579 for row in self._con.execute("SELECT key FROM offset_data;"): 580 yield str(row[0])
581 582 if hasattr(dict, "iteritems"): 583 #Python 2, use iteritems but not items etc 584 #Just need to override this...
585 - def keys(self):
586 """Return a list of all the keys (SeqRecord identifiers).""" 587 return [str(row[0]) for row in 588 self._con.execute("SELECT key FROM offset_data;").fetchall()]
589
590 - def __getitem__(self, key):
591 """x.__getitem__(y) <==> x[y]""" 592 #Pass the offset to the proxy 593 row = self._con.execute( 594 "SELECT file_number, offset FROM offset_data WHERE key=?;", 595 (key,)).fetchone() 596 if not row: 597 raise KeyError 598 file_number, offset = row 599 proxies = self._proxies 600 if file_number in proxies: 601 record = proxies[file_number].get(offset) 602 else: 603 if len(proxies) >= self._max_open: 604 #Close an old handle... 605 proxies.popitem()[1]._handle.close() 606 #Open a new handle... 607 proxy = self._proxy_factory(self._format, self._filenames[file_number]) 608 record = proxy.get(offset) 609 proxies[file_number] = proxy 610 if self._key_function: 611 key2 = self._key_function(record.id) 612 else: 613 key2 = record.id 614 if key != key2: 615 raise ValueError("Key did not match (%s vs %s)" % (key, key2)) 616 return record
617
618 - def get(self, k, d=None):
619 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" 620 try: 621 return self.__getitem__(k) 622 except KeyError: 623 return d
624
625 - def get_raw(self, key):
626 """Similar to the get method, but returns the record as a raw string. 627 628 If the key is not found, a KeyError exception is raised. 629 630 Note that on Python 3 a bytes string is returned, not a typical 631 unicode string. 632 633 NOTE - This functionality is not supported for every file format. 634 """ 635 #Pass the offset to the proxy 636 row = self._con.execute( 637 "SELECT file_number, offset, length FROM offset_data WHERE key=?;", 638 (key,)).fetchone() 639 if not row: 640 raise KeyError 641 file_number, offset, length = row 642 proxies = self._proxies 643 if file_number in proxies: 644 if length: 645 #Shortcut if we have the length 646 h = proxies[file_number]._handle 647 h.seek(offset) 648 return h.read(length) 649 else: 650 return proxies[file_number].get_raw(offset) 651 else: 652 #This code is duplicated from __getitem__ to avoid a function call 653 if len(proxies) >= self._max_open: 654 #Close an old handle... 655 proxies.popitem()[1]._handle.close() 656 #Open a new handle... 657 proxy = self._proxy_factory(self._format, self._filenames[file_number]) 658 proxies[file_number] = proxy 659 if length: 660 #Shortcut if we have the length 661 h = proxy._handle 662 h.seek(offset) 663 return h.read(length) 664 else: 665 return proxy.get_raw(offset)
666
667 - def close(self):
668 """Close any open file handles.""" 669 proxies = self._proxies 670 while proxies: 671 proxies.popitem()[1]._handle.close()
672