Package BioSQL :: Module BioSeqDatabase
[hide private]
[frames] | no frames]

Source Code for Module BioSQL.BioSeqDatabase

  1  # Copyright 2002 by Andrew Dalke.  All rights reserved. 
  2  # Revisions 2007-2016 copyright by Peter Cock.  All rights reserved. 
  3  # Revisions 2009 copyright by Cymon J. Cox.  All rights reserved. 
  4  # Revisions 2013-2014 copyright by Tiago Antao.  All rights reserved. 
  5  # This code is part of the Biopython distribution and governed by its 
  6  # license.  Please see the LICENSE file that should have been included 
  7  # as part of this package. 
  8  # 
  9  # Note that BioSQL (including the database schema and scripts) is 
 10  # available and licensed separately.  Please consult www.biosql.org 
 11  """Connect with a BioSQL database and load Biopython like objects from it. 
 12   
 13  This provides interfaces for loading biological objects from a relational 
 14  database, and is compatible with the BioSQL standards. 
 15  """ 
 16  import os 
 17  import sys 
 18   
 19  from Bio._py3k import _universal_read_mode 
 20  from Bio._py3k import _bytes_bytearray_to_str as bytearray_to_str 
 21  from Bio import BiopythonDeprecationWarning 
 22   
 23  from . import BioSeq 
 24  from . import Loader 
 25  from . import DBUtils 
 26   
 27   
 28  _POSTGRES_RULES_PRESENT = False  # Hack for BioSQL Bug 2839 
 29   
 30   
31 -def open_database(driver="MySQLdb", **kwargs):
32 """Main interface for loading a existing BioSQL-style database. 33 34 This function is the easiest way to retrieve a connection to a 35 database, doing something like: 36 37 >>> from BioSeq import BioSeqDatabase 38 >>> server = BioSeqDatabase.open_database(user="root", db="minidb") 39 40 Arguments: 41 - driver - The name of the database driver to use for connecting. The 42 driver should implement the python DB API. By default, the MySQLdb 43 driver is used. 44 - user -the username to connect to the database with. 45 - password, passwd - the password to connect with 46 - host - the hostname of the database 47 - database or db - the name of the database 48 """ 49 if driver == "psycopg": 50 raise ValueError("Using BioSQL with psycopg (version one) is no " 51 "longer supported. Use psycopg2 instead.") 52 53 if os.name == "java": 54 from com.ziclix.python.sql import zxJDBC 55 module = zxJDBC 56 if driver in ["MySQLdb"]: 57 jdbc_driver = "com.mysql.jdbc.Driver" 58 url_pref = "jdbc:mysql://" + kwargs["host"] + "/" 59 elif driver in ["psycopg2"]: 60 jdbc_driver = "org.postgresql.Driver" 61 url_pref = "jdbc:postgresql://" + kwargs["host"] + "/" 62 63 else: 64 module = __import__(driver, fromlist=["connect"]) 65 connect = module.connect 66 67 # Different drivers use different keywords... 68 kw = kwargs.copy() 69 if driver in ["MySQLdb", "mysql.connector"] and os.name != "java": 70 if "database" in kw: 71 kw["db"] = kw["database"] 72 del kw["database"] 73 if "password" in kw: 74 kw["passwd"] = kw["password"] 75 del kw["password"] 76 # kw["charset"] = "utf8" 77 # kw["use_unicode"] = True 78 else: 79 # DB-API recommendations 80 if "db" in kw: 81 kw["database"] = kw["db"] 82 del kw["db"] 83 if "passwd" in kw: 84 kw["password"] = kw["passwd"] 85 del kw["passwd"] 86 if driver in ["psycopg2", "pgdb"] and not kw.get("database"): 87 kw["database"] = "template1" 88 # SQLite connect takes the database name as input 89 if os.name == "java": 90 if driver in ["MySQLdb"]: 91 conn = connect(url_pref + kw.get("database", "mysql"), 92 kw["user"], kw["password"], jdbc_driver) 93 elif driver in ["psycopg2"]: 94 conn = connect(url_pref + kw.get("database", "postgresql") + 95 "?stringtype=unspecified", 96 kw["user"], kw["password"], jdbc_driver) 97 elif driver in ["sqlite3"]: 98 conn = connect(kw["database"]) 99 else: 100 conn = connect(**kw) 101 102 if os.name == "java": 103 server = DBServer(conn, module, driver) 104 else: 105 server = DBServer(conn, module) 106 107 # TODO - Remove the following once BioSQL Bug 2839 is fixed. 108 # Test for RULES in PostgreSQL schema, see also Bug 2833. 109 if driver in ["psycopg2", "pgdb"]: 110 sql = "SELECT ev_class FROM pg_rewrite WHERE " + \ 111 "rulename='rule_bioentry_i1' OR " + \ 112 "rulename='rule_bioentry_i2';" 113 if server.adaptor.execute_and_fetchall(sql): 114 import warnings 115 from Bio import BiopythonWarning 116 warnings.warn("Your BioSQL PostgreSQL schema includes some rules " 117 "currently required for bioperl-db but which may" 118 "cause problems loading data using Biopython (see " 119 "BioSQL's RedMine Bug 2839 aka GitHub Issue 4 " 120 "https://github.com/biosql/biosql/issues/4). " 121 "If you do not use BioPerl, please remove these " 122 "rules. Biopython should cope with the rules " 123 "present, but with a performance penalty when " 124 "loading new records.", BiopythonWarning) 125 global _POSTGRES_RULES_PRESENT 126 _POSTGRES_RULES_PRESENT = True 127 128 elif driver == 'sqlite3': 129 # Tell SQLite that we want to use foreign keys 130 # https://www.sqlite.org/foreignkeys.html#fk_enable 131 server.adaptor.execute('PRAGMA foreign_keys = ON') 132 133 return server
134 135
136 -class DBServer(object):
137 """Represents a BioSQL database continaing namespaces (sub-databases). 138 139 This acts like a Python dictionary, giving access to each namespace 140 (defined by a row in the biodatabase table) as a BioSeqDatabase object. 141 """ 142
143 - def __init__(self, conn, module, module_name=None):
144 self.module = module 145 if module_name is None: 146 module_name = module.__name__ 147 if module_name == "mysql.connector" and sys.version_info[0] == 3: 148 wrap_cursor = True 149 else: 150 wrap_cursor = False 151 # Get module specific Adaptor or the base (general) Adaptor 152 Adapt = _interface_specific_adaptors.get(module_name, Adaptor) 153 self.adaptor = Adapt(conn, DBUtils.get_dbutils(module_name), 154 wrap_cursor=wrap_cursor) 155 self.module_name = module_name
156
157 - def __repr__(self):
158 return self.__class__.__name__ + "(%r)" % self.adaptor.conn
159
160 - def __getitem__(self, name):
161 return BioSeqDatabase(self.adaptor, name)
162
163 - def __len__(self):
164 """Number of namespaces (sub-databases) in this database.""" 165 sql = "SELECT COUNT(name) FROM biodatabase;" 166 return int(self.adaptor.execute_and_fetch_col0(sql)[0])
167
168 - def __contains__(self, value):
169 """Check if a namespace (sub-database) in this database.""" 170 sql = "SELECT COUNT(name) FROM biodatabase WHERE name=%s;" 171 return bool(self.adaptor.execute_and_fetch_col0(sql, (value,))[0])
172
173 - def __iter__(self):
174 """Iterate over namespaces (sub-databases) in the database.""" 175 # TODO - Iterate over the cursor, much more efficient 176 return iter(self.adaptor.list_biodatabase_names())
177 178 if hasattr(dict, "iteritems"): 179 # Python 2, use iteritems etc
180 - def keys(self):
181 """List of namespaces (sub-databases) in the database.""" 182 return self.adaptor.list_biodatabase_names()
183
184 - def values(self):
185 """List of BioSeqDatabase objects in the database.""" 186 return [self[key] for key in self]
187
188 - def items(self):
189 """List of (namespace, BioSeqDatabase) for entries in the database.""" 190 return [(key, self[key]) for key in self]
191
192 - def iterkeys(self):
193 """Iterate over namespaces (sub-databases) in the database.""" 194 return iter(self)
195
196 - def itervalues(self):
197 """Iterate over BioSeqDatabase objects in the database.""" 198 for key in self: 199 yield self[key]
200
201 - def iteritems(self):
202 """Iterate over (namespace, BioSeqDatabase) in the database.""" 203 for key in self: 204 yield key, self[key]
205 else: 206 # Python 3, items etc are all iterators
207 - def keys(self):
208 """Iterate over namespaces (sub-databases) in the database.""" 209 return iter(self)
210
211 - def values(self):
212 """Iterate over BioSeqDatabase objects in the database.""" 213 for key in self: 214 yield self[key]
215
216 - def items(self):
217 """Iterate over (namespace, BioSeqDatabase) in the database.""" 218 for key in self: 219 yield key, self[key]
220
221 - def __delitem__(self, name):
222 """Remove a namespace and all its entries.""" 223 if name not in self: 224 raise KeyError(name) 225 db_id = self.adaptor.fetch_dbid_by_dbname(name) 226 remover = Loader.DatabaseRemover(self.adaptor, db_id) 227 remover.remove()
228
229 - def remove_database(self, db_name):
230 """Remove a namespace and all its entries (OBSOLETE). 231 232 Try to remove all references to items in a database. 233 234 server.remove_database(name) 235 236 In keeping with the dictionary interface, you can now do this: 237 238 del server[name] 239 """ 240 import warnings 241 warnings.warn("This method is deprecated. In keeping with the " 242 "dictionary interface, you can now use 'del " 243 "server[name]' instead", BiopythonDeprecationWarning) 244 self.__delitem__(db_name)
245
246 - def new_database(self, db_name, authority=None, description=None):
247 """Add a new database to the server and return it. 248 """ 249 # make the database 250 sql = r"INSERT INTO biodatabase (name, authority, description)" \ 251 r" VALUES (%s, %s, %s)" 252 self.adaptor.execute(sql, (db_name, authority, description)) 253 return BioSeqDatabase(self.adaptor, db_name)
254
255 - def load_database_sql(self, sql_file):
256 """Load a database schema into the given database. 257 258 This is used to create tables, etc when a database is first created. 259 sql_file should specify the complete path to a file containing 260 SQL entries for building the tables. 261 """ 262 # Not sophisticated enough for PG schema. Is it needed by MySQL? 263 # Looks like we need this more complicated way for both. Leaving it 264 # the default and removing the simple-minded approach. 265 266 # read the file with all comment lines removed 267 sql = "" 268 with open(sql_file, _universal_read_mode) as sql_handle: 269 for line in sql_handle: 270 if line.startswith("--"): # don't include comment lines 271 pass 272 elif line.startswith("#"): # ditto for MySQL comments 273 pass 274 elif line.strip(): # only include non-blank lines 275 sql += line.strip() + " " 276 277 # two ways to load the SQL 278 # 1. PostgreSQL can load it all at once and actually needs to 279 # due to FUNCTION defines at the end of the SQL which mess up 280 # the splitting by semicolons 281 if self.module_name in ["psycopg2", "pgdb"]: 282 self.adaptor.cursor.execute(sql) 283 # 2. MySQL needs the database loading split up into single lines of 284 # SQL executed one at a time 285 elif self.module_name in ["mysql.connector", "MySQLdb", "sqlite3"]: 286 sql_parts = sql.split(";") # one line per sql command 287 # don't use the last item, it's blank 288 for sql_line in sql_parts[:-1]: 289 self.adaptor.cursor.execute(sql_line) 290 else: 291 raise ValueError("Module %s not supported by the loader." % 292 (self.module_name))
293
294 - def commit(self):
295 """Commits the current transaction to the database.""" 296 return self.adaptor.commit()
297
298 - def rollback(self):
299 """Rolls backs the current transaction.""" 300 return self.adaptor.rollback()
301
302 - def close(self):
303 """Close the connection. No further activity possible.""" 304 return self.adaptor.close()
305 306
307 -class _CursorWrapper(object):
308 """A wraper for mysql.connector resolving bytestring representations.""" 309
310 - def __init__(self, real_cursor):
311 self.real_cursor = real_cursor
312
313 - def execute(self, operation, params=None, multi=False):
314 """Execute a sql statement 315 """ 316 self.real_cursor.execute(operation, params, multi)
317
318 - def executemany(self, operation, params):
319 """Execute many sql statements 320 """ 321 self.real_cursor.executemany(operation, params)
322
323 - def _convert_tuple(self, tuple_):
324 """Decode any bytestrings present in the row 325 """ 326 tuple_list = list(tuple_) 327 for i, elem in enumerate(tuple_list): 328 if type(elem) is bytes: 329 tuple_list[i] = elem.decode("utf-8") 330 return tuple(tuple_list)
331
332 - def _convert_list(self, lst):
333 ret_lst = [] 334 for tuple_ in lst: 335 new_tuple = self._convert_tuple(tuple_) 336 ret_lst.append(new_tuple) 337 return ret_lst
338
339 - def fetchall(self):
340 rv = self.real_cursor.fetchall() 341 return self._convert_list(rv)
342
343 - def fetchone(self):
344 tuple_ = self.real_cursor.fetchone() 345 return self._convert_tuple(tuple_)
346 347
348 -class Adaptor(object):
349 """High level wrapper for a database connection and cursor 350 351 Most database calls in BioSQL are done indirectly though this adaptor 352 class. This provides helper methods for fetching data and executing 353 sql. 354 """ 355
356 - def __init__(self, conn, dbutils, wrap_cursor=False):
357 self.conn = conn 358 if wrap_cursor: 359 self.cursor = _CursorWrapper(conn.cursor()) 360 else: 361 self.cursor = conn.cursor() 362 self.dbutils = dbutils
363
364 - def last_id(self, table):
365 return self.dbutils.last_id(self.cursor, table)
366
367 - def autocommit(self, y=True):
368 """Set the autocommit mode. True values enable; False value disable.""" 369 return self.dbutils.autocommit(self.conn, y)
370
371 - def commit(self):
372 """Commits the current transaction.""" 373 return self.conn.commit()
374
375 - def rollback(self):
376 """Rolls backs the current transaction.""" 377 return self.conn.rollback()
378
379 - def close(self):
380 """Close the connection. No further activity possible.""" 381 return self.conn.close()
382
383 - def fetch_dbid_by_dbname(self, dbname):
384 self.execute( 385 r"select biodatabase_id from biodatabase where name = %s", 386 (dbname,)) 387 rv = self.cursor.fetchall() 388 if not rv: 389 raise KeyError("Cannot find biodatabase with name %r" % dbname) 390 return rv[0][0]
391
392 - def fetch_seqid_by_display_id(self, dbid, name):
393 sql = r"select bioentry_id from bioentry where name = %s" 394 fields = [name] 395 if dbid: 396 sql += " and biodatabase_id = %s" 397 fields.append(dbid) 398 self.execute(sql, fields) 399 rv = self.cursor.fetchall() 400 if not rv: 401 raise IndexError("Cannot find display id %r" % name) 402 if len(rv) > 1: 403 raise IndexError("More than one entry with display id %r" % name) 404 return rv[0][0]
405
406 - def fetch_seqid_by_accession(self, dbid, name):
407 sql = r"select bioentry_id from bioentry where accession = %s" 408 fields = [name] 409 if dbid: 410 sql += " and biodatabase_id = %s" 411 fields.append(dbid) 412 self.execute(sql, fields) 413 rv = self.cursor.fetchall() 414 if not rv: 415 raise IndexError("Cannot find accession %r" % name) 416 if len(rv) > 1: 417 raise IndexError("More than one entry with accession %r" % name) 418 return rv[0][0]
419
420 - def fetch_seqids_by_accession(self, dbid, name):
421 sql = r"select bioentry_id from bioentry where accession = %s" 422 fields = [name] 423 if dbid: 424 sql += " and biodatabase_id = %s" 425 fields.append(dbid) 426 return self.execute_and_fetch_col0(sql, fields)
427
428 - def fetch_seqid_by_version(self, dbid, name):
429 acc_version = name.split(".") 430 if len(acc_version) > 2: 431 raise IndexError("Bad version %r" % name) 432 acc = acc_version[0] 433 if len(acc_version) == 2: 434 version = acc_version[1] 435 else: 436 version = "0" 437 sql = r"SELECT bioentry_id FROM bioentry WHERE accession = %s" \ 438 r" AND version = %s" 439 fields = [acc, version] 440 if dbid: 441 sql += " and biodatabase_id = %s" 442 fields.append(dbid) 443 self.execute(sql, fields) 444 rv = self.cursor.fetchall() 445 if not rv: 446 raise IndexError("Cannot find version %r" % name) 447 if len(rv) > 1: 448 raise IndexError("More than one entry with version %r" % name) 449 return rv[0][0]
450
451 - def fetch_seqid_by_identifier(self, dbid, identifier):
452 # YB: was fetch_seqid_by_seqid 453 sql = "SELECT bioentry_id FROM bioentry WHERE identifier = %s" 454 fields = [identifier] 455 if dbid: 456 sql += " and biodatabase_id = %s" 457 fields.append(dbid) 458 self.execute(sql, fields) 459 rv = self.cursor.fetchall() 460 if not rv: 461 raise IndexError("Cannot find display id %r" % identifier) 462 return rv[0][0]
463
464 - def list_biodatabase_names(self):
465 return self.execute_and_fetch_col0( 466 "SELECT name FROM biodatabase")
467
468 - def list_bioentry_ids(self, dbid):
469 return self.execute_and_fetch_col0( 470 "SELECT bioentry_id FROM bioentry WHERE biodatabase_id = %s", 471 (dbid,))
472
473 - def list_bioentry_display_ids(self, dbid):
474 return self.execute_and_fetch_col0( 475 "SELECT name FROM bioentry WHERE biodatabase_id = %s", 476 (dbid,))
477
478 - def list_any_ids(self, sql, args):
479 """Return ids given a SQL statement to select for them. 480 481 This assumes that the given SQL does a SELECT statement that 482 returns a list of items. This parses them out of the 2D list 483 they come as and just returns them in a list. 484 """ 485 return self.execute_and_fetch_col0(sql, args)
486
487 - def execute_one(self, sql, args=None):
488 """Execute sql that returns 1 record, and return the record""" 489 self.execute(sql, args or ()) 490 rv = self.cursor.fetchall() 491 assert len(rv) == 1, "Expected 1 response, got %d" % len(rv) 492 return rv[0]
493
494 - def execute(self, sql, args=None):
495 """Just execute an sql command. 496 """ 497 if os.name == "java": 498 sql = sql.replace("%s", "?") 499 self.dbutils.execute(self.cursor, sql, args)
500
501 - def executemany(self, sql, args):
502 """Execute many sql commands. 503 """ 504 if os.name == "java": 505 sql = sql.replace("%s", "?") 506 self.dbutils.executemany(self.cursor, sql, args)
507
508 - def get_subseq_as_string(self, seqid, start, end):
509 length = end - start 510 # XXX Check this on MySQL and PostgreSQL. substr should be general, 511 # does it need dbutils? 512 # return self.execute_one( 513 # """select SUBSTRING(seq FROM %s FOR %s) 514 # from biosequence where bioentry_id = %s""", 515 # (start+1, length, seqid))[0] 516 # 517 # Convert to a string on returning for databases that give back 518 # unicode. Shouldn't need unicode for sequences so this seems safe. 519 return str(self.execute_one( 520 """select SUBSTR(seq, %s, %s) 521 from biosequence where bioentry_id = %s""", 522 (start + 1, length, seqid))[0])
523
524 - def execute_and_fetch_col0(self, sql, args=None):
525 """Return a list of values from the first column in the row 526 """ 527 self.execute(sql, args or ()) 528 return [field[0] for field in self.cursor.fetchall()]
529
530 - def execute_and_fetchall(self, sql, args=None):
531 """Return a list of tuples of all rows 532 """ 533 self.execute(sql, args or ()) 534 return self.cursor.fetchall()
535 536
537 -class MysqlConnectorAdaptor(Adaptor):
538 """A BioSQL Adaptor class with fixes for the MySQL interface 539 540 BioSQL was failing due to returns of bytearray objects from 541 the mysql-connector-python database connector. This adaptor 542 class scrubs returns of bytearrays and of byte strings converting 543 them to string objects instead. This adaptor class was made in 544 response to backwards incompatible changes added to 545 mysql-connector-python in release 2.0.0 of the package. 546 """
547 - def execute_one(self, sql, args=None):
548 out = super(MysqlConnectorAdaptor, self).execute_one(sql, args) 549 return tuple(bytearray_to_str(v) for v in out)
550
551 - def execute_and_fetch_col0(self, sql, args=None):
552 out = super(MysqlConnectorAdaptor, self).execute_and_fetch_col0(sql, args) 553 return [bytearray_to_str(column) for column in out]
554
555 - def execute_and_fetchall(self, sql, args=None):
556 out = super(MysqlConnectorAdaptor, self).execute_and_fetchall(sql, args) 557 return [tuple(bytearray_to_str(v) for v in o) for o in out]
558 559 560 _interface_specific_adaptors = { 561 # If SQL interfaces require a specific adaptor, use this to map the adaptor 562 "mysql.connector": MysqlConnectorAdaptor 563 } 564 565 _allowed_lookups = { 566 # Lookup name / function name to get id, function to list all ids 567 'primary_id': "fetch_seqid_by_identifier", 568 'gi': "fetch_seqid_by_identifier", 569 'display_id': "fetch_seqid_by_display_id", 570 'name': "fetch_seqid_by_display_id", 571 'accession': "fetch_seqid_by_accession", 572 'version': "fetch_seqid_by_version", 573 } 574 575
576 -class BioSeqDatabase(object):
577 """Represents a namespace (sub-database) within the BioSQL database. 578 579 i.e. One row in the biodatabase table, and all all rows in the bioentry 580 table associated with it. 581 """ 582
583 - def __init__(self, adaptor, name):
584 self.adaptor = adaptor 585 self.name = name 586 self.dbid = self.adaptor.fetch_dbid_by_dbname(name)
587
588 - def __repr__(self):
589 return "BioSeqDatabase(%r, %r)" % (self.adaptor, self.name)
590
591 - def get_Seq_by_id(self, name):
592 """Gets a DBSeqRecord object by its name 593 594 Example: seq_rec = db.get_Seq_by_id('ROA1_HUMAN') 595 596 The name of this method is misleading since it returns a DBSeqRecord 597 rather than a DBSeq ojbect, and presumably was to mirror BioPerl. 598 """ 599 seqid = self.adaptor.fetch_seqid_by_display_id(self.dbid, name) 600 return BioSeq.DBSeqRecord(self.adaptor, seqid)
601
602 - def get_Seq_by_acc(self, name):
603 """Gets a DBSeqRecord object by accession number 604 605 Example: seq_rec = db.get_Seq_by_acc('X77802') 606 607 The name of this method is misleading since it returns a DBSeqRecord 608 rather than a DBSeq ojbect, and presumably was to mirror BioPerl. 609 """ 610 seqid = self.adaptor.fetch_seqid_by_accession(self.dbid, name) 611 return BioSeq.DBSeqRecord(self.adaptor, seqid)
612
613 - def get_Seq_by_ver(self, name):
614 """Gets a DBSeqRecord object by version number 615 616 Example: seq_rec = db.get_Seq_by_ver('X77802.1') 617 618 The name of this method is misleading since it returns a DBSeqRecord 619 rather than a DBSeq ojbect, and presumably was to mirror BioPerl. 620 """ 621 seqid = self.adaptor.fetch_seqid_by_version(self.dbid, name) 622 return BioSeq.DBSeqRecord(self.adaptor, seqid)
623
624 - def get_Seqs_by_acc(self, name):
625 """Gets a list of DBSeqRecord objects by accession number 626 627 Example: seq_recs = db.get_Seq_by_acc('X77802') 628 629 The name of this method is misleading since it returns a list of 630 DBSeqRecord objects rather than a list of DBSeq ojbects, and presumably 631 was to mirror BioPerl. 632 """ 633 seqids = self.adaptor.fetch_seqids_by_accession(self.dbid, name) 634 return [BioSeq.DBSeqRecord(self.adaptor, seqid) for seqid in seqids]
635
636 - def get_all_primary_ids(self):
637 """All the primary_ids of the sequences in the database (OBSOLETE). 638 639 These maybe ids (display style) or accession numbers or 640 something else completely different - they *are not* 641 meaningful outside of this database implementation. 642 643 Please use .keys() instead of .get_all_primary_ids() 644 """ 645 import warnings 646 warnings.warn("Use bio_seq_database.keys() instead of " 647 "bio_seq_database.get_all_primary_ids()", 648 BiopythonDeprecationWarning) 649 return list(self.keys())
650
651 - def __getitem__(self, key):
652 record = BioSeq.DBSeqRecord(self.adaptor, key) 653 if record._biodatabase_id != self.dbid: 654 raise KeyError("Entry %r does exist, but not in current name space" % key) 655 return record
656
657 - def __delitem__(self, key):
658 """Remove an entry and all its annotation.""" 659 if key not in self: 660 raise KeyError("Entry %r cannot be deleted. It was not found or is invalid" % key) 661 # Assuming this will automatically cascade to the other tables... 662 sql = "DELETE FROM bioentry " + \ 663 "WHERE biodatabase_id=%s AND bioentry_id=%s;" 664 self.adaptor.execute(sql, (self.dbid, key))
665
666 - def __len__(self):
667 """Number of records in this namespace (sub database).""" 668 sql = "SELECT COUNT(bioentry_id) FROM bioentry " + \ 669 "WHERE biodatabase_id=%s;" 670 return int(self.adaptor.execute_and_fetch_col0(sql, (self.dbid, ))[0])
671
672 - def __contains__(self, value):
673 """Check if a primary (internal) id is this namespace (sub database).""" 674 sql = "SELECT COUNT(bioentry_id) FROM bioentry " + \ 675 "WHERE biodatabase_id=%s AND bioentry_id=%s;" 676 # The bioentry_id field is an integer in the schema. 677 # PostgreSQL will throw an error if we use a non integer in the query. 678 try: 679 bioentry_id = int(value) 680 except ValueError: 681 return False 682 return bool(self.adaptor.execute_and_fetch_col0(sql, 683 (self.dbid, bioentry_id))[0])
684
685 - def __iter__(self):
686 """Iterate over ids (which may not be meaningful outside this database).""" 687 # TODO - Iterate over the cursor, much more efficient 688 return iter(self.adaptor.list_bioentry_ids(self.dbid))
689 690 if hasattr(dict, "iteritems"): 691 # Python 2, use iteritems etc
692 - def keys(self):
693 """List of ids which may not be meaningful outside this database.""" 694 return self.adaptor.list_bioentry_ids(self.dbid)
695
696 - def values(self):
697 """List of DBSeqRecord objects in the namespace (sub database).""" 698 return [self[key] for key in self]
699
700 - def items(self):
701 """List of (id, DBSeqRecord) for the namespace (sub database).""" 702 return [(key, self[key]) for key in self]
703
704 - def iterkeys(self):
705 """Iterate over ids (which may not be meaningful outside this database).""" 706 return iter(self)
707
708 - def itervalues(self):
709 """Iterate over DBSeqRecord objects in the namespace (sub database).""" 710 for key in self: 711 yield self[key]
712
713 - def iteritems(self):
714 """Iterate over (id, DBSeqRecord) for the namespace (sub database).""" 715 for key in self: 716 yield key, self[key]
717 else: 718 # Python 3, items etc are all iterators
719 - def keys(self):
720 """Iterate over ids (which may not be meaningful outside this database).""" 721 return iter(self)
722
723 - def values(self):
724 """Iterate over DBSeqRecord objects in the namespace (sub database).""" 725 for key in self: 726 yield self[key]
727
728 - def items(self):
729 """Iterate over (id, DBSeqRecord) for the namespace (sub database).""" 730 for key in self: 731 yield key, self[key]
732
733 - def lookup(self, **kwargs):
734 if len(kwargs) != 1: 735 raise TypeError("single key/value parameter expected") 736 k, v = list(kwargs.items())[0] 737 if k not in _allowed_lookups: 738 raise TypeError("lookup() expects one of %r, not %r" % 739 (list(_allowed_lookups.keys()), k)) 740 lookup_name = _allowed_lookups[k] 741 lookup_func = getattr(self.adaptor, lookup_name) 742 seqid = lookup_func(self.dbid, v) 743 return BioSeq.DBSeqRecord(self.adaptor, seqid)
744
745 - def get_Seq_by_primary_id(self, seqid):
746 """Get a DBSeqRecord by the primary (internal) id (OBSOLETE). 747 748 Rather than db.get_Seq_by_primary_id(my_id) use db[my_id] 749 750 The name of this method is misleading since it returns a DBSeqRecord 751 rather than a DBSeq ojbect, and presumably was to mirror BioPerl. 752 """ 753 import warnings 754 warnings.warn("Use bio_seq_database[my_id] instead of " 755 "bio_seq_database.get_Seq_by_primary_id(my_id)", 756 BiopythonDeprecationWarning) 757 return self[seqid]
758
759 - def load(self, record_iterator, fetch_NCBI_taxonomy=False):
760 """Load a set of SeqRecords into the BioSQL database. 761 762 record_iterator is either a list of SeqRecord objects, or an 763 Iterator object that returns SeqRecord objects (such as the 764 output from the Bio.SeqIO.parse() function), which will be 765 used to populate the database. 766 767 fetch_NCBI_taxonomy is boolean flag allowing or preventing 768 connection to the taxonomic database on the NCBI server 769 (via Bio.Entrez) to fetch a detailed taxonomy for each 770 SeqRecord. 771 772 Example: 773 from Bio import SeqIO 774 count = db.load(SeqIO.parse(open(filename), format)) 775 776 Returns the number of records loaded. 777 """ 778 db_loader = Loader.DatabaseLoader(self.adaptor, self.dbid, 779 fetch_NCBI_taxonomy) 780 num_records = 0 781 global _POSTGRES_RULES_PRESENT 782 for cur_record in record_iterator: 783 num_records += 1 784 # Hack to work arround BioSQL Bug 2839 - If using PostgreSQL and 785 # the RULES are present check for a duplicate record before loading 786 if _POSTGRES_RULES_PRESENT: 787 # Recreate what the Loader's _load_bioentry_table will do: 788 if cur_record.id.count(".") == 1: 789 accession, version = cur_record.id.split('.') 790 try: 791 version = int(version) 792 except ValueError: 793 accession = cur_record.id 794 version = 0 795 else: 796 accession = cur_record.id 797 version = 0 798 gi = cur_record.annotations.get("gi") 799 sql = "SELECT bioentry_id FROM bioentry WHERE (identifier " + \ 800 "= '%s' AND biodatabase_id = '%s') OR (accession = " + \ 801 "'%s' AND version = '%s' AND biodatabase_id = '%s')" 802 self.adaptor.execute( 803 sql % (gi, self.dbid, accession, version, self.dbid)) 804 if self.adaptor.cursor.fetchone(): 805 raise self.adaptor.conn.IntegrityError("Duplicate record " 806 "detected: record has not been inserted") 807 # End of hack 808 db_loader.load_seqrecord(cur_record) 809 return num_records
810