Package BioSQL :: Module BioSeqDatabase
[hide private]
[frames] | no frames]

Source Code for Module BioSQL.BioSeqDatabase

  1  # Copyright 2002 by Andrew Dalke.  All rights reserved. 
  2  # Revisions 2007-2016 copyright by Peter Cock.  All rights reserved. 
  3  # Revisions 2009 copyright by Cymon J. Cox.  All rights reserved. 
  4  # Revisions 2013-2014 copyright by Tiago Antao.  All rights reserved. 
  5  # This code is part of the Biopython distribution and governed by its 
  6  # license.  Please see the LICENSE file that should have been included 
  7  # as part of this package. 
  8  # 
  9  # Note that BioSQL (including the database schema and scripts) is 
 10  # available and licensed separately.  Please consult www.biosql.org 
 11  """Connect with a BioSQL database and load Biopython like objects from it. 
 12   
 13  This provides interfaces for loading biological objects from a relational 
 14  database, and is compatible with the BioSQL standards. 
 15  """ 
 16  import os 
 17  import sys 
 18   
 19  from Bio._py3k import _universal_read_mode 
 20  from Bio._py3k import _bytes_bytearray_to_str as bytearray_to_str 
 21  from Bio import BiopythonDeprecationWarning 
 22   
 23  from . import BioSeq 
 24  from . import Loader 
 25  from . import DBUtils 
 26   
 27   
 28  _POSTGRES_RULES_PRESENT = False  # Hack for BioSQL Bug 2839 
 29   
 30   
31 -def open_database(driver="MySQLdb", **kwargs):
32 """Load an existing BioSQL-style database. 33 34 This function is the easiest way to retrieve a connection to a 35 database, doing something like: 36 37 >>> from BioSeq import BioSeqDatabase 38 >>> server = BioSeqDatabase.open_database(user="root", db="minidb") 39 40 Arguments: 41 - driver - The name of the database driver to use for connecting. The 42 driver should implement the python DB API. By default, the MySQLdb 43 driver is used. 44 - user -the username to connect to the database with. 45 - password, passwd - the password to connect with 46 - host - the hostname of the database 47 - database or db - the name of the database 48 49 """ 50 if driver == "psycopg": 51 raise ValueError("Using BioSQL with psycopg (version one) is no " 52 "longer supported. Use psycopg2 instead.") 53 54 if os.name == "java": 55 from com.ziclix.python.sql import zxJDBC 56 module = zxJDBC 57 if driver in ["MySQLdb"]: 58 jdbc_driver = "com.mysql.jdbc.Driver" 59 url_pref = "jdbc:mysql://" + kwargs["host"] + "/" 60 elif driver in ["psycopg2"]: 61 jdbc_driver = "org.postgresql.Driver" 62 url_pref = "jdbc:postgresql://" + kwargs["host"] + "/" 63 64 else: 65 module = __import__(driver, fromlist=["connect"]) 66 connect = module.connect 67 68 # Different drivers use different keywords... 69 kw = kwargs.copy() 70 if driver in ["MySQLdb", "mysql.connector"] and os.name != "java": 71 if "database" in kw: 72 kw["db"] = kw["database"] 73 del kw["database"] 74 if "password" in kw: 75 kw["passwd"] = kw["password"] 76 del kw["password"] 77 # kw["charset"] = "utf8" 78 # kw["use_unicode"] = True 79 else: 80 # DB-API recommendations 81 if "db" in kw: 82 kw["database"] = kw["db"] 83 del kw["db"] 84 if "passwd" in kw: 85 kw["password"] = kw["passwd"] 86 del kw["passwd"] 87 if driver in ["psycopg2", "pgdb"] and not kw.get("database"): 88 kw["database"] = "template1" 89 # SQLite connect takes the database name as input 90 if os.name == "java": 91 if driver in ["MySQLdb"]: 92 conn = connect(url_pref + kw.get("database", "mysql"), 93 kw["user"], kw["password"], jdbc_driver) 94 elif driver in ["psycopg2"]: 95 conn = connect(url_pref + kw.get("database", "postgresql") + 96 "?stringtype=unspecified", 97 kw["user"], kw["password"], jdbc_driver) 98 elif driver in ["sqlite3"]: 99 conn = connect(kw["database"]) 100 else: 101 conn = connect(**kw) 102 103 if os.name == "java": 104 server = DBServer(conn, module, driver) 105 else: 106 server = DBServer(conn, module) 107 108 # TODO - Remove the following once BioSQL Bug 2839 is fixed. 109 # Test for RULES in PostgreSQL schema, see also Bug 2833. 110 if driver in ["psycopg2", "pgdb"]: 111 sql = "SELECT ev_class FROM pg_rewrite WHERE " + \ 112 "rulename='rule_bioentry_i1' OR " + \ 113 "rulename='rule_bioentry_i2';" 114 if server.adaptor.execute_and_fetchall(sql): 115 import warnings 116 from Bio import BiopythonWarning 117 warnings.warn("Your BioSQL PostgreSQL schema includes some rules " 118 "currently required for bioperl-db but which may" 119 "cause problems loading data using Biopython (see " 120 "BioSQL's RedMine Bug 2839 aka GitHub Issue 4 " 121 "https://github.com/biosql/biosql/issues/4). " 122 "If you do not use BioPerl, please remove these " 123 "rules. Biopython should cope with the rules " 124 "present, but with a performance penalty when " 125 "loading new records.", BiopythonWarning) 126 global _POSTGRES_RULES_PRESENT 127 _POSTGRES_RULES_PRESENT = True 128 129 elif driver == 'sqlite3': 130 # Tell SQLite that we want to use foreign keys 131 # https://www.sqlite.org/foreignkeys.html#fk_enable 132 server.adaptor.execute('PRAGMA foreign_keys = ON') 133 134 return server
135 136
137 -class DBServer(object):
138 """Represents a BioSQL database continaing namespaces (sub-databases). 139 140 This acts like a Python dictionary, giving access to each namespace 141 (defined by a row in the biodatabase table) as a BioSeqDatabase object. 142 """ 143
144 - def __init__(self, conn, module, module_name=None):
145 """Create a DBServer object. 146 147 Arguments: 148 - conn - A database connection object 149 - module - The module used to create the database connection 150 - module_name - Optionally, the name of the module. Default: module.__name__ 151 152 Normally you would not want to create a DBServer object yourself. 153 Instead use the open_database function, which returns an instance of DBServer. 154 """ 155 self.module = module 156 if module_name is None: 157 module_name = module.__name__ 158 if module_name == "mysql.connector" and sys.version_info[0] == 3: 159 wrap_cursor = True 160 else: 161 wrap_cursor = False 162 # Get module specific Adaptor or the base (general) Adaptor 163 Adapt = _interface_specific_adaptors.get(module_name, Adaptor) 164 self.adaptor = Adapt(conn, DBUtils.get_dbutils(module_name), 165 wrap_cursor=wrap_cursor) 166 self.module_name = module_name
167
168 - def __repr__(self):
169 """Return a short description of the class name and database connection.""" 170 return self.__class__.__name__ + "(%r)" % self.adaptor.conn
171
172 - def __getitem__(self, name):
173 """Return a BioSeqDatabase object. 174 175 Arguments: 176 - name - The name of the BioSeqDatabase 177 178 """ 179 return BioSeqDatabase(self.adaptor, name)
180
181 - def __len__(self):
182 """Return number of namespaces (sub-databases) in this database.""" 183 sql = "SELECT COUNT(name) FROM biodatabase;" 184 return int(self.adaptor.execute_and_fetch_col0(sql)[0])
185
186 - def __contains__(self, value):
187 """Check if a namespace (sub-database) in this database.""" 188 sql = "SELECT COUNT(name) FROM biodatabase WHERE name=%s;" 189 return bool(self.adaptor.execute_and_fetch_col0(sql, (value,))[0])
190
191 - def __iter__(self):
192 """Iterate over namespaces (sub-databases) in the database.""" 193 # TODO - Iterate over the cursor, much more efficient 194 return iter(self.adaptor.list_biodatabase_names())
195 196 if hasattr(dict, "iteritems"): 197 # Python 2, use iteritems etc
198 - def keys(self):
199 """List of namespaces (sub-databases) in the database.""" 200 return self.adaptor.list_biodatabase_names()
201
202 - def values(self):
203 """List of BioSeqDatabase objects in the database.""" 204 return [self[key] for key in self]
205
206 - def items(self):
207 """List of (namespace, BioSeqDatabase) for entries in database.""" 208 return [(key, self[key]) for key in self]
209
210 - def iterkeys(self):
211 """Iterate over namespaces (sub-databases) in the database.""" 212 return iter(self)
213
214 - def itervalues(self):
215 """Iterate over BioSeqDatabase objects in the database.""" 216 for key in self: 217 yield self[key]
218
219 - def iteritems(self):
220 """Iterate over (namespace, BioSeqDatabase) in the database.""" 221 for key in self: 222 yield key, self[key]
223 else: 224 # Python 3, items etc are all iterators
225 - def keys(self):
226 """Iterate over namespaces (sub-databases) in the database.""" 227 return iter(self)
228
229 - def values(self):
230 """Iterate over BioSeqDatabase objects in the database.""" 231 for key in self: 232 yield self[key]
233
234 - def items(self):
235 """Iterate over (namespace, BioSeqDatabase) in the database.""" 236 for key in self: 237 yield key, self[key]
238
239 - def __delitem__(self, name):
240 """Remove a namespace and all its entries.""" 241 if name not in self: 242 raise KeyError(name) 243 db_id = self.adaptor.fetch_dbid_by_dbname(name) 244 remover = Loader.DatabaseRemover(self.adaptor, db_id) 245 remover.remove()
246
247 - def remove_database(self, db_name):
248 """Remove a namespace and all its entries (OBSOLETE). 249 250 Try to remove all references to items in a database: 251 252 >>> server.remove_database(name) 253 254 In keeping with the dictionary interface, you can now do this: 255 256 >>> del server[name] 257 258 """ 259 import warnings 260 warnings.warn("This method is deprecated. In keeping with the " 261 "dictionary interface, you can now use 'del " 262 "server[name]' instead", BiopythonDeprecationWarning) 263 self.__delitem__(db_name)
264
265 - def new_database(self, db_name, authority=None, description=None):
266 """Add a new database to the server and return it.""" 267 # make the database 268 sql = r"INSERT INTO biodatabase (name, authority, description)" \ 269 r" VALUES (%s, %s, %s)" 270 self.adaptor.execute(sql, (db_name, authority, description)) 271 return BioSeqDatabase(self.adaptor, db_name)
272
273 - def load_database_sql(self, sql_file):
274 """Load a database schema into the given database. 275 276 This is used to create tables, etc when a database is first created. 277 sql_file should specify the complete path to a file containing 278 SQL entries for building the tables. 279 """ 280 # Not sophisticated enough for PG schema. Is it needed by MySQL? 281 # Looks like we need this more complicated way for both. Leaving it 282 # the default and removing the simple-minded approach. 283 284 # read the file with all comment lines removed 285 sql = "" 286 with open(sql_file, _universal_read_mode) as sql_handle: 287 for line in sql_handle: 288 if line.startswith("--"): # don't include comment lines 289 pass 290 elif line.startswith("#"): # ditto for MySQL comments 291 pass 292 elif line.strip(): # only include non-blank lines 293 sql += line.strip() + " " 294 295 # two ways to load the SQL 296 # 1. PostgreSQL can load it all at once and actually needs to 297 # due to FUNCTION defines at the end of the SQL which mess up 298 # the splitting by semicolons 299 if self.module_name in ["psycopg2", "pgdb"]: 300 self.adaptor.cursor.execute(sql) 301 # 2. MySQL needs the database loading split up into single lines of 302 # SQL executed one at a time 303 elif self.module_name in ["mysql.connector", "MySQLdb", "sqlite3"]: 304 sql_parts = sql.split(";") # one line per sql command 305 # don't use the last item, it's blank 306 for sql_line in sql_parts[:-1]: 307 self.adaptor.cursor.execute(sql_line) 308 else: 309 raise ValueError("Module %s not supported by the loader." % 310 (self.module_name))
311
312 - def commit(self):
313 """Commit the current transaction to the database.""" 314 return self.adaptor.commit()
315
316 - def rollback(self):
317 """Roll-back the current transaction.""" 318 return self.adaptor.rollback()
319
320 - def close(self):
321 """Close the connection. No further activity possible.""" 322 return self.adaptor.close()
323 324
325 -class _CursorWrapper(object):
326 """A wraper for mysql.connector resolving bytestring representations.""" 327
328 - def __init__(self, real_cursor):
329 self.real_cursor = real_cursor
330
331 - def execute(self, operation, params=None, multi=False):
332 """Execute a sql statement.""" 333 self.real_cursor.execute(operation, params, multi)
334
335 - def executemany(self, operation, params):
336 """Execute many sql statements.""" 337 self.real_cursor.executemany(operation, params)
338
339 - def _convert_tuple(self, tuple_):
340 """Decode any bytestrings present in the row.""" 341 tuple_list = list(tuple_) 342 for i, elem in enumerate(tuple_list): 343 if type(elem) is bytes: 344 tuple_list[i] = elem.decode("utf-8") 345 return tuple(tuple_list)
346
347 - def _convert_list(self, lst):
348 ret_lst = [] 349 for tuple_ in lst: 350 new_tuple = self._convert_tuple(tuple_) 351 ret_lst.append(new_tuple) 352 return ret_lst
353
354 - def fetchall(self):
355 rv = self.real_cursor.fetchall() 356 return self._convert_list(rv)
357
358 - def fetchone(self):
359 tuple_ = self.real_cursor.fetchone() 360 return self._convert_tuple(tuple_)
361 362
363 -class Adaptor(object):
364 """High level wrapper for a database connection and cursor. 365 366 Most database calls in BioSQL are done indirectly though this adaptor 367 class. This provides helper methods for fetching data and executing 368 sql. 369 """ 370
371 - def __init__(self, conn, dbutils, wrap_cursor=False):
372 """Create an Adaptor object. 373 374 Arguments: 375 - conn - A database connection 376 - dbutils - A BioSQL.DBUtils object 377 - wrap_cursor - Optional, whether to wrap the cursor object 378 379 """ 380 self.conn = conn 381 if wrap_cursor: 382 self.cursor = _CursorWrapper(conn.cursor()) 383 else: 384 self.cursor = conn.cursor() 385 self.dbutils = dbutils
386
387 - def last_id(self, table):
388 """Return the last row id for the selected table.""" 389 return self.dbutils.last_id(self.cursor, table)
390
391 - def autocommit(self, y=True):
392 """Set the autocommit mode. True values enable; False value disable.""" 393 return self.dbutils.autocommit(self.conn, y)
394
395 - def commit(self):
396 """Commit the current transaction.""" 397 return self.conn.commit()
398
399 - def rollback(self):
400 """Roll-back the current transaction.""" 401 return self.conn.rollback()
402
403 - def close(self):
404 """Close the connection. No further activity possible.""" 405 return self.conn.close()
406
407 - def fetch_dbid_by_dbname(self, dbname):
408 """Return the internal id for the sub-database using its name.""" 409 self.execute( 410 r"select biodatabase_id from biodatabase where name = %s", 411 (dbname,)) 412 rv = self.cursor.fetchall() 413 if not rv: 414 raise KeyError("Cannot find biodatabase with name %r" % dbname) 415 return rv[0][0]
416
417 - def fetch_seqid_by_display_id(self, dbid, name):
418 """Return the internal id for a sequence using its display id. 419 420 Arguments: 421 - dbid - the internal id for the sub-database 422 - name - the name of the sequence. Corresponds to the 423 name column of the bioentry table of the SQL schema 424 425 """ 426 sql = r"select bioentry_id from bioentry where name = %s" 427 fields = [name] 428 if dbid: 429 sql += " and biodatabase_id = %s" 430 fields.append(dbid) 431 self.execute(sql, fields) 432 rv = self.cursor.fetchall() 433 if not rv: 434 raise IndexError("Cannot find display id %r" % name) 435 if len(rv) > 1: 436 raise IndexError("More than one entry with display id %r" % name) 437 return rv[0][0]
438
439 - def fetch_seqid_by_accession(self, dbid, name):
440 """Return the internal id for a sequence using its accession. 441 442 Arguments: 443 - dbid - the internal id for the sub-database 444 - name - the accession of the sequence. Corresponds to the 445 accession column of the bioentry table of the SQL schema 446 447 """ 448 sql = r"select bioentry_id from bioentry where accession = %s" 449 fields = [name] 450 if dbid: 451 sql += " and biodatabase_id = %s" 452 fields.append(dbid) 453 self.execute(sql, fields) 454 rv = self.cursor.fetchall() 455 if not rv: 456 raise IndexError("Cannot find accession %r" % name) 457 if len(rv) > 1: 458 raise IndexError("More than one entry with accession %r" % name) 459 return rv[0][0]
460
461 - def fetch_seqids_by_accession(self, dbid, name):
462 """Return a list internal ids using an accession. 463 464 Arguments: 465 - dbid - the internal id for the sub-database 466 - name - the accession of the sequence. Corresponds to the 467 accession column of the bioentry table of the SQL schema 468 469 """ 470 sql = r"select bioentry_id from bioentry where accession = %s" 471 fields = [name] 472 if dbid: 473 sql += " and biodatabase_id = %s" 474 fields.append(dbid) 475 return self.execute_and_fetch_col0(sql, fields)
476
477 - def fetch_seqid_by_version(self, dbid, name):
478 """Return the internal id for a sequence using its accession and version. 479 480 Arguments: 481 - dbid - the internal id for the sub-database 482 - name - the accession of the sequence containing a version number. 483 Must correspond to <accession>.<version> 484 485 """ 486 acc_version = name.split(".") 487 if len(acc_version) > 2: 488 raise IndexError("Bad version %r" % name) 489 acc = acc_version[0] 490 if len(acc_version) == 2: 491 version = acc_version[1] 492 else: 493 version = "0" 494 sql = r"SELECT bioentry_id FROM bioentry WHERE accession = %s" \ 495 r" AND version = %s" 496 fields = [acc, version] 497 if dbid: 498 sql += " and biodatabase_id = %s" 499 fields.append(dbid) 500 self.execute(sql, fields) 501 rv = self.cursor.fetchall() 502 if not rv: 503 raise IndexError("Cannot find version %r" % name) 504 if len(rv) > 1: 505 raise IndexError("More than one entry with version %r" % name) 506 return rv[0][0]
507
508 - def fetch_seqid_by_identifier(self, dbid, identifier):
509 """Return the internal id for a sequence using its identifier. 510 511 Arguments: 512 - dbid - the internal id for the sub-database 513 - identifier - the identifier of the sequence. Corresponds to 514 the identifier column of the bioentry table in the SQL schema. 515 516 """ 517 # YB: was fetch_seqid_by_seqid 518 sql = "SELECT bioentry_id FROM bioentry WHERE identifier = %s" 519 fields = [identifier] 520 if dbid: 521 sql += " and biodatabase_id = %s" 522 fields.append(dbid) 523 self.execute(sql, fields) 524 rv = self.cursor.fetchall() 525 if not rv: 526 raise IndexError("Cannot find display id %r" % identifier) 527 return rv[0][0]
528
529 - def list_biodatabase_names(self):
530 """Return a list of all of the sub-databases.""" 531 return self.execute_and_fetch_col0( 532 "SELECT name FROM biodatabase")
533
534 - def list_bioentry_ids(self, dbid):
535 """Return a list of internal ids for all of the sequences in a sub-databae. 536 537 Arguments: 538 - dbid - The internal id for a sub-database 539 540 """ 541 return self.execute_and_fetch_col0( 542 "SELECT bioentry_id FROM bioentry WHERE biodatabase_id = %s", 543 (dbid,))
544
545 - def list_bioentry_display_ids(self, dbid):
546 """Return a list of all sequence names in a sub-databae. 547 548 Arguments: 549 - dbid - The internal id for a sub-database 550 551 """ 552 return self.execute_and_fetch_col0( 553 "SELECT name FROM bioentry WHERE biodatabase_id = %s", 554 (dbid,))
555
556 - def list_any_ids(self, sql, args):
557 """Return ids given a SQL statement to select for them. 558 559 This assumes that the given SQL does a SELECT statement that 560 returns a list of items. This parses them out of the 2D list 561 they come as and just returns them in a list. 562 """ 563 return self.execute_and_fetch_col0(sql, args)
564
565 - def execute_one(self, sql, args=None):
566 """Execute sql that returns 1 record, and return the record.""" 567 self.execute(sql, args or ()) 568 rv = self.cursor.fetchall() 569 assert len(rv) == 1, "Expected 1 response, got %d" % len(rv) 570 return rv[0]
571
572 - def execute(self, sql, args=None):
573 """Just execute an sql command.""" 574 if os.name == "java": 575 sql = sql.replace("%s", "?") 576 self.dbutils.execute(self.cursor, sql, args)
577
578 - def executemany(self, sql, args):
579 """Execute many sql commands.""" 580 if os.name == "java": 581 sql = sql.replace("%s", "?") 582 self.dbutils.executemany(self.cursor, sql, args)
583
584 - def get_subseq_as_string(self, seqid, start, end):
585 """Return a substring of a sequence. 586 587 Arguments: 588 - seqid - The internal id for the sequence 589 - start - The start position of the sequence; 0-indexed 590 - end - The end position of the sequence 591 592 """ 593 length = end - start 594 # XXX Check this on MySQL and PostgreSQL. substr should be general, 595 # does it need dbutils? 596 # return self.execute_one( 597 # """select SUBSTRING(seq FROM %s FOR %s) 598 # from biosequence where bioentry_id = %s""", 599 # (start+1, length, seqid))[0] 600 # 601 # Convert to a string on returning for databases that give back 602 # unicode. Shouldn't need unicode for sequences so this seems safe. 603 return str(self.execute_one( 604 """select SUBSTR(seq, %s, %s) 605 from biosequence where bioentry_id = %s""", 606 (start + 1, length, seqid))[0])
607
608 - def execute_and_fetch_col0(self, sql, args=None):
609 """Return a list of values from the first column in the row.""" 610 self.execute(sql, args or ()) 611 return [field[0] for field in self.cursor.fetchall()]
612
613 - def execute_and_fetchall(self, sql, args=None):
614 """Return a list of tuples of all rows.""" 615 self.execute(sql, args or ()) 616 return self.cursor.fetchall()
617 618
619 -class MysqlConnectorAdaptor(Adaptor):
620 """A BioSQL Adaptor class with fixes for the MySQL interface. 621 622 BioSQL was failing due to returns of bytearray objects from 623 the mysql-connector-python database connector. This adaptor 624 class scrubs returns of bytearrays and of byte strings converting 625 them to string objects instead. This adaptor class was made in 626 response to backwards incompatible changes added to 627 mysql-connector-python in release 2.0.0 of the package. 628 """ 629
630 - def execute_one(self, sql, args=None):
631 """Execute sql that returns 1 record, and return the record.""" 632 out = super(MysqlConnectorAdaptor, self).execute_one(sql, args) 633 return tuple(bytearray_to_str(v) for v in out)
634
635 - def execute_and_fetch_col0(self, sql, args=None):
636 """Return a list of values from the first column in the row.""" 637 out = super(MysqlConnectorAdaptor, self).execute_and_fetch_col0(sql, args) 638 return [bytearray_to_str(column) for column in out]
639
640 - def execute_and_fetchall(self, sql, args=None):
641 """Return a list of tuples of all rows.""" 642 out = super(MysqlConnectorAdaptor, self).execute_and_fetchall(sql, args) 643 return [tuple(bytearray_to_str(v) for v in o) for o in out]
644 645 646 _interface_specific_adaptors = { 647 # If SQL interfaces require a specific adaptor, use this to map the adaptor 648 "mysql.connector": MysqlConnectorAdaptor, 649 } 650 651 _allowed_lookups = { 652 # Lookup name / function name to get id, function to list all ids 653 'primary_id': "fetch_seqid_by_identifier", 654 'gi': "fetch_seqid_by_identifier", 655 'display_id': "fetch_seqid_by_display_id", 656 'name': "fetch_seqid_by_display_id", 657 'accession': "fetch_seqid_by_accession", 658 'version': "fetch_seqid_by_version", 659 } 660 661
662 -class BioSeqDatabase(object):
663 """Represents a namespace (sub-database) within the BioSQL database. 664 665 i.e. One row in the biodatabase table, and all all rows in the bioentry 666 table associated with it. 667 """ 668
669 - def __init__(self, adaptor, name):
670 """Create a BioDatabase object. 671 672 Arguments: 673 - adaptor - A BioSQL.Adaptor object 674 - name - The name of the sub-database (namespace) 675 676 """ 677 self.adaptor = adaptor 678 self.name = name 679 self.dbid = self.adaptor.fetch_dbid_by_dbname(name)
680
681 - def __repr__(self):
682 """Return a short summary of the BioSeqDatabase.""" 683 return "BioSeqDatabase(%r, %r)" % (self.adaptor, self.name)
684
685 - def get_Seq_by_id(self, name):
686 """Get a DBSeqRecord object by its name. 687 688 Example: seq_rec = db.get_Seq_by_id('ROA1_HUMAN') 689 690 The name of this method is misleading since it returns a DBSeqRecord 691 rather than a DBSeq ojbect, and presumably was to mirror BioPerl. 692 """ 693 seqid = self.adaptor.fetch_seqid_by_display_id(self.dbid, name) 694 return BioSeq.DBSeqRecord(self.adaptor, seqid)
695
696 - def get_Seq_by_acc(self, name):
697 """Get a DBSeqRecord object by accession number. 698 699 Example: seq_rec = db.get_Seq_by_acc('X77802') 700 701 The name of this method is misleading since it returns a DBSeqRecord 702 rather than a DBSeq ojbect, and presumably was to mirror BioPerl. 703 """ 704 seqid = self.adaptor.fetch_seqid_by_accession(self.dbid, name) 705 return BioSeq.DBSeqRecord(self.adaptor, seqid)
706
707 - def get_Seq_by_ver(self, name):
708 """Get a DBSeqRecord object by version number. 709 710 Example: seq_rec = db.get_Seq_by_ver('X77802.1') 711 712 The name of this method is misleading since it returns a DBSeqRecord 713 rather than a DBSeq ojbect, and presumably was to mirror BioPerl. 714 """ 715 seqid = self.adaptor.fetch_seqid_by_version(self.dbid, name) 716 return BioSeq.DBSeqRecord(self.adaptor, seqid)
717
718 - def get_Seqs_by_acc(self, name):
719 """Get a list of DBSeqRecord objects by accession number. 720 721 Example: seq_recs = db.get_Seq_by_acc('X77802') 722 723 The name of this method is misleading since it returns a list of 724 DBSeqRecord objects rather than a list of DBSeq ojbects, and presumably 725 was to mirror BioPerl. 726 """ 727 seqids = self.adaptor.fetch_seqids_by_accession(self.dbid, name) 728 return [BioSeq.DBSeqRecord(self.adaptor, seqid) for seqid in seqids]
729
730 - def get_all_primary_ids(self):
731 """All the primary_ids of the sequences in the database (OBSOLETE). 732 733 These maybe ids (display style) or accession numbers or 734 something else completely different - they *are not* 735 meaningful outside of this database implementation. 736 737 Please use .keys() instead of .get_all_primary_ids() 738 """ 739 import warnings 740 warnings.warn("Use bio_seq_database.keys() instead of " 741 "bio_seq_database.get_all_primary_ids()", 742 BiopythonDeprecationWarning) 743 return list(self.keys())
744
745 - def __getitem__(self, key):
746 """Return a DBSeqRecord for one of the sequences in the sub-database. 747 748 Arguments: 749 - key - The internal id for the sequence 750 751 """ 752 record = BioSeq.DBSeqRecord(self.adaptor, key) 753 if record._biodatabase_id != self.dbid: 754 raise KeyError("Entry %r does exist, but not in current name space" % key) 755 return record
756
757 - def __delitem__(self, key):
758 """Remove an entry and all its annotation.""" 759 if key not in self: 760 raise KeyError("Entry %r cannot be deleted. " 761 "It was not found or is invalid" % key) 762 # Assuming this will automatically cascade to the other tables... 763 sql = "DELETE FROM bioentry " + \ 764 "WHERE biodatabase_id=%s AND bioentry_id=%s;" 765 self.adaptor.execute(sql, (self.dbid, key))
766
767 - def __len__(self):
768 """Return number of records in this namespace (sub database).""" 769 sql = "SELECT COUNT(bioentry_id) FROM bioentry " + \ 770 "WHERE biodatabase_id=%s;" 771 return int(self.adaptor.execute_and_fetch_col0(sql, (self.dbid, ))[0])
772
773 - def __contains__(self, value):
774 """Check if a primary (internal) id is this namespace (sub database).""" 775 sql = "SELECT COUNT(bioentry_id) FROM bioentry " + \ 776 "WHERE biodatabase_id=%s AND bioentry_id=%s;" 777 # The bioentry_id field is an integer in the schema. 778 # PostgreSQL will throw an error if we use a non integer in the query. 779 try: 780 bioentry_id = int(value) 781 except ValueError: 782 return False 783 return bool(self.adaptor.execute_and_fetch_col0(sql, 784 (self.dbid, bioentry_id))[0])
785
786 - def __iter__(self):
787 """Iterate over ids (which may not be meaningful outside this database).""" 788 # TODO - Iterate over the cursor, much more efficient 789 return iter(self.adaptor.list_bioentry_ids(self.dbid))
790 791 if hasattr(dict, "iteritems"): 792 # Python 2, use iteritems etc
793 - def keys(self):
794 """List of ids which may not be meaningful outside this database.""" 795 return self.adaptor.list_bioentry_ids(self.dbid)
796
797 - def values(self):
798 """List of DBSeqRecord objects in the namespace (sub database).""" 799 return [self[key] for key in self]
800
801 - def items(self):
802 """List of (id, DBSeqRecord) for the namespace (sub database).""" 803 return [(key, self[key]) for key in self]
804
805 - def iterkeys(self):
806 """Iterate over ids (which may not be meaningful outside this database).""" 807 return iter(self)
808
809 - def itervalues(self):
810 """Iterate over DBSeqRecord objects in the namespace (sub database).""" 811 for key in self: 812 yield self[key]
813
814 - def iteritems(self):
815 """Iterate over (id, DBSeqRecord) for the namespace (sub database).""" 816 for key in self: 817 yield key, self[key]
818 else: 819 # Python 3, items etc are all iterators
820 - def keys(self):
821 """Iterate over ids (which may not be meaningful outside this database).""" 822 return iter(self)
823
824 - def values(self):
825 """Iterate over DBSeqRecord objects in the namespace (sub database).""" 826 for key in self: 827 yield self[key]
828
829 - def items(self):
830 """Iterate over (id, DBSeqRecord) for the namespace (sub database).""" 831 for key in self: 832 yield key, self[key]
833
834 - def lookup(self, **kwargs):
835 """Return a DBSeqRecord using an acceptable identifier. 836 837 Arguments: 838 - kwargs - A single key-value pair where the key is one 839 of primary_id, gi, display_id, name, accession, version 840 841 """ 842 if len(kwargs) != 1: 843 raise TypeError("single key/value parameter expected") 844 k, v = list(kwargs.items())[0] 845 if k not in _allowed_lookups: 846 raise TypeError("lookup() expects one of %r, not %r" % 847 (list(_allowed_lookups.keys()), k)) 848 lookup_name = _allowed_lookups[k] 849 lookup_func = getattr(self.adaptor, lookup_name) 850 seqid = lookup_func(self.dbid, v) 851 return BioSeq.DBSeqRecord(self.adaptor, seqid)
852
853 - def get_Seq_by_primary_id(self, seqid):
854 """Get a DBSeqRecord by the primary (internal) id (OBSOLETE). 855 856 Rather than db.get_Seq_by_primary_id(my_id) use db[my_id] 857 858 The name of this method is misleading since it returns a DBSeqRecord 859 rather than a DBSeq ojbect, and presumably was to mirror BioPerl. 860 """ 861 import warnings 862 warnings.warn("Use bio_seq_database[my_id] instead of " 863 "bio_seq_database.get_Seq_by_primary_id(my_id)", 864 BiopythonDeprecationWarning) 865 return self[seqid]
866
867 - def load(self, record_iterator, fetch_NCBI_taxonomy=False):
868 """Load a set of SeqRecords into the BioSQL database. 869 870 record_iterator is either a list of SeqRecord objects, or an 871 Iterator object that returns SeqRecord objects (such as the 872 output from the Bio.SeqIO.parse() function), which will be 873 used to populate the database. 874 875 fetch_NCBI_taxonomy is boolean flag allowing or preventing 876 connection to the taxonomic database on the NCBI server 877 (via Bio.Entrez) to fetch a detailed taxonomy for each 878 SeqRecord. 879 880 Example:: 881 882 from Bio import SeqIO 883 count = db.load(SeqIO.parse(open(filename), format)) 884 885 Returns the number of records loaded. 886 """ 887 db_loader = Loader.DatabaseLoader(self.adaptor, self.dbid, 888 fetch_NCBI_taxonomy) 889 num_records = 0 890 global _POSTGRES_RULES_PRESENT 891 for cur_record in record_iterator: 892 num_records += 1 893 # Hack to work arround BioSQL Bug 2839 - If using PostgreSQL and 894 # the RULES are present check for a duplicate record before loading 895 if _POSTGRES_RULES_PRESENT: 896 # Recreate what the Loader's _load_bioentry_table will do: 897 if cur_record.id.count(".") == 1: 898 accession, version = cur_record.id.split('.') 899 try: 900 version = int(version) 901 except ValueError: 902 accession = cur_record.id 903 version = 0 904 else: 905 accession = cur_record.id 906 version = 0 907 gi = cur_record.annotations.get("gi") 908 sql = "SELECT bioentry_id FROM bioentry WHERE (identifier " + \ 909 "= '%s' AND biodatabase_id = '%s') OR (accession = " + \ 910 "'%s' AND version = '%s' AND biodatabase_id = '%s')" 911 self.adaptor.execute( 912 sql % (gi, self.dbid, accession, version, self.dbid)) 913 if self.adaptor.cursor.fetchone(): 914 raise self.adaptor.conn.IntegrityError( 915 "Duplicate record detected: " 916 "record has not been inserted") 917 # End of hack 918 db_loader.load_seqrecord(cur_record) 919 return num_records
920