Package BioSQL :: Module BioSeqDatabase
[hide private]
[frames] | no frames]

Source Code for Module BioSQL.BioSeqDatabase

  1  # Copyright 2002 by Andrew Dalke.  All rights reserved. 
  2  # Revisions 2007-2009 copyright by Peter Cock.  All rights reserved. 
  3  # Revisions 2009 copyright by Cymon J. Cox.  All rights reserved. 
  4  # Revisions 2013 copyright by Tiago Antao.  All rights reserved. 
  5  # This code is part of the Biopython distribution and governed by its 
  6  # license.  Please see the LICENSE file that should have been included 
  7  # as part of this package. 
  8  # 
  9  # Note that BioSQL (including the database schema and scripts) is 
 10  # available and licensed separately.  Please consult www.biosql.org 
 11  """Connect with a BioSQL database and load Biopython like objects from it. 
 12   
 13  This provides interfaces for loading biological objects from a relational 
 14  database, and is compatible with the BioSQL standards. 
 15  """ 
 16  import os 
 17   
 18  from Bio import BiopythonDeprecationWarning 
 19   
 20  from . import BioSeq 
 21  from . import Loader 
 22  from . import DBUtils 
 23   
 24  _POSTGRES_RULES_PRESENT = False  # Hack for BioSQL Bug 2839 
 25   
 26   
27 -def open_database(driver="MySQLdb", **kwargs):
28 """Main interface for loading a existing BioSQL-style database. 29 30 This function is the easiest way to retrieve a connection to a 31 database, doing something like: 32 33 >>> from BioSeq import BioSeqDatabase 34 >>> server = BioSeqDatabase.open_database(user="root", db="minidb") 35 36 the various options are: 37 driver -> The name of the database driver to use for connecting. The 38 driver should implement the python DB API. By default, the MySQLdb 39 driver is used. 40 user -> the username to connect to the database with. 41 password, passwd -> the password to connect with 42 host -> the hostname of the database 43 database or db -> the name of the database 44 """ 45 if driver == "psycopg": 46 raise ValueError("Using BioSQL with psycopg (version one) is no " 47 "longer supported. Use psycopg2 instead.") 48 49 if os.name == "java": 50 from com.ziclix.python.sql import zxJDBC 51 module = zxJDBC 52 if driver in ["MySQLdb"]: 53 jdbc_driver = "com.mysql.jdbc.Driver" 54 url_pref = "jdbc:mysql://" + kwargs["host"] + "/" 55 elif driver in ["psycopg2"]: 56 jdbc_driver = "org.postgresql.Driver" 57 url_pref = "jdbc:postgresql://" + kwargs["host"] + "/" 58 59 else: 60 module = __import__(driver) 61 connect = getattr(module, "connect") 62 63 # Different drivers use different keywords... 64 kw = kwargs.copy() 65 if driver == "MySQLdb" and os.name != "java": 66 if "database" in kw: 67 kw["db"] = kw["database"] 68 del kw["database"] 69 if "password" in kw: 70 kw["passwd"] = kw["password"] 71 del kw["password"] 72 else: 73 # DB-API recommendations 74 if "db" in kw: 75 kw["database"] = kw["db"] 76 del kw["db"] 77 if "passwd" in kw: 78 kw["password"] = kw["passwd"] 79 del kw["passwd"] 80 if driver in ["psycopg2", "pgdb"] and not kw.get("database"): 81 kw["database"] = "template1" 82 # SQLite connect takes the database name as input 83 if os.name == "java": 84 if driver in ["MySQLdb"]: 85 conn = connect(url_pref + kw.get("database", "mysql"), 86 kw["user"], kw["password"], jdbc_driver) 87 elif driver in ["psycopg2"]: 88 conn = connect(url_pref + kw.get("database", "postgresql") + 89 "?stringtype=unspecified", 90 kw["user"], kw["password"], jdbc_driver) 91 elif driver in ["sqlite3"]: 92 conn = connect(kw["database"]) 93 else: 94 try: 95 conn = connect(**kw) 96 except module.InterfaceError: 97 # Ok, so let's try building a DSN 98 # (older releases of psycopg need this) 99 if "database" in kw: 100 kw["dbname"] = kw["database"] 101 del kw["database"] 102 elif "db" in kw: 103 kw["dbname"] = kw["db"] 104 del kw["db"] 105 dsn = ' '.join('='.join(i) for i in kw.items()) 106 conn = connect(dsn) 107 108 if os.name == "java": 109 server = DBServer(conn, module, driver) 110 else: 111 server = DBServer(conn, module) 112 113 # TODO - Remove the following once BioSQL Bug 2839 is fixed. 114 # Test for RULES in PostgreSQL schema, see also Bug 2833. 115 if driver in ["psycopg2", "pgdb"]: 116 sql = "SELECT ev_class FROM pg_rewrite WHERE " + \ 117 "rulename='rule_bioentry_i1' OR " + \ 118 "rulename='rule_bioentry_i2';" 119 if server.adaptor.execute_and_fetchall(sql): 120 import warnings 121 from Bio import BiopythonWarning 122 warnings.warn("Your BioSQL PostgreSQL schema includes some " 123 "rules currently required for bioperl-db but " 124 "which may cause problems loading data using " 125 "Biopython (see BioSQL Bug 2839). If you do not " 126 "use BioPerl, please remove these rules. " 127 "Biopython should cope with the rules present, " 128 "but with a performance penalty when loading " 129 "new records.", BiopythonWarning) 130 global _POSTGRES_RULES_PRESENT 131 _POSTGRES_RULES_PRESENT = True 132 133 return server
134 135
136 -class DBServer:
137 """Represents a BioSQL database continaing namespaces (sub-databases). 138 139 This acts like a Python dictionary, giving access to each namespace 140 (defined by a row in the biodatabase table) as a BioSeqDatabase object. 141 """
142 - def __init__(self, conn, module, module_name=None):
143 self.module = module 144 if module_name is None: 145 module_name = module.__name__ 146 self.adaptor = Adaptor(conn, DBUtils.get_dbutils(module_name)) 147 self.module_name = module_name
148
149 - def __repr__(self):
150 return self.__class__.__name__ + "(%r)" % self.adaptor.conn
151
152 - def __getitem__(self, name):
153 return BioSeqDatabase(self.adaptor, name)
154
155 - def __len__(self):
156 """Number of namespaces (sub-databases) in this database.""" 157 sql = "SELECT COUNT(name) FROM biodatabase;" 158 return int(self.adaptor.execute_and_fetch_col0(sql)[0])
159
160 - def __contains__(self, value):
161 """Check if a namespace (sub-database) in this database.""" 162 sql = "SELECT COUNT(name) FROM biodatabase WHERE name=%s;" 163 return bool(self.adaptor.execute_and_fetch_col0(sql, (value,))[0])
164
165 - def __iter__(self):
166 """Iterate over namespaces (sub-databases) in the database.""" 167 #TODO - Iterate over the cursor, much more efficient 168 return iter(self.adaptor.list_biodatabase_names())
169 170 if hasattr(dict, "iteritems"): 171 #Python 2, use iteritems etc
172 - def keys(self):
173 """List of namespaces (sub-databases) in the database.""" 174 return self.adaptor.list_biodatabase_names()
175
176 - def values(self):
177 """List of BioSeqDatabase objects in the database.""" 178 return [self[key] for key in self]
179
180 - def items(self):
181 """List of (namespace, BioSeqDatabase) for entries in the database.""" 182 return [(key, self[key]) for key in self]
183
184 - def iterkeys(self):
185 """Iterate over namespaces (sub-databases) in the database.""" 186 return iter(self)
187
188 - def itervalues(self):
189 """Iterate over BioSeqDatabase objects in the database.""" 190 for key in self: 191 yield self[key]
192
193 - def iteritems(self):
194 """Iterate over (namespace, BioSeqDatabase) in the database.""" 195 for key in self: 196 yield key, self[key]
197 else: 198 #Python 3, items etc are all iterators
199 - def keys(self):
200 """Iterate over namespaces (sub-databases) in the database.""" 201 return iter(self)
202
203 - def values(self):
204 """Iterate over BioSeqDatabase objects in the database.""" 205 for key in self: 206 yield self[key]
207
208 - def items(self):
209 """Iterate over (namespace, BioSeqDatabase) in the database.""" 210 for key in self: 211 yield key, self[key]
212
213 - def __delitem__(self, name):
214 """Remove a namespace and all its entries.""" 215 if name not in self: 216 raise KeyError(name) 217 self.remove_database(name)
218
219 - def remove_database(self, db_name):
220 """Remove a namespace and all its entries (OBSOLETE). 221 222 Try to remove all references to items in a database. 223 224 server.remove_database(name) 225 226 In keeping with the dictionary interface, you can now do this: 227 228 del server[name] 229 """ 230 import warnings 231 warnings.warn("This method is deprecated. In keeping with the " 232 "dictionary interface, you can now use 'del " 233 "server[name]' instead", BiopythonDeprecationWarning) 234 db_id = self.adaptor.fetch_dbid_by_dbname(db_name) 235 remover = Loader.DatabaseRemover(self.adaptor, db_id) 236 remover.remove()
237
238 - def new_database(self, db_name, authority=None, description=None):
239 """Add a new database to the server and return it. 240 """ 241 # make the database 242 sql = r"INSERT INTO biodatabase (name, authority, description)" \ 243 r" VALUES (%s, %s, %s)" 244 self.adaptor.execute(sql, (db_name, authority, description)) 245 return BioSeqDatabase(self.adaptor, db_name)
246
247 - def load_database_sql(self, sql_file):
248 """Load a database schema into the given database. 249 250 This is used to create tables, etc when a database is first created. 251 sql_file should specify the complete path to a file containing 252 SQL entries for building the tables. 253 """ 254 # Not sophisticated enough for PG schema. Is it needed by MySQL? 255 # Looks like we need this more complicated way for both. Leaving it 256 # the default and removing the simple-minded approach. 257 258 # read the file with all comment lines removed 259 sql_handle = open(sql_file, "rU") 260 sql = r"" 261 for line in sql_handle: 262 if line.startswith("--"): # don't include comment lines 263 pass 264 elif line.startswith("#"): # ditto for MySQL comments 265 pass 266 elif line.strip(): # only include non-blank lines 267 sql += line.strip() 268 sql += ' ' 269 270 # two ways to load the SQL 271 # 1. PostgreSQL can load it all at once and actually needs to 272 # due to FUNCTION defines at the end of the SQL which mess up 273 # the splitting by semicolons 274 if self.module_name in ["psycopg2", "pgdb"]: 275 self.adaptor.cursor.execute(sql) 276 # 2. MySQL needs the database loading split up into single lines of 277 # SQL executed one at a time 278 elif self.module_name in ["MySQLdb", "sqlite3"]: 279 sql_parts = sql.split(";") # one line per sql command 280 for sql_line in sql_parts[:-1]: # don't use the last item, it's blank 281 self.adaptor.cursor.execute(sql_line) 282 else: 283 raise ValueError("Module %s not supported by the loader." % 284 (self.module_name))
285
286 - def commit(self):
287 """Commits the current transaction to the database.""" 288 return self.adaptor.commit()
289
290 - def rollback(self):
291 """Rolls backs the current transaction.""" 292 return self.adaptor.rollback()
293
294 - def close(self):
295 """Close the connection. No further activity possible.""" 296 return self.adaptor.close()
297 298
299 -class Adaptor:
300 - def __init__(self, conn, dbutils):
301 self.conn = conn 302 self.cursor = conn.cursor() 303 self.dbutils = dbutils
304
305 - def last_id(self, table):
306 return self.dbutils.last_id(self.cursor, table)
307
308 - def autocommit(self, y=True):
309 """Set the autocommit mode. True values enable; False value disable.""" 310 return self.dbutils.autocommit(self.conn, y)
311
312 - def commit(self):
313 """Commits the current transaction.""" 314 return self.conn.commit()
315
316 - def rollback(self):
317 """Rolls backs the current transaction.""" 318 return self.conn.rollback()
319
320 - def close(self):
321 """Close the connection. No further activity possible.""" 322 return self.conn.close()
323
324 - def fetch_dbid_by_dbname(self, dbname):
325 self.execute( 326 r"select biodatabase_id from biodatabase where name = %s", 327 (dbname,)) 328 rv = self.cursor.fetchall() 329 if not rv: 330 raise KeyError("Cannot find biodatabase with name %r" % dbname) 331 # Cannot happen (UK) 332 ## assert len(rv) == 1, "More than one biodatabase with name %r" % dbname 333 return rv[0][0]
334
335 - def fetch_seqid_by_display_id(self, dbid, name):
336 sql = r"select bioentry_id from bioentry where name = %s" 337 fields = [name] 338 if dbid: 339 sql += " and biodatabase_id = %s" 340 fields.append(dbid) 341 self.execute(sql, fields) 342 rv = self.cursor.fetchall() 343 if not rv: 344 raise IndexError("Cannot find display id %r" % name) 345 if len(rv) > 1: 346 raise IndexError("More than one entry with display id %r" % name) 347 return rv[0][0]
348
349 - def fetch_seqid_by_accession(self, dbid, name):
350 sql = r"select bioentry_id from bioentry where accession = %s" 351 fields = [name] 352 if dbid: 353 sql += " and biodatabase_id = %s" 354 fields.append(dbid) 355 self.execute(sql, fields) 356 rv = self.cursor.fetchall() 357 if not rv: 358 raise IndexError("Cannot find accession %r" % name) 359 if len(rv) > 1: 360 raise IndexError("More than one entry with accession %r" % name) 361 return rv[0][0]
362
363 - def fetch_seqids_by_accession(self, dbid, name):
364 sql = r"select bioentry_id from bioentry where accession = %s" 365 fields = [name] 366 if dbid: 367 sql += " and biodatabase_id = %s" 368 fields.append(dbid) 369 return self.execute_and_fetch_col0(sql, fields)
370
371 - def fetch_seqid_by_version(self, dbid, name):
372 acc_version = name.split(".") 373 if len(acc_version) > 2: 374 raise IndexError("Bad version %r" % name) 375 acc = acc_version[0] 376 if len(acc_version) == 2: 377 version = acc_version[1] 378 else: 379 version = "0" 380 sql = r"SELECT bioentry_id FROM bioentry WHERE accession = %s" \ 381 r" AND version = %s" 382 fields = [acc, version] 383 if dbid: 384 sql += " and biodatabase_id = %s" 385 fields.append(dbid) 386 self.execute(sql, fields) 387 rv = self.cursor.fetchall() 388 if not rv: 389 raise IndexError("Cannot find version %r" % name) 390 if len(rv) > 1: 391 raise IndexError("More than one entry with version %r" % name) 392 return rv[0][0]
393
394 - def fetch_seqid_by_identifier(self, dbid, identifier):
395 # YB: was fetch_seqid_by_seqid 396 sql = "SELECT bioentry_id FROM bioentry WHERE identifier = %s" 397 fields = [identifier] 398 if dbid: 399 sql += " and biodatabase_id = %s" 400 fields.append(dbid) 401 self.execute(sql, fields) 402 rv = self.cursor.fetchall() 403 if not rv: 404 raise IndexError("Cannot find display id %r" % identifier) 405 return rv[0][0]
406
407 - def list_biodatabase_names(self):
408 return self.execute_and_fetch_col0( 409 "SELECT name FROM biodatabase")
410
411 - def list_bioentry_ids(self, dbid):
412 return self.execute_and_fetch_col0( 413 "SELECT bioentry_id FROM bioentry WHERE biodatabase_id = %s", 414 (dbid,))
415
416 - def list_bioentry_display_ids(self, dbid):
417 return self.execute_and_fetch_col0( 418 "SELECT name FROM bioentry WHERE biodatabase_id = %s", 419 (dbid,))
420
421 - def list_any_ids(self, sql, args):
422 """Return ids given a SQL statement to select for them. 423 424 This assumes that the given SQL does a SELECT statement that 425 returns a list of items. This parses them out of the 2D list 426 they come as and just returns them in a list. 427 """ 428 return self.execute_and_fetch_col0(sql, args)
429
430 - def execute_one(self, sql, args=None):
431 self.execute(sql, args or ()) 432 rv = self.cursor.fetchall() 433 assert len(rv) == 1, "Expected 1 response, got %d" % len(rv) 434 return rv[0]
435
436 - def execute(self, sql, args=None):
437 """Just execute an sql command. 438 """ 439 if os.name == "java": 440 sql = sql.replace("%s", "?") 441 self.dbutils.execute(self.cursor, sql, args)
442
443 - def get_subseq_as_string(self, seqid, start, end):
444 length = end - start 445 # XXX Check this on MySQL and PostgreSQL. substr should be general, 446 # does it need dbutils? 447 #return self.execute_one( 448 # """select SUBSTRING(seq FROM %s FOR %s) 449 # from biosequence where bioentry_id = %s""", 450 # (start+1, length, seqid))[0] 451 # 452 # Convert to a string on returning for databases that give back 453 # unicode. Shouldn't need unicode for sequences so this seems safe. 454 return str(self.execute_one( 455 """select SUBSTR(seq, %s, %s) 456 from biosequence where bioentry_id = %s""", 457 (start + 1, length, seqid))[0])
458
459 - def execute_and_fetch_col0(self, sql, args=None):
460 self.execute(sql, args or ()) 461 return [field[0] for field in self.cursor.fetchall()]
462
463 - def execute_and_fetchall(self, sql, args=None):
464 self.execute(sql, args or ()) 465 return self.cursor.fetchall()
466 467 _allowed_lookups = { 468 # Lookup name / function name to get id, function to list all ids 469 'primary_id': "fetch_seqid_by_identifier", 470 'gi': "fetch_seqid_by_identifier", 471 'display_id': "fetch_seqid_by_display_id", 472 'name': "fetch_seqid_by_display_id", 473 'accession': "fetch_seqid_by_accession", 474 'version': "fetch_seqid_by_version", 475 } 476 477
478 -class BioSeqDatabase:
479 """Represents a namespace (sub-database) within the BioSQL database. 480 481 i.e. One row in the biodatabase table, and all all rows in the bioentry 482 table associated with it. 483 """
484 - def __init__(self, adaptor, name):
485 self.adaptor = adaptor 486 self.name = name 487 self.dbid = self.adaptor.fetch_dbid_by_dbname(name)
488
489 - def __repr__(self):
490 return "BioSeqDatabase(%r, %r)" % (self.adaptor, self.name)
491
492 - def get_Seq_by_id(self, name):
493 """Gets a DBSeqRecord object by its name 494 495 Example: seq_rec = db.get_Seq_by_id('ROA1_HUMAN') 496 497 The name of this method is misleading since it returns a DBSeqRecord 498 rather than a DBSeq ojbect, and presumably was to mirror BioPerl. 499 """ 500 seqid = self.adaptor.fetch_seqid_by_display_id(self.dbid, name) 501 return BioSeq.DBSeqRecord(self.adaptor, seqid)
502
503 - def get_Seq_by_acc(self, name):
504 """Gets a DBSeqRecord object by accession number 505 506 Example: seq_rec = db.get_Seq_by_acc('X77802') 507 508 The name of this method is misleading since it returns a DBSeqRecord 509 rather than a DBSeq ojbect, and presumably was to mirror BioPerl. 510 """ 511 seqid = self.adaptor.fetch_seqid_by_accession(self.dbid, name) 512 return BioSeq.DBSeqRecord(self.adaptor, seqid)
513
514 - def get_Seq_by_ver(self, name):
515 """Gets a DBSeqRecord object by version number 516 517 Example: seq_rec = db.get_Seq_by_ver('X77802.1') 518 519 The name of this method is misleading since it returns a DBSeqRecord 520 rather than a DBSeq ojbect, and presumably was to mirror BioPerl. 521 """ 522 seqid = self.adaptor.fetch_seqid_by_version(self.dbid, name) 523 return BioSeq.DBSeqRecord(self.adaptor, seqid)
524
525 - def get_Seqs_by_acc(self, name):
526 """Gets a list of DBSeqRecord objects by accession number 527 528 Example: seq_recs = db.get_Seq_by_acc('X77802') 529 530 The name of this method is misleading since it returns a list of 531 DBSeqRecord objects rather than a list of DBSeq ojbects, and presumably 532 was to mirror BioPerl. 533 """ 534 seqids = self.adaptor.fetch_seqids_by_accession(self.dbid, name) 535 return [BioSeq.DBSeqRecord(self.adaptor, seqid) for seqid in seqids]
536
537 - def get_all_primary_ids(self):
538 """All the primary_ids of the sequences in the database (OBSOLETE). 539 540 These maybe ids (display style) or accession numbers or 541 something else completely different - they *are not* 542 meaningful outside of this database implementation. 543 544 Please use .keys() instead of .get_all_primary_ids() 545 """ 546 import warnings 547 warnings.warn("Use bio_seq_database.keys() instead of " 548 "bio_seq_database.get_all_primary_ids()", 549 BiopythonDeprecationWarning) 550 return list(self.keys())
551
552 - def __getitem__(self, key):
553 return BioSeq.DBSeqRecord(self.adaptor, key)
554
555 - def __delitem__(self, key):
556 """Remove an entry and all its annotation.""" 557 if key not in self: 558 raise KeyError(key) 559 #Assuming this will automatically cascade to the other tables... 560 sql = "DELETE FROM bioentry " + \ 561 "WHERE biodatabase_id=%s AND bioentry_id=%s;" 562 self.adaptor.execute(sql, (self.dbid, key))
563
564 - def __len__(self):
565 """Number of records in this namespace (sub database).""" 566 sql = "SELECT COUNT(bioentry_id) FROM bioentry " + \ 567 "WHERE biodatabase_id=%s;" 568 return int(self.adaptor.execute_and_fetch_col0(sql, (self.dbid, ))[0])
569
570 - def __contains__(self, value):
571 """Check if a primary (internal) id is this namespace (sub database).""" 572 sql = "SELECT COUNT(bioentry_id) FROM bioentry " + \ 573 "WHERE biodatabase_id=%s AND bioentry_id=%s;" 574 #The bioentry_id field is an integer in the schema. 575 #PostgreSQL will throw an error if we use a non integer in the query. 576 try: 577 bioentry_id = int(value) 578 except ValueError: 579 return False 580 return bool(self.adaptor.execute_and_fetch_col0(sql, 581 (self.dbid, bioentry_id))[0])
582
583 - def __iter__(self):
584 """Iterate over ids (which may not be meaningful outside this database).""" 585 #TODO - Iterate over the cursor, much more efficient 586 return iter(self.adaptor.list_bioentry_ids(self.dbid))
587 588 if hasattr(dict, "iteritems"): 589 #Python 2, use iteritems etc
590 - def keys(self):
591 """List of ids which may not be meaningful outside this database.""" 592 return self.adaptor.list_bioentry_ids(self.dbid)
593
594 - def values(self):
595 """List of DBSeqRecord objects in the namespace (sub database).""" 596 return [self[key] for key in self]
597
598 - def items(self):
599 """List of (id, DBSeqRecord) for the namespace (sub database).""" 600 return [(key, self[key]) for key in self]
601
602 - def iterkeys(self):
603 """Iterate over ids (which may not be meaningful outside this database).""" 604 return iter(self)
605
606 - def itervalues(self):
607 """Iterate over DBSeqRecord objects in the namespace (sub database).""" 608 for key in self: 609 yield self[key]
610
611 - def iteritems(self):
612 """Iterate over (id, DBSeqRecord) for the namespace (sub database).""" 613 for key in self: 614 yield key, self[key]
615 else: 616 #Python 3, items etc are all iterators
617 - def keys(self):
618 """Iterate over ids (which may not be meaningful outside this database).""" 619 return iter(self)
620
621 - def values(self):
622 """Iterate over DBSeqRecord objects in the namespace (sub database).""" 623 for key in self: 624 yield self[key]
625
626 - def items(self):
627 """Iterate over (id, DBSeqRecord) for the namespace (sub database).""" 628 for key in self: 629 yield key, self[key]
630
631 - def lookup(self, **kwargs):
632 if len(kwargs) != 1: 633 raise TypeError("single key/value parameter expected") 634 k, v = list(kwargs.items())[0] 635 if k not in _allowed_lookups: 636 raise TypeError("lookup() expects one of %r, not %r" % 637 (list(_allowed_lookups.keys()), k)) 638 lookup_name = _allowed_lookups[k] 639 lookup_func = getattr(self.adaptor, lookup_name) 640 seqid = lookup_func(self.dbid, v) 641 return BioSeq.DBSeqRecord(self.adaptor, seqid)
642
643 - def get_Seq_by_primary_id(self, seqid):
644 """Get a DBSeqRecord by the primary (internal) id (OBSOLETE). 645 646 Rather than db.get_Seq_by_primary_id(my_id) use db[my_id] 647 648 The name of this method is misleading since it returns a DBSeqRecord 649 rather than a DBSeq ojbect, and presumably was to mirror BioPerl. 650 """ 651 import warnings 652 warnings.warn("Use bio_seq_database[my_id] instead of " 653 "bio_seq_database.get_Seq_by_primary_id(my_id)", 654 BiopythonDeprecationWarning) 655 return self[seqid]
656
657 - def load(self, record_iterator, fetch_NCBI_taxonomy=False):
658 """Load a set of SeqRecords into the BioSQL database. 659 660 record_iterator is either a list of SeqRecord objects, or an 661 Iterator object that returns SeqRecord objects (such as the 662 output from the Bio.SeqIO.parse() function), which will be 663 used to populate the database. 664 665 fetch_NCBI_taxonomy is boolean flag allowing or preventing 666 connection to the taxonomic database on the NCBI server 667 (via Bio.Entrez) to fetch a detailed taxonomy for each 668 SeqRecord. 669 670 Example: 671 from Bio import SeqIO 672 count = db.load(SeqIO.parse(open(filename), format)) 673 674 Returns the number of records loaded. 675 """ 676 db_loader = Loader.DatabaseLoader(self.adaptor, self.dbid, 677 fetch_NCBI_taxonomy) 678 num_records = 0 679 global _POSTGRES_RULES_PRESENT 680 for cur_record in record_iterator: 681 num_records += 1 682 #Hack to work arround BioSQL Bug 2839 - If using PostgreSQL and 683 #the RULES are present check for a duplicate record before loading 684 if _POSTGRES_RULES_PRESENT: 685 #Recreate what the Loader's _load_bioentry_table will do: 686 if cur_record.id.count(".") == 1: 687 accession, version = cur_record.id.split('.') 688 try: 689 version = int(version) 690 except ValueError: 691 accession = cur_record.id 692 version = 0 693 else: 694 accession = cur_record.id 695 version = 0 696 gi = cur_record.annotations.get("gi", None) 697 sql = "SELECT bioentry_id FROM bioentry WHERE (identifier " + \ 698 "= '%s' AND biodatabase_id = '%s') OR (accession = " + \ 699 "'%s' AND version = '%s' AND biodatabase_id = '%s')" 700 self.adaptor.execute(sql % (gi, self.dbid, accession, version, self.dbid)) 701 if self.adaptor.cursor.fetchone(): 702 raise self.adaptor.conn.IntegrityError("Duplicate record " 703 "detected: record has not been inserted") 704 #End of hack 705 db_loader.load_seqrecord(cur_record) 706 return num_records
707