1
2
3
4
5
6
7
8
9
10 """Connect with a BioSQL database and load Biopython like objects from it.
11
12 This provides interfaces for loading biological objects from a relational
13 database, and is compatible with the BioSQL standards.
14 """
15 from Bio import BiopythonDeprecationWarning
16
17 import BioSeq
18 import Loader
19 import DBUtils
20
21 _POSTGRES_RULES_PRESENT = False
22
23
25 """Main interface for loading a existing BioSQL-style database.
26
27 This function is the easiest way to retrieve a connection to a
28 database, doing something like:
29
30 >>> from BioSeq import BioSeqDatabase
31 >>> server = BioSeqDatabase.open_database(user="root", db="minidb")
32
33 the various options are:
34 driver -> The name of the database driver to use for connecting. The
35 driver should implement the python DB API. By default, the MySQLdb
36 driver is used.
37 user -> the username to connect to the database with.
38 password, passwd -> the password to connect with
39 host -> the hostname of the database
40 database or db -> the name of the database
41 """
42 if driver == "psycopg":
43 raise ValueError("Using BioSQL with psycopg (version one) is no "
44 "longer supported. Use psycopg2 instead.")
45
46 module = __import__(driver)
47 connect = getattr(module, "connect")
48
49
50 kw = kwargs.copy()
51 if driver == "MySQLdb":
52 if "database" in kw:
53 kw["db"] = kw["database"]
54 del kw["database"]
55 if "password" in kw:
56 kw["passwd"] = kw["password"]
57 del kw["password"]
58 else:
59
60 if "db" in kw:
61 kw["database"] = kw["db"]
62 del kw["db"]
63 if "passwd" in kw:
64 kw["password"] = kw["passwd"]
65 del kw["passwd"]
66 if driver in ["psycopg2", "pgdb"] and not kw.get("database"):
67 kw["database"] = "template1"
68
69 if driver in ["sqlite3"]:
70 conn = connect(kw["database"])
71 else:
72 try:
73 conn = connect(**kw)
74 except module.InterfaceError:
75
76
77 if "database" in kw:
78 kw["dbname"] = kw["database"]
79 del kw["database"]
80 elif "db" in kw:
81 kw["dbname"] = kw["db"]
82 del kw["db"]
83 dsn = ' '.join(['='.join(i) for i in kw.items()])
84 conn = connect(dsn)
85
86 server = DBServer(conn, module)
87
88
89
90 if driver in ["psycopg2", "pgdb"]:
91 sql = "SELECT ev_class FROM pg_rewrite WHERE " + \
92 "rulename='rule_bioentry_i1' OR " + \
93 "rulename='rule_bioentry_i2';"
94 if server.adaptor.execute_and_fetchall(sql):
95 import warnings
96 from Bio import BiopythonWarning
97 warnings.warn("Your BioSQL PostgreSQL schema includes some "
98 "rules currently required for bioperl-db but "
99 "which may cause problems loading data using "
100 "Biopython (see BioSQL Bug 2839). If you do not "
101 "use BioPerl, please remove these rules. "
102 "Biopython should cope with the rules present, "
103 "but with a performance penalty when loading "
104 "new records.", BiopythonWarning)
105 global _POSTGRES_RULES_PRESENT
106 _POSTGRES_RULES_PRESENT = True
107
108 return server
109
110
112 """Represents a BioSQL database continaing namespaces (sub-databases).
113
114 This acts like a Python dictionary, giving access to each namespace
115 (defined by a row in the biodatabase table) as a BioSeqDatabase object.
116 """
117 - def __init__(self, conn, module, module_name=None):
118 self.module = module
119 if module_name is None:
120 module_name = module.__name__
121 self.adaptor = Adaptor(conn, DBUtils.get_dbutils(module_name))
122 self.module_name = module_name
123
125 return self.__class__.__name__ + "(%r)" % self.adaptor.conn
126
129
131 """Number of namespaces (sub-databases) in this database."""
132 sql = "SELECT COUNT(name) FROM biodatabase;"
133 return int(self.adaptor.execute_and_fetch_col0(sql)[0])
134
136 """Check if a namespace (sub-database) in this database."""
137 sql = "SELECT COUNT(name) FROM biodatabase WHERE name=%s;"
138 return bool(self.adaptor.execute_and_fetch_col0(sql, (value,))[0])
139
141 """Iterate over namespaces (sub-databases) in the database."""
142
143 return iter(self.adaptor.list_biodatabase_names())
144
145 if hasattr(dict, "iteritems"):
146
148 """List of namespaces (sub-databases) in the database."""
149 return self.adaptor.list_biodatabase_names()
150
152 """List of BioSeqDatabase objects in the database."""
153 return [self[key] for key in self.keys()]
154
156 """List of (namespace, BioSeqDatabase) for entries in the database."""
157 return [(key, self[key]) for key in self.keys()]
158
160 """Iterate over namespaces (sub-databases) in the database."""
161 return iter(self)
162
164 """Iterate over BioSeqDatabase objects in the database."""
165 for key in self:
166 yield self[key]
167
169 """Iterate over (namespace, BioSeqDatabase) in the database."""
170 for key in self:
171 yield key, self[key]
172 else:
173
175 """Iterate over namespaces (sub-databases) in the database."""
176 return iter(self)
177
179 """Iterate over BioSeqDatabase objects in the database."""
180 for key in self:
181 yield self[key]
182
184 """Iterate over (namespace, BioSeqDatabase) in the database."""
185 for key in self:
186 yield key, self[key]
187
193
195 """Remove a namespace and all its entries (OBSOLETE).
196
197 Try to remove all references to items in a database.
198
199 server.remove_database(name)
200
201 In keeping with the dictionary interface, you can now do this:
202
203 del server[name]
204 """
205 import warnings
206 warnings.warn("This method is deprecated. In keeping with the "
207 "dictionary interface, you can now use 'del "
208 "server[name]' instead", BiopythonDeprecationWarning)
209 db_id = self.adaptor.fetch_dbid_by_dbname(db_name)
210 remover = Loader.DatabaseRemover(self.adaptor, db_id)
211 remover.remove()
212
213 - def new_database(self, db_name, authority=None, description=None):
214 """Add a new database to the server and return it.
215 """
216
217 sql = r"INSERT INTO biodatabase (name, authority, description)" \
218 r" VALUES (%s, %s, %s)"
219 self.adaptor.execute(sql, (db_name, authority, description))
220 return BioSeqDatabase(self.adaptor, db_name)
221
223 """Load a database schema into the given database.
224
225 This is used to create tables, etc when a database is first created.
226 sql_file should specify the complete path to a file containing
227 SQL entries for building the tables.
228 """
229
230
231
232
233
234 sql_handle = open(sql_file, "rU")
235 sql = r""
236 for line in sql_handle:
237 if line.startswith("--"):
238 pass
239 elif line.startswith("#"):
240 pass
241 elif line.strip():
242 sql += line.strip()
243 sql += ' '
244
245
246
247
248
249 if self.module_name in ["psycopg2", "pgdb"]:
250 self.adaptor.cursor.execute(sql)
251
252
253 elif self.module_name in ["MySQLdb", "sqlite3"]:
254 sql_parts = sql.split(";")
255 for sql_line in sql_parts[:-1]:
256 self.adaptor.cursor.execute(sql_line)
257 else:
258 raise ValueError("Module %s not supported by the loader." %
259 (self.module_name))
260
262 """Commits the current transaction to the database."""
263 return self.adaptor.commit()
264
266 """Rolls backs the current transaction."""
267 return self.adaptor.rollback()
268
270 """Close the connection. No further activity possible."""
271 return self.adaptor.close()
272
273
276 self.conn = conn
277 self.cursor = conn.cursor()
278 self.dbutils = dbutils
279
282
284 """Set the autocommit mode. True values enable; False value disable."""
285 return self.dbutils.autocommit(self.conn, y)
286
288 """Commits the current transaction."""
289 return self.conn.commit()
290
292 """Rolls backs the current transaction."""
293 return self.conn.rollback()
294
296 """Close the connection. No further activity possible."""
297 return self.conn.close()
298
300 self.execute(
301 r"select biodatabase_id from biodatabase where name = %s",
302 (dbname,))
303 rv = self.cursor.fetchall()
304 if not rv:
305 raise KeyError("Cannot find biodatabase with name %r" % dbname)
306
307
308 return rv[0][0]
309
311 sql = r"select bioentry_id from bioentry where name = %s"
312 fields = [name]
313 if dbid:
314 sql += " and biodatabase_id = %s"
315 fields.append(dbid)
316 self.execute(sql, fields)
317 rv = self.cursor.fetchall()
318 if not rv:
319 raise IndexError("Cannot find display id %r" % name)
320 if len(rv) > 1:
321 raise IndexError("More than one entry with display id %r" % name)
322 return rv[0][0]
323
325 sql = r"select bioentry_id from bioentry where accession = %s"
326 fields = [name]
327 if dbid:
328 sql += " and biodatabase_id = %s"
329 fields.append(dbid)
330 self.execute(sql, fields)
331 rv = self.cursor.fetchall()
332 if not rv:
333 raise IndexError("Cannot find accession %r" % name)
334 if len(rv) > 1:
335 raise IndexError("More than one entry with accession %r" % name)
336 return rv[0][0]
337
339 sql = r"select bioentry_id from bioentry where accession = %s"
340 fields = [name]
341 if dbid:
342 sql += " and biodatabase_id = %s"
343 fields.append(dbid)
344 return self.execute_and_fetch_col0(sql, fields)
345
347 acc_version = name.split(".")
348 if len(acc_version) > 2:
349 raise IndexError("Bad version %r" % name)
350 acc = acc_version[0]
351 if len(acc_version) == 2:
352 version = acc_version[1]
353 else:
354 version = "0"
355 sql = r"SELECT bioentry_id FROM bioentry WHERE accession = %s" \
356 r" AND version = %s"
357 fields = [acc, version]
358 if dbid:
359 sql += " and biodatabase_id = %s"
360 fields.append(dbid)
361 self.execute(sql, fields)
362 rv = self.cursor.fetchall()
363 if not rv:
364 raise IndexError("Cannot find version %r" % name)
365 if len(rv) > 1:
366 raise IndexError("More than one entry with version %r" % name)
367 return rv[0][0]
368
370
371 sql = "SELECT bioentry_id FROM bioentry WHERE identifier = %s"
372 fields = [identifier]
373 if dbid:
374 sql += " and biodatabase_id = %s"
375 fields.append(dbid)
376 self.execute(sql, fields)
377 rv = self.cursor.fetchall()
378 if not rv:
379 raise IndexError("Cannot find display id %r" % identifier)
380 return rv[0][0]
381
385
386 - def list_bioentry_ids(self, dbid):
387 return self.execute_and_fetch_col0(
388 "SELECT bioentry_id FROM bioentry WHERE biodatabase_id = %s",
389 (dbid,))
390
392 return self.execute_and_fetch_col0(
393 "SELECT name FROM bioentry WHERE biodatabase_id = %s",
394 (dbid,))
395
397 """Return ids given a SQL statement to select for them.
398
399 This assumes that the given SQL does a SELECT statement that
400 returns a list of items. This parses them out of the 2D list
401 they come as and just returns them in a list.
402 """
403 return self.execute_and_fetch_col0(sql, args)
404
406 self.execute(sql, args or ())
407 rv = self.cursor.fetchall()
408 assert len(rv) == 1, "Expected 1 response, got %d" % len(rv)
409 return rv[0]
410
411 - def execute(self, sql, args=None):
412 """Just execute an sql command.
413 """
414 self.dbutils.execute(self.cursor, sql, args)
415
417 length = end - start
418
419
420
421
422
423
424
425
426
427 return str(self.execute_one(
428 """select SUBSTR(seq, %s, %s)
429 from biosequence where bioentry_id = %s""",
430 (start + 1, length, seqid))[0])
431
433 self.execute(sql, args or ())
434 return [field[0] for field in self.cursor.fetchall()]
435
437 self.execute(sql, args or ())
438 return self.cursor.fetchall()
439
440 _allowed_lookups = {
441
442 'primary_id': "fetch_seqid_by_identifier",
443 'gi': "fetch_seqid_by_identifier",
444 'display_id': "fetch_seqid_by_display_id",
445 'name': "fetch_seqid_by_display_id",
446 'accession': "fetch_seqid_by_accession",
447 'version': "fetch_seqid_by_version",
448 }
449
450
452 """Represents a namespace (sub-database) within the BioSQL database.
453
454 i.e. One row in the biodatabase table, and all all rows in the bioentry
455 table associated with it.
456 """
461
463 return "BioSeqDatabase(%r, %r)" % (self.adaptor, self.name)
464
466 """Gets a DBSeqRecord object by its name
467
468 Example: seq_rec = db.get_Seq_by_id('ROA1_HUMAN')
469
470 The name of this method is misleading since it returns a DBSeqRecord
471 rather than a DBSeq ojbect, and presumably was to mirror BioPerl.
472 """
473 seqid = self.adaptor.fetch_seqid_by_display_id(self.dbid, name)
474 return BioSeq.DBSeqRecord(self.adaptor, seqid)
475
477 """Gets a DBSeqRecord object by accession number
478
479 Example: seq_rec = db.get_Seq_by_acc('X77802')
480
481 The name of this method is misleading since it returns a DBSeqRecord
482 rather than a DBSeq ojbect, and presumably was to mirror BioPerl.
483 """
484 seqid = self.adaptor.fetch_seqid_by_accession(self.dbid, name)
485 return BioSeq.DBSeqRecord(self.adaptor, seqid)
486
488 """Gets a DBSeqRecord object by version number
489
490 Example: seq_rec = db.get_Seq_by_ver('X77802.1')
491
492 The name of this method is misleading since it returns a DBSeqRecord
493 rather than a DBSeq ojbect, and presumably was to mirror BioPerl.
494 """
495 seqid = self.adaptor.fetch_seqid_by_version(self.dbid, name)
496 return BioSeq.DBSeqRecord(self.adaptor, seqid)
497
499 """Gets a list of DBSeqRecord objects by accession number
500
501 Example: seq_recs = db.get_Seq_by_acc('X77802')
502
503 The name of this method is misleading since it returns a list of
504 DBSeqRecord objects rather than a list of DBSeq ojbects, and presumably
505 was to mirror BioPerl.
506 """
507 seqids = self.adaptor.fetch_seqids_by_accession(self.dbid, name)
508 return [BioSeq.DBSeqRecord(self.adaptor, seqid) for seqid in seqids]
509
511 """All the primary_ids of the sequences in the database (OBSOLETE).
512
513 These maybe ids (display style) or accession numbers or
514 something else completely different - they *are not*
515 meaningful outside of this database implementation.
516
517 Please use .keys() instead of .get_all_primary_ids()
518 """
519 import warnings
520 warnings.warn("Use bio_seq_database.keys() instead of "
521 "bio_seq_database.get_all_primary_ids()",
522 BiopythonDeprecationWarning)
523 return self.keys()
524
527
529 """Remove an entry and all its annotation."""
530 if key not in self:
531 raise KeyError(key)
532
533 sql = "DELETE FROM bioentry " + \
534 "WHERE biodatabase_id=%s AND bioentry_id=%s;"
535 self.adaptor.execute(sql, (self.dbid, key))
536
538 """Number of records in this namespace (sub database)."""
539 sql = "SELECT COUNT(bioentry_id) FROM bioentry " + \
540 "WHERE biodatabase_id=%s;"
541 return int(self.adaptor.execute_and_fetch_col0(sql, (self.dbid, ))[0])
542
544 """Check if a primary (internal) id is this namespace (sub database)."""
545 sql = "SELECT COUNT(bioentry_id) FROM bioentry " + \
546 "WHERE biodatabase_id=%s AND bioentry_id=%s;"
547
548
549 try:
550 bioentry_id = int(value)
551 except ValueError:
552 return False
553 return bool(self.adaptor.execute_and_fetch_col0(sql,
554 (self.dbid, bioentry_id))[0])
555
557 """Iterate over ids (which may not be meaningful outside this database)."""
558
559 return iter(self.adaptor.list_bioentry_ids(self.dbid))
560
561 if hasattr(dict, "iteritems"):
562
564 """List of ids which may not be meaningful outside this database."""
565 return self.adaptor.list_bioentry_ids(self.dbid)
566
568 """List of DBSeqRecord objects in the namespace (sub database)."""
569 return [self[key] for key in self.keys()]
570
572 """List of (id, DBSeqRecord) for the namespace (sub database)."""
573 return [(key, self[key]) for key in self.keys()]
574
576 """Iterate over ids (which may not be meaningful outside this database)."""
577 return iter(self)
578
580 """Iterate over DBSeqRecord objects in the namespace (sub database)."""
581 for key in self:
582 yield self[key]
583
585 """Iterate over (id, DBSeqRecord) for the namespace (sub database)."""
586 for key in self:
587 yield key, self[key]
588 else:
589
591 """Iterate over ids (which may not be meaningful outside this database)."""
592 return iter(self)
593
595 """Iterate over DBSeqRecord objects in the namespace (sub database)."""
596 for key in self:
597 yield self[key]
598
600 """Iterate over (id, DBSeqRecord) for the namespace (sub database)."""
601 for key in self:
602 yield key, self[key]
603
605 if len(kwargs) != 1:
606 raise TypeError("single key/value parameter expected")
607 k, v = kwargs.items()[0]
608 if k not in _allowed_lookups:
609 raise TypeError("lookup() expects one of %s, not %r" %
610 (repr(_allowed_lookups.keys())[1:-1], repr(k)))
611 lookup_name = _allowed_lookups[k]
612 lookup_func = getattr(self.adaptor, lookup_name)
613 seqid = lookup_func(self.dbid, v)
614 return BioSeq.DBSeqRecord(self.adaptor, seqid)
615
617 """Get a DBSeqRecord by the primary (internal) id (OBSOLETE).
618
619 Rather than db.get_Seq_by_primary_id(my_id) use db[my_id]
620
621 The name of this method is misleading since it returns a DBSeqRecord
622 rather than a DBSeq ojbect, and presumably was to mirror BioPerl.
623 """
624 import warnings
625 warnings.warn("Use bio_seq_database[my_id] instead of "
626 "bio_seq_database.get_Seq_by_primary_id(my_id)",
627 BiopythonDeprecationWarning)
628 return self[seqid]
629
630 - def load(self, record_iterator, fetch_NCBI_taxonomy=False):
631 """Load a set of SeqRecords into the BioSQL database.
632
633 record_iterator is either a list of SeqRecord objects, or an
634 Iterator object that returns SeqRecord objects (such as the
635 output from the Bio.SeqIO.parse() function), which will be
636 used to populate the database.
637
638 fetch_NCBI_taxonomy is boolean flag allowing or preventing
639 connection to the taxonomic database on the NCBI server
640 (via Bio.Entrez) to fetch a detailed taxonomy for each
641 SeqRecord.
642
643 Example:
644 from Bio import SeqIO
645 count = db.load(SeqIO.parse(open(filename), format))
646
647 Returns the number of records loaded.
648 """
649 db_loader = Loader.DatabaseLoader(self.adaptor, self.dbid,
650 fetch_NCBI_taxonomy)
651 num_records = 0
652 global _POSTGRES_RULES_PRESENT
653 for cur_record in record_iterator:
654 num_records += 1
655
656
657 if _POSTGRES_RULES_PRESENT:
658
659 if cur_record.id.count(".") == 1:
660 accession, version = cur_record.id.split('.')
661 try:
662 version = int(version)
663 except ValueError:
664 accession = cur_record.id
665 version = 0
666 else:
667 accession = cur_record.id
668 version = 0
669 gi = cur_record.annotations.get("gi", None)
670 sql = "SELECT bioentry_id FROM bioentry WHERE (identifier " + \
671 "= '%s' AND biodatabase_id = '%s') OR (accession = " + \
672 "'%s' AND version = '%s' AND biodatabase_id = '%s')"
673 self.adaptor.execute(sql % (gi, self.dbid, accession, version, self.dbid))
674 if self.adaptor.cursor.fetchone():
675 raise self.adaptor.conn.IntegrityError("Duplicate record "
676 "detected: record has not been inserted")
677
678 db_loader.load_seqrecord(cur_record)
679 return num_records
680