Package BioSQL :: Module Loader
[hide private]
[frames] | no frames]

Source Code for Module BioSQL.Loader

   1  # Copyright 2002 by Andrew Dalke.  All rights reserved. 
   2  # Revisions 2007-2016 copyright by Peter Cock.  All rights reserved. 
   3  # Revisions 2008 copyright by Cymon J. Cox.  All rights reserved. 
   4  # This code is part of the Biopython distribution and governed by its 
   5  # license.  Please see the LICENSE file that should have been included 
   6  # as part of this package. 
   7  # 
   8  # Note that BioSQL (including the database schema and scripts) is 
   9  # available and licensed separately.  Please consult www.biosql.org 
  10   
  11  """Load biopython objects into a BioSQL database for persistent storage. 
  12   
  13  This code makes it possible to store biopython objects in a relational 
  14  database and then retrieve them back. You shouldn't use any of the 
  15  classes in this module directly. Rather, call the load() method on 
  16  a database object. 
  17  """ 
  18  # standard modules 
  19  from __future__ import print_function 
  20   
  21  from time import gmtime, strftime 
  22   
  23  # biopython 
  24  from Bio import Alphabet 
  25  from Bio.SeqUtils.CheckSum import crc64 
  26  from Bio import Entrez 
  27  from Bio.Seq import UnknownSeq 
  28   
  29  from Bio._py3k import _is_int_or_long 
  30  from Bio._py3k import range 
  31  from Bio._py3k import basestring 
  32   
  33   
34 -class DatabaseLoader(object):
35 """Object used to load SeqRecord objects into a BioSQL database.""" 36
37 - def __init__(self, adaptor, dbid, fetch_NCBI_taxonomy=False):
38 """Initialize with connection information for the database. 39 40 Creating a DatabaseLoader object is normally handled via the 41 BioSeqDatabase DBServer object, for example:: 42 43 from BioSQL import BioSeqDatabase 44 server = BioSeqDatabase.open_database(driver="MySQLdb", 45 user="gbrowse", 46 passwd="biosql", 47 host="localhost", 48 db="test_biosql") 49 try: 50 db = server["test"] 51 except KeyError: 52 db = server.new_database("test", 53 description="For testing GBrowse") 54 """ 55 self.adaptor = adaptor 56 self.dbid = dbid 57 self.fetch_NCBI_taxonomy = fetch_NCBI_taxonomy
58
59 - def load_seqrecord(self, record):
60 """Load a Biopython SeqRecord into the database.""" 61 bioentry_id = self._load_bioentry_table(record) 62 self._load_bioentry_date(record, bioentry_id) 63 self._load_biosequence(record, bioentry_id) 64 self._load_comment(record, bioentry_id) 65 self._load_dbxrefs(record, bioentry_id) 66 references = record.annotations.get('references', ()) 67 for reference, rank in zip(references, list(range(len(references)))): 68 self._load_reference(reference, rank, bioentry_id) 69 self._load_annotations(record, bioentry_id) 70 for seq_feature_num in range(len(record.features)): 71 seq_feature = record.features[seq_feature_num] 72 self._load_seqfeature(seq_feature, seq_feature_num, bioentry_id)
73
74 - def _get_ontology_id(self, name, definition=None):
75 """Returns the identifier for the named ontology (PRIVATE). 76 77 This looks through the onotology table for a the given entry name. 78 If it is not found, a row is added for this ontology (using the 79 definition if supplied). In either case, the id corresponding to 80 the provided name is returned, so that you can reference it in 81 another table. 82 """ 83 oids = self.adaptor.execute_and_fetch_col0( 84 "SELECT ontology_id FROM ontology WHERE name = %s", 85 (name,)) 86 if oids: 87 return oids[0] 88 self.adaptor.execute( 89 "INSERT INTO ontology(name, definition) VALUES (%s, %s)", 90 (name, definition)) 91 return self.adaptor.last_id("ontology")
92
93 - def _get_term_id(self, 94 name, 95 ontology_id=None, 96 definition=None, 97 identifier=None):
98 """Get the id that corresponds to a term (PRIVATE). 99 100 This looks through the term table for a the given term. If it 101 is not found, a new id corresponding to this term is created. 102 In either case, the id corresponding to that term is returned, so 103 that you can reference it in another table. 104 105 The ontology_id should be used to disambiguate the term. 106 """ 107 108 # try to get the term id 109 sql = r"SELECT term_id FROM term " \ 110 r"WHERE name = %s" 111 fields = [name] 112 if ontology_id: 113 sql += ' AND ontology_id = %s' 114 fields.append(ontology_id) 115 id_results = self.adaptor.execute_and_fetchall(sql, fields) 116 # something is wrong 117 if len(id_results) > 1: 118 raise ValueError("Multiple term ids for %s: %r" % 119 (name, id_results)) 120 elif len(id_results) == 1: 121 return id_results[0][0] 122 else: 123 sql = r"INSERT INTO term (name, definition," \ 124 r" identifier, ontology_id)" \ 125 r" VALUES (%s, %s, %s, %s)" 126 self.adaptor.execute(sql, (name, definition, 127 identifier, ontology_id)) 128 return self.adaptor.last_id("term")
129
130 - def _add_dbxref(self, dbname, accession, version):
131 """Insert a dbxref and return its id.""" 132 self.adaptor.execute( 133 "INSERT INTO dbxref(dbname, accession, version)" 134 " VALUES (%s, %s, %s)", (dbname, accession, version)) 135 return self.adaptor.last_id("dbxref")
136
137 - def _get_taxon_id(self, record):
138 """Get the taxon id for this record (PRIVATE). 139 140 record - a SeqRecord object 141 142 This searches the taxon/taxon_name tables using the 143 NCBI taxon ID, scientific name and common name to find 144 the matching taxon table entry's id. 145 146 If the species isn't in the taxon table, and we have at 147 least the NCBI taxon ID, scientific name or common name, 148 at least a minimal stub entry is created in the table. 149 150 Returns the taxon id (database key for the taxon table, 151 not an NCBI taxon ID), or None if the taxonomy information 152 is missing. 153 154 See also the BioSQL script load_ncbi_taxonomy.pl which 155 will populate and update the taxon/taxon_name tables 156 with the latest information from the NCBI. 157 """ 158 159 # To find the NCBI taxid, first check for a top level annotation 160 ncbi_taxon_id = None 161 if "ncbi_taxid" in record.annotations: 162 # Could be a list of IDs. 163 if isinstance(record.annotations["ncbi_taxid"], list): 164 if len(record.annotations["ncbi_taxid"]) == 1: 165 ncbi_taxon_id = record.annotations["ncbi_taxid"][0] 166 else: 167 ncbi_taxon_id = record.annotations["ncbi_taxid"] 168 if not ncbi_taxon_id: 169 # Secondly, look for a source feature 170 for f in record.features: 171 if f.type == 'source': 172 quals = getattr(f, 'qualifiers', {}) 173 if "db_xref" in quals: 174 for db_xref in f.qualifiers["db_xref"]: 175 if db_xref.startswith("taxon:"): 176 ncbi_taxon_id = int(db_xref[6:]) 177 break 178 if ncbi_taxon_id: 179 break 180 181 try: 182 scientific_name = record.annotations["organism"][:255] 183 except KeyError: 184 scientific_name = None 185 try: 186 common_name = record.annotations["source"][:255] 187 except KeyError: 188 common_name = None 189 # Note: The maximum length for taxon names in the schema is 255. 190 # Cropping it now should help in getting a match when searching, 191 # and avoids an error if we try and add these to the database. 192 193 if ncbi_taxon_id: 194 # Good, we have the NCBI taxon to go on - this is unambiguous :) 195 # Note that the scientific name and common name will only be 196 # used if we have to record a stub entry. 197 return self._get_taxon_id_from_ncbi_taxon_id(ncbi_taxon_id, 198 scientific_name, 199 common_name) 200 201 if not common_name and not scientific_name: 202 # Nothing to go on... and there is no point adding 203 # a new entry to the database. We'll just leave this 204 # sequence's taxon as a NULL in the database. 205 return None 206 207 # Next, we'll try to find a match based on the species name 208 # (stored in GenBank files as the organism and/or the source). 209 if scientific_name: 210 taxa = self.adaptor.execute_and_fetch_col0( 211 "SELECT taxon_id FROM taxon_name" 212 " WHERE name_class = 'scientific name' AND name = %s", 213 (scientific_name,)) 214 if taxa: 215 # Good, mapped the scientific name to a taxon table entry 216 return taxa[0] 217 218 # Last chance... 219 if common_name: 220 taxa = self.adaptor.execute_and_fetch_col0( 221 "SELECT DISTINCT taxon_id FROM taxon_name" 222 " WHERE name = %s", 223 (common_name,)) 224 # Its natural that several distinct taxa will have the same common 225 # name - in which case we can't resolve the taxon uniquely. 226 if len(taxa) > 1: 227 raise ValueError("Taxa: %d species have name %r" % ( 228 len(taxa), 229 common_name)) 230 if taxa: 231 # Good, mapped the common name to a taxon table entry 232 return taxa[0] 233 234 # At this point, as far as we can tell, this species isn't 235 # in the taxon table already. So we'll have to add it. 236 # We don't have an NCBI taxonomy ID, so if we do record just 237 # a stub entry, there is no simple way to fix this later. 238 # 239 # TODO - Should we try searching the NCBI taxonomy using the 240 # species name? 241 # 242 # OK, let's try inserting the species. 243 # Chances are we don't have enough information ... 244 # Furthermore, it won't be in the hierarchy. 245 246 lineage = [] 247 for c in record.annotations.get("taxonomy", []): 248 lineage.append([None, None, c]) 249 if lineage: 250 lineage[-1][1] = "genus" 251 lineage.append([None, "species", record.annotations["organism"]]) 252 # XXX do we have them? 253 if "subspecies" in record.annotations: 254 lineage.append([None, "subspecies", 255 record.annotations["subspecies"]]) 256 if "variant" in record.annotations: 257 lineage.append([None, "varietas", 258 record.annotations["variant"]]) 259 lineage[-1][0] = ncbi_taxon_id 260 261 left_value = self.adaptor.execute_one( 262 "SELECT MAX(left_value) FROM taxon")[0] 263 if not left_value: 264 left_value = 0 265 left_value += 1 266 267 # XXX -- Brad: Fixing this for now in an ugly way because 268 # I am getting overlaps for right_values. I need to dig into this 269 # more to actually understand how it works. I'm not sure it is 270 # actually working right anyhow. 271 right_start_value = self.adaptor.execute_one( 272 "SELECT MAX(right_value) FROM taxon")[0] 273 if not right_start_value: 274 right_start_value = 0 275 right_value = right_start_value + 2 * len(lineage) - 1 276 277 parent_taxon_id = None 278 for taxon in lineage: 279 self.adaptor.execute( 280 "INSERT INTO taxon(parent_taxon_id, ncbi_taxon_id, node_rank," 281 " left_value, right_value)" 282 " VALUES (%s, %s, %s, %s, %s)", (parent_taxon_id, 283 taxon[0], 284 taxon[1], 285 left_value, 286 right_value)) 287 taxon_id = self.adaptor.last_id("taxon") 288 self.adaptor.execute( 289 "INSERT INTO taxon_name(taxon_id, name, name_class)" 290 "VALUES (%s, %s, 'scientific name')", (taxon_id, taxon[2][:255])) 291 # Note the name field is limited to 255, some SwissProt files 292 # have a multi-species name which can be longer. So truncate this. 293 left_value += 1 294 right_value -= 1 295 parent_taxon_id = taxon_id 296 if common_name: 297 self.adaptor.execute( 298 "INSERT INTO taxon_name(taxon_id, name, name_class)" 299 "VALUES (%s, %s, 'common name')", ( 300 taxon_id, common_name)) 301 302 return taxon_id
303
304 - def _fix_name_class(self, entrez_name):
305 """Map Entrez name terms to those used in taxdump (PRIVATE). 306 307 We need to make this conversion to match the taxon_name.name_class 308 values used by the BioSQL load_ncbi_taxonomy.pl script. 309 310 e.g. 311 "ScientificName" -> "scientific name", 312 "EquivalentName" -> "equivalent name", 313 "Synonym" -> "synonym", 314 """ 315 # Add any special cases here: 316 # 317 # known = {} 318 # try: 319 # return known[entrez_name] 320 # except KeyError: 321 # pass 322 323 # Try automatically by adding spaces before each capital 324 def add_space(letter): 325 """Adds a space before a capital letter.""" 326 if letter.isupper(): 327 return " " + letter.lower() 328 else: 329 return letter
330 answer = "".join(add_space(letter) for letter in entrez_name).strip() 331 assert answer == answer.lower() 332 return answer
333
334 - def _update_left_right_taxon_values(self, left_value):
335 """update the left and right values in the table 336 """ 337 if not left_value: 338 return 339 # Due to the UNIQUE constraint on the left and right values in the taxon 340 # table we cannot simply update them through an SQL statement as we risk 341 # colliding values. Instead we must select all of the rows that we want to 342 # update, modify the values in python and then update the rows 343 # self.adaptor.execute("UPDATE taxon SET right_value = right_value + 2 " 344 # "WHERE right_value >= %s", (left_value,)) 345 # self.adaptor.execute("UPDATE taxon SET left_value = left_value + 2 " 346 # "WHERE left_value > %s", (left_value,)) 347 348 rows = self.adaptor.execute_and_fetchall( 349 "SELECT left_value, right_value, taxon_id FROM taxon " 350 "WHERE right_value >= %s or left_value > %s", 351 (left_value, left_value)) 352 353 right_rows = [] 354 left_rows = [] 355 for row in rows: 356 new_right = row[1] 357 new_left = row[0] 358 if new_right >= left_value: 359 new_right += 2 360 361 if new_left > left_value: 362 new_left += 2 363 right_rows.append((new_right, row[2])) 364 left_rows.append((new_left, row[2])) 365 366 # sort the rows based on the value from largest to smallest 367 # should ensure no overlaps 368 right_rows = sorted(right_rows, key=lambda x: x[0], reverse=True) 369 left_rows = sorted(left_rows, key=lambda x: x[0], reverse=True) 370 371 self.adaptor.executemany("UPDATE taxon SET left_value = %s " 372 "WHERE taxon_id = %s", left_rows) 373 self.adaptor.executemany("UPDATE taxon SET right_value = %s " 374 "WHERE taxon_id = %s", right_rows)
375
376 - def _get_taxon_id_from_ncbi_taxon_id(self, ncbi_taxon_id, 377 scientific_name=None, 378 common_name=None):
379 """Get the taxon id for record from NCBI taxon ID (PRIVATE). 380 381 ncbi_taxon_id - string containing an NCBI taxon id 382 scientific_name - string, used if a stub entry is recorded 383 common_name - string, used if a stub entry is recorded 384 385 This searches the taxon table using ONLY the NCBI taxon ID 386 to find the matching taxon table entry's ID (database key). 387 388 If the species isn't in the taxon table, and the fetch_NCBI_taxonomy 389 flag is true, Biopython will attempt to go online using Bio.Entrez 390 to fetch the official NCBI lineage, recursing up the tree until an 391 existing entry is found in the database or the full lineage has been 392 fetched. 393 394 Otherwise the NCBI taxon ID, scientific name and common name are 395 recorded as a minimal stub entry in the taxon and taxon_name tables. 396 Any partial information about the lineage from the SeqRecord is NOT 397 recorded. This should mean that (re)running the BioSQL script 398 load_ncbi_taxonomy.pl can fill in the taxonomy lineage. 399 400 Returns the taxon id (database key for the taxon table, not 401 an NCBI taxon ID). 402 """ 403 assert ncbi_taxon_id 404 405 taxon_id = self.adaptor.execute_and_fetch_col0( 406 "SELECT taxon_id FROM taxon WHERE ncbi_taxon_id = %s", 407 (int(ncbi_taxon_id),)) 408 if taxon_id: 409 # Good, we have mapped the NCBI taxid to a taxon table entry 410 return taxon_id[0] 411 412 # At this point, as far as we can tell, this species isn't 413 # in the taxon table already. So we'll have to add it. 414 415 parent_taxon_id = None 416 rank = "species" 417 genetic_code = None 418 mito_genetic_code = None 419 parent_left_value = None 420 parent_right_value = None 421 left_value = None 422 right_value = None 423 species_names = [] 424 if scientific_name: 425 species_names.append(("scientific name", scientific_name)) 426 if common_name: 427 species_names.append(("common name", common_name)) 428 429 if self.fetch_NCBI_taxonomy: 430 # Go online to get the parent taxon ID! 431 handle = Entrez.efetch( 432 db="taxonomy", id=ncbi_taxon_id, retmode="XML") 433 taxonomic_record = Entrez.read(handle) 434 if len(taxonomic_record) == 1: 435 assert taxonomic_record[0]["TaxId"] == str(ncbi_taxon_id), \ 436 "%s versus %s" % (taxonomic_record[0]["TaxId"], 437 ncbi_taxon_id) 438 439 parent_taxon_id, parent_left_value, parent_right_value \ 440 = self._get_taxon_id_from_ncbi_lineage(taxonomic_record[0]["LineageEx"]) 441 442 left_value = parent_right_value 443 right_value = parent_right_value + 1 444 445 rank = str(taxonomic_record[0]["Rank"]) 446 447 genetic_code = int(taxonomic_record[0]["GeneticCode"]["GCId"]) 448 449 mito_genetic_code = int(taxonomic_record[ 450 0]["MitoGeneticCode"]["MGCId"]) 451 452 species_names = [("scientific name", 453 str(taxonomic_record[0]["ScientificName"]))] 454 try: 455 for name_class, names in taxonomic_record[0]["OtherNames"].items(): 456 name_class = self._fix_name_class(name_class) 457 if not isinstance(names, list): 458 # The Entrez parser seems to return single entry 459 # lists as just a string which is annoying. 460 names = [names] 461 for name in names: 462 # Want to ignore complex things like ClassCDE 463 # entries 464 if isinstance(name, basestring): 465 species_names.append((name_class, name)) 466 except KeyError: 467 # OtherNames isn't always present, 468 # e.g. NCBI taxon 41205, Bromheadia finlaysoniana 469 pass 470 else: 471 pass 472 # If we are not allowed to go online, we will record the bare minimum; 473 # as long as the NCBI taxon id is present, then (re)running 474 # load_ncbi_taxonomy.pl should fill in the taxonomomy lineage 475 # (and update the species names). 476 # 477 # I am NOT going to try and record the lineage, even if it 478 # is in the record annotation as a list of names, as we won't 479 # know the NCBI taxon IDs for these parent nodes. 480 481 self._update_left_right_taxon_values(left_value) 482 483 self.adaptor.execute( 484 "INSERT INTO taxon(parent_taxon_id, ncbi_taxon_id, node_rank," 485 " genetic_code, mito_genetic_code, left_value, right_value)" 486 " VALUES (%s, %s, %s, %s, %s, %s, %s)", (parent_taxon_id, 487 ncbi_taxon_id, 488 rank, 489 genetic_code, 490 mito_genetic_code, 491 left_value, 492 right_value)) 493 494 taxon_id = self.adaptor.last_id("taxon") 495 496 # Record the scientific name, common name, etc 497 for name_class, name in species_names: 498 self.adaptor.execute( 499 "INSERT INTO taxon_name(taxon_id, name, name_class)" 500 " VALUES (%s, %s, %s)", (taxon_id, 501 name[:255], 502 name_class)) 503 return taxon_id
504
505 - def _get_taxon_id_from_ncbi_lineage(self, taxonomic_lineage):
506 """This is recursive! (PRIVATE). 507 508 taxonomic_lineage - list of taxonomy dictionaries from Bio.Entrez 509 510 First dictionary in list is the taxonomy root, highest would be 511 the species. Each dictionary includes: 512 - TaxID (string, NCBI taxon id) 513 - Rank (string, e.g. "species", "genus", ..., "phylum", ...) 514 - ScientificName (string) 515 (and that is all at the time of writing) 516 517 This method will record all the lineage given, returning the taxon id 518 (database key, not NCBI taxon id) of the final entry (the species). 519 """ 520 ncbi_taxon_id = int(taxonomic_lineage[-1]["TaxId"]) 521 left_value = None 522 right_value = None 523 parent_left_value = None 524 parent_right_value = None 525 # Is this in the database already? Check the taxon table... 526 rows = self.adaptor.execute_and_fetchall( 527 "SELECT taxon_id, left_value, right_value FROM taxon" 528 " WHERE ncbi_taxon_id=%s" % ncbi_taxon_id) 529 if rows: 530 # we could verify that the Scientific Name etc in the database 531 # is the same and update it or print a warning if not... 532 assert len(rows) == 1 533 return rows[0] 534 535 # We have to record this. 536 if len(taxonomic_lineage) > 1: 537 # Use recursion to find out the taxon id (database key) of the 538 # parent. 539 parent_taxon_id, parent_left_value, parent_right_value = \ 540 self._get_taxon_id_from_ncbi_lineage(taxonomic_lineage[:-1]) 541 left_value = parent_right_value 542 right_value = parent_right_value + 1 543 assert _is_int_or_long(parent_taxon_id), repr(parent_taxon_id) 544 else: 545 # we have reached the top of the lineage but no current taxonomy 546 # id has been found 547 parent_taxon_id = None 548 left_value = self.adaptor.execute_one( 549 "SELECT MAX(left_value) FROM taxon")[0] 550 if not left_value: 551 left_value = 0 552 553 right_value = left_value + 1 554 555 self._update_left_right_taxon_values(left_value) 556 557 # INSERT new taxon 558 rank = str(taxonomic_lineage[-1].get("Rank")) 559 self.adaptor.execute( 560 "INSERT INTO taxon(ncbi_taxon_id, parent_taxon_id, node_rank, " 561 "left_value, right_value) VALUES (%s, %s, %s, %s, %s)", 562 (ncbi_taxon_id, parent_taxon_id, rank, left_value, right_value)) 563 564 taxon_id = self.adaptor.last_id("taxon") 565 # assert isinstance(taxon_id, int), repr(taxon_id) 566 # ... and its name in taxon_name 567 scientific_name = taxonomic_lineage[-1].get("ScientificName") 568 if scientific_name: 569 self.adaptor.execute( 570 "INSERT INTO taxon_name(taxon_id, name, name_class) " 571 "VALUES (%s, %s, 'scientific name')", 572 (taxon_id, scientific_name[:255])) 573 return taxon_id, left_value, right_value
574
575 - def _load_bioentry_table(self, record):
576 """Fill the bioentry table with sequence information (PRIVATE). 577 578 record - SeqRecord object to add to the database. 579 """ 580 # get the pertinent info and insert it 581 582 if record.id.count(".") == 1: # try to get a version from the id 583 # This assumes the string is something like "XXXXXXXX.123" 584 accession, version = record.id.split('.') 585 try: 586 version = int(version) 587 except ValueError: 588 accession = record.id 589 version = 0 590 else: # otherwise just use a version of 0 591 accession = record.id 592 version = 0 593 594 if "accessions" in record.annotations \ 595 and isinstance(record.annotations["accessions"], list) \ 596 and record.annotations["accessions"]: 597 # Take the first accession (one if there is more than one) 598 accession = record.annotations["accessions"][0] 599 600 # Find the taxon id (this is not just the NCBI Taxon ID) 601 # NOTE - If the species isn't defined in the taxon table, 602 # a new minimal entry is created. 603 taxon_id = self._get_taxon_id(record) 604 605 if "gi" in record.annotations: 606 identifier = record.annotations["gi"] 607 else: 608 identifier = record.id 609 610 # Allow description and division to default to NULL as in BioPerl. 611 description = getattr(record, 'description', None) 612 division = record.annotations.get("data_file_division") 613 614 sql = """ 615 INSERT INTO bioentry ( 616 biodatabase_id, 617 taxon_id, 618 name, 619 accession, 620 identifier, 621 division, 622 description, 623 version) 624 VALUES ( 625 %s, 626 %s, 627 %s, 628 %s, 629 %s, 630 %s, 631 %s, 632 %s)""" 633 # print self.dbid, taxon_id, record.name, accession, identifier, \ 634 # division, description, version 635 self.adaptor.execute(sql, (self.dbid, 636 taxon_id, 637 record.name, 638 accession, 639 identifier, 640 division, 641 description, 642 version)) 643 # now retrieve the id for the bioentry 644 bioentry_id = self.adaptor.last_id('bioentry') 645 646 return bioentry_id
647
648 - def _load_bioentry_date(self, record, bioentry_id):
649 """Add the effective date of the entry into the database. 650 651 record - a SeqRecord object with an annotated date 652 bioentry_id - corresponding database identifier 653 """ 654 # dates are GenBank style, like: 655 # 14-SEP-2000 656 date = record.annotations.get("date", 657 strftime("%d-%b-%Y", gmtime()).upper()) 658 if isinstance(date, list): 659 date = date[0] 660 annotation_tags_id = self._get_ontology_id("Annotation Tags") 661 date_id = self._get_term_id("date_changed", annotation_tags_id) 662 sql = r"INSERT INTO bioentry_qualifier_value" \ 663 r" (bioentry_id, term_id, value, rank)" \ 664 r" VALUES (%s, %s, %s, 1)" 665 self.adaptor.execute(sql, (bioentry_id, date_id, date))
666
667 - def _load_biosequence(self, record, bioentry_id):
668 """Record SeqRecord's sequence and alphabet in DB (PRIVATE). 669 670 record - a SeqRecord object with a seq property 671 bioentry_id - corresponding database identifier 672 """ 673 if record.seq is None: 674 # The biosequence table entry is optional, so if we haven't 675 # got a sequence, we don't need to write to the table. 676 return 677 678 # determine the string representation of the alphabet 679 if isinstance(record.seq.alphabet, Alphabet.DNAAlphabet): 680 alphabet = "dna" 681 elif isinstance(record.seq.alphabet, Alphabet.RNAAlphabet): 682 alphabet = "rna" 683 elif isinstance(record.seq.alphabet, Alphabet.ProteinAlphabet): 684 alphabet = "protein" 685 else: 686 alphabet = "unknown" 687 688 if isinstance(record.seq, UnknownSeq): 689 seq_str = None 690 else: 691 seq_str = str(record.seq) 692 693 sql = r"INSERT INTO biosequence (bioentry_id, version, " \ 694 r"length, seq, alphabet) " \ 695 r"VALUES (%s, 0, %s, %s, %s)" 696 self.adaptor.execute(sql, (bioentry_id, 697 len(record.seq), 698 seq_str, 699 alphabet))
700
701 - def _load_comment(self, record, bioentry_id):
702 """Record a SeqRecord's annotated comment in the database (PRIVATE). 703 704 record - a SeqRecord object with an annotated comment 705 bioentry_id - corresponding database identifier 706 """ 707 comments = record.annotations.get('comment') 708 if not comments: 709 return 710 if not isinstance(comments, list): 711 # It should be a string then... 712 comments = [comments] 713 714 for index, comment in enumerate(comments): 715 comment = comment.replace('\n', ' ') 716 # TODO - Store each line as a separate entry? This would preserve 717 # the newlines, but we should check BioPerl etc to be consistent. 718 sql = "INSERT INTO comment (bioentry_id, comment_text, rank)" \ 719 " VALUES (%s, %s, %s)" 720 self.adaptor.execute(sql, (bioentry_id, comment, index + 1))
721
722 - def _load_annotations(self, record, bioentry_id):
723 """Record a SeqRecord's misc annotations in the database (PRIVATE). 724 725 The annotation strings are recorded in the bioentry_qualifier_value 726 table, except for special cases like the reference, comment and 727 taxonomy which are handled with their own tables. 728 729 record - a SeqRecord object with an annotations dictionary 730 bioentry_id - corresponding database identifier 731 """ 732 mono_sql = "INSERT INTO bioentry_qualifier_value" \ 733 "(bioentry_id, term_id, value)" \ 734 " VALUES (%s, %s, %s)" 735 many_sql = "INSERT INTO bioentry_qualifier_value" \ 736 "(bioentry_id, term_id, value, rank)" \ 737 " VALUES (%s, %s, %s, %s)" 738 tag_ontology_id = self._get_ontology_id('Annotation Tags') 739 for key, value in record.annotations.items(): 740 if key in ["references", "comment", "ncbi_taxid", "date"]: 741 # Handled separately 742 continue 743 term_id = self._get_term_id(key, ontology_id=tag_ontology_id) 744 if isinstance(value, (list, tuple)): 745 rank = 0 746 for entry in value: 747 if isinstance(entry, (str, int)): 748 # Easy case 749 rank += 1 750 self.adaptor.execute(many_sql, 751 (bioentry_id, term_id, 752 str(entry), rank)) 753 else: 754 pass 755 elif isinstance(value, (str, int)): 756 # Have a simple single entry, leave rank as the DB default 757 self.adaptor.execute(mono_sql, 758 (bioentry_id, term_id, str(value))) 759 else: 760 pass
761 # print "Ignoring annotation '%s' entry of type '%s'" \ 762 # % (key, type(value)) 763
764 - def _load_reference(self, reference, rank, bioentry_id):
765 """Record SeqRecord's annotated references in the database (PRIVATE). 766 767 record - a SeqRecord object with annotated references 768 bioentry_id - corresponding database identifier 769 """ 770 771 refs = None 772 if reference.medline_id: 773 refs = self.adaptor.execute_and_fetch_col0( 774 "SELECT reference_id" 775 " FROM reference JOIN dbxref USING (dbxref_id)" 776 " WHERE dbname = 'MEDLINE' AND accession = %s", 777 (reference.medline_id,)) 778 if not refs and reference.pubmed_id: 779 refs = self.adaptor.execute_and_fetch_col0( 780 "SELECT reference_id" 781 " FROM reference JOIN dbxref USING (dbxref_id)" 782 " WHERE dbname = 'PUBMED' AND accession = %s", 783 (reference.pubmed_id,)) 784 if not refs: 785 s = [] 786 for f in reference.authors, reference.title, reference.journal: 787 s.append(f or "<undef>") 788 crc = crc64("".join(s)) 789 refs = self.adaptor.execute_and_fetch_col0( 790 "SELECT reference_id FROM reference" 791 r" WHERE crc = %s", (crc,)) 792 if not refs: 793 if reference.medline_id: 794 dbxref_id = self._add_dbxref("MEDLINE", 795 reference.medline_id, 0) 796 elif reference.pubmed_id: 797 dbxref_id = self._add_dbxref("PUBMED", 798 reference.pubmed_id, 0) 799 else: 800 dbxref_id = None 801 authors = reference.authors or None 802 title = reference.title or None 803 # The location/journal field cannot be Null, so default 804 # to an empty string rather than None: 805 journal = reference.journal or "" 806 self.adaptor.execute( 807 "INSERT INTO reference (dbxref_id, location," 808 " title, authors, crc)" 809 " VALUES (%s, %s, %s, %s, %s)", 810 (dbxref_id, journal, title, 811 authors, crc)) 812 reference_id = self.adaptor.last_id("reference") 813 else: 814 reference_id = refs[0] 815 816 if reference.location: 817 start = 1 + int(str(reference.location[0].start)) 818 end = int(str(reference.location[0].end)) 819 else: 820 start = None 821 end = None 822 823 sql = "INSERT INTO bioentry_reference (bioentry_id, reference_id," \ 824 " start_pos, end_pos, rank) VALUES (%s, %s, %s, %s, %s)" 825 self.adaptor.execute(sql, (bioentry_id, reference_id, 826 start, end, rank + 1))
827
828 - def _load_seqfeature(self, feature, feature_rank, bioentry_id):
829 """Load a biopython SeqFeature into the database (PRIVATE).""" 830 # records loaded from a gff file using BCBio.GFF will contain value 831 # of 2nd column of the gff as a feature qualifier. The BioSQL wiki 832 # suggests that the source should not go in with the other feature 833 # mappings but instead be put in the term table 834 # (http://www.biosql.org/wiki/Annotation_Mapping) 835 try: 836 source = feature.qualifiers['source'] 837 if isinstance(source, list): 838 source = source[0] 839 seqfeature_id = self._load_seqfeature_basic(feature.type, 840 feature_rank, 841 bioentry_id, 842 source=source) 843 except KeyError: 844 seqfeature_id = self._load_seqfeature_basic(feature.type, 845 feature_rank, 846 bioentry_id) 847 848 self._load_seqfeature_locations(feature, seqfeature_id) 849 self._load_seqfeature_qualifiers(feature.qualifiers, seqfeature_id)
850
851 - def _load_seqfeature_basic(self, feature_type, feature_rank, bioentry_id, 852 source='EMBL/GenBank/SwissProt'):
853 """Load the first tables of a seqfeature and returns the id (PRIVATE). 854 855 This loads the "key" of the seqfeature (ie. CDS, gene) and 856 the basic seqfeature table itself. 857 """ 858 ontology_id = self._get_ontology_id('SeqFeature Keys') 859 seqfeature_key_id = self._get_term_id(feature_type, 860 ontology_id=ontology_id) 861 source_cat_id = self._get_ontology_id('SeqFeature Sources') 862 source_term_id = self._get_term_id(source, 863 ontology_id=source_cat_id) 864 865 sql = r"INSERT INTO seqfeature (bioentry_id, type_term_id, " \ 866 r"source_term_id, rank) VALUES (%s, %s, %s, %s)" 867 self.adaptor.execute(sql, (bioentry_id, seqfeature_key_id, 868 source_term_id, feature_rank + 1)) 869 seqfeature_id = self.adaptor.last_id('seqfeature') 870 871 return seqfeature_id
872
873 - def _load_seqfeature_locations(self, feature, seqfeature_id):
874 """Load all of the locations for a SeqFeature into tables (PRIVATE). 875 876 This adds the locations related to the SeqFeature into the 877 seqfeature_location table. Fuzzies are not handled right now. 878 For a simple location, ie (1..2), we have a single table row 879 with seq_start = 1, seq_end = 2, location_rank = 1. 880 881 For split locations, ie (1..2, 3..4, 5..6) we would have three 882 row tables with:: 883 884 start = 1, end = 2, rank = 1 885 start = 3, end = 4, rank = 2 886 start = 5, end = 6, rank = 3 887 """ 888 # TODO - Record an ontology for the locations (using location.term_id) 889 # which for now as in BioPerl we leave defaulting to NULL. 890 if feature.location_operator and feature.location_operator != "join": 891 # e.g. order locations... we don't record "order" so it 892 # will become a "join" on reloading. What does BioPerl do? 893 import warnings 894 from Bio import BiopythonWarning 895 warnings.warn("%s location operators are not fully supported" 896 % feature.location_operator, BiopythonWarning) 897 # This will be a list of length one for simple FeatureLocation: 898 parts = feature.location.parts 899 if parts and set(loc.strand for loc in parts) == set([-1]): 900 # To mimic prior behaviour of Biopython+BioSQL, reverse order 901 parts = parts[::-1] 902 # TODO - Check what BioPerl does; see also BioSeq.py code 903 for rank, loc in enumerate(parts): 904 self._insert_location(loc, rank + 1, seqfeature_id)
905
906 - def _insert_location(self, location, rank, seqfeature_id):
907 """Add SeqFeatue location to seqfeature_location table (PRIVATE). 908 909 TODO - Add location operator to location_qualifier_value? 910 """ 911 # convert biopython locations to the 1-based location system 912 # used in bioSQL 913 # XXX This could also handle fuzzies 914 start = int(location.start) + 1 915 end = int(location.end) 916 917 # Biopython uses None when we don't know strand information but 918 # BioSQL requires something (non null) and sets this as zero 919 # So we'll use the strand or 0 if Biopython spits out None 920 strand = location.strand or 0 921 922 # TODO - Record an ontology term for the location (location.term_id) 923 # which for now like BioPerl we'll leave as NULL. 924 # This might allow us to record "between" positions properly, but I 925 # don't really see how it could work for before/after fuzzy positions 926 loc_term_id = None 927 928 if location.ref: 929 # sub_feature remote locations when they are in the same db as the 930 # current record do not have a value for ref_db, which SeqFeature 931 # object stores as None. BioSQL schema requires a varchar and is 932 # not NULL 933 dbxref_id = self._get_dbxref_id( 934 location.ref_db or "", location.ref) 935 else: 936 dbxref_id = None 937 938 sql = r"INSERT INTO location (seqfeature_id, dbxref_id, term_id," \ 939 r"start_pos, end_pos, strand, rank) " \ 940 r"VALUES (%s, %s, %s, %s, %s, %s, %s)" 941 self.adaptor.execute(sql, (seqfeature_id, dbxref_id, loc_term_id, 942 start, end, strand, rank)) 943 944 """ 945 # See Bug 2677 946 # TODO - Record the location_operator (e.g. "join" or "order") 947 # using the location_qualifier_value table (which we and BioPerl 948 # have historically left empty). 949 # Note this will need an ontology term for the location qualifer 950 # (location_qualifier_value.term_id) for which oddly the schema 951 # does not allow NULL. 952 if feature.location_operator: 953 #e.g. "join" (common), 954 #or "order" (see Tests/GenBank/protein_refseq2.gb) 955 location_id = self.adaptor.last_id('location') 956 loc_qual_term_id = None # Not allowed in BioSQL v1.0.1 957 sql = r"INSERT INTO location_qualifier_value" \ 958 r"(location_id, term_id, value)" \ 959 r"VALUES (%s, %s, %s)" 960 self.adaptor.execute(sql, (location_id, loc_qual_term_id, 961 feature.location_operator)) 962 """
963
964 - def _load_seqfeature_qualifiers(self, qualifiers, seqfeature_id):
965 """Insert feature's (key, value) pair qualifiers (PRIVATE). 966 967 Qualifiers should be a dictionary of the form: 968 {key : [value1, value2]} 969 """ 970 tag_ontology_id = self._get_ontology_id('Annotation Tags') 971 for qualifier_key in qualifiers: 972 # Treat db_xref qualifiers differently to sequence annotation 973 # qualifiers by populating the seqfeature_dbxref and dbxref 974 # tables. Other qualifiers go into the seqfeature_qualifier_value 975 # and (if new) term tables. 976 if qualifier_key != 'db_xref': 977 qualifier_key_id = self._get_term_id(qualifier_key, 978 ontology_id=tag_ontology_id) 979 # now add all of the values to their table 980 entries = qualifiers[qualifier_key] 981 if not isinstance(entries, list): 982 # Could be a plain string, or an int or a float. 983 # However, we exect a list of strings here. 984 entries = [entries] 985 for qual_value_rank in range(len(entries)): 986 qualifier_value = entries[qual_value_rank] 987 sql = r"INSERT INTO seqfeature_qualifier_value "\ 988 r" (seqfeature_id, term_id, rank, value) VALUES"\ 989 r" (%s, %s, %s, %s)" 990 self.adaptor.execute(sql, (seqfeature_id, 991 qualifier_key_id, 992 qual_value_rank + 1, 993 qualifier_value)) 994 else: 995 # The dbxref_id qualifier/value sets go into the dbxref table 996 # as dbname, accession, version tuples, with dbxref.dbxref_id 997 # being automatically assigned, and into the seqfeature_dbxref 998 # table as seqfeature_id, dbxref_id, and rank tuples 999 self._load_seqfeature_dbxref(qualifiers[qualifier_key], 1000 seqfeature_id)
1001
1002 - def _load_seqfeature_dbxref(self, dbxrefs, seqfeature_id):
1003 """Add SeqFeature's DB cross-references to the database (PRIVATE). 1004 1005 o dbxrefs List, dbxref data from the source file in the 1006 format <database>:<accession> 1007 1008 o seqfeature_id Int, the identifier for the seqfeature in the 1009 seqfeature table 1010 1011 Insert dbxref qualifier data for a seqfeature into the 1012 seqfeature_dbxref and, if required, dbxref tables. 1013 The dbxref_id qualifier/value sets go into the dbxref table 1014 as dbname, accession, version tuples, with dbxref.dbxref_id 1015 being automatically assigned, and into the seqfeature_dbxref 1016 table as seqfeature_id, dbxref_id, and rank tuples 1017 """ 1018 # NOTE - In older versions of Biopython, we would map the GenBank 1019 # db_xref "name", for example "GI" to "GeneIndex", and give a warning 1020 # for any unknown terms. This was a long term maintainance problem, 1021 # and differed from BioPerl and BioJava's implementation. See bug 2405 1022 for rank, value in enumerate(dbxrefs): 1023 # Split the DB:accession format string at colons. We have to 1024 # account for multiple-line and multiple-accession entries 1025 try: 1026 dbxref_data = value.replace(' ', '').replace('\n', '').split(':') 1027 db = dbxref_data[0] 1028 accessions = dbxref_data[1:] 1029 except: 1030 raise ValueError("Parsing of db_xref failed: '%s'" % value) 1031 # Loop over all the grabbed accessions, and attempt to fill the 1032 # table 1033 for accession in accessions: 1034 # Get the dbxref_id value for the dbxref data 1035 dbxref_id = self._get_dbxref_id(db, accession) 1036 # Insert the seqfeature_dbxref data 1037 self._get_seqfeature_dbxref(seqfeature_id, dbxref_id, rank + 1)
1038
1039 - def _get_dbxref_id(self, db, accession):
1040 """ _get_dbxref_id(self, db, accession) -> Int 1041 1042 o db String, the name of the external database containing 1043 the accession number 1044 1045 o accession String, the accession of the dbxref data 1046 1047 Finds and returns the dbxref_id for the passed data. The method 1048 attempts to find an existing record first, and inserts the data 1049 if there is no record. 1050 """ 1051 # Check for an existing record 1052 sql = r'SELECT dbxref_id FROM dbxref WHERE dbname = %s ' \ 1053 r'AND accession = %s' 1054 dbxref_id = self.adaptor.execute_and_fetch_col0(sql, (db, accession)) 1055 # If there was a record, return the dbxref_id, else create the 1056 # record and return the created dbxref_id 1057 if dbxref_id: 1058 return dbxref_id[0] 1059 return self._add_dbxref(db, accession, 0)
1060
1061 - def _get_seqfeature_dbxref(self, seqfeature_id, dbxref_id, rank):
1062 """ Check for a pre-existing seqfeature_dbxref entry with the passed 1063 seqfeature_id and dbxref_id. If one does not exist, insert new 1064 data 1065 1066 """ 1067 # Check for an existing record 1068 sql = r"SELECT seqfeature_id, dbxref_id FROM seqfeature_dbxref " \ 1069 r"WHERE seqfeature_id = %s AND dbxref_id = %s" 1070 result = self.adaptor.execute_and_fetch_col0(sql, (seqfeature_id, 1071 dbxref_id)) 1072 # If there was a record, return without executing anything, else create 1073 # the record and return 1074 if result: 1075 return result 1076 return self._add_seqfeature_dbxref(seqfeature_id, dbxref_id, rank)
1077
1078 - def _add_seqfeature_dbxref(self, seqfeature_id, dbxref_id, rank):
1079 """ Insert a seqfeature_dbxref row and return the seqfeature_id and 1080 dbxref_id 1081 """ 1082 sql = r'INSERT INTO seqfeature_dbxref ' \ 1083 '(seqfeature_id, dbxref_id, rank) VALUES' \ 1084 r'(%s, %s, %s)' 1085 self.adaptor.execute(sql, (seqfeature_id, dbxref_id, rank)) 1086 return (seqfeature_id, dbxref_id)
1087
1088 - def _load_dbxrefs(self, record, bioentry_id):
1089 """Load any sequence level cross references into the database (PRIVATE). 1090 1091 See table bioentry_dbxref.""" 1092 for rank, value in enumerate(record.dbxrefs): 1093 # Split the DB:accession string at first colon. 1094 # We have to cope with things like: 1095 # "MGD:MGI:892" (db="MGD", accession="MGI:892") 1096 # "GO:GO:123" (db="GO", accession="GO:123") 1097 # 1098 # Annoyingly I have seen the NCBI use both the style 1099 # "GO:GO:123" and "GO:123" in different vintages. 1100 assert value.count("\n") == 0 1101 try: 1102 db, accession = value.split(':', 1) 1103 db = db.strip() 1104 accession = accession.strip() 1105 except: 1106 raise ValueError( 1107 "Parsing of dbxrefs list failed: '%s'" % value) 1108 # Get the dbxref_id value for the dbxref data 1109 dbxref_id = self._get_dbxref_id(db, accession) 1110 # Insert the bioentry_dbxref data 1111 self._get_bioentry_dbxref(bioentry_id, dbxref_id, rank + 1)
1112
1113 - def _get_bioentry_dbxref(self, bioentry_id, dbxref_id, rank):
1114 """Get pre-existing db-xref, or create and return it (PRIVATE). 1115 1116 Check for a pre-existing bioentry_dbxref entry with the passed 1117 seqfeature_id and dbxref_id. If one does not exist, insert new 1118 data 1119 """ 1120 # Check for an existing record 1121 sql = r"SELECT bioentry_id, dbxref_id FROM bioentry_dbxref " \ 1122 r"WHERE bioentry_id = %s AND dbxref_id = %s" 1123 result = self.adaptor.execute_and_fetch_col0(sql, (bioentry_id, 1124 dbxref_id)) 1125 # If there was a record, return without executing anything, else create 1126 # the record and return 1127 if result: 1128 return result 1129 return self._add_bioentry_dbxref(bioentry_id, dbxref_id, rank)
1130
1131 - def _add_bioentry_dbxref(self, bioentry_id, dbxref_id, rank):
1132 """Insert a bioentry_dbxref row (PRIVATE). 1133 1134 Returns the seqfeature_id and dbxref_id (PRIVATE). 1135 """ 1136 sql = r'INSERT INTO bioentry_dbxref ' \ 1137 '(bioentry_id,dbxref_id,rank) VALUES ' \ 1138 '(%s, %s, %s)' 1139 self.adaptor.execute(sql, (bioentry_id, dbxref_id, rank)) 1140 return (bioentry_id, dbxref_id)
1141 1142
1143 -class DatabaseRemover(object):
1144 """Complement the Loader functionality by fully removing a database. 1145 1146 This probably isn't really useful for normal purposes, since you 1147 can just do a:: 1148 1149 DROP DATABASE db_name 1150 1151 and then recreate the database. But, it's really useful for testing 1152 purposes. 1153 """ 1154
1155 - def __init__(self, adaptor, dbid):
1156 """Initialize with a database id and adaptor connection.""" 1157 self.adaptor = adaptor 1158 self.dbid = dbid
1159
1160 - def remove(self):
1161 """Remove everything related to the given database id.""" 1162 sql = r"DELETE FROM bioentry WHERE biodatabase_id = %s" 1163 self.adaptor.execute(sql, (self.dbid,)) 1164 sql = r"DELETE FROM biodatabase WHERE biodatabase_id = %s" 1165 self.adaptor.execute(sql, (self.dbid,))
1166