Package BioSQL :: Module Loader
[hide private]
[frames] | no frames]

Source Code for Module BioSQL.Loader

   1  # Copyright 2002 by Andrew Dalke.  All rights reserved. 
   2  # Revisions 2007-2016 copyright by Peter Cock.  All rights reserved. 
   3  # Revisions 2008 copyright by Cymon J. Cox.  All rights reserved. 
   4  # This code is part of the Biopython distribution and governed by its 
   5  # license.  Please see the LICENSE file that should have been included 
   6  # as part of this package. 
   7  # 
   8  # Note that BioSQL (including the database schema and scripts) is 
   9  # available and licensed separately.  Please consult www.biosql.org 
  10   
  11  """Load biopython objects into a BioSQL database for persistent storage. 
  12   
  13  This code makes it possible to store biopython objects in a relational 
  14  database and then retrieve them back. You shouldn't use any of the 
  15  classes in this module directly. Rather, call the load() method on 
  16  a database object. 
  17  """ 
  18  # standard modules 
  19  from __future__ import print_function 
  20   
  21  from time import gmtime, strftime 
  22   
  23  # biopython 
  24  from Bio import Alphabet 
  25  from Bio.SeqUtils.CheckSum import crc64 
  26  from Bio import Entrez 
  27  from Bio.Seq import UnknownSeq 
  28   
  29  from Bio._py3k import _is_int_or_long 
  30  from Bio._py3k import range 
  31  from Bio._py3k import basestring 
  32   
  33  from Bio.SeqFeature import UnknownPosition 
  34   
  35   
36 -class DatabaseLoader(object):
37 """Object used to load SeqRecord objects into a BioSQL database.""" 38
39 - def __init__(self, adaptor, dbid, fetch_NCBI_taxonomy=False):
40 """Initialize with connection information for the database. 41 42 Creating a DatabaseLoader object is normally handled via the 43 BioSeqDatabase DBServer object, for example:: 44 45 from BioSQL import BioSeqDatabase 46 server = BioSeqDatabase.open_database(driver="MySQLdb", 47 user="gbrowse", 48 passwd="biosql", 49 host="localhost", 50 db="test_biosql") 51 try: 52 db = server["test"] 53 except KeyError: 54 db = server.new_database("test", 55 description="For testing GBrowse") 56 57 """ 58 self.adaptor = adaptor 59 self.dbid = dbid 60 self.fetch_NCBI_taxonomy = fetch_NCBI_taxonomy
61
62 - def load_seqrecord(self, record):
63 """Load a Biopython SeqRecord into the database.""" 64 bioentry_id = self._load_bioentry_table(record) 65 self._load_bioentry_date(record, bioentry_id) 66 self._load_biosequence(record, bioentry_id) 67 self._load_comment(record, bioentry_id) 68 self._load_dbxrefs(record, bioentry_id) 69 references = record.annotations.get('references', ()) 70 for reference, rank in zip(references, list(range(len(references)))): 71 self._load_reference(reference, rank, bioentry_id) 72 self._load_annotations(record, bioentry_id) 73 for seq_feature_num in range(len(record.features)): 74 seq_feature = record.features[seq_feature_num] 75 self._load_seqfeature(seq_feature, seq_feature_num, bioentry_id)
76
77 - def _get_ontology_id(self, name, definition=None):
78 """Return identifier for the named ontology (PRIVATE). 79 80 This looks through the onotology table for a the given entry name. 81 If it is not found, a row is added for this ontology (using the 82 definition if supplied). In either case, the id corresponding to 83 the provided name is returned, so that you can reference it in 84 another table. 85 """ 86 oids = self.adaptor.execute_and_fetch_col0( 87 "SELECT ontology_id FROM ontology WHERE name = %s", 88 (name,)) 89 if oids: 90 return oids[0] 91 self.adaptor.execute( 92 "INSERT INTO ontology(name, definition) VALUES (%s, %s)", 93 (name, definition)) 94 return self.adaptor.last_id("ontology")
95
96 - def _get_term_id(self, 97 name, 98 ontology_id=None, 99 definition=None, 100 identifier=None):
101 """Get the id that corresponds to a term (PRIVATE). 102 103 This looks through the term table for a the given term. If it 104 is not found, a new id corresponding to this term is created. 105 In either case, the id corresponding to that term is returned, so 106 that you can reference it in another table. 107 108 The ontology_id should be used to disambiguate the term. 109 """ 110 # try to get the term id 111 sql = r"SELECT term_id FROM term " \ 112 r"WHERE name = %s" 113 fields = [name] 114 if ontology_id: 115 sql += ' AND ontology_id = %s' 116 fields.append(ontology_id) 117 id_results = self.adaptor.execute_and_fetchall(sql, fields) 118 # something is wrong 119 if len(id_results) > 1: 120 raise ValueError("Multiple term ids for %s: %r" % 121 (name, id_results)) 122 elif len(id_results) == 1: 123 return id_results[0][0] 124 else: 125 sql = r"INSERT INTO term (name, definition," \ 126 r" identifier, ontology_id)" \ 127 r" VALUES (%s, %s, %s, %s)" 128 self.adaptor.execute(sql, (name, definition, 129 identifier, ontology_id)) 130 return self.adaptor.last_id("term")
131
132 - def _add_dbxref(self, dbname, accession, version):
133 """Insert a dbxref and return its id.""" 134 self.adaptor.execute( 135 "INSERT INTO dbxref(dbname, accession, version)" 136 " VALUES (%s, %s, %s)", (dbname, accession, version)) 137 return self.adaptor.last_id("dbxref")
138
139 - def _get_taxon_id(self, record):
140 """Get the taxon id for this record (PRIVATE). 141 142 Arguments: 143 - record - a SeqRecord object 144 145 This searches the taxon/taxon_name tables using the 146 NCBI taxon ID, scientific name and common name to find 147 the matching taxon table entry's id. 148 149 If the species isn't in the taxon table, and we have at 150 least the NCBI taxon ID, scientific name or common name, 151 at least a minimal stub entry is created in the table. 152 153 Returns the taxon id (database key for the taxon table, 154 not an NCBI taxon ID), or None if the taxonomy information 155 is missing. 156 157 See also the BioSQL script load_ncbi_taxonomy.pl which 158 will populate and update the taxon/taxon_name tables 159 with the latest information from the NCBI. 160 """ 161 # To find the NCBI taxid, first check for a top level annotation 162 ncbi_taxon_id = None 163 if "ncbi_taxid" in record.annotations: 164 # Could be a list of IDs. 165 if isinstance(record.annotations["ncbi_taxid"], list): 166 if len(record.annotations["ncbi_taxid"]) == 1: 167 ncbi_taxon_id = record.annotations["ncbi_taxid"][0] 168 else: 169 ncbi_taxon_id = record.annotations["ncbi_taxid"] 170 if not ncbi_taxon_id: 171 # Secondly, look for a source feature 172 for f in record.features: 173 if f.type == 'source': 174 quals = getattr(f, 'qualifiers', {}) 175 if "db_xref" in quals: 176 for db_xref in f.qualifiers["db_xref"]: 177 if db_xref.startswith("taxon:"): 178 ncbi_taxon_id = int(db_xref[6:]) 179 break 180 if ncbi_taxon_id: 181 break 182 183 try: 184 scientific_name = record.annotations["organism"][:255] 185 except KeyError: 186 scientific_name = None 187 try: 188 common_name = record.annotations["source"][:255] 189 except KeyError: 190 common_name = None 191 # Note: The maximum length for taxon names in the schema is 255. 192 # Cropping it now should help in getting a match when searching, 193 # and avoids an error if we try and add these to the database. 194 195 if ncbi_taxon_id: 196 # Good, we have the NCBI taxon to go on - this is unambiguous :) 197 # Note that the scientific name and common name will only be 198 # used if we have to record a stub entry. 199 return self._get_taxon_id_from_ncbi_taxon_id(ncbi_taxon_id, 200 scientific_name, 201 common_name) 202 203 if not common_name and not scientific_name: 204 # Nothing to go on... and there is no point adding 205 # a new entry to the database. We'll just leave this 206 # sequence's taxon as a NULL in the database. 207 return None 208 209 # Next, we'll try to find a match based on the species name 210 # (stored in GenBank files as the organism and/or the source). 211 if scientific_name: 212 taxa = self.adaptor.execute_and_fetch_col0( 213 "SELECT taxon_id FROM taxon_name" 214 " WHERE name_class = 'scientific name' AND name = %s", 215 (scientific_name,)) 216 if taxa: 217 # Good, mapped the scientific name to a taxon table entry 218 return taxa[0] 219 220 # Last chance... 221 if common_name: 222 taxa = self.adaptor.execute_and_fetch_col0( 223 "SELECT DISTINCT taxon_id FROM taxon_name" 224 " WHERE name = %s", 225 (common_name,)) 226 # Its natural that several distinct taxa will have the same common 227 # name - in which case we can't resolve the taxon uniquely. 228 if len(taxa) > 1: 229 raise ValueError("Taxa: %d species have name %r" % ( 230 len(taxa), 231 common_name)) 232 if taxa: 233 # Good, mapped the common name to a taxon table entry 234 return taxa[0] 235 236 # At this point, as far as we can tell, this species isn't 237 # in the taxon table already. So we'll have to add it. 238 # We don't have an NCBI taxonomy ID, so if we do record just 239 # a stub entry, there is no simple way to fix this later. 240 # 241 # TODO - Should we try searching the NCBI taxonomy using the 242 # species name? 243 # 244 # OK, let's try inserting the species. 245 # Chances are we don't have enough information ... 246 # Furthermore, it won't be in the hierarchy. 247 248 lineage = [] 249 for c in record.annotations.get("taxonomy", []): 250 lineage.append([None, None, c]) 251 if lineage: 252 lineage[-1][1] = "genus" 253 lineage.append([None, "species", record.annotations["organism"]]) 254 # XXX do we have them? 255 if "subspecies" in record.annotations: 256 lineage.append([None, "subspecies", 257 record.annotations["subspecies"]]) 258 if "variant" in record.annotations: 259 lineage.append([None, "varietas", 260 record.annotations["variant"]]) 261 lineage[-1][0] = ncbi_taxon_id 262 263 left_value = self.adaptor.execute_one( 264 "SELECT MAX(left_value) FROM taxon")[0] 265 if not left_value: 266 left_value = 0 267 left_value += 1 268 269 # XXX -- Brad: Fixing this for now in an ugly way because 270 # I am getting overlaps for right_values. I need to dig into this 271 # more to actually understand how it works. I'm not sure it is 272 # actually working right anyhow. 273 right_start_value = self.adaptor.execute_one( 274 "SELECT MAX(right_value) FROM taxon")[0] 275 if not right_start_value: 276 right_start_value = 0 277 right_value = right_start_value + 2 * len(lineage) - 1 278 279 parent_taxon_id = None 280 for taxon in lineage: 281 self.adaptor.execute( 282 "INSERT INTO taxon(parent_taxon_id, ncbi_taxon_id, node_rank," 283 " left_value, right_value)" 284 " VALUES (%s, %s, %s, %s, %s)", (parent_taxon_id, 285 taxon[0], 286 taxon[1], 287 left_value, 288 right_value)) 289 taxon_id = self.adaptor.last_id("taxon") 290 self.adaptor.execute( 291 "INSERT INTO taxon_name(taxon_id, name, name_class)" 292 "VALUES (%s, %s, 'scientific name')", (taxon_id, taxon[2][:255])) 293 # Note the name field is limited to 255, some SwissProt files 294 # have a multi-species name which can be longer. So truncate this. 295 left_value += 1 296 right_value -= 1 297 parent_taxon_id = taxon_id 298 if common_name: 299 self.adaptor.execute( 300 "INSERT INTO taxon_name(taxon_id, name, name_class)" 301 "VALUES (%s, %s, 'common name')", ( 302 taxon_id, common_name)) 303 304 return taxon_id
305
306 - def _fix_name_class(self, entrez_name):
307 """Map Entrez name terms to those used in taxdump (PRIVATE). 308 309 We need to make this conversion to match the taxon_name.name_class 310 values used by the BioSQL load_ncbi_taxonomy.pl script. 311 312 e.g.:: 313 314 "ScientificName" -> "scientific name", 315 "EquivalentName" -> "equivalent name", 316 "Synonym" -> "synonym", 317 318 """ 319 # Add any special cases here: 320 # 321 # known = {} 322 # try: 323 # return known[entrez_name] 324 # except KeyError: 325 # pass 326 327 # Try automatically by adding spaces before each capital 328 def add_space(letter): 329 """Add a space before a capital letter.""" 330 if letter.isupper(): 331 return " " + letter.lower() 332 else: 333 return letter
334 answer = "".join(add_space(letter) for letter in entrez_name).strip() 335 assert answer == answer.lower() 336 return answer
337
338 - def _update_left_right_taxon_values(self, left_value):
339 """Update the left and right taxon values in the table.""" 340 if not left_value: 341 return 342 # Due to the UNIQUE constraint on the left and right values in the taxon 343 # table we cannot simply update them through an SQL statement as we risk 344 # colliding values. Instead we must select all of the rows that we want to 345 # update, modify the values in python and then update the rows 346 # self.adaptor.execute("UPDATE taxon SET right_value = right_value + 2 " 347 # "WHERE right_value >= %s", (left_value,)) 348 # self.adaptor.execute("UPDATE taxon SET left_value = left_value + 2 " 349 # "WHERE left_value > %s", (left_value,)) 350 351 rows = self.adaptor.execute_and_fetchall( 352 "SELECT left_value, right_value, taxon_id FROM taxon " 353 "WHERE right_value >= %s or left_value > %s", 354 (left_value, left_value)) 355 356 right_rows = [] 357 left_rows = [] 358 for row in rows: 359 new_right = row[1] 360 new_left = row[0] 361 if new_right >= left_value: 362 new_right += 2 363 364 if new_left > left_value: 365 new_left += 2 366 right_rows.append((new_right, row[2])) 367 left_rows.append((new_left, row[2])) 368 369 # sort the rows based on the value from largest to smallest 370 # should ensure no overlaps 371 right_rows = sorted(right_rows, key=lambda x: x[0], reverse=True) 372 left_rows = sorted(left_rows, key=lambda x: x[0], reverse=True) 373 374 self.adaptor.executemany("UPDATE taxon SET left_value = %s " 375 "WHERE taxon_id = %s", left_rows) 376 self.adaptor.executemany("UPDATE taxon SET right_value = %s " 377 "WHERE taxon_id = %s", right_rows)
378
379 - def _get_taxon_id_from_ncbi_taxon_id(self, ncbi_taxon_id, 380 scientific_name=None, 381 common_name=None):
382 """Get the taxon id for record from NCBI taxon ID (PRIVATE). 383 384 Arguments: 385 386 - ncbi_taxon_id - string containing an NCBI taxon id 387 - scientific_name - string, used if a stub entry is recorded 388 - common_name - string, used if a stub entry is recorded 389 390 This searches the taxon table using ONLY the NCBI taxon ID 391 to find the matching taxon table entry's ID (database key). 392 393 If the species isn't in the taxon table, and the fetch_NCBI_taxonomy 394 flag is true, Biopython will attempt to go online using Bio.Entrez 395 to fetch the official NCBI lineage, recursing up the tree until an 396 existing entry is found in the database or the full lineage has been 397 fetched. 398 399 Otherwise the NCBI taxon ID, scientific name and common name are 400 recorded as a minimal stub entry in the taxon and taxon_name tables. 401 Any partial information about the lineage from the SeqRecord is NOT 402 recorded. This should mean that (re)running the BioSQL script 403 load_ncbi_taxonomy.pl can fill in the taxonomy lineage. 404 405 Returns the taxon id (database key for the taxon table, not 406 an NCBI taxon ID). 407 """ 408 assert ncbi_taxon_id 409 410 taxon_id = self.adaptor.execute_and_fetch_col0( 411 "SELECT taxon_id FROM taxon WHERE ncbi_taxon_id = %s", 412 (int(ncbi_taxon_id),)) 413 if taxon_id: 414 # Good, we have mapped the NCBI taxid to a taxon table entry 415 return taxon_id[0] 416 417 # At this point, as far as we can tell, this species isn't 418 # in the taxon table already. So we'll have to add it. 419 420 parent_taxon_id = None 421 rank = "species" 422 genetic_code = None 423 mito_genetic_code = None 424 parent_left_value = None 425 parent_right_value = None 426 left_value = None 427 right_value = None 428 species_names = [] 429 if scientific_name: 430 species_names.append(("scientific name", scientific_name)) 431 if common_name: 432 species_names.append(("common name", common_name)) 433 434 if self.fetch_NCBI_taxonomy: 435 # Go online to get the parent taxon ID! 436 handle = Entrez.efetch( 437 db="taxonomy", id=ncbi_taxon_id, retmode="XML") 438 taxonomic_record = Entrez.read(handle) 439 if len(taxonomic_record) == 1: 440 assert taxonomic_record[0]["TaxId"] == str(ncbi_taxon_id), \ 441 "%s versus %s" % (taxonomic_record[0]["TaxId"], 442 ncbi_taxon_id) 443 444 parent_taxon_id, parent_left_value, parent_right_value \ 445 = self._get_taxon_id_from_ncbi_lineage(taxonomic_record[0]["LineageEx"]) 446 447 left_value = parent_right_value 448 right_value = parent_right_value + 1 449 450 rank = str(taxonomic_record[0]["Rank"]) 451 452 genetic_code = int(taxonomic_record[0]["GeneticCode"]["GCId"]) 453 454 mito_genetic_code = int(taxonomic_record[ 455 0]["MitoGeneticCode"]["MGCId"]) 456 457 species_names = [("scientific name", 458 str(taxonomic_record[0]["ScientificName"]))] 459 try: 460 for name_class, names in taxonomic_record[0]["OtherNames"].items(): 461 name_class = self._fix_name_class(name_class) 462 if not isinstance(names, list): 463 # The Entrez parser seems to return single entry 464 # lists as just a string which is annoying. 465 names = [names] 466 for name in names: 467 # Want to ignore complex things like ClassCDE 468 # entries 469 if isinstance(name, basestring): 470 species_names.append((name_class, name)) 471 except KeyError: 472 # OtherNames isn't always present, 473 # e.g. NCBI taxon 41205, Bromheadia finlaysoniana 474 pass 475 else: 476 pass 477 # If we are not allowed to go online, we will record the bare minimum; 478 # as long as the NCBI taxon id is present, then (re)running 479 # load_ncbi_taxonomy.pl should fill in the taxonomomy lineage 480 # (and update the species names). 481 # 482 # I am NOT going to try and record the lineage, even if it 483 # is in the record annotation as a list of names, as we won't 484 # know the NCBI taxon IDs for these parent nodes. 485 486 self._update_left_right_taxon_values(left_value) 487 488 self.adaptor.execute( 489 "INSERT INTO taxon(parent_taxon_id, ncbi_taxon_id, node_rank," 490 " genetic_code, mito_genetic_code, left_value, right_value)" 491 " VALUES (%s, %s, %s, %s, %s, %s, %s)", (parent_taxon_id, 492 ncbi_taxon_id, 493 rank, 494 genetic_code, 495 mito_genetic_code, 496 left_value, 497 right_value)) 498 499 taxon_id = self.adaptor.last_id("taxon") 500 501 # Record the scientific name, common name, etc 502 for name_class, name in species_names: 503 self.adaptor.execute( 504 "INSERT INTO taxon_name(taxon_id, name, name_class)" 505 " VALUES (%s, %s, %s)", (taxon_id, 506 name[:255], 507 name_class)) 508 return taxon_id
509
510 - def _get_taxon_id_from_ncbi_lineage(self, taxonomic_lineage):
511 """Recursive method to get taxon ID from NCBI lineage (PRIVATE). 512 513 Arguments: 514 - taxonomic_lineage - list of taxonomy dictionaries from Bio.Entrez 515 516 First dictionary in list is the taxonomy root, highest would be 517 the species. Each dictionary includes: 518 519 - TaxID (string, NCBI taxon id) 520 - Rank (string, e.g. "species", "genus", ..., "phylum", ...) 521 - ScientificName (string) 522 523 (and that is all at the time of writing) 524 525 This method will record all the lineage given, returning the taxon id 526 (database key, not NCBI taxon id) of the final entry (the species). 527 """ 528 ncbi_taxon_id = int(taxonomic_lineage[-1]["TaxId"]) 529 left_value = None 530 right_value = None 531 parent_left_value = None 532 parent_right_value = None 533 # Is this in the database already? Check the taxon table... 534 rows = self.adaptor.execute_and_fetchall( 535 "SELECT taxon_id, left_value, right_value FROM taxon" 536 " WHERE ncbi_taxon_id=%s" % ncbi_taxon_id) 537 if rows: 538 # we could verify that the Scientific Name etc in the database 539 # is the same and update it or print a warning if not... 540 assert len(rows) == 1 541 return rows[0] 542 543 # We have to record this. 544 if len(taxonomic_lineage) > 1: 545 # Use recursion to find out the taxon id (database key) of the 546 # parent. 547 parent_taxon_id, parent_left_value, parent_right_value = \ 548 self._get_taxon_id_from_ncbi_lineage(taxonomic_lineage[:-1]) 549 left_value = parent_right_value 550 right_value = parent_right_value + 1 551 assert _is_int_or_long(parent_taxon_id), repr(parent_taxon_id) 552 else: 553 # we have reached the top of the lineage but no current taxonomy 554 # id has been found 555 parent_taxon_id = None 556 left_value = self.adaptor.execute_one( 557 "SELECT MAX(left_value) FROM taxon")[0] 558 if not left_value: 559 left_value = 0 560 561 right_value = left_value + 1 562 563 self._update_left_right_taxon_values(left_value) 564 565 # INSERT new taxon 566 rank = str(taxonomic_lineage[-1].get("Rank")) 567 self.adaptor.execute( 568 "INSERT INTO taxon(ncbi_taxon_id, parent_taxon_id, node_rank, " 569 "left_value, right_value) VALUES (%s, %s, %s, %s, %s)", 570 (ncbi_taxon_id, parent_taxon_id, rank, left_value, right_value)) 571 572 taxon_id = self.adaptor.last_id("taxon") 573 # assert isinstance(taxon_id, int), repr(taxon_id) 574 # ... and its name in taxon_name 575 scientific_name = taxonomic_lineage[-1].get("ScientificName") 576 if scientific_name: 577 self.adaptor.execute( 578 "INSERT INTO taxon_name(taxon_id, name, name_class) " 579 "VALUES (%s, %s, 'scientific name')", 580 (taxon_id, scientific_name[:255])) 581 return taxon_id, left_value, right_value
582
583 - def _load_bioentry_table(self, record):
584 """Fill the bioentry table with sequence information (PRIVATE). 585 586 Arguments: 587 - record - SeqRecord object to add to the database. 588 589 """ 590 # get the pertinent info and insert it 591 592 if record.id.count(".") == 1: # try to get a version from the id 593 # This assumes the string is something like "XXXXXXXX.123" 594 accession, version = record.id.split('.') 595 try: 596 version = int(version) 597 except ValueError: 598 accession = record.id 599 version = 0 600 else: # otherwise just use a version of 0 601 accession = record.id 602 version = 0 603 604 if "accessions" in record.annotations \ 605 and isinstance(record.annotations["accessions"], list) \ 606 and record.annotations["accessions"]: 607 # Take the first accession (one if there is more than one) 608 accession = record.annotations["accessions"][0] 609 610 # Find the taxon id (this is not just the NCBI Taxon ID) 611 # NOTE - If the species isn't defined in the taxon table, 612 # a new minimal entry is created. 613 taxon_id = self._get_taxon_id(record) 614 615 if "gi" in record.annotations: 616 identifier = record.annotations["gi"] 617 else: 618 identifier = record.id 619 620 # Allow description and division to default to NULL as in BioPerl. 621 description = getattr(record, 'description', None) 622 division = record.annotations.get("data_file_division") 623 624 sql = """ 625 INSERT INTO bioentry ( 626 biodatabase_id, 627 taxon_id, 628 name, 629 accession, 630 identifier, 631 division, 632 description, 633 version) 634 VALUES ( 635 %s, 636 %s, 637 %s, 638 %s, 639 %s, 640 %s, 641 %s, 642 %s)""" 643 # print self.dbid, taxon_id, record.name, accession, identifier, \ 644 # division, description, version 645 self.adaptor.execute(sql, (self.dbid, 646 taxon_id, 647 record.name, 648 accession, 649 identifier, 650 division, 651 description, 652 version)) 653 # now retrieve the id for the bioentry 654 bioentry_id = self.adaptor.last_id('bioentry') 655 656 return bioentry_id
657
658 - def _load_bioentry_date(self, record, bioentry_id):
659 """Add the effective date of the entry into the database. 660 661 record - a SeqRecord object with an annotated date 662 bioentry_id - corresponding database identifier 663 """ 664 # dates are GenBank style, like: 665 # 14-SEP-2000 666 date = record.annotations.get("date", 667 strftime("%d-%b-%Y", gmtime()).upper()) 668 if isinstance(date, list): 669 date = date[0] 670 annotation_tags_id = self._get_ontology_id("Annotation Tags") 671 date_id = self._get_term_id("date_changed", annotation_tags_id) 672 sql = r"INSERT INTO bioentry_qualifier_value" \ 673 r" (bioentry_id, term_id, value, rank)" \ 674 r" VALUES (%s, %s, %s, 1)" 675 self.adaptor.execute(sql, (bioentry_id, date_id, date))
676
677 - def _load_biosequence(self, record, bioentry_id):
678 """Record SeqRecord's sequence and alphabet in DB (PRIVATE). 679 680 Arguments: 681 - record - a SeqRecord object with a seq property 682 - bioentry_id - corresponding database identifier 683 684 """ 685 if record.seq is None: 686 # The biosequence table entry is optional, so if we haven't 687 # got a sequence, we don't need to write to the table. 688 return 689 690 # determine the string representation of the alphabet 691 if isinstance(record.seq.alphabet, Alphabet.DNAAlphabet): 692 alphabet = "dna" 693 elif isinstance(record.seq.alphabet, Alphabet.RNAAlphabet): 694 alphabet = "rna" 695 elif isinstance(record.seq.alphabet, Alphabet.ProteinAlphabet): 696 alphabet = "protein" 697 else: 698 alphabet = "unknown" 699 700 if isinstance(record.seq, UnknownSeq): 701 seq_str = None 702 else: 703 seq_str = str(record.seq) 704 705 sql = r"INSERT INTO biosequence (bioentry_id, version, " \ 706 r"length, seq, alphabet) " \ 707 r"VALUES (%s, 0, %s, %s, %s)" 708 self.adaptor.execute(sql, (bioentry_id, 709 len(record.seq), 710 seq_str, 711 alphabet))
712
713 - def _load_comment(self, record, bioentry_id):
714 """Record a SeqRecord's annotated comment in the database (PRIVATE). 715 716 Arguments: 717 - record - a SeqRecord object with an annotated comment 718 - bioentry_id - corresponding database identifier 719 720 """ 721 comments = record.annotations.get('comment') 722 if not comments: 723 return 724 if not isinstance(comments, list): 725 # It should be a string then... 726 comments = [comments] 727 728 for index, comment in enumerate(comments): 729 comment = comment.replace('\n', ' ') 730 # TODO - Store each line as a separate entry? This would preserve 731 # the newlines, but we should check BioPerl etc to be consistent. 732 sql = "INSERT INTO comment (bioentry_id, comment_text, rank)" \ 733 " VALUES (%s, %s, %s)" 734 self.adaptor.execute(sql, (bioentry_id, comment, index + 1))
735
736 - def _load_annotations(self, record, bioentry_id):
737 """Record a SeqRecord's misc annotations in the database (PRIVATE). 738 739 The annotation strings are recorded in the bioentry_qualifier_value 740 table, except for special cases like the reference, comment and 741 taxonomy which are handled with their own tables. 742 743 Arguments: 744 - record - a SeqRecord object with an annotations dictionary 745 - bioentry_id - corresponding database identifier 746 747 """ 748 mono_sql = "INSERT INTO bioentry_qualifier_value" \ 749 "(bioentry_id, term_id, value)" \ 750 " VALUES (%s, %s, %s)" 751 many_sql = "INSERT INTO bioentry_qualifier_value" \ 752 "(bioentry_id, term_id, value, rank)" \ 753 " VALUES (%s, %s, %s, %s)" 754 tag_ontology_id = self._get_ontology_id('Annotation Tags') 755 for key, value in record.annotations.items(): 756 if key in ["references", "comment", "ncbi_taxid", "date"]: 757 # Handled separately 758 continue 759 term_id = self._get_term_id(key, ontology_id=tag_ontology_id) 760 if isinstance(value, (list, tuple)): 761 rank = 0 762 for entry in value: 763 if isinstance(entry, (str, int)): 764 # Easy case 765 rank += 1 766 self.adaptor.execute(many_sql, 767 (bioentry_id, term_id, 768 str(entry), rank)) 769 else: 770 pass 771 elif isinstance(value, (str, int)): 772 # Have a simple single entry, leave rank as the DB default 773 self.adaptor.execute(mono_sql, 774 (bioentry_id, term_id, str(value))) 775 else: 776 pass
777 # print "Ignoring annotation '%s' entry of type '%s'" \ 778 # % (key, type(value)) 779
780 - def _load_reference(self, reference, rank, bioentry_id):
781 """Record SeqRecord's annotated references in the database (PRIVATE). 782 783 Arguments: 784 - record - a SeqRecord object with annotated references 785 - bioentry_id - corresponding database identifier 786 787 """ 788 refs = None 789 if reference.medline_id: 790 refs = self.adaptor.execute_and_fetch_col0( 791 "SELECT reference_id" 792 " FROM reference JOIN dbxref USING (dbxref_id)" 793 " WHERE dbname = 'MEDLINE' AND accession = %s", 794 (reference.medline_id,)) 795 if not refs and reference.pubmed_id: 796 refs = self.adaptor.execute_and_fetch_col0( 797 "SELECT reference_id" 798 " FROM reference JOIN dbxref USING (dbxref_id)" 799 " WHERE dbname = 'PUBMED' AND accession = %s", 800 (reference.pubmed_id,)) 801 if not refs: 802 s = [] 803 for f in reference.authors, reference.title, reference.journal: 804 s.append(f or "<undef>") 805 crc = crc64("".join(s)) 806 refs = self.adaptor.execute_and_fetch_col0( 807 "SELECT reference_id FROM reference" 808 r" WHERE crc = %s", (crc,)) 809 if not refs: 810 if reference.medline_id: 811 dbxref_id = self._add_dbxref("MEDLINE", 812 reference.medline_id, 0) 813 elif reference.pubmed_id: 814 dbxref_id = self._add_dbxref("PUBMED", 815 reference.pubmed_id, 0) 816 else: 817 dbxref_id = None 818 authors = reference.authors or None 819 title = reference.title or None 820 # The location/journal field cannot be Null, so default 821 # to an empty string rather than None: 822 journal = reference.journal or "" 823 self.adaptor.execute( 824 "INSERT INTO reference (dbxref_id, location," 825 " title, authors, crc)" 826 " VALUES (%s, %s, %s, %s, %s)", 827 (dbxref_id, journal, title, 828 authors, crc)) 829 reference_id = self.adaptor.last_id("reference") 830 else: 831 reference_id = refs[0] 832 833 if reference.location: 834 start = 1 + int(str(reference.location[0].start)) 835 end = int(str(reference.location[0].end)) 836 else: 837 start = None 838 end = None 839 840 sql = "INSERT INTO bioentry_reference (bioentry_id, reference_id," \ 841 " start_pos, end_pos, rank) VALUES (%s, %s, %s, %s, %s)" 842 self.adaptor.execute(sql, (bioentry_id, reference_id, 843 start, end, rank + 1))
844
845 - def _load_seqfeature(self, feature, feature_rank, bioentry_id):
846 """Load a biopython SeqFeature into the database (PRIVATE).""" 847 # records loaded from a gff file using BCBio.GFF will contain value 848 # of 2nd column of the gff as a feature qualifier. The BioSQL wiki 849 # suggests that the source should not go in with the other feature 850 # mappings but instead be put in the term table 851 # (http://www.biosql.org/wiki/Annotation_Mapping) 852 try: 853 source = feature.qualifiers['source'] 854 if isinstance(source, list): 855 source = source[0] 856 seqfeature_id = self._load_seqfeature_basic(feature.type, 857 feature_rank, 858 bioentry_id, 859 source=source) 860 except KeyError: 861 seqfeature_id = self._load_seqfeature_basic(feature.type, 862 feature_rank, 863 bioentry_id) 864 865 self._load_seqfeature_locations(feature, seqfeature_id) 866 self._load_seqfeature_qualifiers(feature.qualifiers, seqfeature_id)
867
868 - def _load_seqfeature_basic(self, feature_type, feature_rank, bioentry_id, 869 source='EMBL/GenBank/SwissProt'):
870 """Load the first tables of a seqfeature and returns the id (PRIVATE). 871 872 This loads the "key" of the seqfeature (ie. CDS, gene) and 873 the basic seqfeature table itself. 874 """ 875 ontology_id = self._get_ontology_id('SeqFeature Keys') 876 seqfeature_key_id = self._get_term_id(feature_type, 877 ontology_id=ontology_id) 878 source_cat_id = self._get_ontology_id('SeqFeature Sources') 879 source_term_id = self._get_term_id(source, 880 ontology_id=source_cat_id) 881 882 sql = r"INSERT INTO seqfeature (bioentry_id, type_term_id, " \ 883 r"source_term_id, rank) VALUES (%s, %s, %s, %s)" 884 self.adaptor.execute(sql, (bioentry_id, seqfeature_key_id, 885 source_term_id, feature_rank + 1)) 886 seqfeature_id = self.adaptor.last_id('seqfeature') 887 888 return seqfeature_id
889
890 - def _load_seqfeature_locations(self, feature, seqfeature_id):
891 """Load all of the locations for a SeqFeature into tables (PRIVATE). 892 893 This adds the locations related to the SeqFeature into the 894 seqfeature_location table. Fuzzies are not handled right now. 895 For a simple location, ie (1..2), we have a single table row 896 with seq_start = 1, seq_end = 2, location_rank = 1. 897 898 For split locations, ie (1..2, 3..4, 5..6) we would have three 899 row tables with:: 900 901 start = 1, end = 2, rank = 1 902 start = 3, end = 4, rank = 2 903 start = 5, end = 6, rank = 3 904 905 """ 906 # TODO - Record an ontology for the locations (using location.term_id) 907 # which for now as in BioPerl we leave defaulting to NULL. 908 if feature.location_operator and feature.location_operator != "join": 909 # e.g. order locations... we don't record "order" so it 910 # will become a "join" on reloading. What does BioPerl do? 911 import warnings 912 from Bio import BiopythonWarning 913 warnings.warn("%s location operators are not fully supported" 914 % feature.location_operator, BiopythonWarning) 915 # This will be a list of length one for simple FeatureLocation: 916 parts = feature.location.parts 917 if parts and set(loc.strand for loc in parts) == set([-1]): 918 # To mimic prior behaviour of Biopython+BioSQL, reverse order 919 parts = parts[::-1] 920 # TODO - Check what BioPerl does; see also BioSeq.py code 921 for rank, loc in enumerate(parts): 922 self._insert_location(loc, rank + 1, seqfeature_id)
923
924 - def _insert_location(self, location, rank, seqfeature_id):
925 """Add SeqFeatue location to seqfeature_location table (PRIVATE). 926 927 TODO - Add location operator to location_qualifier_value? 928 """ 929 # convert biopython locations to the 1-based location system 930 # used in bioSQL 931 # XXX This could also handle fuzzies 932 933 try: 934 start = int(location.start) + 1 935 except TypeError: 936 # Handle SwissProt unknown position (?) 937 if isinstance(location.start, UnknownPosition): 938 start = None 939 else: 940 raise 941 942 try: 943 end = int(location.end) 944 except TypeError: 945 # Handle SwissProt unknown position (?) 946 if isinstance(location.end, UnknownPosition): 947 end = None 948 else: 949 raise 950 951 # Biopython uses None when we don't know strand information but 952 # BioSQL requires something (non null) and sets this as zero 953 # So we'll use the strand or 0 if Biopython spits out None 954 strand = location.strand or 0 955 956 # TODO - Record an ontology term for the location (location.term_id) 957 # which for now like BioPerl we'll leave as NULL. 958 # This might allow us to record "between" positions properly, but I 959 # don't really see how it could work for before/after fuzzy positions 960 loc_term_id = None 961 962 if location.ref: 963 # sub_feature remote locations when they are in the same db as the 964 # current record do not have a value for ref_db, which SeqFeature 965 # object stores as None. BioSQL schema requires a varchar and is 966 # not NULL 967 dbxref_id = self._get_dbxref_id( 968 location.ref_db or "", location.ref) 969 else: 970 dbxref_id = None 971 972 sql = r"INSERT INTO location (seqfeature_id, dbxref_id, term_id," \ 973 r"start_pos, end_pos, strand, rank) " \ 974 r"VALUES (%s, %s, %s, %s, %s, %s, %s)" 975 self.adaptor.execute(sql, (seqfeature_id, dbxref_id, loc_term_id, 976 start, end, strand, rank)) 977 978 """ 979 # See Bug 2677 980 # TODO - Record the location_operator (e.g. "join" or "order") 981 # using the location_qualifier_value table (which we and BioPerl 982 # have historically left empty). 983 # Note this will need an ontology term for the location qualifer 984 # (location_qualifier_value.term_id) for which oddly the schema 985 # does not allow NULL. 986 if feature.location_operator: 987 #e.g. "join" (common), 988 #or "order" (see Tests/GenBank/protein_refseq2.gb) 989 location_id = self.adaptor.last_id('location') 990 loc_qual_term_id = None # Not allowed in BioSQL v1.0.1 991 sql = r"INSERT INTO location_qualifier_value" \ 992 r"(location_id, term_id, value)" \ 993 r"VALUES (%s, %s, %s)" 994 self.adaptor.execute(sql, (location_id, loc_qual_term_id, 995 feature.location_operator)) 996 """
997
998 - def _load_seqfeature_qualifiers(self, qualifiers, seqfeature_id):
999 """Insert feature's (key, value) pair qualifiers (PRIVATE). 1000 1001 Qualifiers should be a dictionary of the form:: 1002 1003 {key : [value1, value2]} 1004 1005 """ 1006 tag_ontology_id = self._get_ontology_id('Annotation Tags') 1007 for qualifier_key in qualifiers: 1008 # Treat db_xref qualifiers differently to sequence annotation 1009 # qualifiers by populating the seqfeature_dbxref and dbxref 1010 # tables. Other qualifiers go into the seqfeature_qualifier_value 1011 # and (if new) term tables. 1012 if qualifier_key != 'db_xref': 1013 qualifier_key_id = self._get_term_id(qualifier_key, 1014 ontology_id=tag_ontology_id) 1015 # now add all of the values to their table 1016 entries = qualifiers[qualifier_key] 1017 if not isinstance(entries, list): 1018 # Could be a plain string, or an int or a float. 1019 # However, we exect a list of strings here. 1020 entries = [entries] 1021 for qual_value_rank in range(len(entries)): 1022 qualifier_value = entries[qual_value_rank] 1023 sql = r"INSERT INTO seqfeature_qualifier_value "\ 1024 r" (seqfeature_id, term_id, rank, value) VALUES"\ 1025 r" (%s, %s, %s, %s)" 1026 self.adaptor.execute(sql, (seqfeature_id, 1027 qualifier_key_id, 1028 qual_value_rank + 1, 1029 qualifier_value)) 1030 else: 1031 # The dbxref_id qualifier/value sets go into the dbxref table 1032 # as dbname, accession, version tuples, with dbxref.dbxref_id 1033 # being automatically assigned, and into the seqfeature_dbxref 1034 # table as seqfeature_id, dbxref_id, and rank tuples 1035 self._load_seqfeature_dbxref(qualifiers[qualifier_key], 1036 seqfeature_id)
1037
1038 - def _load_seqfeature_dbxref(self, dbxrefs, seqfeature_id):
1039 """Add SeqFeature's DB cross-references to the database (PRIVATE). 1040 1041 Arguments: 1042 - dbxrefs - List, dbxref data from the source file in the 1043 format <database>:<accession> 1044 - seqfeature_id - Int, the identifier for the seqfeature in the 1045 seqfeature table 1046 1047 Insert dbxref qualifier data for a seqfeature into the 1048 seqfeature_dbxref and, if required, dbxref tables. 1049 The dbxref_id qualifier/value sets go into the dbxref table 1050 as dbname, accession, version tuples, with dbxref.dbxref_id 1051 being automatically assigned, and into the seqfeature_dbxref 1052 table as seqfeature_id, dbxref_id, and rank tuples. 1053 """ 1054 # NOTE - In older versions of Biopython, we would map the GenBank 1055 # db_xref "name", for example "GI" to "GeneIndex", and give a warning 1056 # for any unknown terms. This was a long term maintainance problem, 1057 # and differed from BioPerl and BioJava's implementation. See bug 2405 1058 for rank, value in enumerate(dbxrefs): 1059 # Split the DB:accession format string at colons. We have to 1060 # account for multiple-line and multiple-accession entries 1061 try: 1062 dbxref_data = value.replace(' ', '').replace('\n', '').split(':') 1063 db = dbxref_data[0] 1064 accessions = dbxref_data[1:] 1065 except Exception: 1066 raise ValueError("Parsing of db_xref failed: '%s'" % value) 1067 # Loop over all the grabbed accessions, and attempt to fill the 1068 # table 1069 for accession in accessions: 1070 # Get the dbxref_id value for the dbxref data 1071 dbxref_id = self._get_dbxref_id(db, accession) 1072 # Insert the seqfeature_dbxref data 1073 self._get_seqfeature_dbxref(seqfeature_id, dbxref_id, rank + 1)
1074
1075 - def _get_dbxref_id(self, db, accession):
1076 """Get DB cross-reference for accession. 1077 1078 Arguments: 1079 - db - String, the name of the external database containing 1080 the accession number 1081 - accession - String, the accession of the dbxref data 1082 1083 Finds and returns the dbxref_id for the passed data. The method 1084 attempts to find an existing record first, and inserts the data 1085 if there is no record. 1086 """ 1087 # Check for an existing record 1088 sql = r'SELECT dbxref_id FROM dbxref WHERE dbname = %s ' \ 1089 r'AND accession = %s' 1090 dbxref_id = self.adaptor.execute_and_fetch_col0(sql, (db, accession)) 1091 # If there was a record, return the dbxref_id, else create the 1092 # record and return the created dbxref_id 1093 if dbxref_id: 1094 return dbxref_id[0] 1095 return self._add_dbxref(db, accession, 0)
1096
1097 - def _get_seqfeature_dbxref(self, seqfeature_id, dbxref_id, rank):
1098 """Get DB cross-reference, creating it if needed (PRIVATE). 1099 1100 Check for a pre-existing seqfeature_dbxref entry with the passed 1101 seqfeature_id and dbxref_id. If one does not exist, insert new 1102 data. 1103 """ 1104 # Check for an existing record 1105 sql = r"SELECT seqfeature_id, dbxref_id FROM seqfeature_dbxref " \ 1106 r"WHERE seqfeature_id = %s AND dbxref_id = %s" 1107 result = self.adaptor.execute_and_fetch_col0(sql, (seqfeature_id, 1108 dbxref_id)) 1109 # If there was a record, return without executing anything, else create 1110 # the record and return 1111 if result: 1112 return result 1113 return self._add_seqfeature_dbxref(seqfeature_id, dbxref_id, rank)
1114
1115 - def _add_seqfeature_dbxref(self, seqfeature_id, dbxref_id, rank):
1116 """Add DB cross-reference (PRIVATE). 1117 1118 Insert a seqfeature_dbxref row and return the seqfeature_id and 1119 dbxref_id 1120 """ 1121 sql = r'INSERT INTO seqfeature_dbxref ' \ 1122 '(seqfeature_id, dbxref_id, rank) VALUES' \ 1123 r'(%s, %s, %s)' 1124 self.adaptor.execute(sql, (seqfeature_id, dbxref_id, rank)) 1125 return (seqfeature_id, dbxref_id)
1126
1127 - def _load_dbxrefs(self, record, bioentry_id):
1128 """Load any sequence level cross references into the database (PRIVATE). 1129 1130 See table bioentry_dbxref. 1131 """ 1132 for rank, value in enumerate(record.dbxrefs): 1133 # Split the DB:accession string at first colon. 1134 # We have to cope with things like: 1135 # "MGD:MGI:892" (db="MGD", accession="MGI:892") 1136 # "GO:GO:123" (db="GO", accession="GO:123") 1137 # 1138 # Annoyingly I have seen the NCBI use both the style 1139 # "GO:GO:123" and "GO:123" in different vintages. 1140 assert value.count("\n") == 0 1141 try: 1142 db, accession = value.split(':', 1) 1143 db = db.strip() 1144 accession = accession.strip() 1145 except Exception: 1146 raise ValueError( 1147 "Parsing of dbxrefs list failed: '%s'" % value) 1148 # Get the dbxref_id value for the dbxref data 1149 dbxref_id = self._get_dbxref_id(db, accession) 1150 # Insert the bioentry_dbxref data 1151 self._get_bioentry_dbxref(bioentry_id, dbxref_id, rank + 1)
1152
1153 - def _get_bioentry_dbxref(self, bioentry_id, dbxref_id, rank):
1154 """Get pre-existing db-xref, or create and return it (PRIVATE). 1155 1156 Check for a pre-existing bioentry_dbxref entry with the passed 1157 seqfeature_id and dbxref_id. If one does not exist, insert new 1158 data 1159 """ 1160 # Check for an existing record 1161 sql = r"SELECT bioentry_id, dbxref_id FROM bioentry_dbxref " \ 1162 r"WHERE bioentry_id = %s AND dbxref_id = %s" 1163 result = self.adaptor.execute_and_fetch_col0(sql, (bioentry_id, 1164 dbxref_id)) 1165 # If there was a record, return without executing anything, else create 1166 # the record and return 1167 if result: 1168 return result 1169 return self._add_bioentry_dbxref(bioentry_id, dbxref_id, rank)
1170
1171 - def _add_bioentry_dbxref(self, bioentry_id, dbxref_id, rank):
1172 """Insert a bioentry_dbxref row (PRIVATE). 1173 1174 Returns the seqfeature_id and dbxref_id (PRIVATE). 1175 """ 1176 sql = r'INSERT INTO bioentry_dbxref ' \ 1177 '(bioentry_id,dbxref_id,rank) VALUES ' \ 1178 '(%s, %s, %s)' 1179 self.adaptor.execute(sql, (bioentry_id, dbxref_id, rank)) 1180 return (bioentry_id, dbxref_id)
1181 1182
1183 -class DatabaseRemover(object):
1184 """Complement the Loader functionality by fully removing a database. 1185 1186 This probably isn't really useful for normal purposes, since you 1187 can just do a:: 1188 1189 DROP DATABASE db_name 1190 1191 and then recreate the database. But, it's really useful for testing 1192 purposes. 1193 """ 1194
1195 - def __init__(self, adaptor, dbid):
1196 """Initialize with a database id and adaptor connection.""" 1197 self.adaptor = adaptor 1198 self.dbid = dbid
1199
1200 - def remove(self):
1201 """Remove everything related to the given database id.""" 1202 sql = r"DELETE FROM bioentry WHERE biodatabase_id = %s" 1203 self.adaptor.execute(sql, (self.dbid,)) 1204 sql = r"DELETE FROM biodatabase WHERE biodatabase_id = %s" 1205 self.adaptor.execute(sql, (self.dbid,))
1206