Package BioSQL :: Module Loader
[hide private]
[frames] | no frames]

Source Code for Module BioSQL.Loader

   1  # Copyright 2002 by Andrew Dalke.  All rights reserved. 
   2  # Revisions 2007-2016 copyright by Peter Cock.  All rights reserved. 
   3  # Revisions 2008 copyright by Cymon J. Cox.  All rights reserved. 
   4  # This code is part of the Biopython distribution and governed by its 
   5  # license.  Please see the LICENSE file that should have been included 
   6  # as part of this package. 
   7  # 
   8  # Note that BioSQL (including the database schema and scripts) is 
   9  # available and licensed separately.  Please consult www.biosql.org 
  10   
  11  """Load biopython objects into a BioSQL database for persistent storage. 
  12   
  13  This code makes it possible to store biopython objects in a relational 
  14  database and then retrieve them back. You shouldn't use any of the 
  15  classes in this module directly. Rather, call the load() method on 
  16  a database object. 
  17  """ 
  18  # standard modules 
  19  from __future__ import print_function 
  20   
  21  from time import gmtime, strftime 
  22   
  23  # biopython 
  24  from Bio import Alphabet 
  25  from Bio.SeqUtils.CheckSum import crc64 
  26  from Bio import Entrez 
  27  from Bio.Seq import UnknownSeq 
  28   
  29  from Bio._py3k import _is_int_or_long 
  30  from Bio._py3k import range 
  31  from Bio._py3k import basestring 
  32   
  33   
34 -class DatabaseLoader(object):
35 """Object used to load SeqRecord objects into a BioSQL database.""" 36
37 - def __init__(self, adaptor, dbid, fetch_NCBI_taxonomy=False):
38 """Initialize with connection information for the database. 39 40 Creating a DatabaseLoader object is normally handled via the 41 BioSeqDatabase DBServer object, for example:: 42 43 from BioSQL import BioSeqDatabase 44 server = BioSeqDatabase.open_database(driver="MySQLdb", 45 user="gbrowse", 46 passwd="biosql", 47 host="localhost", 48 db="test_biosql") 49 try: 50 db = server["test"] 51 except KeyError: 52 db = server.new_database("test", 53 description="For testing GBrowse") 54 """ 55 self.adaptor = adaptor 56 self.dbid = dbid 57 self.fetch_NCBI_taxonomy = fetch_NCBI_taxonomy
58
59 - def load_seqrecord(self, record):
60 """Load a Biopython SeqRecord into the database.""" 61 bioentry_id = self._load_bioentry_table(record) 62 self._load_bioentry_date(record, bioentry_id) 63 self._load_biosequence(record, bioentry_id) 64 self._load_comment(record, bioentry_id) 65 self._load_dbxrefs(record, bioentry_id) 66 references = record.annotations.get('references', ()) 67 for reference, rank in zip(references, list(range(len(references)))): 68 self._load_reference(reference, rank, bioentry_id) 69 self._load_annotations(record, bioentry_id) 70 for seq_feature_num in range(len(record.features)): 71 seq_feature = record.features[seq_feature_num] 72 self._load_seqfeature(seq_feature, seq_feature_num, bioentry_id)
73
74 - def _get_ontology_id(self, name, definition=None):
75 """Returns the identifier for the named ontology (PRIVATE). 76 77 This looks through the onotology table for a the given entry name. 78 If it is not found, a row is added for this ontology (using the 79 definition if supplied). In either case, the id corresponding to 80 the provided name is returned, so that you can reference it in 81 another table. 82 """ 83 oids = self.adaptor.execute_and_fetch_col0( 84 "SELECT ontology_id FROM ontology WHERE name = %s", 85 (name,)) 86 if oids: 87 return oids[0] 88 self.adaptor.execute( 89 "INSERT INTO ontology(name, definition) VALUES (%s, %s)", 90 (name, definition)) 91 return self.adaptor.last_id("ontology")
92
93 - def _get_term_id(self, 94 name, 95 ontology_id=None, 96 definition=None, 97 identifier=None):
98 """Get the id that corresponds to a term (PRIVATE). 99 100 This looks through the term table for a the given term. If it 101 is not found, a new id corresponding to this term is created. 102 In either case, the id corresponding to that term is returned, so 103 that you can reference it in another table. 104 105 The ontology_id should be used to disambiguate the term. 106 """ 107 # try to get the term id 108 sql = r"SELECT term_id FROM term " \ 109 r"WHERE name = %s" 110 fields = [name] 111 if ontology_id: 112 sql += ' AND ontology_id = %s' 113 fields.append(ontology_id) 114 id_results = self.adaptor.execute_and_fetchall(sql, fields) 115 # something is wrong 116 if len(id_results) > 1: 117 raise ValueError("Multiple term ids for %s: %r" % 118 (name, id_results)) 119 elif len(id_results) == 1: 120 return id_results[0][0] 121 else: 122 sql = r"INSERT INTO term (name, definition," \ 123 r" identifier, ontology_id)" \ 124 r" VALUES (%s, %s, %s, %s)" 125 self.adaptor.execute(sql, (name, definition, 126 identifier, ontology_id)) 127 return self.adaptor.last_id("term")
128
129 - def _add_dbxref(self, dbname, accession, version):
130 """Insert a dbxref and return its id.""" 131 self.adaptor.execute( 132 "INSERT INTO dbxref(dbname, accession, version)" 133 " VALUES (%s, %s, %s)", (dbname, accession, version)) 134 return self.adaptor.last_id("dbxref")
135
136 - def _get_taxon_id(self, record):
137 """Get the taxon id for this record (PRIVATE). 138 139 record - a SeqRecord object 140 141 This searches the taxon/taxon_name tables using the 142 NCBI taxon ID, scientific name and common name to find 143 the matching taxon table entry's id. 144 145 If the species isn't in the taxon table, and we have at 146 least the NCBI taxon ID, scientific name or common name, 147 at least a minimal stub entry is created in the table. 148 149 Returns the taxon id (database key for the taxon table, 150 not an NCBI taxon ID), or None if the taxonomy information 151 is missing. 152 153 See also the BioSQL script load_ncbi_taxonomy.pl which 154 will populate and update the taxon/taxon_name tables 155 with the latest information from the NCBI. 156 """ 157 # To find the NCBI taxid, first check for a top level annotation 158 ncbi_taxon_id = None 159 if "ncbi_taxid" in record.annotations: 160 # Could be a list of IDs. 161 if isinstance(record.annotations["ncbi_taxid"], list): 162 if len(record.annotations["ncbi_taxid"]) == 1: 163 ncbi_taxon_id = record.annotations["ncbi_taxid"][0] 164 else: 165 ncbi_taxon_id = record.annotations["ncbi_taxid"] 166 if not ncbi_taxon_id: 167 # Secondly, look for a source feature 168 for f in record.features: 169 if f.type == 'source': 170 quals = getattr(f, 'qualifiers', {}) 171 if "db_xref" in quals: 172 for db_xref in f.qualifiers["db_xref"]: 173 if db_xref.startswith("taxon:"): 174 ncbi_taxon_id = int(db_xref[6:]) 175 break 176 if ncbi_taxon_id: 177 break 178 179 try: 180 scientific_name = record.annotations["organism"][:255] 181 except KeyError: 182 scientific_name = None 183 try: 184 common_name = record.annotations["source"][:255] 185 except KeyError: 186 common_name = None 187 # Note: The maximum length for taxon names in the schema is 255. 188 # Cropping it now should help in getting a match when searching, 189 # and avoids an error if we try and add these to the database. 190 191 if ncbi_taxon_id: 192 # Good, we have the NCBI taxon to go on - this is unambiguous :) 193 # Note that the scientific name and common name will only be 194 # used if we have to record a stub entry. 195 return self._get_taxon_id_from_ncbi_taxon_id(ncbi_taxon_id, 196 scientific_name, 197 common_name) 198 199 if not common_name and not scientific_name: 200 # Nothing to go on... and there is no point adding 201 # a new entry to the database. We'll just leave this 202 # sequence's taxon as a NULL in the database. 203 return None 204 205 # Next, we'll try to find a match based on the species name 206 # (stored in GenBank files as the organism and/or the source). 207 if scientific_name: 208 taxa = self.adaptor.execute_and_fetch_col0( 209 "SELECT taxon_id FROM taxon_name" 210 " WHERE name_class = 'scientific name' AND name = %s", 211 (scientific_name,)) 212 if taxa: 213 # Good, mapped the scientific name to a taxon table entry 214 return taxa[0] 215 216 # Last chance... 217 if common_name: 218 taxa = self.adaptor.execute_and_fetch_col0( 219 "SELECT DISTINCT taxon_id FROM taxon_name" 220 " WHERE name = %s", 221 (common_name,)) 222 # Its natural that several distinct taxa will have the same common 223 # name - in which case we can't resolve the taxon uniquely. 224 if len(taxa) > 1: 225 raise ValueError("Taxa: %d species have name %r" % ( 226 len(taxa), 227 common_name)) 228 if taxa: 229 # Good, mapped the common name to a taxon table entry 230 return taxa[0] 231 232 # At this point, as far as we can tell, this species isn't 233 # in the taxon table already. So we'll have to add it. 234 # We don't have an NCBI taxonomy ID, so if we do record just 235 # a stub entry, there is no simple way to fix this later. 236 # 237 # TODO - Should we try searching the NCBI taxonomy using the 238 # species name? 239 # 240 # OK, let's try inserting the species. 241 # Chances are we don't have enough information ... 242 # Furthermore, it won't be in the hierarchy. 243 244 lineage = [] 245 for c in record.annotations.get("taxonomy", []): 246 lineage.append([None, None, c]) 247 if lineage: 248 lineage[-1][1] = "genus" 249 lineage.append([None, "species", record.annotations["organism"]]) 250 # XXX do we have them? 251 if "subspecies" in record.annotations: 252 lineage.append([None, "subspecies", 253 record.annotations["subspecies"]]) 254 if "variant" in record.annotations: 255 lineage.append([None, "varietas", 256 record.annotations["variant"]]) 257 lineage[-1][0] = ncbi_taxon_id 258 259 left_value = self.adaptor.execute_one( 260 "SELECT MAX(left_value) FROM taxon")[0] 261 if not left_value: 262 left_value = 0 263 left_value += 1 264 265 # XXX -- Brad: Fixing this for now in an ugly way because 266 # I am getting overlaps for right_values. I need to dig into this 267 # more to actually understand how it works. I'm not sure it is 268 # actually working right anyhow. 269 right_start_value = self.adaptor.execute_one( 270 "SELECT MAX(right_value) FROM taxon")[0] 271 if not right_start_value: 272 right_start_value = 0 273 right_value = right_start_value + 2 * len(lineage) - 1 274 275 parent_taxon_id = None 276 for taxon in lineage: 277 self.adaptor.execute( 278 "INSERT INTO taxon(parent_taxon_id, ncbi_taxon_id, node_rank," 279 " left_value, right_value)" 280 " VALUES (%s, %s, %s, %s, %s)", (parent_taxon_id, 281 taxon[0], 282 taxon[1], 283 left_value, 284 right_value)) 285 taxon_id = self.adaptor.last_id("taxon") 286 self.adaptor.execute( 287 "INSERT INTO taxon_name(taxon_id, name, name_class)" 288 "VALUES (%s, %s, 'scientific name')", (taxon_id, taxon[2][:255])) 289 # Note the name field is limited to 255, some SwissProt files 290 # have a multi-species name which can be longer. So truncate this. 291 left_value += 1 292 right_value -= 1 293 parent_taxon_id = taxon_id 294 if common_name: 295 self.adaptor.execute( 296 "INSERT INTO taxon_name(taxon_id, name, name_class)" 297 "VALUES (%s, %s, 'common name')", ( 298 taxon_id, common_name)) 299 300 return taxon_id
301
302 - def _fix_name_class(self, entrez_name):
303 """Map Entrez name terms to those used in taxdump (PRIVATE). 304 305 We need to make this conversion to match the taxon_name.name_class 306 values used by the BioSQL load_ncbi_taxonomy.pl script. 307 308 e.g. 309 "ScientificName" -> "scientific name", 310 "EquivalentName" -> "equivalent name", 311 "Synonym" -> "synonym", 312 """ 313 # Add any special cases here: 314 # 315 # known = {} 316 # try: 317 # return known[entrez_name] 318 # except KeyError: 319 # pass 320 321 # Try automatically by adding spaces before each capital 322 def add_space(letter): 323 """Adds a space before a capital letter.""" 324 if letter.isupper(): 325 return " " + letter.lower() 326 else: 327 return letter
328 answer = "".join(add_space(letter) for letter in entrez_name).strip() 329 assert answer == answer.lower() 330 return answer
331
332 - def _update_left_right_taxon_values(self, left_value):
333 """update the left and right values in the table 334 """ 335 if not left_value: 336 return 337 # Due to the UNIQUE constraint on the left and right values in the taxon 338 # table we cannot simply update them through an SQL statement as we risk 339 # colliding values. Instead we must select all of the rows that we want to 340 # update, modify the values in python and then update the rows 341 # self.adaptor.execute("UPDATE taxon SET right_value = right_value + 2 " 342 # "WHERE right_value >= %s", (left_value,)) 343 # self.adaptor.execute("UPDATE taxon SET left_value = left_value + 2 " 344 # "WHERE left_value > %s", (left_value,)) 345 346 rows = self.adaptor.execute_and_fetchall( 347 "SELECT left_value, right_value, taxon_id FROM taxon " 348 "WHERE right_value >= %s or left_value > %s", 349 (left_value, left_value)) 350 351 right_rows = [] 352 left_rows = [] 353 for row in rows: 354 new_right = row[1] 355 new_left = row[0] 356 if new_right >= left_value: 357 new_right += 2 358 359 if new_left > left_value: 360 new_left += 2 361 right_rows.append((new_right, row[2])) 362 left_rows.append((new_left, row[2])) 363 364 # sort the rows based on the value from largest to smallest 365 # should ensure no overlaps 366 right_rows = sorted(right_rows, key=lambda x: x[0], reverse=True) 367 left_rows = sorted(left_rows, key=lambda x: x[0], reverse=True) 368 369 self.adaptor.executemany("UPDATE taxon SET left_value = %s " 370 "WHERE taxon_id = %s", left_rows) 371 self.adaptor.executemany("UPDATE taxon SET right_value = %s " 372 "WHERE taxon_id = %s", right_rows)
373
374 - def _get_taxon_id_from_ncbi_taxon_id(self, ncbi_taxon_id, 375 scientific_name=None, 376 common_name=None):
377 """Get the taxon id for record from NCBI taxon ID (PRIVATE). 378 379 ncbi_taxon_id - string containing an NCBI taxon id 380 scientific_name - string, used if a stub entry is recorded 381 common_name - string, used if a stub entry is recorded 382 383 This searches the taxon table using ONLY the NCBI taxon ID 384 to find the matching taxon table entry's ID (database key). 385 386 If the species isn't in the taxon table, and the fetch_NCBI_taxonomy 387 flag is true, Biopython will attempt to go online using Bio.Entrez 388 to fetch the official NCBI lineage, recursing up the tree until an 389 existing entry is found in the database or the full lineage has been 390 fetched. 391 392 Otherwise the NCBI taxon ID, scientific name and common name are 393 recorded as a minimal stub entry in the taxon and taxon_name tables. 394 Any partial information about the lineage from the SeqRecord is NOT 395 recorded. This should mean that (re)running the BioSQL script 396 load_ncbi_taxonomy.pl can fill in the taxonomy lineage. 397 398 Returns the taxon id (database key for the taxon table, not 399 an NCBI taxon ID). 400 """ 401 assert ncbi_taxon_id 402 403 taxon_id = self.adaptor.execute_and_fetch_col0( 404 "SELECT taxon_id FROM taxon WHERE ncbi_taxon_id = %s", 405 (int(ncbi_taxon_id),)) 406 if taxon_id: 407 # Good, we have mapped the NCBI taxid to a taxon table entry 408 return taxon_id[0] 409 410 # At this point, as far as we can tell, this species isn't 411 # in the taxon table already. So we'll have to add it. 412 413 parent_taxon_id = None 414 rank = "species" 415 genetic_code = None 416 mito_genetic_code = None 417 parent_left_value = None 418 parent_right_value = None 419 left_value = None 420 right_value = None 421 species_names = [] 422 if scientific_name: 423 species_names.append(("scientific name", scientific_name)) 424 if common_name: 425 species_names.append(("common name", common_name)) 426 427 if self.fetch_NCBI_taxonomy: 428 # Go online to get the parent taxon ID! 429 handle = Entrez.efetch( 430 db="taxonomy", id=ncbi_taxon_id, retmode="XML") 431 taxonomic_record = Entrez.read(handle) 432 if len(taxonomic_record) == 1: 433 assert taxonomic_record[0]["TaxId"] == str(ncbi_taxon_id), \ 434 "%s versus %s" % (taxonomic_record[0]["TaxId"], 435 ncbi_taxon_id) 436 437 parent_taxon_id, parent_left_value, parent_right_value \ 438 = self._get_taxon_id_from_ncbi_lineage(taxonomic_record[0]["LineageEx"]) 439 440 left_value = parent_right_value 441 right_value = parent_right_value + 1 442 443 rank = str(taxonomic_record[0]["Rank"]) 444 445 genetic_code = int(taxonomic_record[0]["GeneticCode"]["GCId"]) 446 447 mito_genetic_code = int(taxonomic_record[ 448 0]["MitoGeneticCode"]["MGCId"]) 449 450 species_names = [("scientific name", 451 str(taxonomic_record[0]["ScientificName"]))] 452 try: 453 for name_class, names in taxonomic_record[0]["OtherNames"].items(): 454 name_class = self._fix_name_class(name_class) 455 if not isinstance(names, list): 456 # The Entrez parser seems to return single entry 457 # lists as just a string which is annoying. 458 names = [names] 459 for name in names: 460 # Want to ignore complex things like ClassCDE 461 # entries 462 if isinstance(name, basestring): 463 species_names.append((name_class, name)) 464 except KeyError: 465 # OtherNames isn't always present, 466 # e.g. NCBI taxon 41205, Bromheadia finlaysoniana 467 pass 468 else: 469 pass 470 # If we are not allowed to go online, we will record the bare minimum; 471 # as long as the NCBI taxon id is present, then (re)running 472 # load_ncbi_taxonomy.pl should fill in the taxonomomy lineage 473 # (and update the species names). 474 # 475 # I am NOT going to try and record the lineage, even if it 476 # is in the record annotation as a list of names, as we won't 477 # know the NCBI taxon IDs for these parent nodes. 478 479 self._update_left_right_taxon_values(left_value) 480 481 self.adaptor.execute( 482 "INSERT INTO taxon(parent_taxon_id, ncbi_taxon_id, node_rank," 483 " genetic_code, mito_genetic_code, left_value, right_value)" 484 " VALUES (%s, %s, %s, %s, %s, %s, %s)", (parent_taxon_id, 485 ncbi_taxon_id, 486 rank, 487 genetic_code, 488 mito_genetic_code, 489 left_value, 490 right_value)) 491 492 taxon_id = self.adaptor.last_id("taxon") 493 494 # Record the scientific name, common name, etc 495 for name_class, name in species_names: 496 self.adaptor.execute( 497 "INSERT INTO taxon_name(taxon_id, name, name_class)" 498 " VALUES (%s, %s, %s)", (taxon_id, 499 name[:255], 500 name_class)) 501 return taxon_id
502
503 - def _get_taxon_id_from_ncbi_lineage(self, taxonomic_lineage):
504 """This is recursive! (PRIVATE). 505 506 taxonomic_lineage - list of taxonomy dictionaries from Bio.Entrez 507 508 First dictionary in list is the taxonomy root, highest would be 509 the species. Each dictionary includes: 510 - TaxID (string, NCBI taxon id) 511 - Rank (string, e.g. "species", "genus", ..., "phylum", ...) 512 - ScientificName (string) 513 (and that is all at the time of writing) 514 515 This method will record all the lineage given, returning the taxon id 516 (database key, not NCBI taxon id) of the final entry (the species). 517 """ 518 ncbi_taxon_id = int(taxonomic_lineage[-1]["TaxId"]) 519 left_value = None 520 right_value = None 521 parent_left_value = None 522 parent_right_value = None 523 # Is this in the database already? Check the taxon table... 524 rows = self.adaptor.execute_and_fetchall( 525 "SELECT taxon_id, left_value, right_value FROM taxon" 526 " WHERE ncbi_taxon_id=%s" % ncbi_taxon_id) 527 if rows: 528 # we could verify that the Scientific Name etc in the database 529 # is the same and update it or print a warning if not... 530 assert len(rows) == 1 531 return rows[0] 532 533 # We have to record this. 534 if len(taxonomic_lineage) > 1: 535 # Use recursion to find out the taxon id (database key) of the 536 # parent. 537 parent_taxon_id, parent_left_value, parent_right_value = \ 538 self._get_taxon_id_from_ncbi_lineage(taxonomic_lineage[:-1]) 539 left_value = parent_right_value 540 right_value = parent_right_value + 1 541 assert _is_int_or_long(parent_taxon_id), repr(parent_taxon_id) 542 else: 543 # we have reached the top of the lineage but no current taxonomy 544 # id has been found 545 parent_taxon_id = None 546 left_value = self.adaptor.execute_one( 547 "SELECT MAX(left_value) FROM taxon")[0] 548 if not left_value: 549 left_value = 0 550 551 right_value = left_value + 1 552 553 self._update_left_right_taxon_values(left_value) 554 555 # INSERT new taxon 556 rank = str(taxonomic_lineage[-1].get("Rank")) 557 self.adaptor.execute( 558 "INSERT INTO taxon(ncbi_taxon_id, parent_taxon_id, node_rank, " 559 "left_value, right_value) VALUES (%s, %s, %s, %s, %s)", 560 (ncbi_taxon_id, parent_taxon_id, rank, left_value, right_value)) 561 562 taxon_id = self.adaptor.last_id("taxon") 563 # assert isinstance(taxon_id, int), repr(taxon_id) 564 # ... and its name in taxon_name 565 scientific_name = taxonomic_lineage[-1].get("ScientificName") 566 if scientific_name: 567 self.adaptor.execute( 568 "INSERT INTO taxon_name(taxon_id, name, name_class) " 569 "VALUES (%s, %s, 'scientific name')", 570 (taxon_id, scientific_name[:255])) 571 return taxon_id, left_value, right_value
572
573 - def _load_bioentry_table(self, record):
574 """Fill the bioentry table with sequence information (PRIVATE). 575 576 record - SeqRecord object to add to the database. 577 """ 578 # get the pertinent info and insert it 579 580 if record.id.count(".") == 1: # try to get a version from the id 581 # This assumes the string is something like "XXXXXXXX.123" 582 accession, version = record.id.split('.') 583 try: 584 version = int(version) 585 except ValueError: 586 accession = record.id 587 version = 0 588 else: # otherwise just use a version of 0 589 accession = record.id 590 version = 0 591 592 if "accessions" in record.annotations \ 593 and isinstance(record.annotations["accessions"], list) \ 594 and record.annotations["accessions"]: 595 # Take the first accession (one if there is more than one) 596 accession = record.annotations["accessions"][0] 597 598 # Find the taxon id (this is not just the NCBI Taxon ID) 599 # NOTE - If the species isn't defined in the taxon table, 600 # a new minimal entry is created. 601 taxon_id = self._get_taxon_id(record) 602 603 if "gi" in record.annotations: 604 identifier = record.annotations["gi"] 605 else: 606 identifier = record.id 607 608 # Allow description and division to default to NULL as in BioPerl. 609 description = getattr(record, 'description', None) 610 division = record.annotations.get("data_file_division") 611 612 sql = """ 613 INSERT INTO bioentry ( 614 biodatabase_id, 615 taxon_id, 616 name, 617 accession, 618 identifier, 619 division, 620 description, 621 version) 622 VALUES ( 623 %s, 624 %s, 625 %s, 626 %s, 627 %s, 628 %s, 629 %s, 630 %s)""" 631 # print self.dbid, taxon_id, record.name, accession, identifier, \ 632 # division, description, version 633 self.adaptor.execute(sql, (self.dbid, 634 taxon_id, 635 record.name, 636 accession, 637 identifier, 638 division, 639 description, 640 version)) 641 # now retrieve the id for the bioentry 642 bioentry_id = self.adaptor.last_id('bioentry') 643 644 return bioentry_id
645
646 - def _load_bioentry_date(self, record, bioentry_id):
647 """Add the effective date of the entry into the database. 648 649 record - a SeqRecord object with an annotated date 650 bioentry_id - corresponding database identifier 651 """ 652 # dates are GenBank style, like: 653 # 14-SEP-2000 654 date = record.annotations.get("date", 655 strftime("%d-%b-%Y", gmtime()).upper()) 656 if isinstance(date, list): 657 date = date[0] 658 annotation_tags_id = self._get_ontology_id("Annotation Tags") 659 date_id = self._get_term_id("date_changed", annotation_tags_id) 660 sql = r"INSERT INTO bioentry_qualifier_value" \ 661 r" (bioentry_id, term_id, value, rank)" \ 662 r" VALUES (%s, %s, %s, 1)" 663 self.adaptor.execute(sql, (bioentry_id, date_id, date))
664
665 - def _load_biosequence(self, record, bioentry_id):
666 """Record SeqRecord's sequence and alphabet in DB (PRIVATE). 667 668 record - a SeqRecord object with a seq property 669 bioentry_id - corresponding database identifier 670 """ 671 if record.seq is None: 672 # The biosequence table entry is optional, so if we haven't 673 # got a sequence, we don't need to write to the table. 674 return 675 676 # determine the string representation of the alphabet 677 if isinstance(record.seq.alphabet, Alphabet.DNAAlphabet): 678 alphabet = "dna" 679 elif isinstance(record.seq.alphabet, Alphabet.RNAAlphabet): 680 alphabet = "rna" 681 elif isinstance(record.seq.alphabet, Alphabet.ProteinAlphabet): 682 alphabet = "protein" 683 else: 684 alphabet = "unknown" 685 686 if isinstance(record.seq, UnknownSeq): 687 seq_str = None 688 else: 689 seq_str = str(record.seq) 690 691 sql = r"INSERT INTO biosequence (bioentry_id, version, " \ 692 r"length, seq, alphabet) " \ 693 r"VALUES (%s, 0, %s, %s, %s)" 694 self.adaptor.execute(sql, (bioentry_id, 695 len(record.seq), 696 seq_str, 697 alphabet))
698
699 - def _load_comment(self, record, bioentry_id):
700 """Record a SeqRecord's annotated comment in the database (PRIVATE). 701 702 record - a SeqRecord object with an annotated comment 703 bioentry_id - corresponding database identifier 704 """ 705 comments = record.annotations.get('comment') 706 if not comments: 707 return 708 if not isinstance(comments, list): 709 # It should be a string then... 710 comments = [comments] 711 712 for index, comment in enumerate(comments): 713 comment = comment.replace('\n', ' ') 714 # TODO - Store each line as a separate entry? This would preserve 715 # the newlines, but we should check BioPerl etc to be consistent. 716 sql = "INSERT INTO comment (bioentry_id, comment_text, rank)" \ 717 " VALUES (%s, %s, %s)" 718 self.adaptor.execute(sql, (bioentry_id, comment, index + 1))
719
720 - def _load_annotations(self, record, bioentry_id):
721 """Record a SeqRecord's misc annotations in the database (PRIVATE). 722 723 The annotation strings are recorded in the bioentry_qualifier_value 724 table, except for special cases like the reference, comment and 725 taxonomy which are handled with their own tables. 726 727 record - a SeqRecord object with an annotations dictionary 728 bioentry_id - corresponding database identifier 729 """ 730 mono_sql = "INSERT INTO bioentry_qualifier_value" \ 731 "(bioentry_id, term_id, value)" \ 732 " VALUES (%s, %s, %s)" 733 many_sql = "INSERT INTO bioentry_qualifier_value" \ 734 "(bioentry_id, term_id, value, rank)" \ 735 " VALUES (%s, %s, %s, %s)" 736 tag_ontology_id = self._get_ontology_id('Annotation Tags') 737 for key, value in record.annotations.items(): 738 if key in ["references", "comment", "ncbi_taxid", "date"]: 739 # Handled separately 740 continue 741 term_id = self._get_term_id(key, ontology_id=tag_ontology_id) 742 if isinstance(value, (list, tuple)): 743 rank = 0 744 for entry in value: 745 if isinstance(entry, (str, int)): 746 # Easy case 747 rank += 1 748 self.adaptor.execute(many_sql, 749 (bioentry_id, term_id, 750 str(entry), rank)) 751 else: 752 pass 753 elif isinstance(value, (str, int)): 754 # Have a simple single entry, leave rank as the DB default 755 self.adaptor.execute(mono_sql, 756 (bioentry_id, term_id, str(value))) 757 else: 758 pass
759 # print "Ignoring annotation '%s' entry of type '%s'" \ 760 # % (key, type(value)) 761
762 - def _load_reference(self, reference, rank, bioentry_id):
763 """Record SeqRecord's annotated references in the database (PRIVATE). 764 765 record - a SeqRecord object with annotated references 766 bioentry_id - corresponding database identifier 767 """ 768 refs = None 769 if reference.medline_id: 770 refs = self.adaptor.execute_and_fetch_col0( 771 "SELECT reference_id" 772 " FROM reference JOIN dbxref USING (dbxref_id)" 773 " WHERE dbname = 'MEDLINE' AND accession = %s", 774 (reference.medline_id,)) 775 if not refs and reference.pubmed_id: 776 refs = self.adaptor.execute_and_fetch_col0( 777 "SELECT reference_id" 778 " FROM reference JOIN dbxref USING (dbxref_id)" 779 " WHERE dbname = 'PUBMED' AND accession = %s", 780 (reference.pubmed_id,)) 781 if not refs: 782 s = [] 783 for f in reference.authors, reference.title, reference.journal: 784 s.append(f or "<undef>") 785 crc = crc64("".join(s)) 786 refs = self.adaptor.execute_and_fetch_col0( 787 "SELECT reference_id FROM reference" 788 r" WHERE crc = %s", (crc,)) 789 if not refs: 790 if reference.medline_id: 791 dbxref_id = self._add_dbxref("MEDLINE", 792 reference.medline_id, 0) 793 elif reference.pubmed_id: 794 dbxref_id = self._add_dbxref("PUBMED", 795 reference.pubmed_id, 0) 796 else: 797 dbxref_id = None 798 authors = reference.authors or None 799 title = reference.title or None 800 # The location/journal field cannot be Null, so default 801 # to an empty string rather than None: 802 journal = reference.journal or "" 803 self.adaptor.execute( 804 "INSERT INTO reference (dbxref_id, location," 805 " title, authors, crc)" 806 " VALUES (%s, %s, %s, %s, %s)", 807 (dbxref_id, journal, title, 808 authors, crc)) 809 reference_id = self.adaptor.last_id("reference") 810 else: 811 reference_id = refs[0] 812 813 if reference.location: 814 start = 1 + int(str(reference.location[0].start)) 815 end = int(str(reference.location[0].end)) 816 else: 817 start = None 818 end = None 819 820 sql = "INSERT INTO bioentry_reference (bioentry_id, reference_id," \ 821 " start_pos, end_pos, rank) VALUES (%s, %s, %s, %s, %s)" 822 self.adaptor.execute(sql, (bioentry_id, reference_id, 823 start, end, rank + 1))
824
825 - def _load_seqfeature(self, feature, feature_rank, bioentry_id):
826 """Load a biopython SeqFeature into the database (PRIVATE).""" 827 # records loaded from a gff file using BCBio.GFF will contain value 828 # of 2nd column of the gff as a feature qualifier. The BioSQL wiki 829 # suggests that the source should not go in with the other feature 830 # mappings but instead be put in the term table 831 # (http://www.biosql.org/wiki/Annotation_Mapping) 832 try: 833 source = feature.qualifiers['source'] 834 if isinstance(source, list): 835 source = source[0] 836 seqfeature_id = self._load_seqfeature_basic(feature.type, 837 feature_rank, 838 bioentry_id, 839 source=source) 840 except KeyError: 841 seqfeature_id = self._load_seqfeature_basic(feature.type, 842 feature_rank, 843 bioentry_id) 844 845 self._load_seqfeature_locations(feature, seqfeature_id) 846 self._load_seqfeature_qualifiers(feature.qualifiers, seqfeature_id)
847
848 - def _load_seqfeature_basic(self, feature_type, feature_rank, bioentry_id, 849 source='EMBL/GenBank/SwissProt'):
850 """Load the first tables of a seqfeature and returns the id (PRIVATE). 851 852 This loads the "key" of the seqfeature (ie. CDS, gene) and 853 the basic seqfeature table itself. 854 """ 855 ontology_id = self._get_ontology_id('SeqFeature Keys') 856 seqfeature_key_id = self._get_term_id(feature_type, 857 ontology_id=ontology_id) 858 source_cat_id = self._get_ontology_id('SeqFeature Sources') 859 source_term_id = self._get_term_id(source, 860 ontology_id=source_cat_id) 861 862 sql = r"INSERT INTO seqfeature (bioentry_id, type_term_id, " \ 863 r"source_term_id, rank) VALUES (%s, %s, %s, %s)" 864 self.adaptor.execute(sql, (bioentry_id, seqfeature_key_id, 865 source_term_id, feature_rank + 1)) 866 seqfeature_id = self.adaptor.last_id('seqfeature') 867 868 return seqfeature_id
869
870 - def _load_seqfeature_locations(self, feature, seqfeature_id):
871 """Load all of the locations for a SeqFeature into tables (PRIVATE). 872 873 This adds the locations related to the SeqFeature into the 874 seqfeature_location table. Fuzzies are not handled right now. 875 For a simple location, ie (1..2), we have a single table row 876 with seq_start = 1, seq_end = 2, location_rank = 1. 877 878 For split locations, ie (1..2, 3..4, 5..6) we would have three 879 row tables with:: 880 881 start = 1, end = 2, rank = 1 882 start = 3, end = 4, rank = 2 883 start = 5, end = 6, rank = 3 884 """ 885 # TODO - Record an ontology for the locations (using location.term_id) 886 # which for now as in BioPerl we leave defaulting to NULL. 887 if feature.location_operator and feature.location_operator != "join": 888 # e.g. order locations... we don't record "order" so it 889 # will become a "join" on reloading. What does BioPerl do? 890 import warnings 891 from Bio import BiopythonWarning 892 warnings.warn("%s location operators are not fully supported" 893 % feature.location_operator, BiopythonWarning) 894 # This will be a list of length one for simple FeatureLocation: 895 parts = feature.location.parts 896 if parts and set(loc.strand for loc in parts) == set([-1]): 897 # To mimic prior behaviour of Biopython+BioSQL, reverse order 898 parts = parts[::-1] 899 # TODO - Check what BioPerl does; see also BioSeq.py code 900 for rank, loc in enumerate(parts): 901 self._insert_location(loc, rank + 1, seqfeature_id)
902
903 - def _insert_location(self, location, rank, seqfeature_id):
904 """Add SeqFeatue location to seqfeature_location table (PRIVATE). 905 906 TODO - Add location operator to location_qualifier_value? 907 """ 908 # convert biopython locations to the 1-based location system 909 # used in bioSQL 910 # XXX This could also handle fuzzies 911 start = int(location.start) + 1 912 end = int(location.end) 913 914 # Biopython uses None when we don't know strand information but 915 # BioSQL requires something (non null) and sets this as zero 916 # So we'll use the strand or 0 if Biopython spits out None 917 strand = location.strand or 0 918 919 # TODO - Record an ontology term for the location (location.term_id) 920 # which for now like BioPerl we'll leave as NULL. 921 # This might allow us to record "between" positions properly, but I 922 # don't really see how it could work for before/after fuzzy positions 923 loc_term_id = None 924 925 if location.ref: 926 # sub_feature remote locations when they are in the same db as the 927 # current record do not have a value for ref_db, which SeqFeature 928 # object stores as None. BioSQL schema requires a varchar and is 929 # not NULL 930 dbxref_id = self._get_dbxref_id( 931 location.ref_db or "", location.ref) 932 else: 933 dbxref_id = None 934 935 sql = r"INSERT INTO location (seqfeature_id, dbxref_id, term_id," \ 936 r"start_pos, end_pos, strand, rank) " \ 937 r"VALUES (%s, %s, %s, %s, %s, %s, %s)" 938 self.adaptor.execute(sql, (seqfeature_id, dbxref_id, loc_term_id, 939 start, end, strand, rank)) 940 941 """ 942 # See Bug 2677 943 # TODO - Record the location_operator (e.g. "join" or "order") 944 # using the location_qualifier_value table (which we and BioPerl 945 # have historically left empty). 946 # Note this will need an ontology term for the location qualifer 947 # (location_qualifier_value.term_id) for which oddly the schema 948 # does not allow NULL. 949 if feature.location_operator: 950 #e.g. "join" (common), 951 #or "order" (see Tests/GenBank/protein_refseq2.gb) 952 location_id = self.adaptor.last_id('location') 953 loc_qual_term_id = None # Not allowed in BioSQL v1.0.1 954 sql = r"INSERT INTO location_qualifier_value" \ 955 r"(location_id, term_id, value)" \ 956 r"VALUES (%s, %s, %s)" 957 self.adaptor.execute(sql, (location_id, loc_qual_term_id, 958 feature.location_operator)) 959 """
960
961 - def _load_seqfeature_qualifiers(self, qualifiers, seqfeature_id):
962 """Insert feature's (key, value) pair qualifiers (PRIVATE). 963 964 Qualifiers should be a dictionary of the form: 965 {key : [value1, value2]} 966 """ 967 tag_ontology_id = self._get_ontology_id('Annotation Tags') 968 for qualifier_key in qualifiers: 969 # Treat db_xref qualifiers differently to sequence annotation 970 # qualifiers by populating the seqfeature_dbxref and dbxref 971 # tables. Other qualifiers go into the seqfeature_qualifier_value 972 # and (if new) term tables. 973 if qualifier_key != 'db_xref': 974 qualifier_key_id = self._get_term_id(qualifier_key, 975 ontology_id=tag_ontology_id) 976 # now add all of the values to their table 977 entries = qualifiers[qualifier_key] 978 if not isinstance(entries, list): 979 # Could be a plain string, or an int or a float. 980 # However, we exect a list of strings here. 981 entries = [entries] 982 for qual_value_rank in range(len(entries)): 983 qualifier_value = entries[qual_value_rank] 984 sql = r"INSERT INTO seqfeature_qualifier_value "\ 985 r" (seqfeature_id, term_id, rank, value) VALUES"\ 986 r" (%s, %s, %s, %s)" 987 self.adaptor.execute(sql, (seqfeature_id, 988 qualifier_key_id, 989 qual_value_rank + 1, 990 qualifier_value)) 991 else: 992 # The dbxref_id qualifier/value sets go into the dbxref table 993 # as dbname, accession, version tuples, with dbxref.dbxref_id 994 # being automatically assigned, and into the seqfeature_dbxref 995 # table as seqfeature_id, dbxref_id, and rank tuples 996 self._load_seqfeature_dbxref(qualifiers[qualifier_key], 997 seqfeature_id)
998
999 - def _load_seqfeature_dbxref(self, dbxrefs, seqfeature_id):
1000 """Add SeqFeature's DB cross-references to the database (PRIVATE). 1001 1002 o dbxrefs List, dbxref data from the source file in the 1003 format <database>:<accession> 1004 1005 o seqfeature_id Int, the identifier for the seqfeature in the 1006 seqfeature table 1007 1008 Insert dbxref qualifier data for a seqfeature into the 1009 seqfeature_dbxref and, if required, dbxref tables. 1010 The dbxref_id qualifier/value sets go into the dbxref table 1011 as dbname, accession, version tuples, with dbxref.dbxref_id 1012 being automatically assigned, and into the seqfeature_dbxref 1013 table as seqfeature_id, dbxref_id, and rank tuples 1014 """ 1015 # NOTE - In older versions of Biopython, we would map the GenBank 1016 # db_xref "name", for example "GI" to "GeneIndex", and give a warning 1017 # for any unknown terms. This was a long term maintainance problem, 1018 # and differed from BioPerl and BioJava's implementation. See bug 2405 1019 for rank, value in enumerate(dbxrefs): 1020 # Split the DB:accession format string at colons. We have to 1021 # account for multiple-line and multiple-accession entries 1022 try: 1023 dbxref_data = value.replace(' ', '').replace('\n', '').split(':') 1024 db = dbxref_data[0] 1025 accessions = dbxref_data[1:] 1026 except: 1027 raise ValueError("Parsing of db_xref failed: '%s'" % value) 1028 # Loop over all the grabbed accessions, and attempt to fill the 1029 # table 1030 for accession in accessions: 1031 # Get the dbxref_id value for the dbxref data 1032 dbxref_id = self._get_dbxref_id(db, accession) 1033 # Insert the seqfeature_dbxref data 1034 self._get_seqfeature_dbxref(seqfeature_id, dbxref_id, rank + 1)
1035
1036 - def _get_dbxref_id(self, db, accession):
1037 """ _get_dbxref_id(self, db, accession) -> Int 1038 1039 o db String, the name of the external database containing 1040 the accession number 1041 1042 o accession String, the accession of the dbxref data 1043 1044 Finds and returns the dbxref_id for the passed data. The method 1045 attempts to find an existing record first, and inserts the data 1046 if there is no record. 1047 """ 1048 # Check for an existing record 1049 sql = r'SELECT dbxref_id FROM dbxref WHERE dbname = %s ' \ 1050 r'AND accession = %s' 1051 dbxref_id = self.adaptor.execute_and_fetch_col0(sql, (db, accession)) 1052 # If there was a record, return the dbxref_id, else create the 1053 # record and return the created dbxref_id 1054 if dbxref_id: 1055 return dbxref_id[0] 1056 return self._add_dbxref(db, accession, 0)
1057
1058 - def _get_seqfeature_dbxref(self, seqfeature_id, dbxref_id, rank):
1059 """ Check for a pre-existing seqfeature_dbxref entry with the passed 1060 seqfeature_id and dbxref_id. If one does not exist, insert new 1061 data 1062 1063 """ 1064 # Check for an existing record 1065 sql = r"SELECT seqfeature_id, dbxref_id FROM seqfeature_dbxref " \ 1066 r"WHERE seqfeature_id = %s AND dbxref_id = %s" 1067 result = self.adaptor.execute_and_fetch_col0(sql, (seqfeature_id, 1068 dbxref_id)) 1069 # If there was a record, return without executing anything, else create 1070 # the record and return 1071 if result: 1072 return result 1073 return self._add_seqfeature_dbxref(seqfeature_id, dbxref_id, rank)
1074
1075 - def _add_seqfeature_dbxref(self, seqfeature_id, dbxref_id, rank):
1076 """ Insert a seqfeature_dbxref row and return the seqfeature_id and 1077 dbxref_id 1078 """ 1079 sql = r'INSERT INTO seqfeature_dbxref ' \ 1080 '(seqfeature_id, dbxref_id, rank) VALUES' \ 1081 r'(%s, %s, %s)' 1082 self.adaptor.execute(sql, (seqfeature_id, dbxref_id, rank)) 1083 return (seqfeature_id, dbxref_id)
1084
1085 - def _load_dbxrefs(self, record, bioentry_id):
1086 """Load any sequence level cross references into the database (PRIVATE). 1087 1088 See table bioentry_dbxref.""" 1089 for rank, value in enumerate(record.dbxrefs): 1090 # Split the DB:accession string at first colon. 1091 # We have to cope with things like: 1092 # "MGD:MGI:892" (db="MGD", accession="MGI:892") 1093 # "GO:GO:123" (db="GO", accession="GO:123") 1094 # 1095 # Annoyingly I have seen the NCBI use both the style 1096 # "GO:GO:123" and "GO:123" in different vintages. 1097 assert value.count("\n") == 0 1098 try: 1099 db, accession = value.split(':', 1) 1100 db = db.strip() 1101 accession = accession.strip() 1102 except: 1103 raise ValueError( 1104 "Parsing of dbxrefs list failed: '%s'" % value) 1105 # Get the dbxref_id value for the dbxref data 1106 dbxref_id = self._get_dbxref_id(db, accession) 1107 # Insert the bioentry_dbxref data 1108 self._get_bioentry_dbxref(bioentry_id, dbxref_id, rank + 1)
1109
1110 - def _get_bioentry_dbxref(self, bioentry_id, dbxref_id, rank):
1111 """Get pre-existing db-xref, or create and return it (PRIVATE). 1112 1113 Check for a pre-existing bioentry_dbxref entry with the passed 1114 seqfeature_id and dbxref_id. If one does not exist, insert new 1115 data 1116 """ 1117 # Check for an existing record 1118 sql = r"SELECT bioentry_id, dbxref_id FROM bioentry_dbxref " \ 1119 r"WHERE bioentry_id = %s AND dbxref_id = %s" 1120 result = self.adaptor.execute_and_fetch_col0(sql, (bioentry_id, 1121 dbxref_id)) 1122 # If there was a record, return without executing anything, else create 1123 # the record and return 1124 if result: 1125 return result 1126 return self._add_bioentry_dbxref(bioentry_id, dbxref_id, rank)
1127
1128 - def _add_bioentry_dbxref(self, bioentry_id, dbxref_id, rank):
1129 """Insert a bioentry_dbxref row (PRIVATE). 1130 1131 Returns the seqfeature_id and dbxref_id (PRIVATE). 1132 """ 1133 sql = r'INSERT INTO bioentry_dbxref ' \ 1134 '(bioentry_id,dbxref_id,rank) VALUES ' \ 1135 '(%s, %s, %s)' 1136 self.adaptor.execute(sql, (bioentry_id, dbxref_id, rank)) 1137 return (bioentry_id, dbxref_id)
1138 1139
1140 -class DatabaseRemover(object):
1141 """Complement the Loader functionality by fully removing a database. 1142 1143 This probably isn't really useful for normal purposes, since you 1144 can just do a:: 1145 1146 DROP DATABASE db_name 1147 1148 and then recreate the database. But, it's really useful for testing 1149 purposes. 1150 """ 1151
1152 - def __init__(self, adaptor, dbid):
1153 """Initialize with a database id and adaptor connection.""" 1154 self.adaptor = adaptor 1155 self.dbid = dbid
1156
1157 - def remove(self):
1158 """Remove everything related to the given database id.""" 1159 sql = r"DELETE FROM bioentry WHERE biodatabase_id = %s" 1160 self.adaptor.execute(sql, (self.dbid,)) 1161 sql = r"DELETE FROM biodatabase WHERE biodatabase_id = %s" 1162 self.adaptor.execute(sql, (self.dbid,))
1163