Package BioSQL :: Module Loader
[hide private]
[frames] | no frames]

Source Code for Module BioSQL.Loader

   1  # Copyright 2002 by Andrew Dalke.  All rights reserved. 
   2  # Revisions 2007-2016 copyright by Peter Cock.  All rights reserved. 
   3  # Revisions 2008 copyright by Cymon J. Cox.  All rights reserved. 
   4  # This code is part of the Biopython distribution and governed by its 
   5  # license.  Please see the LICENSE file that should have been included 
   6  # as part of this package. 
   7  # 
   8  # Note that BioSQL (including the database schema and scripts) is 
   9  # available and licensed separately.  Please consult www.biosql.org 
  10   
  11  """Load biopython objects into a BioSQL database for persistent storage. 
  12   
  13  This code makes it possible to store biopython objects in a relational 
  14  database and then retrieve them back. You shouldn't use any of the 
  15  classes in this module directly. Rather, call the load() method on 
  16  a database object. 
  17  """ 
  18  # standard modules 
  19  from __future__ import print_function 
  20   
  21  from time import gmtime, strftime 
  22   
  23  # biopython 
  24  from Bio import Alphabet 
  25  from Bio.SeqUtils.CheckSum import crc64 
  26  from Bio import Entrez 
  27  from Bio.Seq import UnknownSeq 
  28   
  29  from Bio._py3k import _is_int_or_long 
  30  from Bio._py3k import range 
  31  from Bio._py3k import basestring 
  32   
  33   
34 -class DatabaseLoader(object):
35 """Object used to load SeqRecord objects into a BioSQL database.""" 36
37 - def __init__(self, adaptor, dbid, fetch_NCBI_taxonomy=False):
38 """Initialize with connection information for the database. 39 40 Creating a DatabaseLoader object is normally handled via the 41 BioSeqDatabase DBServer object, for example:: 42 43 from BioSQL import BioSeqDatabase 44 server = BioSeqDatabase.open_database(driver="MySQLdb", user="gbrowse", 45 passwd = "biosql", host = "localhost", db="test_biosql") 46 try: 47 db = server["test"] 48 except KeyError: 49 db = server.new_database("test", description="For testing GBrowse") 50 """ 51 self.adaptor = adaptor 52 self.dbid = dbid 53 self.fetch_NCBI_taxonomy = fetch_NCBI_taxonomy
54
55 - def load_seqrecord(self, record):
56 """Load a Biopython SeqRecord into the database.""" 57 bioentry_id = self._load_bioentry_table(record) 58 self._load_bioentry_date(record, bioentry_id) 59 self._load_biosequence(record, bioentry_id) 60 self._load_comment(record, bioentry_id) 61 self._load_dbxrefs(record, bioentry_id) 62 references = record.annotations.get('references', ()) 63 for reference, rank in zip(references, list(range(len(references)))): 64 self._load_reference(reference, rank, bioentry_id) 65 self._load_annotations(record, bioentry_id) 66 for seq_feature_num in range(len(record.features)): 67 seq_feature = record.features[seq_feature_num] 68 self._load_seqfeature(seq_feature, seq_feature_num, bioentry_id)
69
70 - def _get_ontology_id(self, name, definition=None):
71 """Returns the identifier for the named ontology (PRIVATE). 72 73 This looks through the onotology table for a the given entry name. 74 If it is not found, a row is added for this ontology (using the 75 definition if supplied). In either case, the id corresponding to 76 the provided name is returned, so that you can reference it in 77 another table. 78 """ 79 oids = self.adaptor.execute_and_fetch_col0( 80 "SELECT ontology_id FROM ontology WHERE name = %s", 81 (name,)) 82 if oids: 83 return oids[0] 84 self.adaptor.execute( 85 "INSERT INTO ontology(name, definition) VALUES (%s, %s)", 86 (name, definition)) 87 return self.adaptor.last_id("ontology")
88
89 - def _get_term_id(self, 90 name, 91 ontology_id=None, 92 definition=None, 93 identifier=None):
94 """Get the id that corresponds to a term (PRIVATE). 95 96 This looks through the term table for a the given term. If it 97 is not found, a new id corresponding to this term is created. 98 In either case, the id corresponding to that term is returned, so 99 that you can reference it in another table. 100 101 The ontology_id should be used to disambiguate the term. 102 """ 103 104 # try to get the term id 105 sql = r"SELECT term_id FROM term " \ 106 r"WHERE name = %s" 107 fields = [name] 108 if ontology_id: 109 sql += ' AND ontology_id = %s' 110 fields.append(ontology_id) 111 id_results = self.adaptor.execute_and_fetchall(sql, fields) 112 # something is wrong 113 if len(id_results) > 1: 114 raise ValueError("Multiple term ids for %s: %r" % 115 (name, id_results)) 116 elif len(id_results) == 1: 117 return id_results[0][0] 118 else: 119 sql = r"INSERT INTO term (name, definition," \ 120 r" identifier, ontology_id)" \ 121 r" VALUES (%s, %s, %s, %s)" 122 self.adaptor.execute(sql, (name, definition, 123 identifier, ontology_id)) 124 return self.adaptor.last_id("term")
125
126 - def _add_dbxref(self, dbname, accession, version):
127 """Insert a dbxref and return its id.""" 128 self.adaptor.execute( 129 "INSERT INTO dbxref(dbname, accession, version)" 130 " VALUES (%s, %s, %s)", (dbname, accession, version)) 131 return self.adaptor.last_id("dbxref")
132
133 - def _get_taxon_id(self, record):
134 """Get the taxon id for this record (PRIVATE). 135 136 record - a SeqRecord object 137 138 This searches the taxon/taxon_name tables using the 139 NCBI taxon ID, scientific name and common name to find 140 the matching taxon table entry's id. 141 142 If the species isn't in the taxon table, and we have at 143 least the NCBI taxon ID, scientific name or common name, 144 at least a minimal stub entry is created in the table. 145 146 Returns the taxon id (database key for the taxon table, 147 not an NCBI taxon ID), or None if the taxonomy information 148 is missing. 149 150 See also the BioSQL script load_ncbi_taxonomy.pl which 151 will populate and update the taxon/taxon_name tables 152 with the latest information from the NCBI. 153 """ 154 155 # To find the NCBI taxid, first check for a top level annotation 156 ncbi_taxon_id = None 157 if "ncbi_taxid" in record.annotations: 158 # Could be a list of IDs. 159 if isinstance(record.annotations["ncbi_taxid"], list): 160 if len(record.annotations["ncbi_taxid"]) == 1: 161 ncbi_taxon_id = record.annotations["ncbi_taxid"][0] 162 else: 163 ncbi_taxon_id = record.annotations["ncbi_taxid"] 164 if not ncbi_taxon_id: 165 # Secondly, look for a source feature 166 for f in record.features: 167 if f.type == 'source': 168 quals = getattr(f, 'qualifiers', {}) 169 if "db_xref" in quals: 170 for db_xref in f.qualifiers["db_xref"]: 171 if db_xref.startswith("taxon:"): 172 ncbi_taxon_id = int(db_xref[6:]) 173 break 174 if ncbi_taxon_id: 175 break 176 177 try: 178 scientific_name = record.annotations["organism"][:255] 179 except KeyError: 180 scientific_name = None 181 try: 182 common_name = record.annotations["source"][:255] 183 except KeyError: 184 common_name = None 185 # Note: The maximum length for taxon names in the schema is 255. 186 # Cropping it now should help in getting a match when searching, 187 # and avoids an error if we try and add these to the database. 188 189 if ncbi_taxon_id: 190 # Good, we have the NCBI taxon to go on - this is unambiguous :) 191 # Note that the scientific name and common name will only be 192 # used if we have to record a stub entry. 193 return self._get_taxon_id_from_ncbi_taxon_id(ncbi_taxon_id, 194 scientific_name, 195 common_name) 196 197 if not common_name and not scientific_name: 198 # Nothing to go on... and there is no point adding 199 # a new entry to the database. We'll just leave this 200 # sequence's taxon as a NULL in the database. 201 return None 202 203 # Next, we'll try to find a match based on the species name 204 # (stored in GenBank files as the organism and/or the source). 205 if scientific_name: 206 taxa = self.adaptor.execute_and_fetch_col0( 207 "SELECT taxon_id FROM taxon_name" 208 " WHERE name_class = 'scientific name' AND name = %s", 209 (scientific_name,)) 210 if taxa: 211 # Good, mapped the scientific name to a taxon table entry 212 return taxa[0] 213 214 # Last chance... 215 if common_name: 216 taxa = self.adaptor.execute_and_fetch_col0( 217 "SELECT DISTINCT taxon_id FROM taxon_name" 218 " WHERE name = %s", 219 (common_name,)) 220 # Its natural that several distinct taxa will have the same common 221 # name - in which case we can't resolve the taxon uniquely. 222 if len(taxa) > 1: 223 raise ValueError("Taxa: %d species have name %r" % ( 224 len(taxa), 225 common_name)) 226 if taxa: 227 # Good, mapped the common name to a taxon table entry 228 return taxa[0] 229 230 # At this point, as far as we can tell, this species isn't 231 # in the taxon table already. So we'll have to add it. 232 # We don't have an NCBI taxonomy ID, so if we do record just 233 # a stub entry, there is no simple way to fix this later. 234 # 235 # TODO - Should we try searching the NCBI taxonomy using the 236 # species name? 237 # 238 # OK, let's try inserting the species. 239 # Chances are we don't have enough information ... 240 # Furthermore, it won't be in the hierarchy. 241 242 lineage = [] 243 for c in record.annotations.get("taxonomy", []): 244 lineage.append([None, None, c]) 245 if lineage: 246 lineage[-1][1] = "genus" 247 lineage.append([None, "species", record.annotations["organism"]]) 248 # XXX do we have them? 249 if "subspecies" in record.annotations: 250 lineage.append([None, "subspecies", 251 record.annotations["subspecies"]]) 252 if "variant" in record.annotations: 253 lineage.append([None, "varietas", 254 record.annotations["variant"]]) 255 lineage[-1][0] = ncbi_taxon_id 256 257 left_value = self.adaptor.execute_one( 258 "SELECT MAX(left_value) FROM taxon")[0] 259 if not left_value: 260 left_value = 0 261 left_value += 1 262 263 # XXX -- Brad: Fixing this for now in an ugly way because 264 # I am getting overlaps for right_values. I need to dig into this 265 # more to actually understand how it works. I'm not sure it is 266 # actually working right anyhow. 267 right_start_value = self.adaptor.execute_one( 268 "SELECT MAX(right_value) FROM taxon")[0] 269 if not right_start_value: 270 right_start_value = 0 271 right_value = right_start_value + 2 * len(lineage) - 1 272 273 parent_taxon_id = None 274 for taxon in lineage: 275 self.adaptor.execute( 276 "INSERT INTO taxon(parent_taxon_id, ncbi_taxon_id, node_rank," 277 " left_value, right_value)" 278 " VALUES (%s, %s, %s, %s, %s)", (parent_taxon_id, 279 taxon[0], 280 taxon[1], 281 left_value, 282 right_value)) 283 taxon_id = self.adaptor.last_id("taxon") 284 self.adaptor.execute( 285 "INSERT INTO taxon_name(taxon_id, name, name_class)" 286 "VALUES (%s, %s, 'scientific name')", (taxon_id, taxon[2][:255])) 287 # Note the name field is limited to 255, some SwissProt files 288 # have a multi-species name which can be longer. So truncate this. 289 left_value += 1 290 right_value -= 1 291 parent_taxon_id = taxon_id 292 if common_name: 293 self.adaptor.execute( 294 "INSERT INTO taxon_name(taxon_id, name, name_class)" 295 "VALUES (%s, %s, 'common name')", ( 296 taxon_id, common_name)) 297 298 return taxon_id
299
300 - def _fix_name_class(self, entrez_name):
301 """Map Entrez name terms to those used in taxdump (PRIVATE). 302 303 We need to make this conversion to match the taxon_name.name_class 304 values used by the BioSQL load_ncbi_taxonomy.pl script. 305 306 e.g. 307 "ScientificName" -> "scientific name", 308 "EquivalentName" -> "equivalent name", 309 "Synonym" -> "synonym", 310 """ 311 # Add any special cases here: 312 # 313 # known = {} 314 # try: 315 # return known[entrez_name] 316 # except KeyError: 317 # pass 318 319 # Try automatically by adding spaces before each capital 320 def add_space(letter): 321 """Adds a space before a capital letter.""" 322 if letter.isupper(): 323 return " " + letter.lower() 324 else: 325 return letter
326 answer = "".join(add_space(letter) for letter in entrez_name).strip() 327 assert answer == answer.lower() 328 return answer
329
330 - def _update_left_right_taxon_values(self, left_value):
331 """update the left and right values in the table 332 """ 333 if not left_value: 334 return 335 # Due to the UNIQUE constraint on the left and right values in the taxon 336 # table we cannot simply update them through an SQL statement as we risk 337 # colliding values. Instead we must select all of the rows that we want to 338 # update, modify the values in python and then update the rows 339 # self.adaptor.execute("UPDATE taxon SET right_value = right_value + 2 WHERE right_value >= %s", (left_value,)) 340 # self.adaptor.execute("UPDATE taxon SET left_value = left_value + 2 WHERE left_value > %s", (left_value,)) 341 342 rows = self.adaptor.execute_and_fetchall( 343 "SELECT left_value, right_value, taxon_id FROM taxon WHERE right_value >= %s or left_value > %s", 344 # "SELECT left_value, right_value, taxon_id FROM taxon", 345 (left_value, left_value) 346 ) 347 348 right_rows = [] 349 left_rows = [] 350 for row in rows: 351 new_right = row[1] 352 new_left = row[0] 353 if new_right >= left_value: 354 new_right += 2 355 356 if new_left > left_value: 357 new_left += 2 358 right_rows.append((new_right, row[2])) 359 left_rows.append((new_left, row[2])) 360 361 # sort the rows based on the value from largest to smallest 362 # should ensure no overlaps 363 right_rows = sorted(right_rows, key=lambda x: x[0], reverse=True) 364 left_rows = sorted(left_rows, key=lambda x: x[0], reverse=True) 365 366 self.adaptor.executemany("UPDATE taxon SET left_value = %s WHERE taxon_id = %s", left_rows) 367 self.adaptor.executemany("UPDATE taxon SET right_value = %s WHERE taxon_id = %s", right_rows)
368
369 - def _get_taxon_id_from_ncbi_taxon_id(self, ncbi_taxon_id, 370 scientific_name=None, 371 common_name=None):
372 """Get the taxon id for this record from the NCBI taxon ID (PRIVATE). 373 374 ncbi_taxon_id - string containing an NCBI taxon id 375 scientific_name - string, used if a stub entry is recorded 376 common_name - string, used if a stub entry is recorded 377 378 This searches the taxon table using ONLY the NCBI taxon ID 379 to find the matching taxon table entry's ID (database key). 380 381 If the species isn't in the taxon table, and the fetch_NCBI_taxonomy 382 flag is true, Biopython will attempt to go online using Bio.Entrez 383 to fetch the official NCBI lineage, recursing up the tree until an 384 existing entry is found in the database or the full lineage has been 385 fetched. 386 387 Otherwise the NCBI taxon ID, scientific name and common name are 388 recorded as a minimal stub entry in the taxon and taxon_name tables. 389 Any partial information about the lineage from the SeqRecord is NOT 390 recorded. This should mean that (re)running the BioSQL script 391 load_ncbi_taxonomy.pl can fill in the taxonomy lineage. 392 393 Returns the taxon id (database key for the taxon table, not 394 an NCBI taxon ID). 395 """ 396 assert ncbi_taxon_id 397 398 taxon_id = self.adaptor.execute_and_fetch_col0( 399 "SELECT taxon_id FROM taxon WHERE ncbi_taxon_id = %s", 400 (int(ncbi_taxon_id),)) 401 if taxon_id: 402 # Good, we have mapped the NCBI taxid to a taxon table entry 403 return taxon_id[0] 404 405 # At this point, as far as we can tell, this species isn't 406 # in the taxon table already. So we'll have to add it. 407 408 parent_taxon_id = None 409 rank = "species" 410 genetic_code = None 411 mito_genetic_code = None 412 parent_left_value = None 413 parent_right_value = None 414 left_value = None 415 right_value = None 416 species_names = [] 417 if scientific_name: 418 species_names.append(("scientific name", scientific_name)) 419 if common_name: 420 species_names.append(("common name", common_name)) 421 422 if self.fetch_NCBI_taxonomy: 423 # Go online to get the parent taxon ID! 424 handle = Entrez.efetch( 425 db="taxonomy", id=ncbi_taxon_id, retmode="XML") 426 taxonomic_record = Entrez.read(handle) 427 if len(taxonomic_record) == 1: 428 assert taxonomic_record[0]["TaxId"] == str(ncbi_taxon_id), \ 429 "%s versus %s" % (taxonomic_record[0]["TaxId"], 430 ncbi_taxon_id) 431 432 parent_taxon_id, parent_left_value, parent_right_value = self._get_taxon_id_from_ncbi_lineage( 433 taxonomic_record[0]["LineageEx"]) 434 435 left_value = parent_right_value 436 right_value = parent_right_value + 1 437 438 rank = str(taxonomic_record[0]["Rank"]) 439 440 genetic_code = int(taxonomic_record[0]["GeneticCode"]["GCId"]) 441 442 mito_genetic_code = int(taxonomic_record[ 443 0]["MitoGeneticCode"]["MGCId"]) 444 445 species_names = [("scientific name", 446 str(taxonomic_record[0]["ScientificName"]))] 447 try: 448 for name_class, names in taxonomic_record[0]["OtherNames"].items(): 449 name_class = self._fix_name_class(name_class) 450 if not isinstance(names, list): 451 # The Entrez parser seems to return single entry 452 # lists as just a string which is annoying. 453 names = [names] 454 for name in names: 455 # Want to ignore complex things like ClassCDE 456 # entries 457 if isinstance(name, basestring): 458 species_names.append((name_class, name)) 459 except KeyError: 460 # OtherNames isn't always present, 461 # e.g. NCBI taxon 41205, Bromheadia finlaysoniana 462 pass 463 else: 464 pass 465 # If we are not allowed to go online, we will record the bare minimum; 466 # as long as the NCBI taxon id is present, then (re)running 467 # load_ncbi_taxonomy.pl should fill in the taxonomomy lineage 468 # (and update the species names). 469 # 470 # I am NOT going to try and record the lineage, even if it 471 # is in the record annotation as a list of names, as we won't 472 # know the NCBI taxon IDs for these parent nodes. 473 474 self._update_left_right_taxon_values(left_value) 475 476 self.adaptor.execute( 477 "INSERT INTO taxon(parent_taxon_id, ncbi_taxon_id, node_rank," 478 " genetic_code, mito_genetic_code, left_value, right_value)" 479 " VALUES (%s, %s, %s, %s, %s, %s, %s)", (parent_taxon_id, 480 ncbi_taxon_id, 481 rank, 482 genetic_code, 483 mito_genetic_code, 484 left_value, 485 right_value)) 486 487 taxon_id = self.adaptor.last_id("taxon") 488 489 # Record the scientific name, common name, etc 490 for name_class, name in species_names: 491 self.adaptor.execute( 492 "INSERT INTO taxon_name(taxon_id, name, name_class)" 493 " VALUES (%s, %s, %s)", (taxon_id, 494 name[:255], 495 name_class)) 496 return taxon_id
497
498 - def _get_taxon_id_from_ncbi_lineage(self, taxonomic_lineage):
499 """This is recursive! (PRIVATE). 500 501 taxonomic_lineage - list of taxonomy dictionaries from Bio.Entrez 502 503 First dictionary in list is the taxonomy root, highest would be the species. 504 Each dictionary includes: 505 - TaxID (string, NCBI taxon id) 506 - Rank (string, e.g. "species", "genus", ..., "phylum", ...) 507 - ScientificName (string) 508 (and that is all at the time of writing) 509 510 This method will record all the lineage given, returning the taxon id 511 (database key, not NCBI taxon id) of the final entry (the species). 512 """ 513 ncbi_taxon_id = int(taxonomic_lineage[-1]["TaxId"]) 514 left_value = None 515 right_value = None 516 parent_left_value = None 517 parent_right_value = None 518 # Is this in the database already? Check the taxon table... 519 rows = self.adaptor.execute_and_fetchall( 520 "SELECT taxon_id, left_value, right_value FROM taxon" 521 " WHERE ncbi_taxon_id=%s" % ncbi_taxon_id) 522 if rows: 523 # we could verify that the Scientific Name etc in the database 524 # is the same and update it or print a warning if not... 525 assert len(rows) == 1 526 return rows[0] 527 528 # We have to record this. 529 if len(taxonomic_lineage) > 1: 530 # Use recursion to find out the taxon id (database key) of the 531 # parent. 532 parent_taxon_id, parent_left_value, parent_right_value = self._get_taxon_id_from_ncbi_lineage( 533 taxonomic_lineage[:-1]) 534 left_value = parent_right_value 535 right_value = parent_right_value + 1 536 assert _is_int_or_long(parent_taxon_id), repr(parent_taxon_id) 537 else: 538 # we have reached the top of the lineage but no current taxonomy 539 # id has been found 540 parent_taxon_id = None 541 left_value = self.adaptor.execute_one( 542 "SELECT MAX(left_value) FROM taxon")[0] 543 if not left_value: 544 left_value = 0 545 546 right_value = left_value + 1 547 548 self._update_left_right_taxon_values(left_value) 549 550 # INSERT new taxon 551 rank = str(taxonomic_lineage[-1].get("Rank")) 552 self.adaptor.execute( 553 "INSERT INTO taxon(ncbi_taxon_id, parent_taxon_id, node_rank, left_value, right_value)" 554 " VALUES (%s, %s, %s, %s, %s)", (ncbi_taxon_id, parent_taxon_id, rank, left_value, right_value)) 555 556 taxon_id = self.adaptor.last_id("taxon") 557 # assert isinstance(taxon_id, int), repr(taxon_id) 558 # ... and its name in taxon_name 559 scientific_name = taxonomic_lineage[-1].get("ScientificName") 560 if scientific_name: 561 self.adaptor.execute( 562 "INSERT INTO taxon_name(taxon_id, name, name_class)" 563 " VALUES (%s, %s, 'scientific name')", (taxon_id, 564 scientific_name[:255])) 565 return taxon_id, left_value, right_value
566
567 - def _load_bioentry_table(self, record):
568 """Fill the bioentry table with sequence information (PRIVATE). 569 570 record - SeqRecord object to add to the database. 571 """ 572 # get the pertinent info and insert it 573 574 if record.id.count(".") == 1: # try to get a version from the id 575 # This assumes the string is something like "XXXXXXXX.123" 576 accession, version = record.id.split('.') 577 try: 578 version = int(version) 579 except ValueError: 580 accession = record.id 581 version = 0 582 else: # otherwise just use a version of 0 583 accession = record.id 584 version = 0 585 586 if "accessions" in record.annotations \ 587 and isinstance(record.annotations["accessions"], list) \ 588 and record.annotations["accessions"]: 589 # Take the first accession (one if there is more than one) 590 accession = record.annotations["accessions"][0] 591 592 # Find the taxon id (this is not just the NCBI Taxon ID) 593 # NOTE - If the species isn't defined in the taxon table, 594 # a new minimal entry is created. 595 taxon_id = self._get_taxon_id(record) 596 597 if "gi" in record.annotations: 598 identifier = record.annotations["gi"] 599 else: 600 identifier = record.id 601 602 # Allow description and division to default to NULL as in BioPerl. 603 description = getattr(record, 'description', None) 604 division = record.annotations.get("data_file_division") 605 606 sql = """ 607 INSERT INTO bioentry ( 608 biodatabase_id, 609 taxon_id, 610 name, 611 accession, 612 identifier, 613 division, 614 description, 615 version) 616 VALUES ( 617 %s, 618 %s, 619 %s, 620 %s, 621 %s, 622 %s, 623 %s, 624 %s)""" 625 # print self.dbid, taxon_id, record.name, accession, identifier, \ 626 # division, description, version 627 self.adaptor.execute(sql, (self.dbid, 628 taxon_id, 629 record.name, 630 accession, 631 identifier, 632 division, 633 description, 634 version)) 635 # now retrieve the id for the bioentry 636 bioentry_id = self.adaptor.last_id('bioentry') 637 638 return bioentry_id
639
640 - def _load_bioentry_date(self, record, bioentry_id):
641 """Add the effective date of the entry into the database. 642 643 record - a SeqRecord object with an annotated date 644 bioentry_id - corresponding database identifier 645 """ 646 # dates are GenBank style, like: 647 # 14-SEP-2000 648 date = record.annotations.get("date", 649 strftime("%d-%b-%Y", gmtime()).upper()) 650 if isinstance(date, list): 651 date = date[0] 652 annotation_tags_id = self._get_ontology_id("Annotation Tags") 653 date_id = self._get_term_id("date_changed", annotation_tags_id) 654 sql = r"INSERT INTO bioentry_qualifier_value" \ 655 r" (bioentry_id, term_id, value, rank)" \ 656 r" VALUES (%s, %s, %s, 1)" 657 self.adaptor.execute(sql, (bioentry_id, date_id, date))
658
659 - def _load_biosequence(self, record, bioentry_id):
660 """Record a SeqRecord's sequence and alphabet in the database (PRIVATE). 661 662 record - a SeqRecord object with a seq property 663 bioentry_id - corresponding database identifier 664 """ 665 if record.seq is None: 666 # The biosequence table entry is optional, so if we haven't 667 # got a sequence, we don't need to write to the table. 668 return 669 670 # determine the string representation of the alphabet 671 if isinstance(record.seq.alphabet, Alphabet.DNAAlphabet): 672 alphabet = "dna" 673 elif isinstance(record.seq.alphabet, Alphabet.RNAAlphabet): 674 alphabet = "rna" 675 elif isinstance(record.seq.alphabet, Alphabet.ProteinAlphabet): 676 alphabet = "protein" 677 else: 678 alphabet = "unknown" 679 680 if isinstance(record.seq, UnknownSeq): 681 seq_str = None 682 else: 683 seq_str = str(record.seq) 684 685 sql = r"INSERT INTO biosequence (bioentry_id, version, " \ 686 r"length, seq, alphabet) " \ 687 r"VALUES (%s, 0, %s, %s, %s)" 688 self.adaptor.execute(sql, (bioentry_id, 689 len(record.seq), 690 seq_str, 691 alphabet))
692
693 - def _load_comment(self, record, bioentry_id):
694 """Record a SeqRecord's annotated comment in the database (PRIVATE). 695 696 record - a SeqRecord object with an annotated comment 697 bioentry_id - corresponding database identifier 698 """ 699 comments = record.annotations.get('comment') 700 if not comments: 701 return 702 if not isinstance(comments, list): 703 # It should be a string then... 704 comments = [comments] 705 706 for index, comment in enumerate(comments): 707 comment = comment.replace('\n', ' ') 708 # TODO - Store each line as a separate entry? This would preserve 709 # the newlines, but we should check BioPerl etc to be consistent. 710 sql = "INSERT INTO comment (bioentry_id, comment_text, rank)" \ 711 " VALUES (%s, %s, %s)" 712 self.adaptor.execute(sql, (bioentry_id, comment, index + 1))
713
714 - def _load_annotations(self, record, bioentry_id):
715 """Record a SeqRecord's misc annotations in the database (PRIVATE). 716 717 The annotation strings are recorded in the bioentry_qualifier_value 718 table, except for special cases like the reference, comment and 719 taxonomy which are handled with their own tables. 720 721 record - a SeqRecord object with an annotations dictionary 722 bioentry_id - corresponding database identifier 723 """ 724 mono_sql = "INSERT INTO bioentry_qualifier_value" \ 725 "(bioentry_id, term_id, value)" \ 726 " VALUES (%s, %s, %s)" 727 many_sql = "INSERT INTO bioentry_qualifier_value" \ 728 "(bioentry_id, term_id, value, rank)" \ 729 " VALUES (%s, %s, %s, %s)" 730 tag_ontology_id = self._get_ontology_id('Annotation Tags') 731 for key, value in record.annotations.items(): 732 if key in ["references", "comment", "ncbi_taxid", "date"]: 733 # Handled separately 734 continue 735 term_id = self._get_term_id(key, ontology_id=tag_ontology_id) 736 if isinstance(value, (list, tuple)): 737 rank = 0 738 for entry in value: 739 if isinstance(entry, (str, int)): 740 # Easy case 741 rank += 1 742 self.adaptor.execute(many_sql, 743 (bioentry_id, term_id, str(entry), rank)) 744 else: 745 pass 746 # print "Ignoring annotation '%s' sub-entry of type '%s'" \ 747 # % (key, str(type(entry))) 748 elif isinstance(value, (str, int)): 749 # Have a simple single entry, leave rank as the DB default 750 self.adaptor.execute(mono_sql, 751 (bioentry_id, term_id, str(value))) 752 else: 753 pass
754 # print "Ignoring annotation '%s' entry of type '%s'" \ 755 # % (key, type(value)) 756
757 - def _load_reference(self, reference, rank, bioentry_id):
758 """Record a SeqRecord's annotated references in the database (PRIVATE). 759 760 record - a SeqRecord object with annotated references 761 bioentry_id - corresponding database identifier 762 """ 763 764 refs = None 765 if reference.medline_id: 766 refs = self.adaptor.execute_and_fetch_col0( 767 "SELECT reference_id" 768 " FROM reference JOIN dbxref USING (dbxref_id)" 769 " WHERE dbname = 'MEDLINE' AND accession = %s", 770 (reference.medline_id,)) 771 if not refs and reference.pubmed_id: 772 refs = self.adaptor.execute_and_fetch_col0( 773 "SELECT reference_id" 774 " FROM reference JOIN dbxref USING (dbxref_id)" 775 " WHERE dbname = 'PUBMED' AND accession = %s", 776 (reference.pubmed_id,)) 777 if not refs: 778 s = [] 779 for f in reference.authors, reference.title, reference.journal: 780 s.append(f or "<undef>") 781 crc = crc64("".join(s)) 782 refs = self.adaptor.execute_and_fetch_col0( 783 "SELECT reference_id FROM reference" 784 r" WHERE crc = %s", (crc,)) 785 if not refs: 786 if reference.medline_id: 787 dbxref_id = self._add_dbxref("MEDLINE", 788 reference.medline_id, 0) 789 elif reference.pubmed_id: 790 dbxref_id = self._add_dbxref("PUBMED", 791 reference.pubmed_id, 0) 792 else: 793 dbxref_id = None 794 authors = reference.authors or None 795 title = reference.title or None 796 # The location/journal field cannot be Null, so default 797 # to an empty string rather than None: 798 journal = reference.journal or "" 799 self.adaptor.execute( 800 "INSERT INTO reference (dbxref_id, location," 801 " title, authors, crc)" 802 " VALUES (%s, %s, %s, %s, %s)", 803 (dbxref_id, journal, title, 804 authors, crc)) 805 reference_id = self.adaptor.last_id("reference") 806 else: 807 reference_id = refs[0] 808 809 if reference.location: 810 start = 1 + int(str(reference.location[0].start)) 811 end = int(str(reference.location[0].end)) 812 else: 813 start = None 814 end = None 815 816 sql = "INSERT INTO bioentry_reference (bioentry_id, reference_id," \ 817 " start_pos, end_pos, rank)" \ 818 " VALUES (%s, %s, %s, %s, %s)" 819 self.adaptor.execute(sql, (bioentry_id, reference_id, 820 start, end, rank + 1))
821
822 - def _load_seqfeature(self, feature, feature_rank, bioentry_id):
823 """Load a biopython SeqFeature into the database (PRIVATE). 824 """ 825 # records loaded from a gff file using BCBio.GFF will contain the value 826 # of the 2nd column of the gff as a feature qualifier. The BioSQL wiki 827 # suggests that the source should not go in with the other feature 828 # mappings but instead be put in the term table 829 # (http://www.biosql.org/wiki/Annotation_Mapping) 830 try: 831 source = feature.qualifiers['source'] 832 if isinstance(source, list): 833 source = source[0] 834 seqfeature_id = self._load_seqfeature_basic(feature.type, feature_rank, 835 bioentry_id, source=source) 836 except KeyError: 837 seqfeature_id = self._load_seqfeature_basic(feature.type, feature_rank, 838 bioentry_id) 839 840 self._load_seqfeature_locations(feature, seqfeature_id) 841 self._load_seqfeature_qualifiers(feature.qualifiers, seqfeature_id)
842
843 - def _load_seqfeature_basic(self, feature_type, feature_rank, bioentry_id, source='EMBL/GenBank/SwissProt'):
844 """Load the first tables of a seqfeature and returns the id (PRIVATE). 845 846 This loads the "key" of the seqfeature (ie. CDS, gene) and 847 the basic seqfeature table itself. 848 """ 849 ontology_id = self._get_ontology_id('SeqFeature Keys') 850 seqfeature_key_id = self._get_term_id(feature_type, 851 ontology_id=ontology_id) 852 source_cat_id = self._get_ontology_id('SeqFeature Sources') 853 source_term_id = self._get_term_id(source, 854 ontology_id=source_cat_id) 855 856 sql = r"INSERT INTO seqfeature (bioentry_id, type_term_id, " \ 857 r"source_term_id, rank) VALUES (%s, %s, %s, %s)" 858 self.adaptor.execute(sql, (bioentry_id, seqfeature_key_id, 859 source_term_id, feature_rank + 1)) 860 seqfeature_id = self.adaptor.last_id('seqfeature') 861 862 return seqfeature_id
863
864 - def _load_seqfeature_locations(self, feature, seqfeature_id):
865 """Load all of the locations for a SeqFeature into tables (PRIVATE). 866 867 This adds the locations related to the SeqFeature into the 868 seqfeature_location table. Fuzzies are not handled right now. 869 For a simple location, ie (1..2), we have a single table row 870 with seq_start = 1, seq_end = 2, location_rank = 1. 871 872 For split locations, ie (1..2, 3..4, 5..6) we would have three 873 row tables with:: 874 875 start = 1, end = 2, rank = 1 876 start = 3, end = 4, rank = 2 877 start = 5, end = 6, rank = 3 878 """ 879 # TODO - Record an ontology for the locations (using location.term_id) 880 # which for now as in BioPerl we leave defaulting to NULL. 881 if feature.location_operator and feature.location_operator != "join": 882 # e.g. order locations... we don't record "order" so it 883 # will become a "join" on reloading. What does BioPerl do? 884 import warnings 885 from Bio import BiopythonWarning 886 warnings.warn("%s location operators are not fully supported" 887 % feature.location_operator, BiopythonWarning) 888 # This will be a list of length one for simple FeatureLocation: 889 parts = feature.location.parts 890 if parts and set(loc.strand for loc in parts) == set([-1]): 891 # To mimic prior behaviour of Biopython+BioSQL, reverse order 892 parts = parts[::-1] 893 # TODO - Check what BioPerl does; see also BioSeq.py code 894 for rank, loc in enumerate(parts): 895 self._insert_location(loc, rank + 1, seqfeature_id)
896
897 - def _insert_location(self, location, rank, seqfeature_id):
898 """Add a location of a SeqFeature to the seqfeature_location table (PRIVATE). 899 900 TODO - Add location operator to location_qualifier_value? 901 """ 902 # convert biopython locations to the 1-based location system 903 # used in bioSQL 904 # XXX This could also handle fuzzies 905 start = int(location.start) + 1 906 end = int(location.end) 907 908 # Biopython uses None when we don't know strand information but 909 # BioSQL requires something (non null) and sets this as zero 910 # So we'll use the strand or 0 if Biopython spits out None 911 strand = location.strand or 0 912 913 # TODO - Record an ontology term for the location (location.term_id) 914 # which for now like BioPerl we'll leave as NULL. 915 # This might allow us to record "between" positions properly, but I 916 # doesn't really see how it could work for before/after fuzzy positions 917 loc_term_id = None 918 919 if location.ref: 920 # sub_feature remote locations when they are in the same db as the current 921 # record do not have a value for ref_db, which the SeqFeature object 922 # stores as None. BioSQL schema requires a varchar and is not NULL 923 dbxref_id = self._get_dbxref_id( 924 location.ref_db or "", location.ref) 925 else: 926 dbxref_id = None 927 928 sql = r"INSERT INTO location (seqfeature_id, dbxref_id, term_id," \ 929 r"start_pos, end_pos, strand, rank) " \ 930 r"VALUES (%s, %s, %s, %s, %s, %s, %s)" 931 self.adaptor.execute(sql, (seqfeature_id, dbxref_id, loc_term_id, 932 start, end, strand, rank)) 933 934 """ 935 # See Bug 2677 936 # TODO - Record the location_operator (e.g. "join" or "order") 937 # using the location_qualifier_value table (which we and BioPerl 938 # have historically left empty). 939 # Note this will need an ontology term for the location qualifer 940 # (location_qualifier_value.term_id) for which oddly the schema 941 # does not allow NULL. 942 if feature.location_operator: 943 #e.g. "join" (common), 944 #or "order" (see Tests/GenBank/protein_refseq2.gb) 945 location_id = self.adaptor.last_id('location') 946 loc_qual_term_id = None # Not allowed in BioSQL v1.0.1 947 sql = r"INSERT INTO location_qualifier_value" \ 948 r"(location_id, term_id, value)" \ 949 r"VALUES (%s, %s, %s)" 950 self.adaptor.execute(sql, (location_id, loc_qual_term_id, 951 feature.location_operator)) 952 """
953
954 - def _load_seqfeature_qualifiers(self, qualifiers, seqfeature_id):
955 """Insert the (key, value) pair qualifiers relating to a feature (PRIVATE). 956 957 Qualifiers should be a dictionary of the form: 958 {key : [value1, value2]} 959 """ 960 tag_ontology_id = self._get_ontology_id('Annotation Tags') 961 for qualifier_key in qualifiers: 962 # Treat db_xref qualifiers differently to sequence annotation 963 # qualifiers by populating the seqfeature_dbxref and dbxref 964 # tables. Other qualifiers go into the seqfeature_qualifier_value 965 # and (if new) term tables. 966 if qualifier_key != 'db_xref': 967 qualifier_key_id = self._get_term_id(qualifier_key, 968 ontology_id=tag_ontology_id) 969 # now add all of the values to their table 970 entries = qualifiers[qualifier_key] 971 if not isinstance(entries, list): 972 # Could be a plain string, or an int or a float. 973 # However, we exect a list of strings here. 974 entries = [entries] 975 for qual_value_rank in range(len(entries)): 976 qualifier_value = entries[qual_value_rank] 977 sql = r"INSERT INTO seqfeature_qualifier_value "\ 978 r" (seqfeature_id, term_id, rank, value) VALUES"\ 979 r" (%s, %s, %s, %s)" 980 self.adaptor.execute(sql, (seqfeature_id, 981 qualifier_key_id, 982 qual_value_rank + 1, 983 qualifier_value)) 984 else: 985 # The dbxref_id qualifier/value sets go into the dbxref table 986 # as dbname, accession, version tuples, with dbxref.dbxref_id 987 # being automatically assigned, and into the seqfeature_dbxref 988 # table as seqfeature_id, dbxref_id, and rank tuples 989 self._load_seqfeature_dbxref(qualifiers[qualifier_key], 990 seqfeature_id)
991
992 - def _load_seqfeature_dbxref(self, dbxrefs, seqfeature_id):
993 """Add database crossreferences of a SeqFeature to the database (PRIVATE). 994 995 o dbxrefs List, dbxref data from the source file in the 996 format <database>:<accession> 997 998 o seqfeature_id Int, the identifier for the seqfeature in the 999 seqfeature table 1000 1001 Insert dbxref qualifier data for a seqfeature into the 1002 seqfeature_dbxref and, if required, dbxref tables. 1003 The dbxref_id qualifier/value sets go into the dbxref table 1004 as dbname, accession, version tuples, with dbxref.dbxref_id 1005 being automatically assigned, and into the seqfeature_dbxref 1006 table as seqfeature_id, dbxref_id, and rank tuples 1007 """ 1008 # NOTE - In older versions of Biopython, we would map the GenBank 1009 # db_xref "name", for example "GI" to "GeneIndex", and give a warning 1010 # for any unknown terms. This was a long term maintainance problem, 1011 # and differed from BioPerl and BioJava's implementation. See bug 2405 1012 for rank, value in enumerate(dbxrefs): 1013 # Split the DB:accession format string at colons. We have to 1014 # account for multiple-line and multiple-accession entries 1015 try: 1016 dbxref_data = value.replace(' ', '').replace('\n', '').split(':') 1017 db = dbxref_data[0] 1018 accessions = dbxref_data[1:] 1019 except: 1020 raise ValueError("Parsing of db_xref failed: '%s'" % value) 1021 # Loop over all the grabbed accessions, and attempt to fill the 1022 # table 1023 for accession in accessions: 1024 # Get the dbxref_id value for the dbxref data 1025 dbxref_id = self._get_dbxref_id(db, accession) 1026 # Insert the seqfeature_dbxref data 1027 self._get_seqfeature_dbxref(seqfeature_id, dbxref_id, rank + 1)
1028
1029 - def _get_dbxref_id(self, db, accession):
1030 """ _get_dbxref_id(self, db, accession) -> Int 1031 1032 o db String, the name of the external database containing 1033 the accession number 1034 1035 o accession String, the accession of the dbxref data 1036 1037 Finds and returns the dbxref_id for the passed data. The method 1038 attempts to find an existing record first, and inserts the data 1039 if there is no record. 1040 """ 1041 # Check for an existing record 1042 sql = r'SELECT dbxref_id FROM dbxref WHERE dbname = %s ' \ 1043 r'AND accession = %s' 1044 dbxref_id = self.adaptor.execute_and_fetch_col0(sql, (db, accession)) 1045 # If there was a record, return the dbxref_id, else create the 1046 # record and return the created dbxref_id 1047 if dbxref_id: 1048 return dbxref_id[0] 1049 return self._add_dbxref(db, accession, 0)
1050
1051 - def _get_seqfeature_dbxref(self, seqfeature_id, dbxref_id, rank):
1052 """ Check for a pre-existing seqfeature_dbxref entry with the passed 1053 seqfeature_id and dbxref_id. If one does not exist, insert new 1054 data 1055 1056 """ 1057 # Check for an existing record 1058 sql = r"SELECT seqfeature_id, dbxref_id FROM seqfeature_dbxref " \ 1059 r"WHERE seqfeature_id = %s AND dbxref_id = %s" 1060 result = self.adaptor.execute_and_fetch_col0(sql, (seqfeature_id, 1061 dbxref_id)) 1062 # If there was a record, return without executing anything, else create 1063 # the record and return 1064 if result: 1065 return result 1066 return self._add_seqfeature_dbxref(seqfeature_id, dbxref_id, rank)
1067
1068 - def _add_seqfeature_dbxref(self, seqfeature_id, dbxref_id, rank):
1069 """ Insert a seqfeature_dbxref row and return the seqfeature_id and 1070 dbxref_id 1071 """ 1072 sql = r'INSERT INTO seqfeature_dbxref ' \ 1073 '(seqfeature_id, dbxref_id, rank) VALUES' \ 1074 r'(%s, %s, %s)' 1075 self.adaptor.execute(sql, (seqfeature_id, dbxref_id, rank)) 1076 return (seqfeature_id, dbxref_id)
1077
1078 - def _load_dbxrefs(self, record, bioentry_id):
1079 """Load any sequence level cross references into the database (PRIVATE). 1080 1081 See table bioentry_dbxref.""" 1082 for rank, value in enumerate(record.dbxrefs): 1083 # Split the DB:accession string at first colon. 1084 # We have to cope with things like: 1085 # "MGD:MGI:892" (db="MGD", accession="MGI:892") 1086 # "GO:GO:123" (db="GO", accession="GO:123") 1087 # 1088 # Annoyingly I have seen the NCBI use both the style 1089 # "GO:GO:123" and "GO:123" in different vintages. 1090 assert value.count("\n") == 0 1091 try: 1092 db, accession = value.split(':', 1) 1093 db = db.strip() 1094 accession = accession.strip() 1095 except: 1096 raise ValueError( 1097 "Parsing of dbxrefs list failed: '%s'" % value) 1098 # Get the dbxref_id value for the dbxref data 1099 dbxref_id = self._get_dbxref_id(db, accession) 1100 # Insert the bioentry_dbxref data 1101 self._get_bioentry_dbxref(bioentry_id, dbxref_id, rank + 1)
1102
1103 - def _get_bioentry_dbxref(self, bioentry_id, dbxref_id, rank):
1104 """ Check for a pre-existing bioentry_dbxref entry with the passed 1105 seqfeature_id and dbxref_id. If one does not exist, insert new 1106 data 1107 1108 """ 1109 # Check for an existing record 1110 sql = r"SELECT bioentry_id, dbxref_id FROM bioentry_dbxref " \ 1111 r"WHERE bioentry_id = %s AND dbxref_id = %s" 1112 result = self.adaptor.execute_and_fetch_col0(sql, (bioentry_id, 1113 dbxref_id)) 1114 # If there was a record, return without executing anything, else create 1115 # the record and return 1116 if result: 1117 return result 1118 return self._add_bioentry_dbxref(bioentry_id, dbxref_id, rank)
1119
1120 - def _add_bioentry_dbxref(self, bioentry_id, dbxref_id, rank):
1121 """ Insert a bioentry_dbxref row and return the seqfeature_id and 1122 dbxref_id 1123 """ 1124 sql = r'INSERT INTO bioentry_dbxref ' \ 1125 '(bioentry_id,dbxref_id,rank) VALUES ' \ 1126 '(%s, %s, %s)' 1127 self.adaptor.execute(sql, (bioentry_id, dbxref_id, rank)) 1128 return (bioentry_id, dbxref_id)
1129 1130
1131 -class DatabaseRemover(object):
1132 """Complement the Loader functionality by fully removing a database. 1133 1134 This probably isn't really useful for normal purposes, since you 1135 can just do a:: 1136 1137 DROP DATABASE db_name 1138 1139 and then recreate the database. But, it's really useful for testing 1140 purposes. 1141 """ 1142
1143 - def __init__(self, adaptor, dbid):
1144 """Initialize with a database id and adaptor connection.""" 1145 self.adaptor = adaptor 1146 self.dbid = dbid
1147
1148 - def remove(self):
1149 """Remove everything related to the given database id.""" 1150 sql = r"DELETE FROM bioentry WHERE biodatabase_id = %s" 1151 self.adaptor.execute(sql, (self.dbid,)) 1152 sql = r"DELETE FROM biodatabase WHERE biodatabase_id = %s" 1153 self.adaptor.execute(sql, (self.dbid,))
1154