Package BioSQL :: Module Loader
[hide private]
[frames] | no frames]

Source Code for Module BioSQL.Loader

   1  # Copyright 2002 by Andrew Dalke.  All rights reserved. 
   2  # Revisions 2007-2009 copyright by Peter Cock.  All rights reserved. 
   3  # Revisions 2008 copyright by Cymon J. Cox.  All rights reserved. 
   4  # This code is part of the Biopython distribution and governed by its 
   5  # license.  Please see the LICENSE file that should have been included 
   6  # as part of this package. 
   7  # 
   8  # Note that BioSQL (including the database schema and scripts) is 
   9  # available and licensed separately.  Please consult www.biosql.org 
  10   
  11  """Load biopython objects into a BioSQL database for persistent storage. 
  12   
  13  This code makes it possible to store biopython objects in a relational 
  14  database and then retrieve them back. You shouldn't use any of the 
  15  classes in this module directly. Rather, call the load() method on 
  16  a database object. 
  17  """ 
  18  # standard modules 
  19  from __future__ import print_function 
  20   
  21  from time import gmtime, strftime 
  22   
  23  # biopython 
  24  from Bio import Alphabet 
  25  from Bio.SeqUtils.CheckSum import crc64 
  26  from Bio import Entrez 
  27  from Bio.Seq import UnknownSeq 
  28   
  29  from Bio._py3k import _is_int_or_long 
  30  from Bio._py3k import range 
  31  from Bio._py3k import basestring 
  32   
  33  __docformat__ = "restructuredtext en" 
  34   
35 -class DatabaseLoader:
36 """Object used to load SeqRecord objects into a BioSQL database.""" 37
38 - def __init__(self, adaptor, dbid, fetch_NCBI_taxonomy=False):
39 """Initialize with connection information for the database. 40 41 Creating a DatabaseLoader object is normally handled via the 42 BioSeqDatabase DBServer object, for example:: 43 44 from BioSQL import BioSeqDatabase 45 server = BioSeqDatabase.open_database(driver="MySQLdb", user="gbrowse", 46 passwd = "biosql", host = "localhost", db="test_biosql") 47 try: 48 db = server["test"] 49 except KeyError: 50 db = server.new_database("test", description="For testing GBrowse") 51 """ 52 self.adaptor = adaptor 53 self.dbid = dbid 54 self.fetch_NCBI_taxonomy = fetch_NCBI_taxonomy
55
56 - def load_seqrecord(self, record):
57 """Load a Biopython SeqRecord into the database.""" 58 bioentry_id = self._load_bioentry_table(record) 59 self._load_bioentry_date(record, bioentry_id) 60 self._load_biosequence(record, bioentry_id) 61 self._load_comment(record, bioentry_id) 62 self._load_dbxrefs(record, bioentry_id) 63 references = record.annotations.get('references', ()) 64 for reference, rank in zip(references, list(range(len(references)))): 65 self._load_reference(reference, rank, bioentry_id) 66 self._load_annotations(record, bioentry_id) 67 for seq_feature_num in range(len(record.features)): 68 seq_feature = record.features[seq_feature_num] 69 self._load_seqfeature(seq_feature, seq_feature_num, bioentry_id)
70
71 - def _get_ontology_id(self, name, definition=None):
72 """Returns the identifier for the named ontology (PRIVATE). 73 74 This looks through the onotology table for a the given entry name. 75 If it is not found, a row is added for this ontology (using the 76 definition if supplied). In either case, the id corresponding to 77 the provided name is returned, so that you can reference it in 78 another table. 79 """ 80 oids = self.adaptor.execute_and_fetch_col0( 81 "SELECT ontology_id FROM ontology WHERE name = %s", 82 (name,)) 83 if oids: 84 return oids[0] 85 self.adaptor.execute( 86 "INSERT INTO ontology(name, definition) VALUES (%s, %s)", 87 (name, definition)) 88 return self.adaptor.last_id("ontology")
89
90 - def _get_term_id(self, 91 name, 92 ontology_id=None, 93 definition=None, 94 identifier=None):
95 """Get the id that corresponds to a term (PRIVATE). 96 97 This looks through the term table for a the given term. If it 98 is not found, a new id corresponding to this term is created. 99 In either case, the id corresponding to that term is returned, so 100 that you can reference it in another table. 101 102 The ontology_id should be used to disambiguate the term. 103 """ 104 105 # try to get the term id 106 sql = r"SELECT term_id FROM term " \ 107 r"WHERE name = %s" 108 fields = [name] 109 if ontology_id: 110 sql += ' AND ontology_id = %s' 111 fields.append(ontology_id) 112 id_results = self.adaptor.execute_and_fetchall(sql, fields) 113 # something is wrong 114 if len(id_results) > 1: 115 raise ValueError("Multiple term ids for %s: %r" % 116 (name, id_results)) 117 elif len(id_results) == 1: 118 return id_results[0][0] 119 else: 120 sql = r"INSERT INTO term (name, definition," \ 121 r" identifier, ontology_id)" \ 122 r" VALUES (%s, %s, %s, %s)" 123 self.adaptor.execute(sql, (name, definition, 124 identifier, ontology_id)) 125 return self.adaptor.last_id("term")
126
127 - def _add_dbxref(self, dbname, accession, version):
128 """Insert a dbxref and return its id.""" 129 self.adaptor.execute( 130 "INSERT INTO dbxref(dbname, accession, version)" 131 " VALUES (%s, %s, %s)", (dbname, accession, version)) 132 return self.adaptor.last_id("dbxref")
133
134 - def _get_taxon_id(self, record):
135 """Get the taxon id for this record (PRIVATE). 136 137 record - a SeqRecord object 138 139 This searches the taxon/taxon_name tables using the 140 NCBI taxon ID, scientific name and common name to find 141 the matching taxon table entry's id. 142 143 If the species isn't in the taxon table, and we have at 144 least the NCBI taxon ID, scientific name or common name, 145 at least a minimal stub entry is created in the table. 146 147 Returns the taxon id (database key for the taxon table, 148 not an NCBI taxon ID), or None if the taxonomy information 149 is missing. 150 151 See also the BioSQL script load_ncbi_taxonomy.pl which 152 will populate and update the taxon/taxon_name tables 153 with the latest information from the NCBI. 154 """ 155 156 # To find the NCBI taxid, first check for a top level annotation 157 ncbi_taxon_id = None 158 if "ncbi_taxid" in record.annotations: 159 # Could be a list of IDs. 160 if isinstance(record.annotations["ncbi_taxid"], list): 161 if len(record.annotations["ncbi_taxid"]) == 1: 162 ncbi_taxon_id = record.annotations["ncbi_taxid"][0] 163 else: 164 ncbi_taxon_id = record.annotations["ncbi_taxid"] 165 if not ncbi_taxon_id: 166 # Secondly, look for a source feature 167 for f in record.features: 168 if f.type == 'source': 169 quals = getattr(f, 'qualifiers', {}) 170 if "db_xref" in quals: 171 for db_xref in f.qualifiers["db_xref"]: 172 if db_xref.startswith("taxon:"): 173 ncbi_taxon_id = int(db_xref[6:]) 174 break 175 if ncbi_taxon_id: 176 break 177 178 try: 179 scientific_name = record.annotations["organism"][:255] 180 except KeyError: 181 scientific_name = None 182 try: 183 common_name = record.annotations["source"][:255] 184 except KeyError: 185 common_name = None 186 # Note: The maximum length for taxon names in the schema is 255. 187 # Cropping it now should help in getting a match when searching, 188 # and avoids an error if we try and add these to the database. 189 190 if ncbi_taxon_id: 191 # Good, we have the NCBI taxon to go on - this is unambiguous :) 192 # Note that the scientific name and common name will only be 193 # used if we have to record a stub entry. 194 return self._get_taxon_id_from_ncbi_taxon_id(ncbi_taxon_id, 195 scientific_name, 196 common_name) 197 198 if not common_name and not scientific_name: 199 # Nothing to go on... and there is no point adding 200 # a new entry to the database. We'll just leave this 201 # sequence's taxon as a NULL in the database. 202 return None 203 204 # Next, we'll try to find a match based on the species name 205 # (stored in GenBank files as the organism and/or the source). 206 if scientific_name: 207 taxa = self.adaptor.execute_and_fetch_col0( 208 "SELECT taxon_id FROM taxon_name" 209 " WHERE name_class = 'scientific name' AND name = %s", 210 (scientific_name,)) 211 if taxa: 212 # Good, mapped the scientific name to a taxon table entry 213 return taxa[0] 214 215 # Last chance... 216 if common_name: 217 taxa = self.adaptor.execute_and_fetch_col0( 218 "SELECT DISTINCT taxon_id FROM taxon_name" 219 " WHERE name = %s", 220 (common_name,)) 221 # Its natural that several distinct taxa will have the same common 222 # name - in which case we can't resolve the taxon uniquely. 223 if len(taxa) > 1: 224 raise ValueError("Taxa: %d species have name %r" % ( 225 len(taxa), 226 common_name)) 227 if taxa: 228 # Good, mapped the common name to a taxon table entry 229 return taxa[0] 230 231 # At this point, as far as we can tell, this species isn't 232 # in the taxon table already. So we'll have to add it. 233 # We don't have an NCBI taxonomy ID, so if we do record just 234 # a stub entry, there is no simple way to fix this later. 235 # 236 # TODO - Should we try searching the NCBI taxonomy using the 237 # species name? 238 # 239 # OK, let's try inserting the species. 240 # Chances are we don't have enough information ... 241 # Furthermore, it won't be in the hierarchy. 242 243 lineage = [] 244 for c in record.annotations.get("taxonomy", []): 245 lineage.append([None, None, c]) 246 if lineage: 247 lineage[-1][1] = "genus" 248 lineage.append([None, "species", record.annotations["organism"]]) 249 # XXX do we have them? 250 if "subspecies" in record.annotations: 251 lineage.append([None, "subspecies", 252 record.annotations["subspecies"]]) 253 if "variant" in record.annotations: 254 lineage.append([None, "varietas", 255 record.annotations["variant"]]) 256 lineage[-1][0] = ncbi_taxon_id 257 258 left_value = self.adaptor.execute_one( 259 "SELECT MAX(left_value) FROM taxon")[0] 260 if not left_value: 261 left_value = 0 262 left_value += 1 263 264 # XXX -- Brad: Fixing this for now in an ugly way because 265 # I am getting overlaps for right_values. I need to dig into this 266 # more to actually understand how it works. I'm not sure it is 267 # actually working right anyhow. 268 right_start_value = self.adaptor.execute_one( 269 "SELECT MAX(right_value) FROM taxon")[0] 270 if not right_start_value: 271 right_start_value = 0 272 right_value = right_start_value + 2 * len(lineage) - 1 273 274 parent_taxon_id = None 275 for taxon in lineage: 276 self.adaptor.execute( 277 "INSERT INTO taxon(parent_taxon_id, ncbi_taxon_id, node_rank," 278 " left_value, right_value)" 279 " VALUES (%s, %s, %s, %s, %s)", (parent_taxon_id, 280 taxon[0], 281 taxon[1], 282 left_value, 283 right_value)) 284 taxon_id = self.adaptor.last_id("taxon") 285 self.adaptor.execute( 286 "INSERT INTO taxon_name(taxon_id, name, name_class)" 287 "VALUES (%s, %s, 'scientific name')", (taxon_id, taxon[2][:255])) 288 # Note the name field is limited to 255, some SwissProt files 289 # have a multi-species name which can be longer. So truncate this. 290 left_value += 1 291 right_value -= 1 292 parent_taxon_id = taxon_id 293 if common_name: 294 self.adaptor.execute( 295 "INSERT INTO taxon_name(taxon_id, name, name_class)" 296 "VALUES (%s, %s, 'common name')", ( 297 taxon_id, common_name)) 298 299 return taxon_id
300
301 - def _fix_name_class(self, entrez_name):
302 """Map Entrez name terms to those used in taxdump (PRIVATE). 303 304 We need to make this conversion to match the taxon_name.name_class 305 values used by the BioSQL load_ncbi_taxonomy.pl script. 306 307 e.g. 308 "ScientificName" -> "scientific name", 309 "EquivalentName" -> "equivalent name", 310 "Synonym" -> "synonym", 311 """ 312 # Add any special cases here: 313 # 314 # known = {} 315 # try: 316 # return known[entrez_name] 317 # except KeyError: 318 # pass 319 320 # Try automatically by adding spaces before each capital 321 def add_space(letter): 322 """Adds a space before a capital letter.""" 323 if letter.isupper(): 324 return " " + letter.lower() 325 else: 326 return letter
327 answer = "".join(add_space(letter) for letter in entrez_name).strip() 328 assert answer == answer.lower() 329 return answer
330
331 - def _get_taxon_id_from_ncbi_taxon_id(self, ncbi_taxon_id, 332 scientific_name=None, 333 common_name=None):
334 """Get the taxon id for this record from the NCBI taxon ID (PRIVATE). 335 336 ncbi_taxon_id - string containing an NCBI taxon id 337 scientific_name - string, used if a stub entry is recorded 338 common_name - string, used if a stub entry is recorded 339 340 This searches the taxon table using ONLY the NCBI taxon ID 341 to find the matching taxon table entry's ID (database key). 342 343 If the species isn't in the taxon table, and the fetch_NCBI_taxonomy 344 flag is true, Biopython will attempt to go online using Bio.Entrez 345 to fetch the official NCBI lineage, recursing up the tree until an 346 existing entry is found in the database or the full lineage has been 347 fetched. 348 349 Otherwise the NCBI taxon ID, scientific name and common name are 350 recorded as a minimal stub entry in the taxon and taxon_name tables. 351 Any partial information about the lineage from the SeqRecord is NOT 352 recorded. This should mean that (re)running the BioSQL script 353 load_ncbi_taxonomy.pl can fill in the taxonomy lineage. 354 355 Returns the taxon id (database key for the taxon table, not 356 an NCBI taxon ID). 357 """ 358 assert ncbi_taxon_id 359 360 taxon_id = self.adaptor.execute_and_fetch_col0( 361 "SELECT taxon_id FROM taxon WHERE ncbi_taxon_id = %s", 362 (int(ncbi_taxon_id),)) 363 if taxon_id: 364 # Good, we have mapped the NCBI taxid to a taxon table entry 365 return taxon_id[0] 366 367 # At this point, as far as we can tell, this species isn't 368 # in the taxon table already. So we'll have to add it. 369 370 parent_taxon_id = None 371 rank = "species" 372 genetic_code = None 373 mito_genetic_code = None 374 species_names = [] 375 if scientific_name: 376 species_names.append(("scientific name", scientific_name)) 377 if common_name: 378 species_names.append(("common name", common_name)) 379 380 if self.fetch_NCBI_taxonomy: 381 # Go online to get the parent taxon ID! 382 handle = Entrez.efetch( 383 db="taxonomy", id=ncbi_taxon_id, retmode="XML") 384 taxonomic_record = Entrez.read(handle) 385 if len(taxonomic_record) == 1: 386 assert taxonomic_record[0]["TaxId"] == str(ncbi_taxon_id), \ 387 "%s versus %s" % (taxonomic_record[0]["TaxId"], 388 ncbi_taxon_id) 389 parent_taxon_id = self._get_taxon_id_from_ncbi_lineage( 390 taxonomic_record[0]["LineageEx"]) 391 rank = taxonomic_record[0]["Rank"] 392 genetic_code = taxonomic_record[0]["GeneticCode"]["GCId"] 393 mito_genetic_code = taxonomic_record[ 394 0]["MitoGeneticCode"]["MGCId"] 395 species_names = [("scientific name", 396 taxonomic_record[0]["ScientificName"])] 397 try: 398 for name_class, names in taxonomic_record[0]["OtherNames"].items(): 399 name_class = self._fix_name_class(name_class) 400 if not isinstance(names, list): 401 # The Entrez parser seems to return single entry 402 # lists as just a string which is annoying. 403 names = [names] 404 for name in names: 405 # Want to ignore complex things like ClassCDE 406 # entries 407 if isinstance(name, basestring): 408 species_names.append((name_class, name)) 409 except KeyError: 410 # OtherNames isn't always present, 411 # e.g. NCBI taxon 41205, Bromheadia finlaysoniana 412 pass 413 else: 414 pass 415 # If we are not allowed to go online, we will record the bare minimum; 416 # as long as the NCBI taxon id is present, then (re)running 417 # load_ncbi_taxonomy.pl should fill in the taxonomomy lineage 418 # (and update the species names). 419 # 420 # I am NOT going to try and record the lineage, even if it 421 # is in the record annotation as a list of names, as we won't 422 # know the NCBI taxon IDs for these parent nodes. 423 424 self.adaptor.execute( 425 "INSERT INTO taxon(parent_taxon_id, ncbi_taxon_id, node_rank," 426 " genetic_code, mito_genetic_code, left_value, right_value)" 427 " VALUES (%s, %s, %s, %s, %s, %s, %s)", (parent_taxon_id, 428 ncbi_taxon_id, 429 rank, 430 genetic_code, 431 mito_genetic_code, 432 None, 433 None)) 434 taxon_id = self.adaptor.last_id("taxon") 435 436 # Record the scientific name, common name, etc 437 for name_class, name in species_names: 438 self.adaptor.execute( 439 "INSERT INTO taxon_name(taxon_id, name, name_class)" 440 " VALUES (%s, %s, %s)", (taxon_id, 441 name[:255], 442 name_class)) 443 return taxon_id
444
445 - def _get_taxon_id_from_ncbi_lineage(self, taxonomic_lineage):
446 """This is recursive! (PRIVATE). 447 448 taxonomic_lineage - list of taxonomy dictionaries from Bio.Entrez 449 450 First dictionary in list is the taxonomy root, highest would be the species. 451 Each dictionary includes: 452 - TaxID (string, NCBI taxon id) 453 - Rank (string, e.g. "species", "genus", ..., "phylum", ...) 454 - ScientificName (string) 455 (and that is all at the time of writing) 456 457 This method will record all the lineage given, returning the taxon id 458 (database key, not NCBI taxon id) of the final entry (the species). 459 """ 460 ncbi_taxon_id = taxonomic_lineage[-1]["TaxId"] 461 462 # Is this in the database already? Check the taxon table... 463 taxon_id = self.adaptor.execute_and_fetch_col0( 464 "SELECT taxon_id FROM taxon" 465 " WHERE ncbi_taxon_id=%s" % ncbi_taxon_id) 466 if taxon_id: 467 # we could verify that the Scientific Name etc in the database 468 # is the same and update it or print a warning if not... 469 if isinstance(taxon_id, list): 470 assert len(taxon_id) == 1 471 return taxon_id[0] 472 else: 473 return taxon_id 474 475 # We have to record this. 476 if len(taxonomic_lineage) > 1: 477 # Use recursion to find out the taxon id (database key) of the 478 # parent. 479 parent_taxon_id = self._get_taxon_id_from_ncbi_lineage( 480 taxonomic_lineage[:-1]) 481 assert _is_int_or_long(parent_taxon_id), repr(parent_taxon_id) 482 else: 483 parent_taxon_id = None 484 485 # INSERT new taxon 486 rank = taxonomic_lineage[-1].get("Rank", None) 487 self.adaptor.execute( 488 "INSERT INTO taxon(ncbi_taxon_id, parent_taxon_id, node_rank)" 489 " VALUES (%s, %s, %s)", (ncbi_taxon_id, parent_taxon_id, rank)) 490 taxon_id = self.adaptor.last_id("taxon") 491 assert isinstance(taxon_id, (int, long)), repr(taxon_id) 492 # ... and its name in taxon_name 493 scientific_name = taxonomic_lineage[-1].get("ScientificName", None) 494 if scientific_name: 495 self.adaptor.execute( 496 "INSERT INTO taxon_name(taxon_id, name, name_class)" 497 " VALUES (%s, %s, 'scientific name')", (taxon_id, 498 scientific_name[:255])) 499 return taxon_id
500
501 - def _load_bioentry_table(self, record):
502 """Fill the bioentry table with sequence information (PRIVATE). 503 504 record - SeqRecord object to add to the database. 505 """ 506 # get the pertinent info and insert it 507 508 if record.id.count(".") == 1: # try to get a version from the id 509 # This assumes the string is something like "XXXXXXXX.123" 510 accession, version = record.id.split('.') 511 try: 512 version = int(version) 513 except ValueError: 514 accession = record.id 515 version = 0 516 else: # otherwise just use a version of 0 517 accession = record.id 518 version = 0 519 520 if "accessions" in record.annotations \ 521 and isinstance(record.annotations["accessions"], list) \ 522 and record.annotations["accessions"]: 523 # Take the first accession (one if there is more than one) 524 accession = record.annotations["accessions"][0] 525 526 # Find the taxon id (this is not just the NCBI Taxon ID) 527 # NOTE - If the species isn't defined in the taxon table, 528 # a new minimal entry is created. 529 taxon_id = self._get_taxon_id(record) 530 531 if "gi" in record.annotations: 532 identifier = record.annotations["gi"] 533 else: 534 identifier = record.id 535 536 # Allow description and division to default to NULL as in BioPerl. 537 description = getattr(record, 'description', None) 538 division = record.annotations.get("data_file_division", None) 539 540 sql = """ 541 INSERT INTO bioentry ( 542 biodatabase_id, 543 taxon_id, 544 name, 545 accession, 546 identifier, 547 division, 548 description, 549 version) 550 VALUES ( 551 %s, 552 %s, 553 %s, 554 %s, 555 %s, 556 %s, 557 %s, 558 %s)""" 559 # print self.dbid, taxon_id, record.name, accession, identifier, \ 560 # division, description, version 561 self.adaptor.execute(sql, (self.dbid, 562 taxon_id, 563 record.name, 564 accession, 565 identifier, 566 division, 567 description, 568 version)) 569 # now retrieve the id for the bioentry 570 bioentry_id = self.adaptor.last_id('bioentry') 571 572 return bioentry_id
573
574 - def _load_bioentry_date(self, record, bioentry_id):
575 """Add the effective date of the entry into the database. 576 577 record - a SeqRecord object with an annotated date 578 bioentry_id - corresponding database identifier 579 """ 580 # dates are GenBank style, like: 581 # 14-SEP-2000 582 date = record.annotations.get("date", 583 strftime("%d-%b-%Y", gmtime()).upper()) 584 if isinstance(date, list): 585 date = date[0] 586 annotation_tags_id = self._get_ontology_id("Annotation Tags") 587 date_id = self._get_term_id("date_changed", annotation_tags_id) 588 sql = r"INSERT INTO bioentry_qualifier_value" \ 589 r" (bioentry_id, term_id, value, rank)" \ 590 r" VALUES (%s, %s, %s, 1)" 591 self.adaptor.execute(sql, (bioentry_id, date_id, date))
592
593 - def _load_biosequence(self, record, bioentry_id):
594 """Record a SeqRecord's sequence and alphabet in the database (PRIVATE). 595 596 record - a SeqRecord object with a seq property 597 bioentry_id - corresponding database identifier 598 """ 599 if record.seq is None: 600 # The biosequence table entry is optional, so if we haven't 601 # got a sequence, we don't need to write to the table. 602 return 603 604 # determine the string representation of the alphabet 605 if isinstance(record.seq.alphabet, Alphabet.DNAAlphabet): 606 alphabet = "dna" 607 elif isinstance(record.seq.alphabet, Alphabet.RNAAlphabet): 608 alphabet = "rna" 609 elif isinstance(record.seq.alphabet, Alphabet.ProteinAlphabet): 610 alphabet = "protein" 611 else: 612 alphabet = "unknown" 613 614 if isinstance(record.seq, UnknownSeq): 615 seq_str = None 616 else: 617 seq_str = str(record.seq) 618 619 sql = r"INSERT INTO biosequence (bioentry_id, version, " \ 620 r"length, seq, alphabet) " \ 621 r"VALUES (%s, 0, %s, %s, %s)" 622 self.adaptor.execute(sql, (bioentry_id, 623 len(record.seq), 624 seq_str, 625 alphabet))
626
627 - def _load_comment(self, record, bioentry_id):
628 """Record a SeqRecord's annotated comment in the database (PRIVATE). 629 630 record - a SeqRecord object with an annotated comment 631 bioentry_id - corresponding database identifier 632 """ 633 comments = record.annotations.get('comment') 634 if not comments: 635 return 636 if not isinstance(comments, list): 637 # It should be a string then... 638 comments = [comments] 639 640 for index, comment in enumerate(comments): 641 comment = comment.replace('\n', ' ') 642 # TODO - Store each line as a separate entry? This would preserve 643 # the newlines, but we should check BioPerl etc to be consistent. 644 sql = "INSERT INTO comment (bioentry_id, comment_text, rank)" \ 645 " VALUES (%s, %s, %s)" 646 self.adaptor.execute(sql, (bioentry_id, comment, index + 1))
647
648 - def _load_annotations(self, record, bioentry_id):
649 """Record a SeqRecord's misc annotations in the database (PRIVATE). 650 651 The annotation strings are recorded in the bioentry_qualifier_value 652 table, except for special cases like the reference, comment and 653 taxonomy which are handled with their own tables. 654 655 record - a SeqRecord object with an annotations dictionary 656 bioentry_id - corresponding database identifier 657 """ 658 mono_sql = "INSERT INTO bioentry_qualifier_value" \ 659 "(bioentry_id, term_id, value)" \ 660 " VALUES (%s, %s, %s)" 661 many_sql = "INSERT INTO bioentry_qualifier_value" \ 662 "(bioentry_id, term_id, value, rank)" \ 663 " VALUES (%s, %s, %s, %s)" 664 tag_ontology_id = self._get_ontology_id('Annotation Tags') 665 for key, value in record.annotations.items(): 666 if key in ["references", "comment", "ncbi_taxid", "date"]: 667 # Handled separately 668 continue 669 term_id = self._get_term_id(key, ontology_id=tag_ontology_id) 670 if isinstance(value, list) or isinstance(value, tuple): 671 rank = 0 672 for entry in value: 673 if isinstance(entry, str) or isinstance(entry, int): 674 # Easy case 675 rank += 1 676 self.adaptor.execute(many_sql, 677 (bioentry_id, term_id, str(entry), rank)) 678 else: 679 pass 680 # print "Ignoring annotation '%s' sub-entry of type '%s'" \ 681 # % (key, str(type(entry))) 682 elif isinstance(value, str) or isinstance(value, int): 683 # Have a simple single entry, leave rank as the DB default 684 self.adaptor.execute(mono_sql, 685 (bioentry_id, term_id, str(value))) 686 else: 687 pass
688 # print "Ignoring annotation '%s' entry of type '%s'" \ 689 # % (key, type(value)) 690
691 - def _load_reference(self, reference, rank, bioentry_id):
692 """Record a SeqRecord's annotated references in the database (PRIVATE). 693 694 record - a SeqRecord object with annotated references 695 bioentry_id - corresponding database identifier 696 """ 697 698 refs = None 699 if reference.medline_id: 700 refs = self.adaptor.execute_and_fetch_col0( 701 "SELECT reference_id" 702 " FROM reference JOIN dbxref USING (dbxref_id)" 703 " WHERE dbname = 'MEDLINE' AND accession = %s", 704 (reference.medline_id,)) 705 if not refs and reference.pubmed_id: 706 refs = self.adaptor.execute_and_fetch_col0( 707 "SELECT reference_id" 708 " FROM reference JOIN dbxref USING (dbxref_id)" 709 " WHERE dbname = 'PUBMED' AND accession = %s", 710 (reference.pubmed_id,)) 711 if not refs: 712 s = [] 713 for f in reference.authors, reference.title, reference.journal: 714 s.append(f or "<undef>") 715 crc = crc64("".join(s)) 716 refs = self.adaptor.execute_and_fetch_col0( 717 "SELECT reference_id FROM reference" 718 r" WHERE crc = %s", (crc,)) 719 if not refs: 720 if reference.medline_id: 721 dbxref_id = self._add_dbxref("MEDLINE", 722 reference.medline_id, 0) 723 elif reference.pubmed_id: 724 dbxref_id = self._add_dbxref("PUBMED", 725 reference.pubmed_id, 0) 726 else: 727 dbxref_id = None 728 authors = reference.authors or None 729 title = reference.title or None 730 # The location/journal field cannot be Null, so default 731 # to an empty string rather than None: 732 journal = reference.journal or "" 733 self.adaptor.execute( 734 "INSERT INTO reference (dbxref_id, location," 735 " title, authors, crc)" 736 " VALUES (%s, %s, %s, %s, %s)", 737 (dbxref_id, journal, title, 738 authors, crc)) 739 reference_id = self.adaptor.last_id("reference") 740 else: 741 reference_id = refs[0] 742 743 if reference.location: 744 start = 1 + int(str(reference.location[0].start)) 745 end = int(str(reference.location[0].end)) 746 else: 747 start = None 748 end = None 749 750 sql = "INSERT INTO bioentry_reference (bioentry_id, reference_id," \ 751 " start_pos, end_pos, rank)" \ 752 " VALUES (%s, %s, %s, %s, %s)" 753 self.adaptor.execute(sql, (bioentry_id, reference_id, 754 start, end, rank + 1))
755
756 - def _load_seqfeature(self, feature, feature_rank, bioentry_id):
757 """Load a biopython SeqFeature into the database (PRIVATE). 758 """ 759 seqfeature_id = self._load_seqfeature_basic(feature.type, feature_rank, 760 bioentry_id) 761 self._load_seqfeature_locations(feature, seqfeature_id) 762 self._load_seqfeature_qualifiers(feature.qualifiers, seqfeature_id)
763
764 - def _load_seqfeature_basic(self, feature_type, feature_rank, bioentry_id):
765 """Load the first tables of a seqfeature and returns the id (PRIVATE). 766 767 This loads the "key" of the seqfeature (ie. CDS, gene) and 768 the basic seqfeature table itself. 769 """ 770 ontology_id = self._get_ontology_id('SeqFeature Keys') 771 seqfeature_key_id = self._get_term_id(feature_type, 772 ontology_id=ontology_id) 773 # XXX source is always EMBL/GenBank/SwissProt here; it should depend on 774 # the record (how?) 775 source_cat_id = self._get_ontology_id('SeqFeature Sources') 776 source_term_id = self._get_term_id('EMBL/GenBank/SwissProt', 777 ontology_id=source_cat_id) 778 779 sql = r"INSERT INTO seqfeature (bioentry_id, type_term_id, " \ 780 r"source_term_id, rank) VALUES (%s, %s, %s, %s)" 781 self.adaptor.execute(sql, (bioentry_id, seqfeature_key_id, 782 source_term_id, feature_rank + 1)) 783 seqfeature_id = self.adaptor.last_id('seqfeature') 784 785 return seqfeature_id
786
787 - def _load_seqfeature_locations(self, feature, seqfeature_id):
788 """Load all of the locations for a SeqFeature into tables (PRIVATE). 789 790 This adds the locations related to the SeqFeature into the 791 seqfeature_location table. Fuzzies are not handled right now. 792 For a simple location, ie (1..2), we have a single table row 793 with seq_start = 1, seq_end = 2, location_rank = 1. 794 795 For split locations, ie (1..2, 3..4, 5..6) we would have three 796 row tables with:: 797 798 start = 1, end = 2, rank = 1 799 start = 3, end = 4, rank = 2 800 start = 5, end = 6, rank = 3 801 """ 802 # TODO - Record an ontology for the locations (using location.term_id) 803 # which for now as in BioPerl we leave defaulting to NULL. 804 if feature.location_operator and feature.location_operator != "join": 805 # e.g. order locations... we don't record "order" so it 806 # will become a "join" on reloading. What does BioPerl do? 807 import warnings 808 from Bio import BiopythonWarning 809 warnings.warn("%s location operators are not fully supported" 810 % feature.location_operator, BiopythonWarning) 811 # This will be a list of length one for simple FeatureLocation: 812 parts = feature.location.parts 813 if parts and set(loc.strand for loc in parts) == set([-1]): 814 # To mimic prior behaviour of Biopython+BioSQL, reverse order 815 parts = parts[::-1] 816 # TODO - Check what BioPerl does; see also BioSeq.py code 817 for rank, loc in enumerate(parts): 818 self._insert_location(loc, rank + 1, seqfeature_id)
819
820 - def _insert_location(self, location, rank, seqfeature_id):
821 """Add a location of a SeqFeature to the seqfeature_location table (PRIVATE). 822 823 TODO - Add location operator to location_qualifier_value? 824 """ 825 # convert biopython locations to the 1-based location system 826 # used in bioSQL 827 # XXX This could also handle fuzzies 828 start = int(location.start) + 1 829 end = int(location.end) 830 831 # Biopython uses None when we don't know strand information but 832 # BioSQL requires something (non null) and sets this as zero 833 # So we'll use the strand or 0 if Biopython spits out None 834 strand = location.strand or 0 835 836 # TODO - Record an ontology term for the location (location.term_id) 837 # which for now like BioPerl we'll leave as NULL. 838 # This might allow us to record "between" positions properly, but I 839 # doesn't really see how it could work for before/after fuzzy positions 840 loc_term_id = None 841 842 if location.ref: 843 # sub_feature remote locations when they are in the same db as the current 844 # record do not have a value for ref_db, which the SeqFeature object 845 # stores as None. BioSQL schema requires a varchar and is not NULL 846 dbxref_id = self._get_dbxref_id( 847 location.ref_db or "", location.ref) 848 else: 849 dbxref_id = None 850 851 sql = r"INSERT INTO location (seqfeature_id, dbxref_id, term_id," \ 852 r"start_pos, end_pos, strand, rank) " \ 853 r"VALUES (%s, %s, %s, %s, %s, %s, %s)" 854 self.adaptor.execute(sql, (seqfeature_id, dbxref_id, loc_term_id, 855 start, end, strand, rank)) 856 857 """ 858 # See Bug 2677 859 # TODO - Record the location_operator (e.g. "join" or "order") 860 # using the location_qualifier_value table (which we and BioPerl 861 # have historically left empty). 862 # Note this will need an ontology term for the location qualifer 863 # (location_qualifier_value.term_id) for which oddly the schema 864 # does not allow NULL. 865 if feature.location_operator: 866 #e.g. "join" (common), 867 #or "order" (see Tests/GenBank/protein_refseq2.gb) 868 location_id = self.adaptor.last_id('location') 869 loc_qual_term_id = None # Not allowed in BioSQL v1.0.1 870 sql = r"INSERT INTO location_qualifier_value" \ 871 r"(location_id, term_id, value)" \ 872 r"VALUES (%s, %s, %s)" 873 self.adaptor.execute(sql, (location_id, loc_qual_term_id, 874 feature.location_operator)) 875 """
876
877 - def _load_seqfeature_qualifiers(self, qualifiers, seqfeature_id):
878 """Insert the (key, value) pair qualifiers relating to a feature (PRIVATE). 879 880 Qualifiers should be a dictionary of the form: 881 {key : [value1, value2]} 882 """ 883 tag_ontology_id = self._get_ontology_id('Annotation Tags') 884 for qualifier_key in qualifiers: 885 # Treat db_xref qualifiers differently to sequence annotation 886 # qualifiers by populating the seqfeature_dbxref and dbxref 887 # tables. Other qualifiers go into the seqfeature_qualifier_value 888 # and (if new) term tables. 889 if qualifier_key != 'db_xref': 890 qualifier_key_id = self._get_term_id(qualifier_key, 891 ontology_id=tag_ontology_id) 892 # now add all of the values to their table 893 entries = qualifiers[qualifier_key] 894 if not isinstance(entries, list): 895 # Could be a plain string, or an int or a float. 896 # However, we exect a list of strings here. 897 entries = [entries] 898 for qual_value_rank in range(len(entries)): 899 qualifier_value = entries[qual_value_rank] 900 sql = r"INSERT INTO seqfeature_qualifier_value "\ 901 r" (seqfeature_id, term_id, rank, value) VALUES"\ 902 r" (%s, %s, %s, %s)" 903 self.adaptor.execute(sql, (seqfeature_id, 904 qualifier_key_id, 905 qual_value_rank + 1, 906 qualifier_value)) 907 else: 908 # The dbxref_id qualifier/value sets go into the dbxref table 909 # as dbname, accession, version tuples, with dbxref.dbxref_id 910 # being automatically assigned, and into the seqfeature_dbxref 911 # table as seqfeature_id, dbxref_id, and rank tuples 912 self._load_seqfeature_dbxref(qualifiers[qualifier_key], 913 seqfeature_id)
914
915 - def _load_seqfeature_dbxref(self, dbxrefs, seqfeature_id):
916 """Add database crossreferences of a SeqFeature to the database (PRIVATE). 917 918 o dbxrefs List, dbxref data from the source file in the 919 format <database>:<accession> 920 921 o seqfeature_id Int, the identifier for the seqfeature in the 922 seqfeature table 923 924 Insert dbxref qualifier data for a seqfeature into the 925 seqfeature_dbxref and, if required, dbxref tables. 926 The dbxref_id qualifier/value sets go into the dbxref table 927 as dbname, accession, version tuples, with dbxref.dbxref_id 928 being automatically assigned, and into the seqfeature_dbxref 929 table as seqfeature_id, dbxref_id, and rank tuples 930 """ 931 # NOTE - In older versions of Biopython, we would map the GenBank 932 # db_xref "name", for example "GI" to "GeneIndex", and give a warning 933 # for any unknown terms. This was a long term maintainance problem, 934 # and differed from BioPerl and BioJava's implementation. See bug 2405 935 for rank, value in enumerate(dbxrefs): 936 # Split the DB:accession format string at colons. We have to 937 # account for multiple-line and multiple-accession entries 938 try: 939 dbxref_data = value.replace(' ', '').replace('\n', '').split(':') 940 db = dbxref_data[0] 941 accessions = dbxref_data[1:] 942 except: 943 raise ValueError("Parsing of db_xref failed: '%s'" % value) 944 # Loop over all the grabbed accessions, and attempt to fill the 945 # table 946 for accession in accessions: 947 # Get the dbxref_id value for the dbxref data 948 dbxref_id = self._get_dbxref_id(db, accession) 949 # Insert the seqfeature_dbxref data 950 self._get_seqfeature_dbxref(seqfeature_id, dbxref_id, rank + 1)
951
952 - def _get_dbxref_id(self, db, accession):
953 """ _get_dbxref_id(self, db, accession) -> Int 954 955 o db String, the name of the external database containing 956 the accession number 957 958 o accession String, the accession of the dbxref data 959 960 Finds and returns the dbxref_id for the passed data. The method 961 attempts to find an existing record first, and inserts the data 962 if there is no record. 963 """ 964 # Check for an existing record 965 sql = r'SELECT dbxref_id FROM dbxref WHERE dbname = %s ' \ 966 r'AND accession = %s' 967 dbxref_id = self.adaptor.execute_and_fetch_col0(sql, (db, accession)) 968 # If there was a record, return the dbxref_id, else create the 969 # record and return the created dbxref_id 970 if dbxref_id: 971 return dbxref_id[0] 972 return self._add_dbxref(db, accession, 0)
973
974 - def _get_seqfeature_dbxref(self, seqfeature_id, dbxref_id, rank):
975 """ Check for a pre-existing seqfeature_dbxref entry with the passed 976 seqfeature_id and dbxref_id. If one does not exist, insert new 977 data 978 979 """ 980 # Check for an existing record 981 sql = r"SELECT seqfeature_id, dbxref_id FROM seqfeature_dbxref " \ 982 r"WHERE seqfeature_id = %s AND dbxref_id = %s" 983 result = self.adaptor.execute_and_fetch_col0(sql, (seqfeature_id, 984 dbxref_id)) 985 # If there was a record, return without executing anything, else create 986 # the record and return 987 if result: 988 return result 989 return self._add_seqfeature_dbxref(seqfeature_id, dbxref_id, rank)
990
991 - def _add_seqfeature_dbxref(self, seqfeature_id, dbxref_id, rank):
992 """ Insert a seqfeature_dbxref row and return the seqfeature_id and 993 dbxref_id 994 """ 995 sql = r'INSERT INTO seqfeature_dbxref ' \ 996 '(seqfeature_id, dbxref_id, rank) VALUES' \ 997 r'(%s, %s, %s)' 998 self.adaptor.execute(sql, (seqfeature_id, dbxref_id, rank)) 999 return (seqfeature_id, dbxref_id)
1000
1001 - def _load_dbxrefs(self, record, bioentry_id):
1002 """Load any sequence level cross references into the database (PRIVATE). 1003 1004 See table bioentry_dbxref.""" 1005 for rank, value in enumerate(record.dbxrefs): 1006 # Split the DB:accession string at first colon. 1007 # We have to cope with things like: 1008 # "MGD:MGI:892" (db="MGD", accession="MGI:892") 1009 # "GO:GO:123" (db="GO", accession="GO:123") 1010 # 1011 # Annoyingly I have seen the NCBI use both the style 1012 # "GO:GO:123" and "GO:123" in different vintages. 1013 assert value.count("\n") == 0 1014 try: 1015 db, accession = value.split(':', 1) 1016 db = db.strip() 1017 accession = accession.strip() 1018 except: 1019 raise ValueError( 1020 "Parsing of dbxrefs list failed: '%s'" % value) 1021 # Get the dbxref_id value for the dbxref data 1022 dbxref_id = self._get_dbxref_id(db, accession) 1023 # Insert the bioentry_dbxref data 1024 self._get_bioentry_dbxref(bioentry_id, dbxref_id, rank + 1)
1025
1026 - def _get_bioentry_dbxref(self, bioentry_id, dbxref_id, rank):
1027 """ Check for a pre-existing bioentry_dbxref entry with the passed 1028 seqfeature_id and dbxref_id. If one does not exist, insert new 1029 data 1030 1031 """ 1032 # Check for an existing record 1033 sql = r"SELECT bioentry_id, dbxref_id FROM bioentry_dbxref " \ 1034 r"WHERE bioentry_id = %s AND dbxref_id = %s" 1035 result = self.adaptor.execute_and_fetch_col0(sql, (bioentry_id, 1036 dbxref_id)) 1037 # If there was a record, return without executing anything, else create 1038 # the record and return 1039 if result: 1040 return result 1041 return self._add_bioentry_dbxref(bioentry_id, dbxref_id, rank)
1042
1043 - def _add_bioentry_dbxref(self, bioentry_id, dbxref_id, rank):
1044 """ Insert a bioentry_dbxref row and return the seqfeature_id and 1045 dbxref_id 1046 """ 1047 sql = r'INSERT INTO bioentry_dbxref ' \ 1048 '(bioentry_id,dbxref_id,rank) VALUES ' \ 1049 '(%s, %s, %s)' 1050 self.adaptor.execute(sql, (bioentry_id, dbxref_id, rank)) 1051 return (bioentry_id, dbxref_id)
1052 1053
1054 -class DatabaseRemover:
1055 """Complement the Loader functionality by fully removing a database. 1056 1057 This probably isn't really useful for normal purposes, since you 1058 can just do a:: 1059 1060 DROP DATABASE db_name 1061 1062 and then recreate the database. But, it's really useful for testing 1063 purposes. 1064 """ 1065
1066 - def __init__(self, adaptor, dbid):
1067 """Initialize with a database id and adaptor connection.""" 1068 self.adaptor = adaptor 1069 self.dbid = dbid
1070
1071 - def remove(self):
1072 """Remove everything related to the given database id.""" 1073 sql = r"DELETE FROM bioentry WHERE biodatabase_id = %s" 1074 self.adaptor.execute(sql, (self.dbid,)) 1075 sql = r"DELETE FROM biodatabase WHERE biodatabase_id = %s" 1076 self.adaptor.execute(sql, (self.dbid,))
1077