1
2
3
4
5
6
7
8
9
10
11 """Load biopython objects into a BioSQL database for persistent storage.
12
13 This code makes it possible to store biopython objects in a relational
14 database and then retrieve them back. You shouldn't use any of the
15 classes in this module directly. Rather, call the load() method on
16 a database object.
17 """
18
19 from time import gmtime, strftime
20
21
22 from Bio import Alphabet
23 from Bio.SeqUtils.CheckSum import crc64
24 from Bio import Entrez
25 from Bio.Seq import UnknownSeq
26
27 from Bio._py3k import _is_int_or_long
28
29
31 """Object used to load SeqRecord objects into a BioSQL database."""
32 - def __init__(self, adaptor, dbid, fetch_NCBI_taxonomy=False):
33 """Initialize with connection information for the database.
34
35 Creating a DatabaseLoader object is normally handled via the
36 BioSeqDatabase DBServer object, for example:
37
38 from BioSQL import BioSeqDatabase
39 server = BioSeqDatabase.open_database(driver="MySQLdb", user="gbrowse",
40 passwd = "biosql", host = "localhost", db="test_biosql")
41 try:
42 db = server["test"]
43 except KeyError:
44 db = server.new_database("test", description="For testing GBrowse")
45 """
46 self.adaptor = adaptor
47 self.dbid = dbid
48 self.fetch_NCBI_taxonomy = fetch_NCBI_taxonomy
49
51 """Load a Biopython SeqRecord into the database.
52 """
53 bioentry_id = self._load_bioentry_table(record)
54 self._load_bioentry_date(record, bioentry_id)
55 self._load_biosequence(record, bioentry_id)
56 self._load_comment(record, bioentry_id)
57 self._load_dbxrefs(record, bioentry_id)
58 references = record.annotations.get('references', ())
59 for reference, rank in zip(references, range(len(references))):
60 self._load_reference(reference, rank, bioentry_id)
61 self._load_annotations(record, bioentry_id)
62 for seq_feature_num in range(len(record.features)):
63 seq_feature = record.features[seq_feature_num]
64 self._load_seqfeature(seq_feature, seq_feature_num, bioentry_id)
65
67 """Returns the identifier for the named ontology (PRIVATE).
68
69 This looks through the onotology table for a the given entry name.
70 If it is not found, a row is added for this ontology (using the
71 definition if supplied). In either case, the id corresponding to
72 the provided name is returned, so that you can reference it in
73 another table.
74 """
75 oids = self.adaptor.execute_and_fetch_col0(
76 "SELECT ontology_id FROM ontology WHERE name = %s",
77 (name,))
78 if oids:
79 return oids[0]
80 self.adaptor.execute(
81 "INSERT INTO ontology(name, definition) VALUES (%s, %s)",
82 (name, definition))
83 return self.adaptor.last_id("ontology")
84
85 - def _get_term_id(self,
86 name,
87 ontology_id=None,
88 definition=None,
89 identifier=None):
90 """Get the id that corresponds to a term (PRIVATE).
91
92 This looks through the term table for a the given term. If it
93 is not found, a new id corresponding to this term is created.
94 In either case, the id corresponding to that term is returned, so
95 that you can reference it in another table.
96
97 The ontology_id should be used to disambiguate the term.
98 """
99
100
101 sql = r"SELECT term_id FROM term " \
102 r"WHERE name = %s"
103 fields = [name]
104 if ontology_id:
105 sql += ' AND ontology_id = %s'
106 fields.append(ontology_id)
107 id_results = self.adaptor.execute_and_fetchall(sql, fields)
108
109 if len(id_results) > 1:
110 raise ValueError("Multiple term ids for %s: %r" %
111 (name, id_results))
112 elif len(id_results) == 1:
113 return id_results[0][0]
114 else:
115 sql = r"INSERT INTO term (name, definition," \
116 r" identifier, ontology_id)" \
117 r" VALUES (%s, %s, %s, %s)"
118 self.adaptor.execute(sql, (name, definition,
119 identifier, ontology_id))
120 return self.adaptor.last_id("term")
121
123 """Insert a dbxref and return its id."""
124
125 self.adaptor.execute(
126 "INSERT INTO dbxref(dbname, accession, version)"
127 " VALUES (%s, %s, %s)", (dbname, accession, version))
128 return self.adaptor.last_id("dbxref")
129
131 """Get the taxon id for this record (PRIVATE).
132
133 record - a SeqRecord object
134
135 This searches the taxon/taxon_name tables using the
136 NCBI taxon ID, scientific name and common name to find
137 the matching taxon table entry's id.
138
139 If the species isn't in the taxon table, and we have at
140 least the NCBI taxon ID, scientific name or common name,
141 at least a minimal stub entry is created in the table.
142
143 Returns the taxon id (database key for the taxon table,
144 not an NCBI taxon ID), or None if the taxonomy information
145 is missing.
146
147 See also the BioSQL script load_ncbi_taxonomy.pl which
148 will populate and update the taxon/taxon_name tables
149 with the latest information from the NCBI.
150 """
151
152
153 ncbi_taxon_id = None
154 if "ncbi_taxid" in record.annotations:
155
156 if isinstance(record.annotations["ncbi_taxid"], list):
157 if len(record.annotations["ncbi_taxid"]) == 1:
158 ncbi_taxon_id = record.annotations["ncbi_taxid"][0]
159 else:
160 ncbi_taxon_id = record.annotations["ncbi_taxid"]
161 if not ncbi_taxon_id:
162
163 for f in record.features:
164 if f.type == 'source':
165 quals = getattr(f, 'qualifiers', {})
166 if "db_xref" in quals:
167 for db_xref in f.qualifiers["db_xref"]:
168 if db_xref.startswith("taxon:"):
169 ncbi_taxon_id = int(db_xref[6:])
170 break
171 if ncbi_taxon_id:
172 break
173
174 try:
175 scientific_name = record.annotations["organism"][:255]
176 except KeyError:
177 scientific_name = None
178 try:
179 common_name = record.annotations["source"][:255]
180 except KeyError:
181 common_name = None
182
183
184
185
186 if ncbi_taxon_id:
187
188
189
190 return self._get_taxon_id_from_ncbi_taxon_id(ncbi_taxon_id,
191 scientific_name,
192 common_name)
193
194 if not common_name and not scientific_name:
195
196
197
198 return None
199
200
201
202 if scientific_name:
203 taxa = self.adaptor.execute_and_fetch_col0(
204 "SELECT taxon_id FROM taxon_name"
205 " WHERE name_class = 'scientific name' AND name = %s",
206 (scientific_name,))
207 if taxa:
208
209 return taxa[0]
210
211
212 if common_name:
213 taxa = self.adaptor.execute_and_fetch_col0(
214 "SELECT DISTINCT taxon_id FROM taxon_name"
215 " WHERE name = %s",
216 (common_name,))
217
218
219 if len(taxa) > 1:
220 raise ValueError("Taxa: %d species have name %r" % (
221 len(taxa),
222 common_name))
223 if taxa:
224
225 return taxa[0]
226
227
228
229
230
231
232
233
234
235
236
237
238
239 lineage = []
240 for c in record.annotations.get("taxonomy", []):
241 lineage.append([None, None, c])
242 if lineage:
243 lineage[-1][1] = "genus"
244 lineage.append([None, "species", record.annotations["organism"]])
245
246 if "subspecies" in record.annotations:
247 lineage.append([None, "subspecies",
248 record.annotations["subspecies"]])
249 if "variant" in record.annotations:
250 lineage.append([None, "varietas",
251 record.annotations["variant"]])
252 lineage[-1][0] = ncbi_taxon_id
253
254 left_value = self.adaptor.execute_one(
255 "SELECT MAX(left_value) FROM taxon")[0]
256 if not left_value:
257 left_value = 0
258 left_value += 1
259
260
261
262
263
264 right_start_value = self.adaptor.execute_one(
265 "SELECT MAX(right_value) FROM taxon")[0]
266 if not right_start_value:
267 right_start_value = 0
268 right_value = right_start_value + 2 * len(lineage) - 1
269
270 parent_taxon_id = None
271 for taxon in lineage:
272 self.adaptor.execute(
273 "INSERT INTO taxon(parent_taxon_id, ncbi_taxon_id, node_rank,"
274 " left_value, right_value)"
275 " VALUES (%s, %s, %s, %s, %s)", (parent_taxon_id,
276 taxon[0],
277 taxon[1],
278 left_value,
279 right_value))
280 taxon_id = self.adaptor.last_id("taxon")
281 self.adaptor.execute(
282 "INSERT INTO taxon_name(taxon_id, name, name_class)"
283 "VALUES (%s, %s, 'scientific name')", (taxon_id, taxon[2][:255]))
284
285
286 left_value += 1
287 right_value -= 1
288 parent_taxon_id = taxon_id
289 if common_name:
290 self.adaptor.execute(
291 "INSERT INTO taxon_name(taxon_id, name, name_class)"
292 "VALUES (%s, %s, 'common name')", (
293 taxon_id, common_name))
294
295 return taxon_id
296
298 """Map Entrez name terms to those used in taxdump (PRIVATE).
299
300 We need to make this conversion to match the taxon_name.name_class
301 values used by the BioSQL load_ncbi_taxonomy.pl script.
302
303 e.g.
304 "ScientificName" -> "scientific name",
305 "EquivalentName" -> "equivalent name",
306 "Synonym" -> "synonym",
307 """
308
309
310
311
312
313
314
315
316
317 def add_space(letter):
318 """Adds a space before a capital letter."""
319 if letter.isupper():
320 return " " + letter.lower()
321 else:
322 return letter
323 answer = "".join([add_space(letter) for letter in entrez_name]).strip()
324 assert answer == answer.lower()
325 return answer
326
330 """Get the taxon id for this record from the NCBI taxon ID (PRIVATE).
331
332 ncbi_taxon_id - string containing an NCBI taxon id
333 scientific_name - string, used if a stub entry is recorded
334 common_name - string, used if a stub entry is recorded
335
336 This searches the taxon table using ONLY the NCBI taxon ID
337 to find the matching taxon table entry's ID (database key).
338
339 If the species isn't in the taxon table, and the fetch_NCBI_taxonomy
340 flag is true, Biopython will attempt to go online using Bio.Entrez
341 to fetch the official NCBI lineage, recursing up the tree until an
342 existing entry is found in the database or the full lineage has been
343 fetched.
344
345 Otherwise the NCBI taxon ID, scientific name and common name are
346 recorded as a minimal stub entry in the taxon and taxon_name tables.
347 Any partial information about the lineage from the SeqRecord is NOT
348 recorded. This should mean that (re)running the BioSQL script
349 load_ncbi_taxonomy.pl can fill in the taxonomy lineage.
350
351 Returns the taxon id (database key for the taxon table, not
352 an NCBI taxon ID).
353 """
354 assert ncbi_taxon_id
355
356 taxon_id = self.adaptor.execute_and_fetch_col0(
357 "SELECT taxon_id FROM taxon WHERE ncbi_taxon_id = %s",
358 (ncbi_taxon_id,))
359 if taxon_id:
360
361 return taxon_id[0]
362
363
364
365
366 parent_taxon_id = None
367 rank = "species"
368 genetic_code = None
369 mito_genetic_code = None
370 species_names = []
371 if scientific_name:
372 species_names.append(("scientific name", scientific_name))
373 if common_name:
374 species_names.append(("common name", common_name))
375
376 if self.fetch_NCBI_taxonomy:
377
378 handle = Entrez.efetch(db="taxonomy", id=ncbi_taxon_id, retmode="XML")
379 taxonomic_record = Entrez.read(handle)
380 if len(taxonomic_record) == 1:
381 assert taxonomic_record[0]["TaxId"] == str(ncbi_taxon_id), \
382 "%s versus %s" % (taxonomic_record[0]["TaxId"],
383 ncbi_taxon_id)
384 parent_taxon_id = self._get_taxon_id_from_ncbi_lineage(
385 taxonomic_record[0]["LineageEx"])
386 rank = taxonomic_record[0]["Rank"]
387 genetic_code = taxonomic_record[0]["GeneticCode"]["GCId"]
388 mito_genetic_code = taxonomic_record[0]["MitoGeneticCode"]["MGCId"]
389 species_names = [("scientific name",
390 taxonomic_record[0]["ScientificName"])]
391 try:
392 for name_class, names in taxonomic_record[0]["OtherNames"].iteritems():
393 name_class = self._fix_name_class(name_class)
394 if not isinstance(names, list):
395
396
397 names = [names]
398 for name in names:
399
400 if isinstance(name, basestring):
401 species_names.append((name_class, name))
402 except KeyError:
403
404
405 pass
406 else:
407 pass
408
409
410
411
412
413
414
415
416
417 self.adaptor.execute(
418 "INSERT INTO taxon(parent_taxon_id, ncbi_taxon_id, node_rank,"
419 " genetic_code, mito_genetic_code, left_value, right_value)"
420 " VALUES (%s, %s, %s, %s, %s, %s, %s)", (parent_taxon_id,
421 ncbi_taxon_id,
422 rank,
423 genetic_code,
424 mito_genetic_code,
425 None,
426 None))
427 taxon_id = self.adaptor.last_id("taxon")
428
429
430 for name_class, name in species_names:
431 self.adaptor.execute(
432 "INSERT INTO taxon_name(taxon_id, name, name_class)"
433 " VALUES (%s, %s, %s)", (taxon_id,
434 name[:255],
435 name_class))
436 return taxon_id
437
439 """This is recursive! (PRIVATE).
440
441 taxonomic_lineage - list of taxonomy dictionaries from Bio.Entrez
442
443 First dictionary in list is the taxonomy root, highest would be the species.
444 Each dictionary includes:
445 - TaxID (string, NCBI taxon id)
446 - Rank (string, e.g. "species", "genus", ..., "phylum", ...)
447 - ScientificName (string)
448 (and that is all at the time of writing)
449
450 This method will record all the lineage given, returning the taxon id
451 (database key, not NCBI taxon id) of the final entry (the species).
452 """
453 ncbi_taxon_id = taxonomic_lineage[-1]["TaxId"]
454
455
456 taxon_id = self.adaptor.execute_and_fetch_col0(
457 "SELECT taxon_id FROM taxon"
458 " WHERE ncbi_taxon_id=%s" % ncbi_taxon_id)
459 if taxon_id:
460
461
462 if isinstance(taxon_id, list):
463 assert len(taxon_id) == 1
464 return taxon_id[0]
465 else:
466 return taxon_id
467
468
469 if len(taxonomic_lineage) > 1:
470
471 parent_taxon_id = self._get_taxon_id_from_ncbi_lineage(taxonomic_lineage[:-1])
472 assert _is_int_or_long(parent_taxon_id), repr(parent_taxon_id)
473 else:
474 parent_taxon_id = None
475
476
477 rank = taxonomic_lineage[-1].get("Rank", None)
478 self.adaptor.execute(
479 "INSERT INTO taxon(ncbi_taxon_id, parent_taxon_id, node_rank)"
480 " VALUES (%s, %s, %s)", (ncbi_taxon_id, parent_taxon_id, rank))
481 taxon_id = self.adaptor.last_id("taxon")
482 assert isinstance(taxon_id, int) or isinstance(taxon_id, long), repr(taxon_id)
483
484 scientific_name = taxonomic_lineage[-1].get("ScientificName", None)
485 if scientific_name:
486 self.adaptor.execute(
487 "INSERT INTO taxon_name(taxon_id, name, name_class)"
488 " VALUES (%s, %s, 'scientific name')", (taxon_id,
489 scientific_name[:255]))
490 return taxon_id
491
492 - def _load_bioentry_table(self, record):
493 """Fill the bioentry table with sequence information (PRIVATE).
494
495 record - SeqRecord object to add to the database.
496 """
497
498
499 if record.id.count(".") == 1:
500
501 accession, version = record.id.split('.')
502 try:
503 version = int(version)
504 except ValueError:
505 accession = record.id
506 version = 0
507 else:
508 accession = record.id
509 version = 0
510
511 if "accessions" in record.annotations \
512 and isinstance(record.annotations["accessions"], list) \
513 and record.annotations["accessions"]:
514
515 accession = record.annotations["accessions"][0]
516
517
518
519
520 taxon_id = self._get_taxon_id(record)
521
522 if "gi" in record.annotations:
523 identifier = record.annotations["gi"]
524 else:
525 identifier = record.id
526
527
528 description = getattr(record, 'description', None)
529 division = record.annotations.get("data_file_division", None)
530
531 sql = """
532 INSERT INTO bioentry (
533 biodatabase_id,
534 taxon_id,
535 name,
536 accession,
537 identifier,
538 division,
539 description,
540 version)
541 VALUES (
542 %s,
543 %s,
544 %s,
545 %s,
546 %s,
547 %s,
548 %s,
549 %s)"""
550
551
552 self.adaptor.execute(sql, (self.dbid,
553 taxon_id,
554 record.name,
555 accession,
556 identifier,
557 division,
558 description,
559 version))
560
561 bioentry_id = self.adaptor.last_id('bioentry')
562
563 return bioentry_id
564
565 - def _load_bioentry_date(self, record, bioentry_id):
566 """Add the effective date of the entry into the database.
567
568 record - a SeqRecord object with an annotated date
569 bioentry_id - corresponding database identifier
570 """
571
572
573 date = record.annotations.get("date",
574 strftime("%d-%b-%Y", gmtime()).upper())
575 if isinstance(date, list):
576 date = date[0]
577 annotation_tags_id = self._get_ontology_id("Annotation Tags")
578 date_id = self._get_term_id("date_changed", annotation_tags_id)
579 sql = r"INSERT INTO bioentry_qualifier_value" \
580 r" (bioentry_id, term_id, value, rank)" \
581 r" VALUES (%s, %s, %s, 1)"
582 self.adaptor.execute(sql, (bioentry_id, date_id, date))
583
585 """Record a SeqRecord's sequence and alphabet in the database (PRIVATE).
586
587 record - a SeqRecord object with a seq property
588 bioentry_id - corresponding database identifier
589 """
590 if record.seq is None:
591
592
593 return
594
595
596 if isinstance(record.seq.alphabet, Alphabet.DNAAlphabet):
597 alphabet = "dna"
598 elif isinstance(record.seq.alphabet, Alphabet.RNAAlphabet):
599 alphabet = "rna"
600 elif isinstance(record.seq.alphabet, Alphabet.ProteinAlphabet):
601 alphabet = "protein"
602 else:
603 alphabet = "unknown"
604
605 if isinstance(record.seq, UnknownSeq):
606 seq_str = None
607 else:
608 seq_str = str(record.seq)
609
610 sql = r"INSERT INTO biosequence (bioentry_id, version, " \
611 r"length, seq, alphabet) " \
612 r"VALUES (%s, 0, %s, %s, %s)"
613 self.adaptor.execute(sql, (bioentry_id,
614 len(record.seq),
615 seq_str,
616 alphabet))
617
638
640 """Record a SeqRecord's misc annotations in the database (PRIVATE).
641
642 The annotation strings are recorded in the bioentry_qualifier_value
643 table, except for special cases like the reference, comment and
644 taxonomy which are handled with their own tables.
645
646 record - a SeqRecord object with an annotations dictionary
647 bioentry_id - corresponding database identifier
648 """
649 mono_sql = "INSERT INTO bioentry_qualifier_value" \
650 "(bioentry_id, term_id, value)" \
651 " VALUES (%s, %s, %s)"
652 many_sql = "INSERT INTO bioentry_qualifier_value" \
653 "(bioentry_id, term_id, value, rank)" \
654 " VALUES (%s, %s, %s, %s)"
655 tag_ontology_id = self._get_ontology_id('Annotation Tags')
656 for key, value in record.annotations.iteritems():
657 if key in ["references", "comment", "ncbi_taxid", "date"]:
658
659 continue
660 term_id = self._get_term_id(key, ontology_id=tag_ontology_id)
661 if isinstance(value, list) or isinstance(value, tuple):
662 rank = 0
663 for entry in value:
664 if isinstance(entry, str) or isinstance(entry, int):
665
666 rank += 1
667 self.adaptor.execute(many_sql,
668 (bioentry_id, term_id, str(entry), rank))
669 else:
670 pass
671
672
673 elif isinstance(value, str) or isinstance(value, int):
674
675 self.adaptor.execute(mono_sql,
676 (bioentry_id, term_id, str(value)))
677 else:
678 pass
679
680
681
683 """Record a SeqRecord's annotated references in the database (PRIVATE).
684
685 record - a SeqRecord object with annotated references
686 bioentry_id - corresponding database identifier
687 """
688
689 refs = None
690 if reference.medline_id:
691 refs = self.adaptor.execute_and_fetch_col0(
692 "SELECT reference_id"
693 " FROM reference JOIN dbxref USING (dbxref_id)"
694 " WHERE dbname = 'MEDLINE' AND accession = %s",
695 (reference.medline_id,))
696 if not refs and reference.pubmed_id:
697 refs = self.adaptor.execute_and_fetch_col0(
698 "SELECT reference_id"
699 " FROM reference JOIN dbxref USING (dbxref_id)"
700 " WHERE dbname = 'PUBMED' AND accession = %s",
701 (reference.pubmed_id,))
702 if not refs:
703 s = []
704 for f in reference.authors, reference.title, reference.journal:
705 s.append(f or "<undef>")
706 crc = crc64("".join(s))
707 refs = self.adaptor.execute_and_fetch_col0(
708 "SELECT reference_id FROM reference"
709 r" WHERE crc = %s", (crc,))
710 if not refs:
711 if reference.medline_id:
712 dbxref_id = self._add_dbxref("MEDLINE",
713 reference.medline_id, 0)
714 elif reference.pubmed_id:
715 dbxref_id = self._add_dbxref("PUBMED",
716 reference.pubmed_id, 0)
717 else:
718 dbxref_id = None
719 authors = reference.authors or None
720 title = reference.title or None
721
722
723 journal = reference.journal or ""
724 self.adaptor.execute(
725 "INSERT INTO reference (dbxref_id, location,"
726 " title, authors, crc)"
727 " VALUES (%s, %s, %s, %s, %s)",
728 (dbxref_id, journal, title,
729 authors, crc))
730 reference_id = self.adaptor.last_id("reference")
731 else:
732 reference_id = refs[0]
733
734 if reference.location:
735 start = 1 + int(str(reference.location[0].start))
736 end = int(str(reference.location[0].end))
737 else:
738 start = None
739 end = None
740
741 sql = "INSERT INTO bioentry_reference (bioentry_id, reference_id," \
742 " start_pos, end_pos, rank)" \
743 " VALUES (%s, %s, %s, %s, %s)"
744 self.adaptor.execute(sql, (bioentry_id, reference_id,
745 start, end, rank + 1))
746
754
756 """Load the first tables of a seqfeature and returns the id (PRIVATE).
757
758 This loads the "key" of the seqfeature (ie. CDS, gene) and
759 the basic seqfeature table itself.
760 """
761 ontology_id = self._get_ontology_id('SeqFeature Keys')
762 seqfeature_key_id = self._get_term_id(feature_type,
763 ontology_id=ontology_id)
764
765
766 source_cat_id = self._get_ontology_id('SeqFeature Sources')
767 source_term_id = self._get_term_id('EMBL/GenBank/SwissProt',
768 ontology_id=source_cat_id)
769
770 sql = r"INSERT INTO seqfeature (bioentry_id, type_term_id, " \
771 r"source_term_id, rank) VALUES (%s, %s, %s, %s)"
772 self.adaptor.execute(sql, (bioentry_id, seqfeature_key_id,
773 source_term_id, feature_rank + 1))
774 seqfeature_id = self.adaptor.last_id('seqfeature')
775
776 return seqfeature_id
777
779 """Load all of the locations for a SeqFeature into tables (PRIVATE).
780
781 This adds the locations related to the SeqFeature into the
782 seqfeature_location table. Fuzzies are not handled right now.
783 For a simple location, ie (1..2), we have a single table row
784 with seq_start = 1, seq_end = 2, location_rank = 1.
785
786 For split locations, ie (1..2, 3..4, 5..6) we would have three
787 row tables with:
788 start = 1, end = 2, rank = 1
789 start = 3, end = 4, rank = 2
790 start = 5, end = 6, rank = 3
791 """
792
793
794 if feature.location_operator and feature.location_operator != "join":
795
796
797 import warnings
798 from Bio import BiopythonWarning
799 warnings.warn("%s location operators are not fully supported"
800 % feature.location_operator, BiopythonWarning)
801
802
803 if not feature.sub_features:
804 self._insert_seqfeature_location(feature, 1, seqfeature_id)
805 else:
806 for rank, cur_feature in enumerate(feature.sub_features):
807 self._insert_seqfeature_location(cur_feature,
808 rank + 1,
809 seqfeature_id)
810
812 """Add a location of a SeqFeature to the seqfeature_location table (PRIVATE).
813
814 TODO - Add location_operators to location_qualifier_value.
815 """
816
817
818
819 start = int(feature.location.start) + 1
820 end = int(feature.location.end)
821
822
823
824
825 strand = feature.strand or 0
826
827
828
829
830
831 loc_term_id = None
832
833 if feature.ref:
834
835
836
837 dbxref_id = self._get_dbxref_id(feature.ref_db or "", feature.ref)
838 else:
839 dbxref_id = None
840
841 sql = r"INSERT INTO location (seqfeature_id, dbxref_id, term_id," \
842 r"start_pos, end_pos, strand, rank) " \
843 r"VALUES (%s, %s, %s, %s, %s, %s, %s)"
844 self.adaptor.execute(sql, (seqfeature_id, dbxref_id, loc_term_id,
845 start, end, strand, rank))
846
847 """
848 # See Bug 2677
849 # TODO - Record the location_operator (e.g. "join" or "order")
850 # using the location_qualifier_value table (which we and BioPerl
851 # have historically left empty).
852 # Note this will need an ontology term for the location qualifer
853 # (location_qualifier_value.term_id) for which oddly the schema
854 # does not allow NULL.
855 if feature.location_operator:
856 #e.g. "join" (common),
857 #or "order" (see Tests/GenBank/protein_refseq2.gb)
858 location_id = self.adaptor.last_id('location')
859 loc_qual_term_id = None # Not allowed in BioSQL v1.0.1
860 sql = r"INSERT INTO location_qualifier_value" \
861 r"(location_id, term_id, value)" \
862 r"VALUES (%s, %s, %s)"
863 self.adaptor.execute(sql, (location_id, loc_qual_term_id,
864 feature.location_operator))
865 """
866
868 """Insert the (key, value) pair qualifiers relating to a feature (PRIVATE).
869
870 Qualifiers should be a dictionary of the form:
871 {key : [value1, value2]}
872 """
873 tag_ontology_id = self._get_ontology_id('Annotation Tags')
874 for qualifier_key in qualifiers:
875
876
877
878
879 if qualifier_key != 'db_xref':
880 qualifier_key_id = self._get_term_id(qualifier_key,
881 ontology_id=tag_ontology_id)
882
883 entries = qualifiers[qualifier_key]
884 if not isinstance(entries, list):
885
886
887 entries = [entries]
888 for qual_value_rank in range(len(entries)):
889 qualifier_value = entries[qual_value_rank]
890 sql = r"INSERT INTO seqfeature_qualifier_value "\
891 r" (seqfeature_id, term_id, rank, value) VALUES"\
892 r" (%s, %s, %s, %s)"
893 self.adaptor.execute(sql, (seqfeature_id,
894 qualifier_key_id,
895 qual_value_rank + 1,
896 qualifier_value))
897 else:
898
899
900
901
902 self._load_seqfeature_dbxref(qualifiers[qualifier_key],
903 seqfeature_id)
904
906 """Add database crossreferences of a SeqFeature to the database (PRIVATE).
907
908 o dbxrefs List, dbxref data from the source file in the
909 format <database>:<accession>
910
911 o seqfeature_id Int, the identifier for the seqfeature in the
912 seqfeature table
913
914 Insert dbxref qualifier data for a seqfeature into the
915 seqfeature_dbxref and, if required, dbxref tables.
916 The dbxref_id qualifier/value sets go into the dbxref table
917 as dbname, accession, version tuples, with dbxref.dbxref_id
918 being automatically assigned, and into the seqfeature_dbxref
919 table as seqfeature_id, dbxref_id, and rank tuples
920 """
921
922
923
924
925 for rank, value in enumerate(dbxrefs):
926
927
928 try:
929 dbxref_data = value.replace(' ', '').replace('\n', '').split(':')
930 db = dbxref_data[0]
931 accessions = dbxref_data[1:]
932 except:
933 raise ValueError("Parsing of db_xref failed: '%s'" % value)
934
935
936 for accession in accessions:
937
938 dbxref_id = self._get_dbxref_id(db, accession)
939
940 self._get_seqfeature_dbxref(seqfeature_id, dbxref_id, rank + 1)
941
943 """ _get_dbxref_id(self, db, accession) -> Int
944
945 o db String, the name of the external database containing
946 the accession number
947
948 o accession String, the accession of the dbxref data
949
950 Finds and returns the dbxref_id for the passed data. The method
951 attempts to find an existing record first, and inserts the data
952 if there is no record.
953 """
954
955 sql = r'SELECT dbxref_id FROM dbxref WHERE dbname = %s ' \
956 r'AND accession = %s'
957 dbxref_id = self.adaptor.execute_and_fetch_col0(sql, (db, accession))
958
959
960 if dbxref_id:
961 return dbxref_id[0]
962 return self._add_dbxref(db, accession, 0)
963
965 """ Check for a pre-existing seqfeature_dbxref entry with the passed
966 seqfeature_id and dbxref_id. If one does not exist, insert new
967 data
968
969 """
970
971 sql = r"SELECT seqfeature_id, dbxref_id FROM seqfeature_dbxref " \
972 r"WHERE seqfeature_id = %s AND dbxref_id = %s"
973 result = self.adaptor.execute_and_fetch_col0(sql, (seqfeature_id,
974 dbxref_id))
975
976
977 if result:
978 return result
979 return self._add_seqfeature_dbxref(seqfeature_id, dbxref_id, rank)
980
982 """ Insert a seqfeature_dbxref row and return the seqfeature_id and
983 dbxref_id
984 """
985 sql = r'INSERT INTO seqfeature_dbxref ' \
986 '(seqfeature_id, dbxref_id, rank) VALUES' \
987 r'(%s, %s, %s)'
988 self.adaptor.execute(sql, (seqfeature_id, dbxref_id, rank))
989 return (seqfeature_id, dbxref_id)
990
1014
1015 - def _get_bioentry_dbxref(self, bioentry_id, dbxref_id, rank):
1016 """ Check for a pre-existing bioentry_dbxref entry with the passed
1017 seqfeature_id and dbxref_id. If one does not exist, insert new
1018 data
1019
1020 """
1021
1022 sql = r"SELECT bioentry_id, dbxref_id FROM bioentry_dbxref " \
1023 r"WHERE bioentry_id = %s AND dbxref_id = %s"
1024 result = self.adaptor.execute_and_fetch_col0(sql, (bioentry_id,
1025 dbxref_id))
1026
1027
1028 if result:
1029 return result
1030 return self._add_bioentry_dbxref(bioentry_id, dbxref_id, rank)
1031
1032 - def _add_bioentry_dbxref(self, bioentry_id, dbxref_id, rank):
1033 """ Insert a bioentry_dbxref row and return the seqfeature_id and
1034 dbxref_id
1035 """
1036 sql = r'INSERT INTO bioentry_dbxref ' \
1037 '(bioentry_id,dbxref_id,rank) VALUES ' \
1038 '(%s, %s, %s)'
1039 self.adaptor.execute(sql, (bioentry_id, dbxref_id, rank))
1040 return (bioentry_id, dbxref_id)
1041
1042
1044 """Complement the Loader functionality by fully removing a database.
1045
1046 This probably isn't really useful for normal purposes, since you
1047 can just do a:
1048 DROP DATABASE db_name
1049 and then recreate the database. But, it's really useful for testing
1050 purposes.
1051
1052 YB: now use the cascaded deletions
1053 """
1055 """Initialize with a database id and adaptor connection.
1056 """
1057 self.adaptor = adaptor
1058 self.dbid = dbid
1059
1061 """Remove everything related to the given database id.
1062 """
1063 sql = r"DELETE FROM bioentry WHERE biodatabase_id = %s"
1064 self.adaptor.execute(sql, (self.dbid,))
1065 sql = r"DELETE FROM biodatabase WHERE biodatabase_id = %s"
1066 self.adaptor.execute(sql, (self.dbid,))
1067