Package BioSQL :: Module BioSeq
[hide private]
[frames] | no frames]

Source Code for Module BioSQL.BioSeq

  1  # Copyright 2002 by Andrew Dalke.  All rights reserved. 
  2  # Revisions 2007-2016 copyright by Peter Cock.  All rights reserved. 
  3  # Revisions 2008-2009 copyright by Cymon J. Cox.  All rights reserved. 
  4  # 
  5  # This file is part of the Biopython distribution and governed by your 
  6  # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". 
  7  # Please see the LICENSE file that should have been included as part of this 
  8  # package. 
  9  # 
 10  # Note that BioSQL (including the database schema and scripts) is 
 11  # available and licensed separately.  Please consult www.biosql.org 
 12  """Implementations of Biopython-like Seq objects on top of BioSQL. 
 13   
 14  This allows retrival of items stored in a BioSQL database using 
 15  a biopython-like SeqRecord and Seq interface. 
 16   
 17  Note: Currently we do not support recording per-letter-annotations 
 18  (like quality scores) in BioSQL. 
 19  """ 
 20   
 21  from Bio._py3k import unicode 
 22   
 23  from Bio import Alphabet 
 24  from Bio.Seq import Seq, UnknownSeq 
 25  from Bio.SeqRecord import SeqRecord, _RestrictedDict 
 26  from Bio import SeqFeature 
 27   
 28   
29 -class DBSeq(Seq):
30 """BioSQL equivalent of the Biopython Seq object.""" 31
32 - def __init__(self, primary_id, adaptor, alphabet, start, length):
33 """Create a new DBSeq object referring to a BioSQL entry. 34 35 You wouldn't normally create a DBSeq object yourself, this is done 36 for you when retreiving a DBSeqRecord object from the database. 37 """ 38 self.primary_id = primary_id 39 self.adaptor = adaptor 40 self.alphabet = alphabet 41 self._length = length 42 self.start = start
43
44 - def __len__(self):
45 """Return the length of the sequence.""" 46 return self._length
47
48 - def __getitem__(self, index): # Seq API requirement
49 """Return a subsequence or single letter.""" 50 # Note since Python 2.0, __getslice__ is deprecated 51 # and __getitem__ is used instead. 52 # See http://docs.python.org/ref/sequence-methods.html 53 if isinstance(index, int): 54 # Return a single letter as a string 55 i = index 56 if i < 0: 57 if -i > self._length: 58 raise IndexError(i) 59 i = i + self._length 60 elif i >= self._length: 61 raise IndexError(i) 62 return self.adaptor.get_subseq_as_string(self.primary_id, 63 self.start + i, 64 self.start + i + 1) 65 if not isinstance(index, slice): 66 raise TypeError("Unexpected index type") 67 68 # Return the (sub)sequence as another DBSeq or Seq object 69 # (see the Seq obect's __getitem__ method) 70 if index.start is None: 71 i = 0 72 else: 73 i = index.start 74 if i < 0: 75 # Map to equavilent positive index 76 if -i > self._length: 77 raise IndexError(i) 78 i = i + self._length 79 elif i >= self._length: 80 # Trivial case, should return empty string! 81 i = self._length 82 83 if index.stop is None: 84 j = self._length 85 else: 86 j = index.stop 87 if j < 0: 88 # Map to equavilent positive index 89 if -j > self._length: 90 raise IndexError(j) 91 j = j + self._length 92 elif j >= self._length: 93 j = self._length 94 95 if i >= j: 96 # Trivial case, empty string. 97 return Seq("", self.alphabet) 98 elif index.step is None or index.step == 1: 99 # Easy case - can return a DBSeq with the start and end adjusted 100 return self.__class__(self.primary_id, self.adaptor, self.alphabet, 101 self.start + i, j - i) 102 else: 103 # Tricky. Will have to create a Seq object because of the stride 104 full = self.adaptor.get_subseq_as_string(self.primary_id, 105 self.start + i, 106 self.start + j) 107 return Seq(full[::index.step], self.alphabet)
108
109 - def tostring(self):
110 """Return the full sequence as a python string (DEPRECATED). 111 112 You are now encouraged to use str(my_seq) instead of 113 my_seq.tostring(). 114 """ 115 import warnings 116 warnings.warn("This method is obsolete; please use str(my_seq) " 117 "instead of my_seq.tostring().", 118 PendingDeprecationWarning) 119 return self.adaptor.get_subseq_as_string(self.primary_id, 120 self.start, 121 self.start + self._length)
122
123 - def __str__(self):
124 """Return the full sequence as a python string.""" 125 return self.adaptor.get_subseq_as_string(self.primary_id, 126 self.start, 127 self.start + self._length)
128 129 data = property(tostring, doc="Sequence as string (DEPRECATED)") 130
131 - def toseq(self):
132 """Return the full sequence as a Seq object.""" 133 # Note - the method name copies that of the MutableSeq object 134 return Seq(str(self), self.alphabet)
135
136 - def __add__(self, other):
137 """Add another sequence or string to this sequence. 138 139 The sequence is first converted to a Seq object before the addition. 140 The returned object is a Seq object, not a DBSeq object. 141 """ 142 # Let the Seq object deal with the alphabet issues etc 143 return self.toseq() + other
144
145 - def __radd__(self, other):
146 """Add another sequence or string to the left. 147 148 The sequence is first converted to a Seq object before the addition. 149 The returned object is a Seq object, not a DBSeq object. 150 """ 151 # Let the Seq object deal with the alphabet issues etc 152 return other + self.toseq()
153
154 - def __mul__(self, other):
155 """Multiply sequence by an integer. 156 157 The sequence is first converted to a Seq object before multiplication. 158 The returned object is a Seq object, not a DBSeq object. 159 """ 160 # Let the Seq object deal with the alphabet issues etc 161 return self.toseq() * other
162
163 - def __rmul__(self, other):
164 """Multiply integer by a sequence. 165 166 The sequence is first converted to a Seq object before multiplication. 167 The returned object is a Seq object, not a DBSeq object. 168 """ 169 # Let the Seq object deal with the alphabet issues etc 170 return other * self.toseq()
171
172 - def __imul__(self, other):
173 """Multiply sequence by integer in-place. 174 175 The sequence is first converted to a Seq object before multiplication. 176 The returned object is a Seq object, not a DBSeq object. 177 """ 178 # Let the Seq object deal with the alphabet issues etc 179 return self.toseq() * other
180 181
182 -def _retrieve_seq_len(adaptor, primary_id):
183 # The database schema ensures there will be only one matching row 184 seqs = adaptor.execute_and_fetchall( 185 "SELECT length FROM biosequence WHERE bioentry_id = %s", (primary_id,)) 186 if not seqs: 187 return None 188 assert len(seqs) == 1 189 given_length, = seqs[0] 190 return int(given_length)
191 192
193 -def _retrieve_seq(adaptor, primary_id):
194 # The database schema ensures there will be only one matching 195 # row in the table. 196 197 # If an UnknownSeq was recorded, seq will be NULL, 198 # but length will be populated. This means length(seq) 199 # will return None. 200 seqs = adaptor.execute_and_fetchall( 201 "SELECT alphabet, length, length(seq) FROM biosequence" 202 " WHERE bioentry_id = %s", (primary_id,)) 203 if not seqs: 204 return 205 assert len(seqs) == 1 206 moltype, given_length, length = seqs[0] 207 208 try: 209 length = int(length) 210 given_length = int(length) 211 assert length == given_length 212 have_seq = True 213 except TypeError: 214 assert length is None 215 seqs = adaptor.execute_and_fetchall( 216 "SELECT alphabet, length, seq FROM biosequence" 217 " WHERE bioentry_id = %s", (primary_id,)) 218 assert len(seqs) == 1 219 moltype, given_length, seq = seqs[0] 220 assert seq is None or seq == "" 221 length = int(given_length) 222 have_seq = False 223 del seq 224 del given_length 225 226 moltype = moltype.lower() # might be upper case in database 227 # We have no way of knowing if these sequences will use IUPAC 228 # alphabets, and we certainly can't assume they are unambiguous! 229 if moltype == "dna": 230 alphabet = Alphabet.generic_dna 231 elif moltype == "rna": 232 alphabet = Alphabet.generic_rna 233 elif moltype == "protein": 234 alphabet = Alphabet.generic_protein 235 elif moltype == "unknown": 236 # This is used in BioSQL/Loader.py and would happen 237 # for any generic or nucleotide alphabets. 238 alphabet = Alphabet.single_letter_alphabet 239 else: 240 raise AssertionError("Unknown moltype: %s" % moltype) 241 242 if have_seq: 243 return DBSeq(primary_id, adaptor, alphabet, 0, int(length)) 244 else: 245 return UnknownSeq(length, alphabet)
246 247
248 -def _retrieve_dbxrefs(adaptor, primary_id):
249 """Retrieve the database cross references for the sequence (PRIVATE).""" 250 _dbxrefs = [] 251 dbxrefs = adaptor.execute_and_fetchall( 252 "SELECT dbname, accession, version" 253 " FROM bioentry_dbxref join dbxref using (dbxref_id)" 254 " WHERE bioentry_id = %s" 255 " ORDER BY rank", (primary_id,)) 256 for dbname, accession, version in dbxrefs: 257 if version and version != "0": 258 v = "%s.%s" % (accession, version) 259 else: 260 v = accession 261 _dbxrefs.append("%s:%s" % (dbname, v)) 262 return _dbxrefs
263 264
265 -def _retrieve_features(adaptor, primary_id):
266 sql = "SELECT seqfeature_id, type.name, rank" \ 267 " FROM seqfeature join term type on (type_term_id = type.term_id)" \ 268 " WHERE bioentry_id = %s" \ 269 " ORDER BY rank" 270 results = adaptor.execute_and_fetchall(sql, (primary_id,)) 271 seq_feature_list = [] 272 for seqfeature_id, seqfeature_type, seqfeature_rank in results: 273 # Get qualifiers [except for db_xref which is stored separately] 274 qvs = adaptor.execute_and_fetchall( 275 "SELECT name, value" 276 " FROM seqfeature_qualifier_value join term using (term_id)" 277 " WHERE seqfeature_id = %s" 278 " ORDER BY rank", (seqfeature_id,)) 279 qualifiers = {} 280 for qv_name, qv_value in qvs: 281 qualifiers.setdefault(qv_name, []).append(qv_value) 282 # Get db_xrefs [special case of qualifiers] 283 qvs = adaptor.execute_and_fetchall( 284 "SELECT dbxref.dbname, dbxref.accession" 285 " FROM dbxref join seqfeature_dbxref using (dbxref_id)" 286 " WHERE seqfeature_dbxref.seqfeature_id = %s" 287 " ORDER BY rank", (seqfeature_id,)) 288 for qv_name, qv_value in qvs: 289 value = "%s:%s" % (qv_name, qv_value) 290 qualifiers.setdefault("db_xref", []).append(value) 291 # Get locations 292 results = adaptor.execute_and_fetchall( 293 "SELECT location_id, start_pos, end_pos, strand" 294 " FROM location" 295 " WHERE seqfeature_id = %s" 296 " ORDER BY rank", (seqfeature_id,)) 297 locations = [] 298 # convert to Python standard form 299 # Convert strand = 0 to strand = None 300 # re: comment in Loader.py: 301 # Biopython uses None when we don't know strand information but 302 # BioSQL requires something (non null) and sets this as zero 303 # So we'll use the strand or 0 if Biopython spits out None 304 for location_id, start, end, strand in results: 305 if start: 306 start -= 1 307 if strand == 0: 308 strand = None 309 if strand not in (+1, -1, None): 310 raise ValueError("Invalid strand %s found in database for " 311 "seqfeature_id %s" % (strand, seqfeature_id)) 312 if start is not None and end is not None and end < start: 313 import warnings 314 from Bio import BiopythonWarning 315 warnings.warn("Inverted location start/end (%i and %i) for " 316 "seqfeature_id %s" % (start, end, seqfeature_id), 317 BiopythonWarning) 318 319 # For SwissProt unknown positions (?) 320 if start is None: 321 start = SeqFeature.UnknownPosition() 322 if end is None: 323 end = SeqFeature.UnknownPosition() 324 325 locations.append((location_id, start, end, strand)) 326 # Get possible remote reference information 327 remote_results = adaptor.execute_and_fetchall( 328 "SELECT location_id, dbname, accession, version" 329 " FROM location join dbxref using (dbxref_id)" 330 " WHERE seqfeature_id = %s", (seqfeature_id,)) 331 lookup = {} 332 for location_id, dbname, accession, version in remote_results: 333 if version and version != "0": 334 v = "%s.%s" % (accession, version) 335 else: 336 v = accession 337 # subfeature remote location db_ref are stored as a empty string 338 # when not present 339 if dbname == "": 340 dbname = None 341 lookup[location_id] = (dbname, v) 342 343 feature = SeqFeature.SeqFeature(type=seqfeature_type) 344 # Store the key as a private property 345 feature._seqfeature_id = seqfeature_id 346 feature.qualifiers = qualifiers 347 if len(locations) == 0: 348 pass 349 elif len(locations) == 1: 350 location_id, start, end, strand = locations[0] 351 # See Bug 2677, we currently don't record the location_operator 352 # For consistency with older versions Biopython, default to "". 353 feature.location_operator = \ 354 _retrieve_location_qualifier_value(adaptor, location_id) 355 dbname, version = lookup.get(location_id, (None, None)) 356 feature.location = SeqFeature.FeatureLocation(start, end) 357 feature.strand = strand 358 feature.ref_db = dbname 359 feature.ref = version 360 else: 361 locs = [] 362 for location in locations: 363 location_id, start, end, strand = location 364 dbname, version = lookup.get(location_id, (None, None)) 365 locs.append(SeqFeature.FeatureLocation(start, end, 366 strand=strand, 367 ref=version, 368 ref_db=dbname)) 369 # Locations are typically in biological in order (see negative 370 # strands below), but because of remote locations for 371 # sub-features they are not necessarily in numerical order: 372 strands = set(l.strand for l in locs) 373 if len(strands) == 1 and -1 in strands: 374 # Evil hack time for backwards compatibility 375 # TODO - Check if BioPerl and (old) Biopython did the same, 376 # we may have an existing incompatibility lurking here... 377 locs = locs[::-1] 378 feature.location = SeqFeature.CompoundLocation(locs, "join") 379 # TODO - See Bug 2677 - we don't yet record location operator, 380 # so for consistency with older versions of Biopython default 381 # to assuming its a join. 382 seq_feature_list.append(feature) 383 return seq_feature_list
384 385
386 -def _retrieve_location_qualifier_value(adaptor, location_id):
387 value = adaptor.execute_and_fetch_col0( 388 "SELECT value FROM location_qualifier_value" 389 " WHERE location_id = %s", (location_id,)) 390 try: 391 return value[0] 392 except IndexError: 393 return ""
394 395
396 -def _retrieve_annotations(adaptor, primary_id, taxon_id):
397 annotations = {} 398 annotations.update(_retrieve_qualifier_value(adaptor, primary_id)) 399 annotations.update(_retrieve_reference(adaptor, primary_id)) 400 annotations.update(_retrieve_taxon(adaptor, primary_id, taxon_id)) 401 annotations.update(_retrieve_comment(adaptor, primary_id)) 402 # Convert values into strings in cases of unicode from the database. 403 # BioSQL could eventually be expanded to be unicode aware. 404 str_anns = {} 405 for key, val in annotations.items(): 406 if isinstance(val, list): 407 val = [_make_unicode_into_string(x) for x in val] 408 elif isinstance(val, unicode): 409 val = str(val) 410 str_anns[key] = val 411 return str_anns
412 413
414 -def _make_unicode_into_string(text):
415 if isinstance(text, unicode): 416 return str(text) 417 else: 418 return text
419 420
421 -def _retrieve_qualifier_value(adaptor, primary_id):
422 qvs = adaptor.execute_and_fetchall( 423 "SELECT name, value" 424 " FROM bioentry_qualifier_value JOIN term USING (term_id)" 425 " WHERE bioentry_id = %s" 426 " ORDER BY rank", (primary_id,)) 427 qualifiers = {} 428 for name, value in qvs: 429 if name == "keyword": 430 name = "keywords" 431 # See handling of "date" in Loader.py 432 elif name == "date_changed": 433 name = "date" 434 elif name == "secondary_accession": 435 name = "accessions" 436 qualifiers.setdefault(name, []).append(value) 437 return qualifiers
438 439
440 -def _retrieve_reference(adaptor, primary_id):
441 # XXX dbxref_qualifier_value 442 443 refs = adaptor.execute_and_fetchall( 444 "SELECT start_pos, end_pos, " 445 " location, title, authors," 446 " dbname, accession" 447 " FROM bioentry_reference" 448 " JOIN reference USING (reference_id)" 449 " LEFT JOIN dbxref USING (dbxref_id)" 450 " WHERE bioentry_id = %s" 451 " ORDER BY rank", (primary_id,)) 452 references = [] 453 for start, end, location, title, authors, dbname, accession in refs: 454 reference = SeqFeature.Reference() 455 # If the start/end are missing, reference.location is an empty list 456 if (start is not None) or (end is not None): 457 if start is not None: 458 start -= 1 # python counting 459 reference.location = [SeqFeature.FeatureLocation(start, end)] 460 # Don't replace the default "" with None. 461 if authors: 462 reference.authors = authors 463 if title: 464 reference.title = title 465 reference.journal = location 466 if dbname == 'PUBMED': 467 reference.pubmed_id = accession 468 elif dbname == 'MEDLINE': 469 reference.medline_id = accession 470 references.append(reference) 471 if references: 472 return {'references': references} 473 else: 474 return {}
475 476
477 -def _retrieve_taxon(adaptor, primary_id, taxon_id):
478 a = {} 479 common_names = adaptor.execute_and_fetch_col0( 480 "SELECT name FROM taxon_name WHERE taxon_id = %s" 481 " AND name_class = 'genbank common name'", (taxon_id,)) 482 if common_names: 483 a['source'] = common_names[0] 484 scientific_names = adaptor.execute_and_fetch_col0( 485 "SELECT name FROM taxon_name WHERE taxon_id = %s" 486 " AND name_class = 'scientific name'", (taxon_id,)) 487 if scientific_names: 488 a['organism'] = scientific_names[0] 489 ncbi_taxids = adaptor.execute_and_fetch_col0( 490 "SELECT ncbi_taxon_id FROM taxon WHERE taxon_id = %s", (taxon_id,)) 491 if ncbi_taxids and ncbi_taxids[0] and ncbi_taxids[0] != "0": 492 a['ncbi_taxid'] = ncbi_taxids[0] 493 494 # Old code used the left/right values in the taxon table to get the 495 # taxonomy lineage in one SQL command. This was actually very slow, 496 # and would fail if the (optional) left/right values were missing. 497 # 498 # The following code is based on a contribution from Eric Gibert, and 499 # relies on the taxon table's parent_taxon_id field only (ignoring the 500 # optional left/right values). This means that it has to make a 501 # separate SQL query for each entry in the lineage, but it does still 502 # appear to be *much* faster. See Bug 2494. 503 taxonomy = [] 504 while taxon_id: 505 name, rank, parent_taxon_id = adaptor.execute_one( 506 "SELECT taxon_name.name, taxon.node_rank, taxon.parent_taxon_id" 507 " FROM taxon, taxon_name" 508 " WHERE taxon.taxon_id=taxon_name.taxon_id" 509 " AND taxon_name.name_class='scientific name'" 510 " AND taxon.taxon_id = %s", (taxon_id,)) 511 if taxon_id == parent_taxon_id: 512 # If the taxon table has been populated by the BioSQL script 513 # load_ncbi_taxonomy.pl this is how top parent nodes are stored. 514 # Personally, I would have used a NULL parent_taxon_id here. 515 break 516 517 taxonomy.insert(0, name) 518 taxon_id = parent_taxon_id 519 520 if taxonomy: 521 a['taxonomy'] = taxonomy 522 return a
523 524
525 -def _retrieve_comment(adaptor, primary_id):
526 qvs = adaptor.execute_and_fetchall( 527 "SELECT comment_text FROM comment" 528 " WHERE bioentry_id=%s" 529 " ORDER BY rank", (primary_id,)) 530 comments = [comm[0] for comm in qvs] 531 # Don't want to add an empty list... 532 if comments: 533 return {"comment": comments} 534 else: 535 return {}
536 537
538 -class DBSeqRecord(SeqRecord):
539 """BioSQL equivalent of the Biopython SeqRecord object.""" 540
541 - def __init__(self, adaptor, primary_id):
542 """Create a DBSeqRecord object. 543 544 Arguments: 545 - adaptor - A BioSQL.BioSeqDatabase.Adaptor object 546 - primary_id - An internal integer ID used by BioSQL 547 548 You wouldn't normally create a DBSeqRecord object yourself, 549 this is done for you when using a BioSeqDatabase object 550 """ 551 self._adaptor = adaptor 552 self._primary_id = primary_id 553 554 (self._biodatabase_id, self._taxon_id, self.name, 555 accession, version, self._identifier, 556 self._division, self.description) = self._adaptor.execute_one( 557 "SELECT biodatabase_id, taxon_id, name, accession, version," 558 " identifier, division, description" 559 " FROM bioentry" 560 " WHERE bioentry_id = %s", (self._primary_id,)) 561 if version and version != "0": 562 self.id = "%s.%s" % (accession, version) 563 else: 564 self.id = accession 565 # We don't yet record any per-letter-annotations in the 566 # BioSQL database, but we should set this property up 567 # for completeness (and the __str__ method). 568 # We do NOT want to load the sequence from the DB here! 569 length = _retrieve_seq_len(adaptor, primary_id) 570 self._per_letter_annotations = _RestrictedDict(length=length)
571
572 - def __get_seq(self):
573 if not hasattr(self, "_seq"): 574 self._seq = _retrieve_seq(self._adaptor, self._primary_id) 575 return self._seq
576
577 - def __set_seq(self, seq):
578 # TODO - Check consistent with self._per_letter_annotations 579 self._seq = seq
580
581 - def __del_seq(self):
582 del self._seq
583 seq = property(__get_seq, __set_seq, __del_seq, "Seq object") 584
585 - def __get_dbxrefs(self):
586 if not hasattr(self, "_dbxrefs"): 587 self._dbxrefs = _retrieve_dbxrefs(self._adaptor, self._primary_id) 588 return self._dbxrefs
589
590 - def __set_dbxrefs(self, dbxrefs):
591 self._dbxrefs = dbxrefs
592
593 - def __del_dbxrefs(self):
594 del self._dbxrefs
595 dbxrefs = property(__get_dbxrefs, __set_dbxrefs, __del_dbxrefs, 596 "Database cross references") 597
598 - def __get_features(self):
599 if not hasattr(self, "_features"): 600 self._features = _retrieve_features(self._adaptor, 601 self._primary_id) 602 return self._features
603
604 - def __set_features(self, features):
605 self._features = features
606
607 - def __del_features(self):
608 del self._features
609 features = property(__get_features, __set_features, __del_features, 610 "Features") 611
612 - def __get_annotations(self):
613 if not hasattr(self, "_annotations"): 614 self._annotations = _retrieve_annotations(self._adaptor, 615 self._primary_id, 616 self._taxon_id) 617 if self._identifier: 618 self._annotations["gi"] = self._identifier 619 if self._division: 620 self._annotations["data_file_division"] = self._division 621 return self._annotations
622
623 - def __set_annotations(self, annotations):
624 self._annotations = annotations
625
626 - def __del_annotations(self):
627 del self._annotations
628 annotations = property(__get_annotations, __set_annotations, 629 __del_annotations, "Annotations")
630