Package BioSQL :: Module BioSeq
[hide private]
[frames] | no frames]

Source Code for Module BioSQL.BioSeq

  1  # Copyright 2002 by Andrew Dalke.  All rights reserved. 
  2  # Revisions 2007-2016 copyright by Peter Cock.  All rights reserved. 
  3  # Revisions 2008-2009 copyright by Cymon J. Cox.  All rights reserved. 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7  # 
  8  # Note that BioSQL (including the database schema and scripts) is 
  9  # available and licensed separately.  Please consult www.biosql.org 
 10  """Implementations of Biopython-like Seq objects on top of BioSQL. 
 11   
 12  This allows retrival of items stored in a BioSQL database using 
 13  a biopython-like SeqRecord and Seq interface. 
 14   
 15  Note: Currently we do not support recording per-letter-annotations 
 16  (like quality scores) in BioSQL. 
 17  """ 
 18   
 19  from Bio._py3k import unicode 
 20   
 21  from Bio import Alphabet 
 22  from Bio.Seq import Seq, UnknownSeq 
 23  from Bio.SeqRecord import SeqRecord, _RestrictedDict 
 24  from Bio import SeqFeature 
 25   
 26   
27 -class DBSeq(Seq):
28 """BioSQL equivalent of the Biopython Seq object.""" 29
30 - def __init__(self, primary_id, adaptor, alphabet, start, length):
31 """Create a new DBSeq object referring to a BioSQL entry. 32 33 You wouldn't normally create a DBSeq object yourself, this is done 34 for you when retreiving a DBSeqRecord object from the database. 35 """ 36 self.primary_id = primary_id 37 self.adaptor = adaptor 38 self.alphabet = alphabet 39 self._length = length 40 self.start = start
41
42 - def __len__(self):
43 """Return the length of the sequence.""" 44 return self._length
45
46 - def __getitem__(self, index): # Seq API requirement
47 """Return a subsequence or single letter.""" 48 # Note since Python 2.0, __getslice__ is deprecated 49 # and __getitem__ is used instead. 50 # See http://docs.python.org/ref/sequence-methods.html 51 if isinstance(index, int): 52 # Return a single letter as a string 53 i = index 54 if i < 0: 55 if -i > self._length: 56 raise IndexError(i) 57 i = i + self._length 58 elif i >= self._length: 59 raise IndexError(i) 60 return self.adaptor.get_subseq_as_string(self.primary_id, 61 self.start + i, 62 self.start + i + 1) 63 if not isinstance(index, slice): 64 raise TypeError("Unexpected index type") 65 66 # Return the (sub)sequence as another DBSeq or Seq object 67 # (see the Seq obect's __getitem__ method) 68 if index.start is None: 69 i = 0 70 else: 71 i = index.start 72 if i < 0: 73 # Map to equavilent positive index 74 if -i > self._length: 75 raise IndexError(i) 76 i = i + self._length 77 elif i >= self._length: 78 # Trivial case, should return empty string! 79 i = self._length 80 81 if index.stop is None: 82 j = self._length 83 else: 84 j = index.stop 85 if j < 0: 86 # Map to equavilent positive index 87 if -j > self._length: 88 raise IndexError(j) 89 j = j + self._length 90 elif j >= self._length: 91 j = self._length 92 93 if i >= j: 94 # Trivial case, empty string. 95 return Seq("", self.alphabet) 96 elif index.step is None or index.step == 1: 97 # Easy case - can return a DBSeq with the start and end adjusted 98 return self.__class__(self.primary_id, self.adaptor, self.alphabet, 99 self.start + i, j - i) 100 else: 101 # Tricky. Will have to create a Seq object because of the stride 102 full = self.adaptor.get_subseq_as_string(self.primary_id, 103 self.start + i, 104 self.start + j) 105 return Seq(full[::index.step], self.alphabet)
106
107 - def tostring(self):
108 """Return the full sequence as a python string (DEPRECATED). 109 110 You are now encouraged to use str(my_seq) instead of 111 my_seq.tostring(). 112 """ 113 import warnings 114 warnings.warn("This method is obsolete; please use str(my_seq) " 115 "instead of my_seq.tostring().", 116 PendingDeprecationWarning) 117 return self.adaptor.get_subseq_as_string(self.primary_id, 118 self.start, 119 self.start + self._length)
120
121 - def __str__(self):
122 """Return the full sequence as a python string.""" 123 return self.adaptor.get_subseq_as_string(self.primary_id, 124 self.start, 125 self.start + self._length)
126 127 data = property(tostring, doc="Sequence as string (DEPRECATED)") 128
129 - def toseq(self):
130 """Return the full sequence as a Seq object.""" 131 # Note - the method name copies that of the MutableSeq object 132 return Seq(str(self), self.alphabet)
133
134 - def __add__(self, other):
135 """Add another sequence or string to this sequence. 136 137 The sequence is first converted to a Seq object before the addition. 138 The returned object is a Seq object, not a DBSeq object 139 """ 140 # Let the Seq object deal with the alphabet issues etc 141 return self.toseq() + other
142
143 - def __radd__(self, other):
144 """Add another sequence or string to the left. 145 146 The sequence is first converted to a Seq object before the addition. 147 The returned object is a Seq object, not a DBSeq object 148 """ 149 # Let the Seq object deal with the alphabet issues etc 150 return other + self.toseq()
151 152
153 -def _retrieve_seq_len(adaptor, primary_id):
154 # The database schema ensures there will be only one matching row 155 seqs = adaptor.execute_and_fetchall( 156 "SELECT length FROM biosequence WHERE bioentry_id = %s", (primary_id,)) 157 if not seqs: 158 return None 159 assert len(seqs) == 1 160 given_length, = seqs[0] 161 return int(given_length)
162 163
164 -def _retrieve_seq(adaptor, primary_id):
165 # The database schema ensures there will be only one matching 166 # row in the table. 167 168 # If an UnknownSeq was recorded, seq will be NULL, 169 # but length will be populated. This means length(seq) 170 # will return None. 171 seqs = adaptor.execute_and_fetchall( 172 "SELECT alphabet, length, length(seq) FROM biosequence" 173 " WHERE bioentry_id = %s", (primary_id,)) 174 if not seqs: 175 return 176 assert len(seqs) == 1 177 moltype, given_length, length = seqs[0] 178 179 try: 180 length = int(length) 181 given_length = int(length) 182 assert length == given_length 183 have_seq = True 184 except TypeError: 185 assert length is None 186 seqs = adaptor.execute_and_fetchall( 187 "SELECT alphabet, length, seq FROM biosequence" 188 " WHERE bioentry_id = %s", (primary_id,)) 189 assert len(seqs) == 1 190 moltype, given_length, seq = seqs[0] 191 assert seq is None or seq == "" 192 length = int(given_length) 193 have_seq = False 194 del seq 195 del given_length 196 197 moltype = moltype.lower() # might be upper case in database 198 # We have no way of knowing if these sequences will use IUPAC 199 # alphabets, and we certainly can't assume they are unambiguous! 200 if moltype == "dna": 201 alphabet = Alphabet.generic_dna 202 elif moltype == "rna": 203 alphabet = Alphabet.generic_rna 204 elif moltype == "protein": 205 alphabet = Alphabet.generic_protein 206 elif moltype == "unknown": 207 # This is used in BioSQL/Loader.py and would happen 208 # for any generic or nucleotide alphabets. 209 alphabet = Alphabet.single_letter_alphabet 210 else: 211 raise AssertionError("Unknown moltype: %s" % moltype) 212 213 if have_seq: 214 return DBSeq(primary_id, adaptor, alphabet, 0, int(length)) 215 else: 216 return UnknownSeq(length, alphabet)
217 218
219 -def _retrieve_dbxrefs(adaptor, primary_id):
220 """Retrieve the database cross references for the sequence.""" 221 _dbxrefs = [] 222 dbxrefs = adaptor.execute_and_fetchall( 223 "SELECT dbname, accession, version" 224 " FROM bioentry_dbxref join dbxref using (dbxref_id)" 225 " WHERE bioentry_id = %s" 226 " ORDER BY rank", (primary_id,)) 227 for dbname, accession, version in dbxrefs: 228 if version and version != "0": 229 v = "%s.%s" % (accession, version) 230 else: 231 v = accession 232 _dbxrefs.append("%s:%s" % (dbname, v)) 233 return _dbxrefs
234 235
236 -def _retrieve_features(adaptor, primary_id):
237 sql = "SELECT seqfeature_id, type.name, rank" \ 238 " FROM seqfeature join term type on (type_term_id = type.term_id)" \ 239 " WHERE bioentry_id = %s" \ 240 " ORDER BY rank" 241 results = adaptor.execute_and_fetchall(sql, (primary_id,)) 242 seq_feature_list = [] 243 for seqfeature_id, seqfeature_type, seqfeature_rank in results: 244 # Get qualifiers [except for db_xref which is stored separately] 245 qvs = adaptor.execute_and_fetchall( 246 "SELECT name, value" 247 " FROM seqfeature_qualifier_value join term using (term_id)" 248 " WHERE seqfeature_id = %s" 249 " ORDER BY rank", (seqfeature_id,)) 250 qualifiers = {} 251 for qv_name, qv_value in qvs: 252 qualifiers.setdefault(qv_name, []).append(qv_value) 253 # Get db_xrefs [special case of qualifiers] 254 qvs = adaptor.execute_and_fetchall( 255 "SELECT dbxref.dbname, dbxref.accession" 256 " FROM dbxref join seqfeature_dbxref using (dbxref_id)" 257 " WHERE seqfeature_dbxref.seqfeature_id = %s" 258 " ORDER BY rank", (seqfeature_id,)) 259 for qv_name, qv_value in qvs: 260 value = "%s:%s" % (qv_name, qv_value) 261 qualifiers.setdefault("db_xref", []).append(value) 262 # Get locations 263 results = adaptor.execute_and_fetchall( 264 "SELECT location_id, start_pos, end_pos, strand" 265 " FROM location" 266 " WHERE seqfeature_id = %s" 267 " ORDER BY rank", (seqfeature_id,)) 268 locations = [] 269 # convert to Python standard form 270 # Convert strand = 0 to strand = None 271 # re: comment in Loader.py: 272 # Biopython uses None when we don't know strand information but 273 # BioSQL requires something (non null) and sets this as zero 274 # So we'll use the strand or 0 if Biopython spits out None 275 for location_id, start, end, strand in results: 276 if start: 277 start -= 1 278 if strand == 0: 279 strand = None 280 if strand not in (+1, -1, None): 281 raise ValueError("Invalid strand %s found in database for " 282 "seqfeature_id %s" % (strand, seqfeature_id)) 283 if start is not None and end is not None and end < start: 284 import warnings 285 from Bio import BiopythonWarning 286 warnings.warn("Inverted location start/end (%i and %i) for " 287 "seqfeature_id %s" % (start, end, seqfeature_id), 288 BiopythonWarning) 289 290 # For SwissProt unknown positions (?) 291 if start is None: 292 start = SeqFeature.UnknownPosition() 293 if end is None: 294 end = SeqFeature.UnknownPosition() 295 296 locations.append((location_id, start, end, strand)) 297 # Get possible remote reference information 298 remote_results = adaptor.execute_and_fetchall( 299 "SELECT location_id, dbname, accession, version" 300 " FROM location join dbxref using (dbxref_id)" 301 " WHERE seqfeature_id = %s", (seqfeature_id,)) 302 lookup = {} 303 for location_id, dbname, accession, version in remote_results: 304 if version and version != "0": 305 v = "%s.%s" % (accession, version) 306 else: 307 v = accession 308 # subfeature remote location db_ref are stored as a empty string 309 # when not present 310 if dbname == "": 311 dbname = None 312 lookup[location_id] = (dbname, v) 313 314 feature = SeqFeature.SeqFeature(type=seqfeature_type) 315 # Store the key as a private property 316 feature._seqfeature_id = seqfeature_id 317 feature.qualifiers = qualifiers 318 if len(locations) == 0: 319 pass 320 elif len(locations) == 1: 321 location_id, start, end, strand = locations[0] 322 # See Bug 2677, we currently don't record the location_operator 323 # For consistency with older versions Biopython, default to "". 324 feature.location_operator = \ 325 _retrieve_location_qualifier_value(adaptor, location_id) 326 dbname, version = lookup.get(location_id, (None, None)) 327 feature.location = SeqFeature.FeatureLocation(start, end) 328 feature.strand = strand 329 feature.ref_db = dbname 330 feature.ref = version 331 else: 332 locs = [] 333 for location in locations: 334 location_id, start, end, strand = location 335 dbname, version = lookup.get(location_id, (None, None)) 336 locs.append(SeqFeature.FeatureLocation(start, end, 337 strand=strand, 338 ref=version, 339 ref_db=dbname)) 340 # Locations are typically in biological in order (see negative 341 # strands below), but because of remote locations for 342 # sub-features they are not necessarily in numerical order: 343 strands = set(l.strand for l in locs) 344 if len(strands) == 1 and -1 in strands: 345 # Evil hack time for backwards compatibility 346 # TODO - Check if BioPerl and (old) Biopython did the same, 347 # we may have an existing incompatibility lurking here... 348 locs = locs[::-1] 349 feature.location = SeqFeature.CompoundLocation(locs, "join") 350 # TODO - See Bug 2677 - we don't yet record location operator, 351 # so for consistency with older versions of Biopython default 352 # to assuming its a join. 353 seq_feature_list.append(feature) 354 return seq_feature_list
355 356
357 -def _retrieve_location_qualifier_value(adaptor, location_id):
358 value = adaptor.execute_and_fetch_col0( 359 "SELECT value FROM location_qualifier_value" 360 " WHERE location_id = %s", (location_id,)) 361 try: 362 return value[0] 363 except IndexError: 364 return ""
365 366
367 -def _retrieve_annotations(adaptor, primary_id, taxon_id):
368 annotations = {} 369 annotations.update(_retrieve_qualifier_value(adaptor, primary_id)) 370 annotations.update(_retrieve_reference(adaptor, primary_id)) 371 annotations.update(_retrieve_taxon(adaptor, primary_id, taxon_id)) 372 annotations.update(_retrieve_comment(adaptor, primary_id)) 373 # Convert values into strings in cases of unicode from the database. 374 # BioSQL could eventually be expanded to be unicode aware. 375 str_anns = {} 376 for key, val in annotations.items(): 377 if isinstance(val, list): 378 val = [_make_unicode_into_string(x) for x in val] 379 elif isinstance(val, unicode): 380 val = str(val) 381 str_anns[key] = val 382 return str_anns
383 384
385 -def _make_unicode_into_string(text):
386 if isinstance(text, unicode): 387 return str(text) 388 else: 389 return text
390 391
392 -def _retrieve_qualifier_value(adaptor, primary_id):
393 qvs = adaptor.execute_and_fetchall( 394 "SELECT name, value" 395 " FROM bioentry_qualifier_value JOIN term USING (term_id)" 396 " WHERE bioentry_id = %s" 397 " ORDER BY rank", (primary_id,)) 398 qualifiers = {} 399 for name, value in qvs: 400 if name == "keyword": 401 name = "keywords" 402 # See handling of "date" in Loader.py 403 elif name == "date_changed": 404 name = "date" 405 elif name == "secondary_accession": 406 name = "accessions" 407 qualifiers.setdefault(name, []).append(value) 408 return qualifiers
409 410
411 -def _retrieve_reference(adaptor, primary_id):
412 # XXX dbxref_qualifier_value 413 414 refs = adaptor.execute_and_fetchall( 415 "SELECT start_pos, end_pos, " 416 " location, title, authors," 417 " dbname, accession" 418 " FROM bioentry_reference" 419 " JOIN reference USING (reference_id)" 420 " LEFT JOIN dbxref USING (dbxref_id)" 421 " WHERE bioentry_id = %s" 422 " ORDER BY rank", (primary_id,)) 423 references = [] 424 for start, end, location, title, authors, dbname, accession in refs: 425 reference = SeqFeature.Reference() 426 # If the start/end are missing, reference.location is an empty list 427 if (start is not None) or (end is not None): 428 if start is not None: 429 start -= 1 # python counting 430 reference.location = [SeqFeature.FeatureLocation(start, end)] 431 # Don't replace the default "" with None. 432 if authors: 433 reference.authors = authors 434 if title: 435 reference.title = title 436 reference.journal = location 437 if dbname == 'PUBMED': 438 reference.pubmed_id = accession 439 elif dbname == 'MEDLINE': 440 reference.medline_id = accession 441 references.append(reference) 442 if references: 443 return {'references': references} 444 else: 445 return {}
446 447
448 -def _retrieve_taxon(adaptor, primary_id, taxon_id):
449 a = {} 450 common_names = adaptor.execute_and_fetch_col0( 451 "SELECT name FROM taxon_name WHERE taxon_id = %s" 452 " AND name_class = 'genbank common name'", (taxon_id,)) 453 if common_names: 454 a['source'] = common_names[0] 455 scientific_names = adaptor.execute_and_fetch_col0( 456 "SELECT name FROM taxon_name WHERE taxon_id = %s" 457 " AND name_class = 'scientific name'", (taxon_id,)) 458 if scientific_names: 459 a['organism'] = scientific_names[0] 460 ncbi_taxids = adaptor.execute_and_fetch_col0( 461 "SELECT ncbi_taxon_id FROM taxon WHERE taxon_id = %s", (taxon_id,)) 462 if ncbi_taxids and ncbi_taxids[0] and ncbi_taxids[0] != "0": 463 a['ncbi_taxid'] = ncbi_taxids[0] 464 465 # Old code used the left/right values in the taxon table to get the 466 # taxonomy lineage in one SQL command. This was actually very slow, 467 # and would fail if the (optional) left/right values were missing. 468 # 469 # The following code is based on a contribution from Eric Gibert, and 470 # relies on the taxon table's parent_taxon_id field only (ignoring the 471 # optional left/right values). This means that it has to make a 472 # separate SQL query for each entry in the lineage, but it does still 473 # appear to be *much* faster. See Bug 2494. 474 taxonomy = [] 475 while taxon_id: 476 name, rank, parent_taxon_id = adaptor.execute_one( 477 "SELECT taxon_name.name, taxon.node_rank, taxon.parent_taxon_id" 478 " FROM taxon, taxon_name" 479 " WHERE taxon.taxon_id=taxon_name.taxon_id" 480 " AND taxon_name.name_class='scientific name'" 481 " AND taxon.taxon_id = %s", (taxon_id,)) 482 if taxon_id == parent_taxon_id: 483 # If the taxon table has been populated by the BioSQL script 484 # load_ncbi_taxonomy.pl this is how top parent nodes are stored. 485 # Personally, I would have used a NULL parent_taxon_id here. 486 break 487 488 taxonomy.insert(0, name) 489 taxon_id = parent_taxon_id 490 491 if taxonomy: 492 a['taxonomy'] = taxonomy 493 return a
494 495
496 -def _retrieve_comment(adaptor, primary_id):
497 qvs = adaptor.execute_and_fetchall( 498 "SELECT comment_text FROM comment" 499 " WHERE bioentry_id=%s" 500 " ORDER BY rank", (primary_id,)) 501 comments = [comm[0] for comm in qvs] 502 # Don't want to add an empty list... 503 if comments: 504 return {"comment": comments} 505 else: 506 return {}
507 508
509 -class DBSeqRecord(SeqRecord):
510 """BioSQL equivalent of the Biopython SeqRecord object.""" 511
512 - def __init__(self, adaptor, primary_id):
513 """Create a DBSeqRecord object. 514 515 Arguments: 516 - adaptor - A BioSQL.BioSeqDatabase.Adaptor object 517 - primary_id - An internal integer ID used by BioSQL 518 519 You wouldn't normally create a DBSeqRecord object yourself, 520 this is done for you when using a BioSeqDatabase object 521 """ 522 self._adaptor = adaptor 523 self._primary_id = primary_id 524 525 (self._biodatabase_id, self._taxon_id, self.name, 526 accession, version, self._identifier, 527 self._division, self.description) = self._adaptor.execute_one( 528 "SELECT biodatabase_id, taxon_id, name, accession, version," 529 " identifier, division, description" 530 " FROM bioentry" 531 " WHERE bioentry_id = %s", (self._primary_id,)) 532 if version and version != "0": 533 self.id = "%s.%s" % (accession, version) 534 else: 535 self.id = accession 536 # We don't yet record any per-letter-annotations in the 537 # BioSQL database, but we should set this property up 538 # for completeness (and the __str__ method). 539 # We do NOT want to load the sequence from the DB here! 540 length = _retrieve_seq_len(adaptor, primary_id) 541 self._per_letter_annotations = _RestrictedDict(length=length)
542
543 - def __get_seq(self):
544 if not hasattr(self, "_seq"): 545 self._seq = _retrieve_seq(self._adaptor, self._primary_id) 546 return self._seq
547
548 - def __set_seq(self, seq):
549 # TODO - Check consistent with self._per_letter_annotations 550 self._seq = seq
551
552 - def __del_seq(self):
553 del self._seq
554 seq = property(__get_seq, __set_seq, __del_seq, "Seq object") 555
556 - def __get_dbxrefs(self):
557 if not hasattr(self, "_dbxrefs"): 558 self._dbxrefs = _retrieve_dbxrefs(self._adaptor, self._primary_id) 559 return self._dbxrefs
560
561 - def __set_dbxrefs(self, dbxrefs):
562 self._dbxrefs = dbxrefs
563
564 - def __del_dbxrefs(self):
565 del self._dbxrefs
566 dbxrefs = property(__get_dbxrefs, __set_dbxrefs, __del_dbxrefs, 567 "Database cross references") 568
569 - def __get_features(self):
570 if not hasattr(self, "_features"): 571 self._features = _retrieve_features(self._adaptor, 572 self._primary_id) 573 return self._features
574
575 - def __set_features(self, features):
576 self._features = features
577
578 - def __del_features(self):
579 del self._features
580 features = property(__get_features, __set_features, __del_features, 581 "Features") 582
583 - def __get_annotations(self):
584 if not hasattr(self, "_annotations"): 585 self._annotations = _retrieve_annotations(self._adaptor, 586 self._primary_id, 587 self._taxon_id) 588 if self._identifier: 589 self._annotations["gi"] = self._identifier 590 if self._division: 591 self._annotations["data_file_division"] = self._division 592 return self._annotations
593
594 - def __set_annotations(self, annotations):
595 self._annotations = annotations
596
597 - def __del_annotations(self):
598 del self._annotations
599 annotations = property(__get_annotations, __set_annotations, 600 __del_annotations, "Annotations")
601