Package BioSQL :: Module BioSeq
[hide private]
[frames] | no frames]

Source Code for Module BioSQL.BioSeq

  1  # Copyright 2002 by Andrew Dalke.  All rights reserved. 
  2  # Revisions 2007-2009 copyright by Peter Cock.  All rights reserved. 
  3  # Revisions 2008-2009 copyright by Cymon J. Cox.  All rights reserved. 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7  # 
  8  # Note that BioSQL (including the database schema and scripts) is 
  9  # available and licensed separately.  Please consult www.biosql.org 
 10  """Implementations of Biopython-like Seq objects on top of BioSQL. 
 11   
 12  This allows retrival of items stored in a BioSQL database using 
 13  a biopython-like SeqRecord and Seq interface. 
 14   
 15  Note: Currently we do not support recording per-letter-annotations 
 16  (like quality scores) in BioSQL. 
 17  """ 
 18   
 19  from Bio._py3k import unicode 
 20   
 21  from Bio import Alphabet 
 22  from Bio.Seq import Seq, UnknownSeq 
 23  from Bio.SeqRecord import SeqRecord, _RestrictedDict 
 24  from Bio import SeqFeature 
 25   
 26   
27 -class DBSeq(Seq):
28 """BioSQL equivalent of the Biopython Seq object.""" 29
30 - def __init__(self, primary_id, adaptor, alphabet, start, length):
31 """Create a new DBSeq object referring to a BioSQL entry. 32 33 You wouldn't normally create a DBSeq object yourself, this is done 34 for you when retreiving a DBSeqRecord object from the database. 35 """ 36 self.primary_id = primary_id 37 self.adaptor = adaptor 38 self.alphabet = alphabet 39 self._length = length 40 self.start = start
41
42 - def __len__(self):
43 return self._length
44
45 - def __getitem__(self, index): # Seq API requirement
46 # Note since Python 2.0, __getslice__ is deprecated 47 # and __getitem__ is used instead. 48 # See http://docs.python.org/ref/sequence-methods.html 49 if isinstance(index, int): 50 # Return a single letter as a string 51 i = index 52 if i < 0: 53 if -i > self._length: 54 raise IndexError(i) 55 i = i + self._length 56 elif i >= self._length: 57 raise IndexError(i) 58 return self.adaptor.get_subseq_as_string(self.primary_id, 59 self.start + i, 60 self.start + i + 1) 61 if not isinstance(index, slice): 62 raise ValueError("Unexpected index type") 63 64 # Return the (sub)sequence as another DBSeq or Seq object 65 # (see the Seq obect's __getitem__ method) 66 if index.start is None: 67 i = 0 68 else: 69 i = index.start 70 if i < 0: 71 # Map to equavilent positive index 72 if -i > self._length: 73 raise IndexError(i) 74 i = i + self._length 75 elif i >= self._length: 76 # Trivial case, should return empty string! 77 i = self._length 78 79 if index.stop is None: 80 j = self._length 81 else: 82 j = index.stop 83 if j < 0: 84 # Map to equavilent positive index 85 if -j > self._length: 86 raise IndexError(j) 87 j = j + self._length 88 elif j >= self._length: 89 j = self._length 90 91 if i >= j: 92 # Trivial case, empty string. 93 return Seq("", self.alphabet) 94 elif index.step is None or index.step == 1: 95 # Easy case - can return a DBSeq with the start and end adjusted 96 return self.__class__(self.primary_id, self.adaptor, self.alphabet, 97 self.start + i, j - i) 98 else: 99 # Tricky. Will have to create a Seq object because of the stride 100 full = self.adaptor.get_subseq_as_string(self.primary_id, 101 self.start + i, 102 self.start + j) 103 return Seq(full[::index.step], self.alphabet)
104
105 - def tostring(self):
106 """Returns the full sequence as a python string (DEPRECATED). 107 108 You are now encouraged to use str(my_seq) instead of 109 my_seq.tostring().""" 110 import warnings 111 warnings.warn("This method is obsolete; please use str(my_seq) " 112 "instead of my_seq.tostring().", 113 PendingDeprecationWarning) 114 return self.adaptor.get_subseq_as_string(self.primary_id, 115 self.start, 116 self.start + self._length)
117
118 - def __str__(self):
119 """Returns the full sequence as a python string.""" 120 return self.adaptor.get_subseq_as_string(self.primary_id, 121 self.start, 122 self.start + self._length)
123 124 data = property(tostring, doc="Sequence as string (DEPRECATED)") 125
126 - def toseq(self):
127 """Returns the full sequence as a Seq object.""" 128 # Note - the method name copies that of the MutableSeq object 129 return Seq(str(self), self.alphabet)
130
131 - def __add__(self, other):
132 # Let the Seq object deal with the alphabet issues etc 133 return self.toseq() + other
134
135 - def __radd__(self, other):
136 # Let the Seq object deal with the alphabet issues etc 137 return other + self.toseq()
138 139
140 -def _retrieve_seq_len(adaptor, primary_id):
141 # The database schema ensures there will be only one matching row 142 seqs = adaptor.execute_and_fetchall( 143 "SELECT length FROM biosequence WHERE bioentry_id = %s", (primary_id,)) 144 if not seqs: 145 return None 146 assert len(seqs) == 1 147 given_length, = seqs[0] 148 return int(given_length)
149 150
151 -def _retrieve_seq(adaptor, primary_id):
152 # The database schema ensures there will be only one matching 153 # row in the table. 154 155 # If an UnknownSeq was recorded, seq will be NULL, 156 # but length will be populated. This means length(seq) 157 # will return None. 158 seqs = adaptor.execute_and_fetchall( 159 "SELECT alphabet, length, length(seq) FROM biosequence" 160 " WHERE bioentry_id = %s", (primary_id,)) 161 if not seqs: 162 return 163 assert len(seqs) == 1 164 moltype, given_length, length = seqs[0] 165 166 try: 167 length = int(length) 168 given_length = int(length) 169 assert length == given_length 170 have_seq = True 171 except TypeError: 172 assert length is None 173 seqs = adaptor.execute_and_fetchall( 174 "SELECT alphabet, length, seq FROM biosequence" 175 " WHERE bioentry_id = %s", (primary_id,)) 176 assert len(seqs) == 1 177 moltype, given_length, seq = seqs[0] 178 assert seq is None or seq == "" 179 length = int(given_length) 180 have_seq = False 181 del seq 182 del given_length 183 184 moltype = moltype.lower() # might be upper case in database 185 # We have no way of knowing if these sequences will use IUPAC 186 # alphabets, and we certainly can't assume they are unambiguous! 187 if moltype == "dna": 188 alphabet = Alphabet.generic_dna 189 elif moltype == "rna": 190 alphabet = Alphabet.generic_rna 191 elif moltype == "protein": 192 alphabet = Alphabet.generic_protein 193 elif moltype == "unknown": 194 # This is used in BioSQL/Loader.py and would happen 195 # for any generic or nucleotide alphabets. 196 alphabet = Alphabet.single_letter_alphabet 197 else: 198 raise AssertionError("Unknown moltype: %s" % moltype) 199 200 if have_seq: 201 return DBSeq(primary_id, adaptor, alphabet, 0, int(length)) 202 else: 203 return UnknownSeq(length, alphabet)
204 205
206 -def _retrieve_dbxrefs(adaptor, primary_id):
207 """Retrieve the database cross references for the sequence.""" 208 _dbxrefs = [] 209 dbxrefs = adaptor.execute_and_fetchall( 210 "SELECT dbname, accession, version" 211 " FROM bioentry_dbxref join dbxref using (dbxref_id)" 212 " WHERE bioentry_id = %s" 213 " ORDER BY rank", (primary_id,)) 214 for dbname, accession, version in dbxrefs: 215 if version and version != "0": 216 v = "%s.%s" % (accession, version) 217 else: 218 v = accession 219 _dbxrefs.append("%s:%s" % (dbname, v)) 220 return _dbxrefs
221 222
223 -def _retrieve_features(adaptor, primary_id):
224 sql = "SELECT seqfeature_id, type.name, rank" \ 225 " FROM seqfeature join term type on (type_term_id = type.term_id)" \ 226 " WHERE bioentry_id = %s" \ 227 " ORDER BY rank" 228 results = adaptor.execute_and_fetchall(sql, (primary_id,)) 229 seq_feature_list = [] 230 for seqfeature_id, seqfeature_type, seqfeature_rank in results: 231 # Get qualifiers [except for db_xref which is stored separately] 232 qvs = adaptor.execute_and_fetchall( 233 "SELECT name, value" 234 " FROM seqfeature_qualifier_value join term using (term_id)" 235 " WHERE seqfeature_id = %s" 236 " ORDER BY rank", (seqfeature_id,)) 237 qualifiers = {} 238 for qv_name, qv_value in qvs: 239 qualifiers.setdefault(qv_name, []).append(qv_value) 240 # Get db_xrefs [special case of qualifiers] 241 qvs = adaptor.execute_and_fetchall( 242 "SELECT dbxref.dbname, dbxref.accession" 243 " FROM dbxref join seqfeature_dbxref using (dbxref_id)" 244 " WHERE seqfeature_dbxref.seqfeature_id = %s" 245 " ORDER BY rank", (seqfeature_id,)) 246 for qv_name, qv_value in qvs: 247 value = "%s:%s" % (qv_name, qv_value) 248 qualifiers.setdefault("db_xref", []).append(value) 249 # Get locations 250 results = adaptor.execute_and_fetchall( 251 "SELECT location_id, start_pos, end_pos, strand" 252 " FROM location" 253 " WHERE seqfeature_id = %s" 254 " ORDER BY rank", (seqfeature_id,)) 255 locations = [] 256 # convert to Python standard form 257 # Convert strand = 0 to strand = None 258 # re: comment in Loader.py: 259 # Biopython uses None when we don't know strand information but 260 # BioSQL requires something (non null) and sets this as zero 261 # So we'll use the strand or 0 if Biopython spits out None 262 for location_id, start, end, strand in results: 263 if start: 264 start -= 1 265 if strand == 0: 266 strand = None 267 if strand not in (+1, -1, None): 268 raise ValueError("Invalid strand %s found in database for " 269 "seqfeature_id %s" % (strand, seqfeature_id)) 270 if end < start: 271 import warnings 272 from Bio import BiopythonWarning 273 warnings.warn("Inverted location start/end (%i and %i) for " 274 "seqfeature_id %s" % (start, end, seqfeature_id), 275 BiopythonWarning) 276 locations.append((location_id, start, end, strand)) 277 # Get possible remote reference information 278 remote_results = adaptor.execute_and_fetchall( 279 "SELECT location_id, dbname, accession, version" 280 " FROM location join dbxref using (dbxref_id)" 281 " WHERE seqfeature_id = %s", (seqfeature_id,)) 282 lookup = {} 283 for location_id, dbname, accession, version in remote_results: 284 if version and version != "0": 285 v = "%s.%s" % (accession, version) 286 else: 287 v = accession 288 # subfeature remote location db_ref are stored as a empty string when 289 # not present 290 if dbname == "": 291 dbname = None 292 lookup[location_id] = (dbname, v) 293 294 feature = SeqFeature.SeqFeature(type=seqfeature_type) 295 # Store the key as a private property 296 feature._seqfeature_id = seqfeature_id 297 feature.qualifiers = qualifiers 298 if len(locations) == 0: 299 pass 300 elif len(locations) == 1: 301 location_id, start, end, strand = locations[0] 302 # See Bug 2677, we currently don't record the location_operator 303 # For consistency with older versions Biopython, default to "". 304 feature.location_operator = \ 305 _retrieve_location_qualifier_value(adaptor, location_id) 306 dbname, version = lookup.get(location_id, (None, None)) 307 feature.location = SeqFeature.FeatureLocation(start, end) 308 feature.strand = strand 309 feature.ref_db = dbname 310 feature.ref = version 311 else: 312 sub_features = feature.sub_features 313 assert sub_features == [] 314 for location in locations: 315 location_id, start, end, strand = location 316 dbname, version = lookup.get(location_id, (None, None)) 317 subfeature = SeqFeature.SeqFeature() 318 subfeature.type = seqfeature_type 319 subfeature.location = SeqFeature.FeatureLocation(start, end) 320 # subfeature.location_operator = \ 321 # _retrieve_location_qualifier_value(adaptor, location_id) 322 subfeature.strand = strand 323 subfeature.ref_db = dbname 324 subfeature.ref = version 325 sub_features.append(subfeature) 326 # Locations are in order, but because of remote locations for 327 # sub-features they are not necessarily in numerical order: 328 strands = set(sf.strand for sf in sub_features) 329 if len(strands) == 1 and -1 in strands: 330 # Evil hack time for backwards compatibility 331 # TODO - Check if BioPerl and (old) Biopython did the same, 332 # we may have an existing incompatibility lurking here... 333 locs = [f.location for f in sub_features[::-1]] 334 else: 335 # All forward, or mixed strands 336 locs = [f.location for f in sub_features] 337 feature.location = SeqFeature.CompoundLocation( 338 locs, seqfeature_type) 339 # TODO - See Bug 2677 - we don't yet record location_operator, 340 # so for consistency with older versions of Biopython default 341 # to assuming its a join. 342 feature.location_operator = "join" 343 seq_feature_list.append(feature) 344 345 return seq_feature_list
346 347
348 -def _retrieve_location_qualifier_value(adaptor, location_id):
349 value = adaptor.execute_and_fetch_col0( 350 "SELECT value FROM location_qualifier_value" 351 " WHERE location_id = %s", (location_id,)) 352 try: 353 return value[0] 354 except IndexError: 355 return ""
356 357
358 -def _retrieve_annotations(adaptor, primary_id, taxon_id):
359 annotations = {} 360 annotations.update(_retrieve_qualifier_value(adaptor, primary_id)) 361 annotations.update(_retrieve_reference(adaptor, primary_id)) 362 annotations.update(_retrieve_taxon(adaptor, primary_id, taxon_id)) 363 annotations.update(_retrieve_comment(adaptor, primary_id)) 364 # Convert values into strings in cases of unicode from the database. 365 # BioSQL could eventually be expanded to be unicode aware. 366 str_anns = {} 367 for key, val in annotations.items(): 368 if isinstance(val, list): 369 val = [_make_unicode_into_string(x) for x in val] 370 elif isinstance(val, unicode): 371 val = str(val) 372 str_anns[key] = val 373 return str_anns
374 375
376 -def _make_unicode_into_string(text):
377 if isinstance(text, unicode): 378 return str(text) 379 else: 380 return text
381 382
383 -def _retrieve_qualifier_value(adaptor, primary_id):
384 qvs = adaptor.execute_and_fetchall( 385 "SELECT name, value" 386 " FROM bioentry_qualifier_value JOIN term USING (term_id)" 387 " WHERE bioentry_id = %s" 388 " ORDER BY rank", (primary_id,)) 389 qualifiers = {} 390 for name, value in qvs: 391 if name == "keyword": 392 name = "keywords" 393 # See handling of "date" in Loader.py 394 elif name == "date_changed": 395 name = "date" 396 elif name == "secondary_accession": 397 name = "accessions" 398 qualifiers.setdefault(name, []).append(value) 399 return qualifiers
400 401
402 -def _retrieve_reference(adaptor, primary_id):
403 # XXX dbxref_qualifier_value 404 405 refs = adaptor.execute_and_fetchall( 406 "SELECT start_pos, end_pos, " 407 " location, title, authors," 408 " dbname, accession" 409 " FROM bioentry_reference" 410 " JOIN reference USING (reference_id)" 411 " LEFT JOIN dbxref USING (dbxref_id)" 412 " WHERE bioentry_id = %s" 413 " ORDER BY rank", (primary_id,)) 414 references = [] 415 for start, end, location, title, authors, dbname, accession in refs: 416 reference = SeqFeature.Reference() 417 # If the start/end are missing, reference.location is an empty list 418 if (start is not None) or (end is not None): 419 if start is not None: 420 start -= 1 # python counting 421 reference.location = [SeqFeature.FeatureLocation(start, end)] 422 # Don't replace the default "" with None. 423 if authors: 424 reference.authors = authors 425 if title: 426 reference.title = title 427 reference.journal = location 428 if dbname == 'PUBMED': 429 reference.pubmed_id = accession 430 elif dbname == 'MEDLINE': 431 reference.medline_id = accession 432 references.append(reference) 433 if references: 434 return {'references': references} 435 else: 436 return {}
437 438
439 -def _retrieve_taxon(adaptor, primary_id, taxon_id):
440 a = {} 441 common_names = adaptor.execute_and_fetch_col0( 442 "SELECT name FROM taxon_name WHERE taxon_id = %s" 443 " AND name_class = 'genbank common name'", (taxon_id,)) 444 if common_names: 445 a['source'] = common_names[0] 446 scientific_names = adaptor.execute_and_fetch_col0( 447 "SELECT name FROM taxon_name WHERE taxon_id = %s" 448 " AND name_class = 'scientific name'", (taxon_id,)) 449 if scientific_names: 450 a['organism'] = scientific_names[0] 451 ncbi_taxids = adaptor.execute_and_fetch_col0( 452 "SELECT ncbi_taxon_id FROM taxon WHERE taxon_id = %s", (taxon_id,)) 453 if ncbi_taxids and ncbi_taxids[0] and ncbi_taxids[0] != "0": 454 a['ncbi_taxid'] = ncbi_taxids[0] 455 456 # Old code used the left/right values in the taxon table to get the 457 # taxonomy lineage in one SQL command. This was actually very slow, 458 # and would fail if the (optional) left/right values were missing. 459 # 460 # The following code is based on a contribution from Eric Gibert, and 461 # relies on the taxon table's parent_taxon_id field only (ignoring the 462 # optional left/right values). This means that it has to make a 463 # separate SQL query for each entry in the lineage, but it does still 464 # appear to be *much* faster. See Bug 2494. 465 taxonomy = [] 466 while taxon_id: 467 name, rank, parent_taxon_id = adaptor.execute_one( 468 "SELECT taxon_name.name, taxon.node_rank, taxon.parent_taxon_id" 469 " FROM taxon, taxon_name" 470 " WHERE taxon.taxon_id=taxon_name.taxon_id" 471 " AND taxon_name.name_class='scientific name'" 472 " AND taxon.taxon_id = %s", (taxon_id,)) 473 if taxon_id == parent_taxon_id: 474 # If the taxon table has been populated by the BioSQL script 475 # load_ncbi_taxonomy.pl this is how top parent nodes are stored. 476 # Personally, I would have used a NULL parent_taxon_id here. 477 break 478 if rank != "no rank": 479 # For consistency with older versions of Biopython, we are only 480 # interested in taxonomy entries with a stated rank. 481 # Add this to the start of the lineage list. 482 taxonomy.insert(0, name) 483 taxon_id = parent_taxon_id 484 485 if taxonomy: 486 a['taxonomy'] = taxonomy 487 return a
488 489
490 -def _retrieve_comment(adaptor, primary_id):
491 qvs = adaptor.execute_and_fetchall( 492 "SELECT comment_text FROM comment" 493 " WHERE bioentry_id=%s" 494 " ORDER BY rank", (primary_id,)) 495 comments = [comm[0] for comm in qvs] 496 # Don't want to add an empty list... 497 if comments: 498 return {"comment": comments} 499 else: 500 return {}
501 502
503 -class DBSeqRecord(SeqRecord):
504 """BioSQL equivalent of the Biopython SeqRecord object.""" 505
506 - def __init__(self, adaptor, primary_id):
507 self._adaptor = adaptor 508 self._primary_id = primary_id 509 510 (self._biodatabase_id, self._taxon_id, self.name, 511 accession, version, self._identifier, 512 self._division, self.description) = self._adaptor.execute_one( 513 "SELECT biodatabase_id, taxon_id, name, accession, version," 514 " identifier, division, description" 515 " FROM bioentry" 516 " WHERE bioentry_id = %s", (self._primary_id,)) 517 if version and version != "0": 518 self.id = "%s.%s" % (accession, version) 519 else: 520 self.id = accession 521 # We don't yet record any per-letter-annotations in the 522 # BioSQL database, but we should set this property up 523 # for completeness (and the __str__ method). 524 # We do NOT want to load the sequence from the DB here! 525 length = _retrieve_seq_len(adaptor, primary_id) 526 self._per_letter_annotations = _RestrictedDict(length=length)
527
528 - def __get_seq(self):
529 if not hasattr(self, "_seq"): 530 self._seq = _retrieve_seq(self._adaptor, self._primary_id) 531 return self._seq
532
533 - def __set_seq(self, seq):
534 # TODO - Check consistent with self._per_letter_annotations 535 self._seq = seq
536
537 - def __del_seq(self):
538 del self._seq
539 seq = property(__get_seq, __set_seq, __del_seq, "Seq object") 540
541 - def __get_dbxrefs(self):
542 if not hasattr(self, "_dbxrefs"): 543 self._dbxrefs = _retrieve_dbxrefs(self._adaptor, self._primary_id) 544 return self._dbxrefs
545
546 - def __set_dbxrefs(self, dbxrefs):
547 self._dbxrefs = dbxrefs
548
549 - def __del_dbxrefs(self):
550 del self._dbxrefs
551 dbxrefs = property(__get_dbxrefs, __set_dbxrefs, __del_dbxrefs, 552 "Database cross references") 553
554 - def __get_features(self):
555 if not hasattr(self, "_features"): 556 self._features = _retrieve_features(self._adaptor, 557 self._primary_id) 558 return self._features
559
560 - def __set_features(self, features):
561 self._features = features
562
563 - def __del_features(self):
564 del self._features
565 features = property(__get_features, __set_features, __del_features, 566 "Features") 567
568 - def __get_annotations(self):
569 if not hasattr(self, "_annotations"): 570 self._annotations = _retrieve_annotations(self._adaptor, 571 self._primary_id, 572 self._taxon_id) 573 if self._identifier: 574 self._annotations["gi"] = self._identifier 575 if self._division: 576 self._annotations["data_file_division"] = self._division 577 return self._annotations
578
579 - def __set_annotations(self, annotations):
580 self._annotations = annotations
581
582 - def __del_annotations(self):
583 del self._annotations
584 annotations = property(__get_annotations, __set_annotations, 585 __del_annotations, "Annotations")
586