Package BioSQL :: Module BioSeq
[hide private]
[frames] | no frames]

Source Code for Module BioSQL.BioSeq

  1  # Copyright 2002 by Andrew Dalke.  All rights reserved. 
  2  # Revisions 2007-2009 copyright by Peter Cock.  All rights reserved. 
  3  # Revisions 2008-2009 copyright by Cymon J. Cox.  All rights reserved. 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7  # 
  8  # Note that BioSQL (including the database schema and scripts) is 
  9  # available and licensed separately.  Please consult www.biosql.org 
 10  """Implementations of Biopython-like Seq objects on top of BioSQL. 
 11   
 12  This allows retrival of items stored in a BioSQL database using 
 13  a biopython-like SeqRecord and Seq interface. 
 14   
 15  Note: Currently we do not support recording per-letter-annotations 
 16  (like quality scores) in BioSQL. 
 17  """ 
 18   
 19  from Bio._py3k import unicode 
 20   
 21  from Bio import Alphabet 
 22  from Bio.Seq import Seq, UnknownSeq 
 23  from Bio.SeqRecord import SeqRecord, _RestrictedDict 
 24  from Bio import SeqFeature 
 25   
 26   
27 -class DBSeq(Seq): # This implements the biopython Seq interface
28 - def __init__(self, primary_id, adaptor, alphabet, start, length):
29 """Create a new DBSeq object referring to a BioSQL entry. 30 31 You wouldn't normally create a DBSeq object yourself, this is done 32 for you when retreiving a DBSeqRecord object from the database. 33 """ 34 self.primary_id = primary_id 35 self.adaptor = adaptor 36 self.alphabet = alphabet 37 self._length = length 38 self.start = start
39
40 - def __len__(self):
41 return self._length
42
43 - def __getitem__(self, index): # Seq API requirement
44 #Note since Python 2.0, __getslice__ is deprecated 45 #and __getitem__ is used instead. 46 #See http://docs.python.org/ref/sequence-methods.html 47 if isinstance(index, int): 48 #Return a single letter as a string 49 i = index 50 if i < 0: 51 if -i > self._length: 52 raise IndexError(i) 53 i = i + self._length 54 elif i >= self._length: 55 raise IndexError(i) 56 return self.adaptor.get_subseq_as_string(self.primary_id, 57 self.start + i, 58 self.start + i + 1) 59 if not isinstance(index, slice): 60 raise ValueError("Unexpected index type") 61 62 #Return the (sub)sequence as another DBSeq or Seq object 63 #(see the Seq obect's __getitem__ method) 64 if index.start is None: 65 i = 0 66 else: 67 i = index.start 68 if i < 0: 69 #Map to equavilent positive index 70 if -i > self._length: 71 raise IndexError(i) 72 i = i + self._length 73 elif i >= self._length: 74 #Trivial case, should return empty string! 75 i = self._length 76 77 if index.stop is None: 78 j = self._length 79 else: 80 j = index.stop 81 if j < 0: 82 #Map to equavilent positive index 83 if -j > self._length: 84 raise IndexError(j) 85 j = j + self._length 86 elif j >= self._length: 87 j = self._length 88 89 if i >= j: 90 #Trivial case, empty string. 91 return Seq("", self.alphabet) 92 elif index.step is None or index.step == 1: 93 #Easy case - can return a DBSeq with the start and end adjusted 94 return self.__class__(self.primary_id, self.adaptor, self.alphabet, 95 self.start + i, j - i) 96 else: 97 #Tricky. Will have to create a Seq object because of the stride 98 full = self.adaptor.get_subseq_as_string(self.primary_id, 99 self.start + i, 100 self.start + j) 101 return Seq(full[::index.step], self.alphabet) 102
103 - def tostring(self):
104 """Returns the full sequence as a python string. 105 106 Although not formally deprecated, you are now encouraged to use 107 str(my_seq) instead of my_seq.tostring().""" 108 return self.adaptor.get_subseq_as_string(self.primary_id, 109 self.start, 110 self.start + self._length)
111
112 - def __str__(self):
113 """Returns the full sequence as a python string.""" 114 return self.adaptor.get_subseq_as_string(self.primary_id, 115 self.start, 116 self.start + self._length)
117 118 data = property(tostring, doc="Sequence as string (DEPRECATED)") 119
120 - def toseq(self):
121 """Returns the full sequence as a Seq object.""" 122 #Note - the method name copies that of the MutableSeq object 123 return Seq(str(self), self.alphabet)
124
125 - def __add__(self, other):
126 #Let the Seq object deal with the alphabet issues etc 127 return self.toseq() + other
128
129 - def __radd__(self, other):
130 #Let the Seq object deal with the alphabet issues etc 131 return other + self.toseq()
132 133
134 -def _retrieve_seq(adaptor, primary_id):
135 #The database schema ensures there will be only one matching 136 #row in the table. 137 138 #If an UnknownSeq was recorded, seq will be NULL, 139 #but length will be populated. This means length(seq) 140 #will return None. 141 seqs = adaptor.execute_and_fetchall( 142 "SELECT alphabet, length, length(seq) FROM biosequence" 143 " WHERE bioentry_id = %s", (primary_id,)) 144 if not seqs: 145 return 146 assert len(seqs) == 1 147 moltype, given_length, length = seqs[0] 148 149 try: 150 length = int(length) 151 given_length = int(length) 152 assert length == given_length 153 have_seq = True 154 except TypeError: 155 assert length is None 156 seqs = adaptor.execute_and_fetchall( 157 "SELECT alphabet, length, seq FROM biosequence" 158 " WHERE bioentry_id = %s", (primary_id,)) 159 assert len(seqs) == 1 160 moltype, given_length, seq = seqs[0] 161 assert seq is None or seq == "" 162 length = int(given_length) 163 have_seq = False 164 del seq 165 del given_length 166 167 moltype = moltype.lower() # might be upper case in database 168 #We have no way of knowing if these sequences will use IUPAC 169 #alphabets, and we certainly can't assume they are unambiguous! 170 if moltype == "dna": 171 alphabet = Alphabet.generic_dna 172 elif moltype == "rna": 173 alphabet = Alphabet.generic_rna 174 elif moltype == "protein": 175 alphabet = Alphabet.generic_protein 176 elif moltype == "unknown": 177 #This is used in BioSQL/Loader.py and would happen 178 #for any generic or nucleotide alphabets. 179 alphabet = Alphabet.single_letter_alphabet 180 else: 181 raise AssertionError("Unknown moltype: %s" % moltype) 182 183 if have_seq: 184 return DBSeq(primary_id, adaptor, alphabet, 0, int(length)) 185 else: 186 return UnknownSeq(length, alphabet)
187 188
189 -def _retrieve_dbxrefs(adaptor, primary_id):
190 """Retrieve the database cross references for the sequence.""" 191 _dbxrefs = [] 192 dbxrefs = adaptor.execute_and_fetchall( 193 "SELECT dbname, accession, version" 194 " FROM bioentry_dbxref join dbxref using (dbxref_id)" 195 " WHERE bioentry_id = %s" 196 " ORDER BY rank", (primary_id,)) 197 for dbname, accession, version in dbxrefs: 198 if version and version != "0": 199 v = "%s.%s" % (accession, version) 200 else: 201 v = accession 202 _dbxrefs.append("%s:%s" % (dbname, v)) 203 return _dbxrefs
204 205
206 -def _retrieve_features(adaptor, primary_id):
207 sql = "SELECT seqfeature_id, type.name, rank" \ 208 " FROM seqfeature join term type on (type_term_id = type.term_id)" \ 209 " WHERE bioentry_id = %s" \ 210 " ORDER BY rank" 211 results = adaptor.execute_and_fetchall(sql, (primary_id,)) 212 seq_feature_list = [] 213 for seqfeature_id, seqfeature_type, seqfeature_rank in results: 214 # Get qualifiers [except for db_xref which is stored separately] 215 qvs = adaptor.execute_and_fetchall( 216 "SELECT name, value" 217 " FROM seqfeature_qualifier_value join term using (term_id)" 218 " WHERE seqfeature_id = %s" 219 " ORDER BY rank", (seqfeature_id,)) 220 qualifiers = {} 221 for qv_name, qv_value in qvs: 222 qualifiers.setdefault(qv_name, []).append(qv_value) 223 # Get db_xrefs [special case of qualifiers] 224 qvs = adaptor.execute_and_fetchall( 225 "SELECT dbxref.dbname, dbxref.accession" 226 " FROM dbxref join seqfeature_dbxref using (dbxref_id)" 227 " WHERE seqfeature_dbxref.seqfeature_id = %s" 228 " ORDER BY rank", (seqfeature_id,)) 229 for qv_name, qv_value in qvs: 230 value = "%s:%s" % (qv_name, qv_value) 231 qualifiers.setdefault("db_xref", []).append(value) 232 # Get locations 233 results = adaptor.execute_and_fetchall( 234 "SELECT location_id, start_pos, end_pos, strand" 235 " FROM location" 236 " WHERE seqfeature_id = %s" 237 " ORDER BY rank", (seqfeature_id,)) 238 locations = [] 239 # convert to Python standard form 240 # Convert strand = 0 to strand = None 241 # re: comment in Loader.py: 242 # Biopython uses None when we don't know strand information but 243 # BioSQL requires something (non null) and sets this as zero 244 # So we'll use the strand or 0 if Biopython spits out None 245 for location_id, start, end, strand in results: 246 if start: 247 start -= 1 248 if strand == 0: 249 strand = None 250 if strand not in (+1, -1, None): 251 raise ValueError("Invalid strand %s found in database for " 252 "seqfeature_id %s" % (strand, seqfeature_id)) 253 if end < start: 254 import warnings 255 from Bio import BiopythonWarning 256 warnings.warn("Inverted location start/end (%i and %i) for " 257 "seqfeature_id %s" % (start, end, seqfeature_id), 258 BiopythonWarning) 259 locations.append((location_id, start, end, strand)) 260 # Get possible remote reference information 261 remote_results = adaptor.execute_and_fetchall( 262 "SELECT location_id, dbname, accession, version" 263 " FROM location join dbxref using (dbxref_id)" 264 " WHERE seqfeature_id = %s", (seqfeature_id,)) 265 lookup = {} 266 for location_id, dbname, accession, version in remote_results: 267 if version and version != "0": 268 v = "%s.%s" % (accession, version) 269 else: 270 v = accession 271 # subfeature remote location db_ref are stored as a empty string when 272 # not present 273 if dbname == "": 274 dbname = None 275 lookup[location_id] = (dbname, v) 276 277 feature = SeqFeature.SeqFeature(type=seqfeature_type) 278 feature._seqfeature_id = seqfeature_id # Store the key as a private property 279 feature.qualifiers = qualifiers 280 if len(locations) == 0: 281 pass 282 elif len(locations) == 1: 283 location_id, start, end, strand = locations[0] 284 #See Bug 2677, we currently don't record the location_operator 285 #For consistency with older versions Biopython, default to "". 286 feature.location_operator = \ 287 _retrieve_location_qualifier_value(adaptor, location_id) 288 dbname, version = lookup.get(location_id, (None, None)) 289 feature.location = SeqFeature.FeatureLocation(start, end) 290 feature.strand = strand 291 feature.ref_db = dbname 292 feature.ref = version 293 else: 294 sub_features = feature.sub_features 295 assert sub_features == [] 296 for location in locations: 297 location_id, start, end, strand = location 298 dbname, version = lookup.get(location_id, (None, None)) 299 subfeature = SeqFeature.SeqFeature() 300 subfeature.type = seqfeature_type 301 subfeature.location = SeqFeature.FeatureLocation(start, end) 302 #subfeature.location_operator = \ 303 # _retrieve_location_qualifier_value(adaptor, location_id) 304 subfeature.strand = strand 305 subfeature.ref_db = dbname 306 subfeature.ref = version 307 sub_features.append(subfeature) 308 # Locations are in order, but because of remote locations for 309 # sub-features they are not necessarily in numerical order: 310 strands = set(sf.strand for sf in sub_features) 311 if len(strands)==1 and -1 in strands: 312 #Evil hack time for backwards compatibility 313 #TODO - Check if BioPerl and (old) Biopython did the same, 314 #we may have an existing incompatibility lurking here... 315 locs = [f.location for f in sub_features[::-1]] 316 else: 317 #All forward, or mixed strands 318 locs = [f.location for f in sub_features] 319 feature.location = SeqFeature.CompoundLocation(locs, seqfeature_type) 320 #TODO - See Bug 2677 - we don't yet record location_operator, 321 #so for consistency with older versions of Biopython default 322 #to assuming its a join. 323 feature.location_operator = "join" 324 seq_feature_list.append(feature) 325 326 return seq_feature_list
327 328
329 -def _retrieve_location_qualifier_value(adaptor, location_id):
330 value = adaptor.execute_and_fetch_col0( 331 "SELECT value FROM location_qualifier_value" 332 " WHERE location_id = %s", (location_id,)) 333 try: 334 return value[0] 335 except IndexError: 336 return ""
337 338
339 -def _retrieve_annotations(adaptor, primary_id, taxon_id):
340 annotations = {} 341 annotations.update(_retrieve_qualifier_value(adaptor, primary_id)) 342 annotations.update(_retrieve_reference(adaptor, primary_id)) 343 annotations.update(_retrieve_taxon(adaptor, primary_id, taxon_id)) 344 annotations.update(_retrieve_comment(adaptor, primary_id)) 345 # Convert values into strings in cases of unicode from the database. 346 # BioSQL could eventually be expanded to be unicode aware. 347 str_anns = {} 348 for key, val in annotations.items(): 349 if isinstance(val, list): 350 val = [_make_unicode_into_string(x) for x in val] 351 elif isinstance(val, unicode): 352 val = str(val) 353 str_anns[key] = val 354 return str_anns
355 356
357 -def _make_unicode_into_string(text):
358 if isinstance(text, unicode): 359 return str(text) 360 else: 361 return text
362 363
364 -def _retrieve_qualifier_value(adaptor, primary_id):
365 qvs = adaptor.execute_and_fetchall( 366 "SELECT name, value" 367 " FROM bioentry_qualifier_value JOIN term USING (term_id)" 368 " WHERE bioentry_id = %s" 369 " ORDER BY rank", (primary_id,)) 370 qualifiers = {} 371 for name, value in qvs: 372 if name == "keyword": 373 name = "keywords" 374 #See handling of "date" in Loader.py 375 elif name == "date_changed": 376 name = "date" 377 elif name == "secondary_accession": 378 name = "accessions" 379 qualifiers.setdefault(name, []).append(value) 380 return qualifiers
381 382
383 -def _retrieve_reference(adaptor, primary_id):
384 # XXX dbxref_qualifier_value 385 386 refs = adaptor.execute_and_fetchall( 387 "SELECT start_pos, end_pos, " 388 " location, title, authors," 389 " dbname, accession" 390 " FROM bioentry_reference" 391 " JOIN reference USING (reference_id)" 392 " LEFT JOIN dbxref USING (dbxref_id)" 393 " WHERE bioentry_id = %s" 394 " ORDER BY rank", (primary_id,)) 395 references = [] 396 for start, end, location, title, authors, dbname, accession in refs: 397 reference = SeqFeature.Reference() 398 #If the start/end are missing, reference.location is an empty list 399 if (start is not None) or (end is not None): 400 if start is not None: 401 start -= 1 # python counting 402 reference.location = [SeqFeature.FeatureLocation(start, end)] 403 #Don't replace the default "" with None. 404 if authors: 405 reference.authors = authors 406 if title: 407 reference.title = title 408 reference.journal = location 409 if dbname == 'PUBMED': 410 reference.pubmed_id = accession 411 elif dbname == 'MEDLINE': 412 reference.medline_id = accession 413 references.append(reference) 414 if references: 415 return {'references': references} 416 else: 417 return {}
418 419
420 -def _retrieve_taxon(adaptor, primary_id, taxon_id):
421 a = {} 422 common_names = adaptor.execute_and_fetch_col0( 423 "SELECT name FROM taxon_name WHERE taxon_id = %s" 424 " AND name_class = 'genbank common name'", (taxon_id,)) 425 if common_names: 426 a['source'] = common_names[0] 427 scientific_names = adaptor.execute_and_fetch_col0( 428 "SELECT name FROM taxon_name WHERE taxon_id = %s" 429 " AND name_class = 'scientific name'", (taxon_id,)) 430 if scientific_names: 431 a['organism'] = scientific_names[0] 432 ncbi_taxids = adaptor.execute_and_fetch_col0( 433 "SELECT ncbi_taxon_id FROM taxon WHERE taxon_id = %s", (taxon_id,)) 434 if ncbi_taxids and ncbi_taxids[0] and ncbi_taxids[0] != "0": 435 a['ncbi_taxid'] = ncbi_taxids[0] 436 437 #Old code used the left/right values in the taxon table to get the 438 #taxonomy lineage in one SQL command. This was actually very slow, 439 #and would fail if the (optional) left/right values were missing. 440 # 441 #The following code is based on a contribution from Eric Gibert, and 442 #relies on the taxon table's parent_taxon_id field only (ignoring the 443 #optional left/right values). This means that it has to make a 444 #separate SQL query for each entry in the lineage, but it does still 445 #appear to be *much* faster. See Bug 2494. 446 taxonomy = [] 447 while taxon_id: 448 name, rank, parent_taxon_id = adaptor.execute_one( 449 "SELECT taxon_name.name, taxon.node_rank, taxon.parent_taxon_id" 450 " FROM taxon, taxon_name" 451 " WHERE taxon.taxon_id=taxon_name.taxon_id" 452 " AND taxon_name.name_class='scientific name'" 453 " AND taxon.taxon_id = %s", (taxon_id,)) 454 if taxon_id == parent_taxon_id: 455 # If the taxon table has been populated by the BioSQL script 456 # load_ncbi_taxonomy.pl this is how top parent nodes are stored. 457 # Personally, I would have used a NULL parent_taxon_id here. 458 break 459 if rank != "no rank": 460 #For consistency with older versions of Biopython, we are only 461 #interested in taxonomy entries with a stated rank. 462 #Add this to the start of the lineage list. 463 taxonomy.insert(0, name) 464 taxon_id = parent_taxon_id 465 466 if taxonomy: 467 a['taxonomy'] = taxonomy 468 return a
469 470
471 -def _retrieve_comment(adaptor, primary_id):
472 qvs = adaptor.execute_and_fetchall( 473 "SELECT comment_text FROM comment" 474 " WHERE bioentry_id=%s" 475 " ORDER BY rank", (primary_id,)) 476 comments = [comm[0] for comm in qvs] 477 #Don't want to add an empty list... 478 if comments: 479 return {"comment": comments} 480 else: 481 return {}
482 483
484 -class DBSeqRecord(SeqRecord):
485 """BioSQL equivalent of the biopython SeqRecord object. 486 """ 487
488 - def __init__(self, adaptor, primary_id):
489 self._adaptor = adaptor 490 self._primary_id = primary_id 491 492 (self._biodatabase_id, self._taxon_id, self.name, 493 accession, version, self._identifier, 494 self._division, self.description) = self._adaptor.execute_one( 495 "SELECT biodatabase_id, taxon_id, name, accession, version," 496 " identifier, division, description" 497 " FROM bioentry" 498 " WHERE bioentry_id = %s", (self._primary_id,)) 499 if version and version != "0": 500 self.id = "%s.%s" % (accession, version) 501 else: 502 self.id = accession 503 #We don't yet record any per-letter-annotations in the 504 #BioSQL database, but we should set this property up 505 #for completeness (and the __str__ method). 506 try: 507 length = len(self.seq) 508 except: 509 #Could be no sequence in the database! 510 length = 0 511 self._per_letter_annotations = _RestrictedDict(length=length)
512
513 - def __get_seq(self):
514 if not hasattr(self, "_seq"): 515 self._seq = _retrieve_seq(self._adaptor, self._primary_id) 516 return self._seq
517
518 - def __set_seq(self, seq):
519 self._seq = seq
520
521 - def __del_seq(self):
522 del self._seq
523 seq = property(__get_seq, __set_seq, __del_seq, "Seq object") 524
525 - def __get_dbxrefs(self):
526 if not hasattr(self, "_dbxrefs"): 527 self._dbxrefs = _retrieve_dbxrefs(self._adaptor, self._primary_id) 528 return self._dbxrefs
529
530 - def __set_dbxrefs(self, dbxrefs):
531 self._dbxrefs = dbxrefs
532
533 - def __del_dbxrefs(self):
534 del self._dbxrefs
535 dbxrefs = property(__get_dbxrefs, __set_dbxrefs, __del_dbxrefs, 536 "Database cross references") 537
538 - def __get_features(self):
539 if not hasattr(self, "_features"): 540 self._features = _retrieve_features(self._adaptor, 541 self._primary_id) 542 return self._features
543
544 - def __set_features(self, features):
545 self._features = features
546
547 - def __del_features(self):
548 del self._features
549 features = property(__get_features, __set_features, __del_features, 550 "Features") 551
552 - def __get_annotations(self):
553 if not hasattr(self, "_annotations"): 554 self._annotations = _retrieve_annotations(self._adaptor, 555 self._primary_id, 556 self._taxon_id) 557 if self._identifier: 558 self._annotations["gi"] = self._identifier 559 if self._division: 560 self._annotations["data_file_division"] = self._division 561 return self._annotations
562
563 - def __set_annotations(self, annotations):
564 self._annotations = annotations
565
566 - def __del_annotations(self):
567 del self._annotations
568 annotations = property(__get_annotations, __set_annotations, 569 __del_annotations, "Annotations")
570