Package Bio :: Package SearchIO :: Package _model :: Module hsp
[hide private]
[frames] | no frames]

Source Code for Module Bio.SearchIO._model.hsp

   1  # Copyright 2012 by Wibowo Arindrarto.  All rights reserved. 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license.  Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5   
   6  """Bio.SearchIO objects to model high scoring regions between query and hit.""" 
   7   
   8  from __future__ import print_function 
   9  from Bio._py3k import basestring 
  10   
  11  import warnings 
  12  from operator import ge, le 
  13   
  14  from Bio import BiopythonWarning 
  15  from Bio.Align import MultipleSeqAlignment 
  16  from Bio.Alphabet import single_letter_alphabet 
  17  from Bio.Seq import Seq 
  18  from Bio.SeqRecord import SeqRecord 
  19   
  20  from Bio._utils import getattr_str, trim_str 
  21  from Bio.SearchIO._utils import singleitem, allitems, fullcascade, \ 
  22          fragcascade 
  23   
  24  from ._base import _BaseHSP 
  25   
  26   
27 -class HSP(_BaseHSP):
28 """Class representing high-scoring region(s) between query and hit. 29 30 HSP (high-scoring pair) objects are contained by Hit objects (see Hit). 31 In most cases, HSP objects store the bulk of the statistics and results 32 (e.g. e-value, bitscores, query sequence, etc.) produced by a search 33 program. 34 35 Depending on the search output file format, a given HSP will contain one 36 or more HSPFragment object(s). Examples of search programs that produce HSP 37 with one HSPFragments are BLAST, HMMER, and FASTA. Other programs such as 38 BLAT or Exonerate may produce HSPs containing more than one HSPFragment. 39 However, their native terminologies may differ: in BLAT these fragments 40 are called 'blocks' while in Exonerate they are called exons or NER. 41 42 Here are examples from each type of HSP. The first one comes from a BLAST 43 search:: 44 45 >>> from Bio import SearchIO 46 >>> blast_qresult = next(SearchIO.parse('Blast/mirna.xml', 'blast-xml')) 47 >>> blast_hsp = blast_qresult[1][0] # the first HSP from the second hit 48 >>> blast_hsp 49 HSP(hit_id='gi|301171311|ref|NR_035856.1|', query_id='33211', 1 fragments) 50 >>> print(blast_hsp) 51 Query: 33211 mir_1 52 Hit: gi|301171311|ref|NR_035856.1| Pan troglodytes microRNA mir-520b ... 53 Query range: [1:61] (1) 54 Hit range: [0:60] (1) 55 Quick stats: evalue 1.7e-22; bitscore 109.49 56 Fragments: 1 (60 columns) 57 Query - CCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG 58 |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| 59 Hit - CCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG 60 61 For HSPs with a single HSPFragment, you can invoke ``print`` on it and see the 62 underlying sequence alignment, if it exists. This is not the case for HSPs 63 with more than one HSPFragment. Below is an example, using an HSP from a 64 BLAT search. Invoking ``print`` on these HSPs will instead show a table of the 65 HSPFragment objects it contains:: 66 67 >>> blat_qresult = SearchIO.read('Blat/mirna.pslx', 'blat-psl', pslx=True) 68 >>> blat_hsp = blat_qresult[1][0] # the first HSP from the second hit 69 >>> blat_hsp 70 HSP(hit_id='chr11', query_id='blat_1', 2 fragments) 71 >>> print(blat_hsp) 72 Query: blat_1 <unknown description> 73 Hit: chr11 <unknown description> 74 Query range: [42:67] (-1) 75 Hit range: [59018929:59018955] (1) 76 Quick stats: evalue ?; bitscore ? 77 Fragments: --- -------------- ---------------------- ---------------------- 78 # Span Query range Hit range 79 --- -------------- ---------------------- ---------------------- 80 0 6 [61:67] [59018929:59018935] 81 1 16 [42:58] [59018939:59018955] 82 83 Notice that in HSPs with more than one HSPFragments, the HSP's ``query_range`` 84 ``hit_range`` properties encompasses all fragments it contains. 85 86 You can check whether an HSP has more than one HSPFragments or not using the 87 ``is_fragmented`` property:: 88 89 >>> blast_hsp.is_fragmented 90 False 91 >>> blat_hsp.is_fragmented 92 True 93 94 Since HSP objects are also containers similar to Python lists, you can 95 access a single fragment in an HSP using its integer index:: 96 97 >>> blat_fragment = blat_hsp[0] 98 >>> print(blat_fragment) 99 Query: blat_1 <unknown description> 100 Hit: chr11 <unknown description> 101 Query range: [61:67] (-1) 102 Hit range: [59018929:59018935] (1) 103 Fragments: 1 (6 columns) 104 Query - tatagt 105 Hit - tatagt 106 107 This applies to HSPs objects with a single fragment as well:: 108 109 >>> blast_fragment = blast_hsp[0] 110 >>> print(blast_fragment) 111 Query: 33211 mir_1 112 Hit: gi|301171311|ref|NR_035856.1| Pan troglodytes microRNA mir-520b ... 113 Query range: [1:61] (1) 114 Hit range: [0:60] (1) 115 Fragments: 1 (60 columns) 116 Query - CCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG 117 |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| 118 Hit - CCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG 119 120 Regardless of the search output file format, HSP objects provide the 121 properties listed below. These properties always return values in a list, 122 due to the HSP object itself being a list-like container. However, for 123 HSP objects with a single HSPFragment, shortcut properties that fetches 124 the item from the list are also provided. 125 126 +----------------------+---------------------+-----------------------------+ 127 | Property | Shortcut | Value | 128 +======================+=====================+=============================+ 129 | aln_all | aln | HSP alignments as | 130 | | | MultipleSeqAlignment object | 131 +----------------------+---------------------+-----------------------------+ 132 | aln_annotation_all | aln_annotation | dictionary of annotation(s) | 133 | | | of all fragments' alignments| 134 +----------------------+---------------------+-----------------------------+ 135 | fragments | fragment | HSPFragment objects | 136 +----------------------+---------------------+-----------------------------+ 137 | hit_all | hit | hit sequence as SeqRecord | 138 | | | objects | 139 +----------------------+---------------------+-----------------------------+ 140 | hit_features_all | hit_features | SeqFeatures of all hit | 141 | | | fragments | 142 +----------------------+---------------------+-----------------------------+ 143 | hit_start_all | hit_start* | start coordinates of the | 144 | | | hit fragments | 145 +----------------------+---------------------+-----------------------------+ 146 | hit_end_all | hit_end* | end coordinates of the hit | 147 | | | fragments | 148 +----------------------+---------------------+-----------------------------+ 149 | hit_span_all | hit_span* | sizes of each hit fragments | 150 +----------------------+---------------------+-----------------------------+ 151 | hit_strand_all | hit_strand | strand orientations of the | 152 | | | hit fragments | 153 +----------------------+---------------------+-----------------------------+ 154 | hit_frame_all | hit_frame | reading frames of the hit | 155 | | | fragments | 156 +----------------------+---------------------+-----------------------------+ 157 | hit_range_all | hit_range | tuples of start and end | 158 | | | coordinates of each hit | 159 | | | fragment | 160 +----------------------+---------------------+-----------------------------+ 161 | query_all | query | query sequence as SeqRecord | 162 | | | object | 163 +----------------------+---------------------+-----------------------------+ 164 | query_features_all | query_features | SeqFeatures of all query | 165 | | | fragments | 166 +----------------------+---------------------+-----------------------------+ 167 | query_start_all | query_start* | start coordinates of the | 168 | | | fragments | 169 +----------------------+---------------------+-----------------------------+ 170 | query_end_all | query_end* | end coordinates of the | 171 | | | query fragments | 172 +----------------------+---------------------+-----------------------------+ 173 | query_span_all | query_span* | sizes of each query | 174 | | | fragments | 175 +----------------------+---------------------+-----------------------------+ 176 | query_strand_all | query_strand | strand orientations of the | 177 | | | query fragments | 178 +----------------------+---------------------+-----------------------------+ 179 | query_frame_all | query_frame | reading frames of the query | 180 | | | fragments | 181 +----------------------+---------------------+-----------------------------+ 182 | query_range_all | query_range | tuples of start and end | 183 | | | coordinates of each query | 184 | | | fragment | 185 +----------------------+---------------------+-----------------------------+ 186 187 For all types of HSP objects, the property will return the values in a list. 188 Shorcuts are only applicable for HSPs with one fragment. Except the ones 189 noted, if they are used on an HSP with more than one fragments, an exception 190 will be raised. 191 192 For properties that may be used in HSPs with multiple or single fragments 193 (``*_start``, ``*_end``, and ``*_span`` properties), their interpretation depends 194 on how many fragment the HSP has: 195 196 +------------+---------------------------------------------------+ 197 | Property | Value | 198 +============+===================================================+ 199 | hit_start | smallest coordinate value of all hit fragments | 200 +------------+---------------------------------------------------+ 201 | hit_end | largest coordinate value of all hit fragments | 202 +------------+---------------------------------------------------+ 203 | hit_span | difference between ``hit_start`` and ``hit_end`` | 204 +------------+---------------------------------------------------+ 205 | query_start| smallest coordinate value of all query fragments | 206 +------------+---------------------------------------------------+ 207 | query_end | largest coordinate value of all query fragments | 208 +------------+---------------------------------------------------+ 209 | query_span | difference between ``query_start`` and | 210 | | ``query_end`` | 211 +------------+---------------------------------------------------+ 212 213 In addition to the objects listed above, HSP objects also provide the 214 following properties: 215 216 +--------------------+------------------------------------------------------+ 217 | Property | Value | 218 +====================+======================================================+ 219 | aln_span | total number of residues in all HSPFragment objects | 220 +--------------------+------------------------------------------------------+ 221 | alphabet | alphabet used in hit and query SeqRecord objects | 222 +--------------------+------------------------------------------------------+ 223 | is_fragmented | boolean, whether there are multiple fragments or not | 224 +--------------------+------------------------------------------------------+ 225 | hit_id | ID of the hit sequence | 226 +--------------------+------------------------------------------------------+ 227 | hit_description | description of the hit sequence | 228 +--------------------+------------------------------------------------------+ 229 | hit_inter_ranges | list of hit sequence coordinates of the regions | 230 | | between fragments | 231 +--------------------+------------------------------------------------------+ 232 | hit_inter_spans | list of lengths of the regions between hit fragments | 233 +--------------------+------------------------------------------------------+ 234 | query_id | ID of the query sequence | 235 +--------------------+------------------------------------------------------+ 236 | query_description | description of the query sequence | 237 +--------------------+------------------------------------------------------+ 238 | query_inter_ranges | list of query sequence coordinates of the regions | 239 | | between fragments | 240 +--------------------+------------------------------------------------------+ 241 | query_inter_spans | list of lengths of the regions between query | 242 | | fragments | 243 +--------------------+------------------------------------------------------+ 244 245 .. [1] may be used in HSPs with multiple fragments 246 247 """ 248 249 # attributes we don't want to transfer when creating a new Hit class 250 # from this one 251 _NON_STICKY_ATTRS = ('_items', ) 252
253 - def __init__(self, fragments=()):
254 """Initializes an HSP object. 255 256 :param fragments: fragments contained in the HSP object 257 :type fragments: iterable yielding HSPFragment 258 259 HSP objects must be initialized with a list containing at least one 260 HSPFragment object. If multiple HSPFragment objects are used for 261 initialization, they must all have the same ``query_id``, 262 ``query_description``, ``hit_id``, ``hit_description``, and alphabet 263 properties. 264 265 """ 266 if not fragments: 267 raise ValueError("HSP objects must have at least one HSPFragment " 268 "object.") 269 # TODO - Move this into the for look in case hsps is a single use 270 # iterable? 271 # check that all fragments contain the same IDs, descriptions, alphabet 272 for attr in ('query_id', 'query_description', 'hit_id', 273 'hit_description', 'alphabet'): 274 if len(set(getattr(frag, attr) for frag in fragments)) != 1: 275 raise ValueError("HSP object can not contain fragments with " 276 "more than one %s." % attr) 277 278 self._items = [] 279 for fragment in fragments: 280 self._validate_fragment(fragment) 281 self._items.append(fragment)
282
283 - def __repr__(self):
284 return "%s(hit_id=%r, query_id=%r, %r fragments)" % \ 285 (self.__class__.__name__, self.hit_id, self.query_id, len(self))
286
287 - def __iter__(self):
288 return iter(self._items)
289
290 - def __contains__(self, fragment):
291 return fragment in self._items
292
293 - def __len__(self):
294 return len(self._items)
295 296 # Python 3:
297 - def __bool__(self):
298 return bool(self._items)
299 300 # Python 2: 301 __nonzero__ = __bool__ 302
303 - def __str__(self):
304 305 lines = [] 306 # set hsp info line 307 statline = [] 308 # evalue 309 evalue = getattr_str(self, 'evalue', fmt='%.2g') 310 statline.append('evalue ' + evalue) 311 # bitscore 312 bitscore = getattr_str(self, 'bitscore', fmt='%.2f') 313 statline.append('bitscore ' + bitscore) 314 lines.append('Quick stats: ' + '; '.join(statline)) 315 316 if len(self.fragments) == 1: 317 return '\n'.join([self._str_hsp_header(), '\n'.join(lines), 318 self.fragments[0]._str_aln()]) 319 else: 320 lines.append(' Fragments: %s %s %s %s' % 321 ('-' * 3, '-' * 14, '-' * 22, '-' * 22)) 322 pattern = '%16s %14s %22s %22s' 323 lines.append(pattern % ('#', 'Span', 'Query range', 'Hit range')) 324 lines.append(pattern % ('-' * 3, '-' * 14, '-' * 22, '-' * 22)) 325 for idx, block in enumerate(self.fragments): 326 # set hsp line and table 327 # alignment span 328 aln_span = getattr_str(block, 'aln_span') 329 # query region 330 query_start = getattr_str(block, 'query_start') 331 query_end = getattr_str(block, 'query_end') 332 query_range = '[%s:%s]' % (query_start, query_end) 333 # max column length is 20 334 query_range = trim_str(query_range, 22, '~]') 335 # hit region 336 hit_start = getattr_str(block, 'hit_start') 337 hit_end = getattr_str(block, 'hit_end') 338 hit_range = '[%s:%s]' % (hit_start, hit_end) 339 hit_range = trim_str(hit_range, 22, '~]') 340 # append the hsp row 341 lines.append(pattern % (str(idx), aln_span, query_range, hit_range)) 342 343 return self._str_hsp_header() + '\n' + '\n'.join(lines)
344
345 - def __getitem__(self, idx):
346 # if key is slice, return a new HSP instance 347 if isinstance(idx, slice): 348 obj = self.__class__(self._items[idx]) 349 self._transfer_attrs(obj) 350 return obj 351 return self._items[idx]
352
353 - def __setitem__(self, idx, fragments):
354 # handle case if hsps is a list of hsp 355 if isinstance(fragments, (list, tuple)): 356 for fragment in fragments: 357 self._validate_fragment(fragment) 358 else: 359 self._validate_fragment(fragments) 360 361 self._items[idx] = fragments
362
363 - def __delitem__(self, idx):
364 # note that this may result in an empty HSP object, which should be 365 # invalid 366 del self._items[idx]
367
368 - def _validate_fragment(self, fragment):
369 if not isinstance(fragment, HSPFragment): 370 raise TypeError("HSP objects can only contain HSPFragment " 371 "objects.") 372 # HACK: to make validation during __init__ work 373 if self._items: 374 if fragment.hit_id != self.hit_id: 375 raise ValueError("Expected HSPFragment with hit ID %r, " 376 "found %r instead." % (self.id, fragment.hit_id)) 377 378 if fragment.hit_description != self.hit_description: 379 raise ValueError("Expected HSPFragment with hit " 380 "description %r, found %r instead." % 381 (self.description, fragment.hit_description)) 382 383 if fragment.query_id != self.query_id: 384 raise ValueError("Expected HSPFragment with query ID %r, " 385 "found %r instead." % (self.query_id, 386 fragment.query_id)) 387 388 if fragment.query_description != self.query_description: 389 raise ValueError("Expected HSP with query description %r, " 390 "found %r instead." % (self.query_description, 391 fragment.query_description))
392
393 - def _aln_span_get(self):
394 # length of all alignments 395 # alignment span can be its own attribute, or computed from 396 # query / hit length 397 return sum(frg.aln_span for frg in self.fragments)
398 399 aln_span = property(fget=_aln_span_get, 400 doc="""Total number of columns in all HSPFragment objects.""") 401 402 # coordinate properties #
403 - def _get_coords(self, seq_type, coord_type):
404 assert seq_type in ('hit', 'query') 405 assert coord_type in ('start', 'end') 406 coord_name = '%s_%s' % (seq_type, coord_type) 407 coords = [getattr(frag, coord_name) for frag in self.fragments] 408 if None in coords: 409 warnings.warn("'None' exist in %s coordinates; ignored" % 410 (coord_name), BiopythonWarning) 411 return coords
412
413 - def _hit_start_get(self):
414 return min(self._get_coords('hit', 'start'))
415 416 hit_start = property(fget=_hit_start_get, 417 doc="""Smallest coordinate value of all hit fragments""") 418
419 - def _query_start_get(self):
420 return min(self._get_coords('query', 'start'))
421 422 query_start = property(fget=_query_start_get, 423 doc="""Smallest coordinate value of all query fragments""") 424
425 - def _hit_end_get(self):
426 return max(self._get_coords('hit', 'end'))
427 428 hit_end = property(fget=_hit_end_get, 429 doc="""Largest coordinate value of all hit fragments""") 430
431 - def _query_end_get(self):
432 return max(self._get_coords('query', 'end'))
433 434 query_end = property(fget=_query_end_get, 435 doc="""Largest coordinate value of all hit fragments""") 436 437 # coordinate-dependent properties #
438 - def _hit_span_get(self):
439 try: 440 return self.hit_end - self.hit_start 441 except TypeError: # triggered if any of the coordinates are None 442 return None
443 444 hit_span = property(fget=_hit_span_get, 445 doc="""The number of hit residues covered by the HSP.""") 446
447 - def _query_span_get(self):
448 try: 449 return self.query_end - self.query_start 450 except TypeError: # triggered if any of the coordinates are None 451 return None
452 453 query_span = property(fget=_query_span_get, 454 doc="""The number of query residues covered by the HSP.""") 455
456 - def _hit_range_get(self):
457 return (self.hit_start, self.hit_end)
458 459 hit_range = property(fget=_hit_range_get, 460 doc="""Tuple of HSP hit start and end coordinates.""") 461
462 - def _query_range_get(self):
463 return (self.query_start, self.query_end)
464 465 query_range = property(fget=_query_range_get, 466 doc="""Tuple of HSP query start and end coordinates.""") 467
468 - def _inter_ranges_get(self, seq_type):
469 # this property assumes that there are no mixed strands in a hit/query 470 assert seq_type in ('query', 'hit') 471 strand = getattr(self, '%s_strand_all' % seq_type)[0] 472 coords = getattr(self, '%s_range_all' % seq_type) 473 # determine function used to set inter range 474 # start and end coordinates, given two pairs 475 # of fragment start and end coordinates 476 if strand == -1: 477 startfunc, endfunc = min, max 478 else: 479 startfunc, endfunc = max, min 480 inter_coords = [] 481 for idx, coord in enumerate(coords[:-1]): 482 start = startfunc(coords[idx]) 483 end = endfunc(coords[idx + 1]) 484 inter_coords.append((min(start, end), max(start, end))) 485 486 return inter_coords
487
488 - def _hit_inter_ranges_get(self):
489 return self._inter_ranges_get('hit')
490 491 hit_inter_ranges = property(fget=_hit_inter_ranges_get, 492 doc="""Hit sequence coordinates of the regions between fragments""") 493
494 - def _query_inter_ranges_get(self):
495 return self._inter_ranges_get('query')
496 497 query_inter_ranges = property(fget=_query_inter_ranges_get, 498 doc="""Query sequence coordinates of the regions between fragments""") 499
500 - def _inter_spans_get(self, seq_type):
501 assert seq_type in ('query', 'hit') 502 attr_name = '%s_inter_ranges' % seq_type 503 return [coord[1] - coord[0] for coord in getattr(self, attr_name)]
504
505 - def _hit_inter_spans_get(self):
506 return self._inter_spans_get('hit')
507 508 hit_inter_spans = property(fget=_hit_inter_spans_get, 509 doc="""Lengths of regions between hit fragments""") 510
511 - def _query_inter_spans_get(self):
512 return self._inter_spans_get('query')
513 514 query_inter_spans = property(fget=_query_inter_spans_get, 515 doc="""Lengths of regions between query fragments""") 516 517 # shortcuts for fragments' properties # 518 519 # bool check if there's more than one fragments 520 is_fragmented = property(lambda self: len(self) > 1, 521 doc="""Whether the HSP has more than one HSPFragment objects""") 522 523 # first item properties with setters 524 hit_description = fullcascade('hit_description', 525 doc="""Description of the hit sequence""") 526 527 query_description = fullcascade('query_description', 528 doc="""Description of the query sequence""") 529 530 hit_id = fullcascade('hit_id', 531 doc="""ID of the hit sequence""") 532 533 query_id = fullcascade('query_id', 534 doc="""ID of the query sequence""") 535 536 alphabet = fullcascade('alphabet', 537 doc="""Alphabet used in hit and query SeqRecord objects""") 538 539 # properties for single-fragment HSPs 540 fragment = singleitem( 541 doc="""HSPFragment object, first fragment""") 542 543 hit = singleitem('hit', 544 doc="""Hit sequence as a SeqRecord object, first fragment""") 545 546 query = singleitem('query', 547 doc="""Query sequence as a SeqRecord object, first fragment""") 548 549 aln = singleitem('aln', 550 doc="""Alignment of the first fragment as a MultipleSeqAlignment object""") 551 552 aln_annotation = singleitem('aln_annotation', 553 doc="""Dictionary of annotation(s) of the first fragment's alignment""") 554 555 hit_features = singleitem('hit_features', 556 doc="""Hit sequence features, first fragment""") 557 558 query_features = singleitem('query_features', 559 doc="""Query sequence features, first fragment""") 560 561 hit_strand = singleitem('hit_strand', 562 doc="""Hit strand orientation, first fragment""") 563 564 query_strand = singleitem('query_strand', 565 doc="""Query strand orientation, first fragment""") 566 567 hit_frame = singleitem('hit_frame', 568 doc="""Hit sequence reading frame, first fragment""") 569 570 query_frame = singleitem('query_frame', 571 doc="""Query sequence reading frame, first fragment""") 572 573 # properties for multi-fragment HSPs 574 fragments = allitems(doc="""List of all HSPFragment objects""") 575 576 hit_all = allitems('hit', 577 doc="""List of all fragments' hit sequences as SeqRecord objects""") 578 579 query_all = allitems('query', 580 doc="""List of all fragments' query sequences as SeqRecord objects""") 581 582 aln_all = allitems('aln', 583 doc="""List of all fragments' alignments as MultipleSeqAlignment objects""") 584 585 aln_annotation_all = allitems('aln_annotation', 586 doc="""Dictionary of annotation(s) of all fragments' alignments""") 587 588 hit_features_all = allitems('hit_features', 589 doc="""List of all hit sequence features""") 590 591 query_features_all = allitems('query_features', 592 doc="""List of all query sequence features""") 593 594 hit_strand_all = allitems('hit_strand', 595 doc="""List of all fragments' hit sequence strands""") 596 597 query_strand_all = allitems('query_strand', 598 doc="""List of all fragments' query sequence strands""") 599 600 hit_frame_all = allitems('hit_frame', 601 doc="""List of all fragments' hit sequence reading frames""") 602 603 query_frame_all = allitems('query_frame', 604 doc="""List of all fragments' query sequence reading frames""") 605 606 hit_start_all = allitems('hit_start', 607 doc="""List of all fragments' hit start coordinates""") 608 609 query_start_all = allitems('query_start', 610 doc="""List of all fragments' query start coordinates""") 611 612 hit_end_all = allitems('hit_end', 613 doc="""List of all fragments' hit end coordinates""") 614 615 query_end_all = allitems('query_end', 616 doc="""List of all fragments' query end coordinates""") 617 618 hit_span_all = allitems('hit_span', 619 doc="""List of all fragments' hit sequence size""") 620 621 query_span_all = allitems('query_span', 622 doc="""List of all fragments' query sequence size""") 623 624 hit_range_all = allitems('hit_range', 625 doc="""List of all fragments' hit start and end coordinates""") 626 627 query_range_all = allitems('query_range', 628 doc="""List of all fragments' query start and end coordinates""")
629 630
631 -class HSPFragment(_BaseHSP):
632 """Class representing a contiguous alignment of hit-query sequence. 633 634 HSPFragment forms the core of any parsed search output file. Depending on 635 the search output file format, it may contain the actual query and/or hit 636 sequences that produces the search hits. These sequences are stored as 637 SeqRecord objects (see SeqRecord): 638 639 >>> from Bio import SearchIO 640 >>> qresult = next(SearchIO.parse('Blast/mirna.xml', 'blast-xml')) 641 >>> fragment = qresult[0][0][0] # first hit, first hsp, first fragment 642 >>> print(fragment) 643 Query: 33211 mir_1 644 Hit: gi|262205317|ref|NR_030195.1| Homo sapiens microRNA 520b (MIR520... 645 Query range: [0:61] (1) 646 Hit range: [0:61] (1) 647 Fragments: 1 (61 columns) 648 Query - CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG 649 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| 650 Hit - CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG 651 652 # the query sequence is a SeqRecord object 653 >>> fragment.query.__class__ 654 <class 'Bio.SeqRecord.SeqRecord'> 655 >>> print(fragment.query) 656 ID: 33211 657 Name: aligned query sequence 658 Description: mir_1 659 Number of features: 0 660 Seq('CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTT...GGG', DNAAlphabet()) 661 662 # the hit sequence is a SeqRecord object as well 663 >>> fragment.hit.__class__ 664 <class 'Bio.SeqRecord.SeqRecord'> 665 >>> print(fragment.hit) 666 ID: gi|262205317|ref|NR_030195.1| 667 Name: aligned hit sequence 668 Description: Homo sapiens microRNA 520b (MIR520B), microRNA 669 Number of features: 0 670 Seq('CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTT...GGG', DNAAlphabet()) 671 672 # when both query and hit are present, we get a MultipleSeqAlignment object 673 >>> fragment.aln.__class__ 674 <class 'Bio.Align.MultipleSeqAlignment'> 675 >>> print(fragment.aln) 676 DNAAlphabet() alignment with 2 rows and 61 columns 677 CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAG...GGG 33211 678 CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAG...GGG gi|262205317|ref|NR_030195.1| 679 680 """ 681
682 - def __init__(self, hit_id='<unknown id>', query_id='<unknown id>', 683 hit=None, query=None, alphabet=single_letter_alphabet):
684 685 self._alphabet = alphabet 686 self.aln_annotation = {} 687 688 self._hit_id = hit_id 689 self._query_id = query_id 690 691 for seq_type in ('query', 'hit'): 692 # query or hit attributes default attributes 693 setattr(self, '_%s_description' % seq_type, '<unknown description>') 694 setattr(self, '_%s_features' % seq_type, []) 695 # query or hit attributes whose default attribute is None 696 for attr in ('strand', 'frame', 'start', 'end'): 697 setattr(self, '%s_%s' % (seq_type, attr), None) 698 # self.query or self.hit 699 if eval(seq_type): 700 setattr(self, seq_type, eval(seq_type)) 701 else: 702 setattr(self, seq_type, None)
703
704 - def __repr__(self):
705 info = "hit_id=%r, query_id=%r" % (self.hit_id, self.query_id) 706 try: 707 info += ", %i columns" % len(self) 708 except AttributeError: 709 pass 710 return "%s(%s)" % (self.__class__.__name__, info)
711
712 - def __len__(self):
713 return self.aln_span
714
715 - def __str__(self):
716 return self._str_hsp_header() + '\n' + self._str_aln()
717
718 - def __getitem__(self, idx):
719 if self.aln is not None: 720 obj = self.__class__( 721 hit_id=self.hit_id, query_id=self.query_id, 722 alphabet=self.alphabet) 723 # transfer query and hit attributes 724 # let SeqRecord handle feature slicing, then retrieve the sliced 725 # features into the sliced HSPFragment 726 if self.query is not None: 727 obj.query = self.query[idx] 728 obj.query_features = obj.query.features 729 if self.hit is not None: 730 obj.hit = self.hit[idx] 731 obj.hit_features = obj.hit.features 732 # description, strand, frame 733 for attr in ('description', 'strand', 'frame'): 734 for seq_type in ('hit', 'query'): 735 attr_name = '%s_%s' % (seq_type, attr) 736 self_val = getattr(self, attr_name) 737 setattr(obj, attr_name, self_val) 738 # alignment annotation should be transferred, since we can compute 739 # the resulting annotation 740 obj.aln_annotation = {} 741 for key, value in self.aln_annotation.items(): 742 assert len(value[idx]) == len(obj) 743 obj.aln_annotation[key] = value[idx] 744 return obj 745 else: 746 raise TypeError("Slicing for HSP objects without " 747 "alignment is not supported.")
748
749 - def _str_aln(self):
750 lines = [] 751 # alignment length 752 aln_span = getattr_str(self, 'aln_span') 753 lines.append(' Fragments: 1 (%s columns)' % aln_span) 754 # sequences 755 if self.query is not None and self.hit is not None: 756 try: 757 qseq = str(self.query.seq) 758 except AttributeError: # query is None 759 qseq = '?' 760 try: 761 hseq = str(self.hit.seq) 762 except AttributeError: # hit is None 763 hseq = '?' 764 765 # similarity line 766 simil = '' 767 if 'similarity' in self.aln_annotation and \ 768 isinstance(self.aln_annotation.get('similarity'), basestring): 769 simil = self.aln_annotation['similarity'] 770 771 if self.aln_span <= 67: 772 lines.append("%10s - %s" % ('Query', qseq)) 773 if simil: 774 lines.append(" %s" % simil) 775 lines.append("%10s - %s" % ('Hit', hseq)) 776 else: 777 # adjust continuation character length, so we don't display 778 # the same residues twice 779 if self.aln_span - 66 > 3: 780 cont = '~' * 3 781 else: 782 cont = '~' * (self.aln_span - 66) 783 lines.append("%10s - %s%s%s" % ('Query', 784 qseq[:59], cont, qseq[-5:])) 785 if simil: 786 lines.append(" %s%s%s" % 787 (simil[:59], cont, simil[-5:])) 788 lines.append("%10s - %s%s%s" % ('Hit', 789 hseq[:59], cont, hseq[-5:])) 790 791 return '\n'.join(lines)
792 793 # sequence properties #
794 - def _set_seq(self, seq, seq_type):
795 """Checks the given sequence for attribute setting 796 797 :param seq: sequence to check 798 :type seq: string or SeqRecord 799 :param seq_type: sequence type 800 :type seq_type: string, choice of 'hit' or 'query' 801 802 """ 803 assert seq_type in ('hit', 'query') 804 if seq is None: 805 return seq # return immediately if seq is None 806 else: 807 if not isinstance(seq, (basestring, SeqRecord)): 808 raise TypeError("%s sequence must be a string or a SeqRecord" 809 " object." % seq_type) 810 # check length if the opposite sequence is not None 811 opp_type = 'hit' if seq_type == 'query' else 'query' 812 opp_seq = getattr(self, '_%s' % opp_type, None) 813 if opp_seq is not None: 814 if len(seq) != len(opp_seq): 815 raise ValueError("Sequence lengths do not match. Expected: " 816 "%r (%s); found: %r (%s)." % (len(opp_seq), opp_type, 817 len(seq), seq_type)) 818 819 seq_id = getattr(self, '%s_id' % seq_type) 820 seq_desc = getattr(self, '%s_description' % seq_type) 821 seq_feats = getattr(self, '%s_features' % seq_type) 822 seq_name = 'aligned %s sequence' % seq_type 823 824 if isinstance(seq, SeqRecord): 825 seq.id = seq_id 826 seq.description = seq_desc 827 seq.name = seq_name 828 seq.features = seq_feats 829 seq.seq.alphabet = self.alphabet 830 elif isinstance(seq, basestring): 831 seq = SeqRecord(Seq(seq, self.alphabet), id=seq_id, name=seq_name, 832 description=seq_desc, features=seq_feats) 833 834 return seq
835
836 - def _hit_get(self):
837 return self._hit
838
839 - def _hit_set(self, value):
840 self._hit = self._set_seq(value, 'hit')
841 842 hit = property(fget=_hit_get, fset=_hit_set, 843 doc="""Hit sequence as a SeqRecord object, defaults to None""") 844
845 - def _query_get(self):
846 return self._query
847
848 - def _query_set(self, value):
849 self._query = self._set_seq(value, 'query')
850 851 query = property(fget=_query_get, fset=_query_set, 852 doc="""Query sequence as a SeqRecord object, defaults to None""") 853
854 - def _aln_get(self):
855 if self.query is None and self.hit is None: 856 return None 857 elif self.hit is None: 858 return MultipleSeqAlignment([self.query], self.alphabet) 859 elif self.query is None: 860 return MultipleSeqAlignment([self.hit], self.alphabet) 861 else: 862 return MultipleSeqAlignment([self.query, self.hit], self.alphabet)
863 864 aln = property(fget=_aln_get, 865 doc="""Query-hit alignment as a MultipleSeqAlignment object, 866 defaults to None""") 867
868 - def _alphabet_get(self):
869 return self._alphabet
870
871 - def _alphabet_set(self, value):
872 self._alphabet = value 873 try: 874 self.query.seq.alphabet = value 875 except AttributeError: 876 pass 877 try: 878 self.hit.seq.alphabet = value 879 except AttributeError: 880 pass
881 882 alphabet = property(fget=_alphabet_get, fset=_alphabet_set, 883 doc="""Alphabet object used in the fragment's sequences and alignment, 884 defaults to single_letter_alphabet""") 885
886 - def _aln_span_get(self):
887 # length of alignment (gaps included) 888 # alignment span can be its own attribute, or computed from 889 # query / hit length 890 if not hasattr(self, '_aln_span'): 891 if self.query is not None: 892 self._aln_span = len(self.query) 893 elif self.hit is not None: 894 self._aln_span = len(self.hit) 895 896 return self._aln_span
897
898 - def _aln_span_set(self, value):
899 self._aln_span = value
900 901 aln_span = property(fget=_aln_span_get, fset=_aln_span_set, 902 doc="""The number of alignment columns covered by the fragment""") 903 904 # id, description, and features properties # 905 hit_description = fragcascade('description', 'hit', 906 doc="""Hit sequence description""") 907 908 query_description = fragcascade('description', 'query', 909 doc="""Query sequence description""") 910 911 hit_id = fragcascade('id', 'hit', 912 doc="""Hit sequence ID""") 913 914 query_id = fragcascade('id', 'query', 915 doc="""Query sequence ID""") 916 917 hit_features = fragcascade('features', 'hit', 918 doc="""Hit sequence features""") 919 920 query_features = fragcascade('features', 'query', 921 doc="""Query sequence features""") 922 923 # strand properties #
924 - def _prep_strand(self, strand):
925 # follow SeqFeature's convention 926 if strand not in (-1, 0, 1, None): 927 raise ValueError("Strand should be -1, 0, 1, or None; not %r" % 928 strand) 929 return strand
930
931 - def _get_strand(self, seq_type):
932 assert seq_type in ('hit', 'query') 933 strand = getattr(self, '_%s_strand' % seq_type) 934 935 if strand is None: 936 # try to compute strand from frame 937 frame = getattr(self, '%s_frame' % seq_type) 938 if frame is not None: 939 try: 940 strand = frame // abs(frame) 941 except ZeroDivisionError: 942 strand = 0 943 setattr(self, '%s_strand' % seq_type, strand) 944 945 return strand
946
947 - def _hit_strand_get(self):
948 return self._get_strand('hit')
949
950 - def _hit_strand_set(self, value):
951 self._hit_strand = self._prep_strand(value)
952 953 hit_strand = property(fget=_hit_strand_get, fset=_hit_strand_set, 954 doc="""Hit sequence strand, defaults to None""") 955
956 - def _query_strand_get(self):
957 return self._get_strand('query')
958
959 - def _query_strand_set(self, value):
960 self._query_strand = self._prep_strand(value)
961 962 query_strand = property(fget=_query_strand_get, fset=_query_strand_set, 963 doc="""Query sequence strand, defaults to None""") 964 965 # frame properties #
966 - def _prep_frame(self, frame):
967 if frame not in (-3, -2, -1, 0, 1, 2, 3, None): 968 raise ValueError("Strand should be an integer between -3 and 3, " 969 "or None; not %r" % frame) 970 return frame
971
972 - def _hit_frame_get(self):
973 return self._hit_frame
974
975 - def _hit_frame_set(self, value):
976 self._hit_frame = self._prep_frame(value)
977 978 hit_frame = property(fget=_hit_frame_get, fset=_hit_frame_set, 979 doc="""Hit sequence reading frame, defaults to None""") 980
981 - def _query_frame_get(self):
982 return self._query_frame
983
984 - def _query_frame_set(self, value):
985 self._query_frame = self._prep_frame(value)
986 987 query_frame = property(fget=_query_frame_get, fset=_query_frame_set, 988 doc="""Query sequence reading frame, defaults to None""") 989 990 # coordinate properties #
991 - def _prep_coord(self, coord, opp_coord_name, op):
992 # coord must either be None or int 993 if coord is None: 994 return coord 995 assert isinstance(coord, int) 996 # try to get opposite coordinate, if it's not present, return 997 try: 998 opp_coord = getattr(self, opp_coord_name) 999 except AttributeError: 1000 return coord 1001 # if opposite coordinate is None, return 1002 if opp_coord is None: 1003 return coord 1004 # otherwise compare it to coord ('>=' or '<=') 1005 else: 1006 assert op(coord, opp_coord) 1007 return coord
1008
1009 - def _hit_start_get(self):
1010 return self._hit_start
1011
1012 - def _hit_start_set(self, value):
1013 self._hit_start = self._prep_coord(value, 'hit_end', le)
1014 1015 hit_start = property(fget=_hit_start_get, fset=_hit_start_set, 1016 doc="""Hit sequence start coordinate, defaults to None""") 1017
1018 - def _query_start_get(self):
1019 return self._query_start
1020
1021 - def _query_start_set(self, value):
1022 self._query_start = self._prep_coord(value, 'query_end', le)
1023 1024 query_start = property(fget=_query_start_get, fset=_query_start_set, 1025 doc="""Query sequence start coordinate, defaults to None""") 1026
1027 - def _hit_end_get(self):
1028 return self._hit_end
1029
1030 - def _hit_end_set(self, value):
1031 self._hit_end = self._prep_coord(value, 'hit_start', ge)
1032 1033 hit_end = property(fget=_hit_end_get, fset=_hit_end_set, 1034 doc="""Hit sequence start coordinate, defaults to None""") 1035
1036 - def _query_end_get(self):
1037 return self._query_end
1038
1039 - def _query_end_set(self, value):
1040 self._query_end = self._prep_coord(value, 'query_start', ge)
1041 1042 query_end = property(fget=_query_end_get, fset=_query_end_set, 1043 doc="""Query sequence end coordinate, defaults to None""") 1044 1045 # coordinate-dependent properties #
1046 - def _hit_span_get(self):
1047 try: 1048 return self.hit_end - self.hit_start 1049 except TypeError: # triggered if any of the coordinates are None 1050 return None
1051 1052 hit_span = property(fget=_hit_span_get, 1053 doc="""The number of residues covered by the hit sequence""") 1054
1055 - def _query_span_get(self):
1056 try: 1057 return self.query_end - self.query_start 1058 except TypeError: # triggered if any of the coordinates are None 1059 return None
1060 1061 query_span = property(fget=_query_span_get, 1062 doc="""The number of residues covered by the query sequence""") 1063
1064 - def _hit_range_get(self):
1065 return (self.hit_start, self.hit_end)
1066 1067 hit_range = property(fget=_hit_range_get, 1068 doc="""Tuple of hit start and end coordinates""") 1069
1070 - def _query_range_get(self):
1071 return (self.query_start, self.query_end)
1072 1073 query_range = property(fget=_query_range_get, 1074 doc="""Tuple of query start and end coordinates""")
1075 1076 1077 # if not used as a module, run the doctest 1078 if __name__ == "__main__": 1079 from Bio._utils import run_doctest 1080 run_doctest() 1081