Package Bio :: Package SearchIO :: Package _model :: Module hsp
[hide private]
[frames] | no frames]

Source Code for Module Bio.SearchIO._model.hsp

   1  # Copyright 2012 by Wibowo Arindrarto.  All rights reserved. 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license.  Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5   
   6  """Bio.SearchIO objects to model high scoring regions between query and hit.""" 
   7   
   8  from __future__ import print_function 
   9  from Bio._py3k import basestring 
  10   
  11  import warnings 
  12  from operator import ge, le 
  13   
  14  from Bio import BiopythonWarning 
  15  from Bio.Align import MultipleSeqAlignment 
  16  from Bio.Alphabet import single_letter_alphabet 
  17  from Bio.Seq import Seq 
  18  from Bio.SeqRecord import SeqRecord 
  19   
  20  from Bio._utils import getattr_str, trim_str 
  21  from Bio.SearchIO._utils import singleitem, allitems, fullcascade, \ 
  22          fragcascade 
  23   
  24  from ._base import _BaseHSP 
  25   
  26   
27 -class HSP(_BaseHSP):
28 29 """Class representing high-scoring region(s) between query and hit. 30 31 HSP (high-scoring pair) objects are contained by Hit objects (see Hit). 32 In most cases, HSP objects store the bulk of the statistics and results 33 (e.g. e-value, bitscores, query sequence, etc.) produced by a search 34 program. 35 36 Depending on the search output file format, a given HSP will contain one 37 or more HSPFragment object(s). Examples of search programs that produce HSP 38 with one HSPFragments are BLAST, HMMER, and FASTA. Other programs such as 39 BLAT or Exonerate may produce HSPs containing more than one HSPFragment. 40 However, their native terminologies may differ: in BLAT these fragments 41 are called 'blocks' while in Exonerate they are called exons or NER. 42 43 Here are examples from each type of HSP. The first one comes from a BLAST 44 search:: 45 46 >>> from Bio import SearchIO 47 >>> blast_qresult = next(SearchIO.parse('Blast/mirna.xml', 'blast-xml')) 48 >>> blast_hsp = blast_qresult[1][0] # the first HSP from the second hit 49 >>> blast_hsp 50 HSP(hit_id='gi|301171311|ref|NR_035856.1|', query_id='33211', 1 fragments) 51 >>> print(blast_hsp) 52 Query: 33211 mir_1 53 Hit: gi|301171311|ref|NR_035856.1| Pan troglodytes microRNA mir-520b ... 54 Query range: [1:61] (1) 55 Hit range: [0:60] (1) 56 Quick stats: evalue 1.7e-22; bitscore 109.49 57 Fragments: 1 (60 columns) 58 Query - CCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG 59 |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| 60 Hit - CCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG 61 62 For HSPs with a single HSPFragment, you can invoke ``print`` on it and see the 63 underlying sequence alignment, if it exists. This is not the case for HSPs 64 with more than one HSPFragment. Below is an example, using an HSP from a 65 BLAT search. Invoking ``print`` on these HSPs will instead show a table of the 66 HSPFragment objects it contains:: 67 68 >>> blat_qresult = SearchIO.read('Blat/mirna.pslx', 'blat-psl', pslx=True) 69 >>> blat_hsp = blat_qresult[1][0] # the first HSP from the second hit 70 >>> blat_hsp 71 HSP(hit_id='chr11', query_id='blat_1', 2 fragments) 72 >>> print(blat_hsp) 73 Query: blat_1 <unknown description> 74 Hit: chr11 <unknown description> 75 Query range: [42:67] (-1) 76 Hit range: [59018929:59018955] (1) 77 Quick stats: evalue ?; bitscore ? 78 Fragments: --- -------------- ---------------------- ---------------------- 79 # Span Query range Hit range 80 --- -------------- ---------------------- ---------------------- 81 0 6 [61:67] [59018929:59018935] 82 1 16 [42:58] [59018939:59018955] 83 84 Notice that in HSPs with more than one HSPFragments, the HSP's ``query_range`` 85 ``hit_range`` properties encompasses all fragments it contains. 86 87 You can check whether an HSP has more than one HSPFragments or not using the 88 ``is_fragmented`` property:: 89 90 >>> blast_hsp.is_fragmented 91 False 92 >>> blat_hsp.is_fragmented 93 True 94 95 Since HSP objects are also containers similar to Python lists, you can 96 access a single fragment in an HSP using its integer index:: 97 98 >>> blat_fragment = blat_hsp[0] 99 >>> print(blat_fragment) 100 Query: blat_1 <unknown description> 101 Hit: chr11 <unknown description> 102 Query range: [61:67] (-1) 103 Hit range: [59018929:59018935] (1) 104 Fragments: 1 (6 columns) 105 Query - tatagt 106 Hit - tatagt 107 108 This applies to HSPs objects with a single fragment as well:: 109 110 >>> blast_fragment = blast_hsp[0] 111 >>> print(blast_fragment) 112 Query: 33211 mir_1 113 Hit: gi|301171311|ref|NR_035856.1| Pan troglodytes microRNA mir-520b ... 114 Query range: [1:61] (1) 115 Hit range: [0:60] (1) 116 Fragments: 1 (60 columns) 117 Query - CCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG 118 |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| 119 Hit - CCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG 120 121 Regardless of the search output file format, HSP objects provide the 122 properties listed below. These properties always return values in a list, 123 due to the HSP object itself being a list-like container. However, for 124 HSP objects with a single HSPFragment, shortcut properties that fetches 125 the item from the list are also provided. 126 127 +----------------------+---------------------+-----------------------------+ 128 | Property | Shortcut | Value | 129 +======================+=====================+=============================+ 130 | aln_all | aln | HSP alignments as | 131 | | | MultipleSeqAlignment object | 132 +----------------------+---------------------+-----------------------------+ 133 | aln_annotation_all | aln_annotation | dictionary of annotation(s) | 134 | | | of all fragments' alignments| 135 +----------------------+---------------------+-----------------------------+ 136 | fragments | fragment | HSPFragment objects | 137 +----------------------+---------------------+-----------------------------+ 138 | hit_all | hit | hit sequence as SeqRecord | 139 | | | objects | 140 +----------------------+---------------------+-----------------------------+ 141 | hit_features_all | hit_features | SeqFeatures of all hit | 142 | | | fragments | 143 +----------------------+---------------------+-----------------------------+ 144 | hit_start_all | hit_start* | start coordinates of the | 145 | | | hit fragments | 146 +----------------------+---------------------+-----------------------------+ 147 | hit_end_all | hit_end* | end coordinates of the hit | 148 | | | fragments | 149 +----------------------+---------------------+-----------------------------+ 150 | hit_span_all | hit_span* | sizes of each hit fragments | 151 +----------------------+---------------------+-----------------------------+ 152 | hit_strand_all | hit_strand | strand orientations of the | 153 | | | hit fragments | 154 +----------------------+---------------------+-----------------------------+ 155 | hit_frame_all | hit_frame | reading frames of the hit | 156 | | | fragments | 157 +----------------------+---------------------+-----------------------------+ 158 | hit_range_all | hit_range | tuples of start and end | 159 | | | coordinates of each hit | 160 | | | fragment | 161 +----------------------+---------------------+-----------------------------+ 162 | query_all | query | query sequence as SeqRecord | 163 | | | object | 164 +----------------------+---------------------+-----------------------------+ 165 | query_features_all | query_features | SeqFeatures of all query | 166 | | | fragments | 167 +----------------------+---------------------+-----------------------------+ 168 | query_start_all | query_start* | start coordinates of the | 169 | | | fragments | 170 +----------------------+---------------------+-----------------------------+ 171 | query_end_all | query_end* | end coordinates of the | 172 | | | query fragments | 173 +----------------------+---------------------+-----------------------------+ 174 | query_span_all | query_span* | sizes of each query | 175 | | | fragments | 176 +----------------------+---------------------+-----------------------------+ 177 | query_strand_all | query_strand | strand orientations of the | 178 | | | query fragments | 179 +----------------------+---------------------+-----------------------------+ 180 | query_frame_all | query_frame | reading frames of the query | 181 | | | fragments | 182 +----------------------+---------------------+-----------------------------+ 183 | query_range_all | query_range | tuples of start and end | 184 | | | coordinates of each query | 185 | | | fragment | 186 +----------------------+---------------------+-----------------------------+ 187 188 For all types of HSP objects, the property will return the values in a list. 189 Shorcuts are only applicable for HSPs with one fragment. Except the ones 190 noted, if they are used on an HSP with more than one fragments, an exception 191 will be raised. 192 193 For properties that may be used in HSPs with multiple or single fragments 194 (``*_start``, ``*_end``, and ``*_span`` properties), their interpretation depends 195 on how many fragment the HSP has: 196 197 +------------+---------------------------------------------------+ 198 | Property | Value | 199 +============+===================================================+ 200 | hit_start | smallest coordinate value of all hit fragments | 201 +------------+---------------------------------------------------+ 202 | hit_end | largest coordinate value of all hit fragments | 203 +------------+---------------------------------------------------+ 204 | hit_span | difference between ``hit_start`` and ``hit_end`` | 205 +------------+---------------------------------------------------+ 206 | query_start| smallest coordinate value of all query fragments | 207 +------------+---------------------------------------------------+ 208 | query_end | largest coordinate value of all query fragments | 209 +------------+---------------------------------------------------+ 210 | query_span | difference between ``query_start`` and | 211 | | ``query_end`` | 212 +------------+---------------------------------------------------+ 213 214 In addition to the objects listed above, HSP objects also provide the 215 following properties: 216 217 +--------------------+------------------------------------------------------+ 218 | Property | Value | 219 +====================+======================================================+ 220 | aln_span | total number of residues in all HSPFragment objects | 221 +--------------------+------------------------------------------------------+ 222 | alphabet | alphabet used in hit and query SeqRecord objects | 223 +--------------------+------------------------------------------------------+ 224 | is_fragmented | boolean, whether there are multiple fragments or not | 225 +--------------------+------------------------------------------------------+ 226 | hit_id | ID of the hit sequence | 227 +--------------------+------------------------------------------------------+ 228 | hit_description | description of the hit sequence | 229 +--------------------+------------------------------------------------------+ 230 | hit_inter_ranges | list of hit sequence coordinates of the regions | 231 | | between fragments | 232 +--------------------+------------------------------------------------------+ 233 | hit_inter_spans | list of lengths of the regions between hit fragments | 234 +--------------------+------------------------------------------------------+ 235 | query_id | ID of the query sequence | 236 +--------------------+------------------------------------------------------+ 237 | query_description | description of the query sequence | 238 +--------------------+------------------------------------------------------+ 239 | query_inter_ranges | list of query sequence coordinates of the regions | 240 | | between fragments | 241 +--------------------+------------------------------------------------------+ 242 | query_inter_spans | list of lengths of the regions between query | 243 | | fragments | 244 +--------------------+------------------------------------------------------+ 245 246 .. [1] may be used in HSPs with multiple fragments 247 248 """ 249 # attributes we don't want to transfer when creating a new Hit class 250 # from this one 251 _NON_STICKY_ATTRS = ('_items', ) 252
253 - def __init__(self, fragments=()):
254 """Initializes an HSP object. 255 256 :param fragments: fragments contained in the HSP object 257 :type fragments: iterable yielding HSPFragment 258 259 HSP objects must be initialized with a list containing at least one 260 HSPFragment object. If multiple HSPFragment objects are used for 261 initialization, they must all have the same ``query_id``, 262 ``query_description``, ``hit_id``, ``hit_description``, and alphabet 263 properties. 264 265 """ 266 if not fragments: 267 raise ValueError("HSP objects must have at least one HSPFragment " 268 "object.") 269 # TODO - Move this into the for look in case hsps is a single use 270 # iterable? 271 # check that all fragments contain the same IDs, descriptions, alphabet 272 for attr in ('query_id', 'query_description', 'hit_id', 273 'hit_description', 'alphabet'): 274 if len(set(getattr(frag, attr) for frag in fragments)) != 1: 275 raise ValueError("HSP object can not contain fragments with " 276 "more than one %s." % attr) 277 278 self._items = [] 279 for fragment in fragments: 280 self._validate_fragment(fragment) 281 self._items.append(fragment)
282
283 - def __repr__(self):
284 return "%s(hit_id=%r, query_id=%r, %r fragments)" % \ 285 (self.__class__.__name__, self.hit_id, self.query_id, len(self))
286
287 - def __iter__(self):
288 return iter(self._items)
289
290 - def __contains__(self, fragment):
291 return fragment in self._items
292
293 - def __len__(self):
294 return len(self._items)
295 296 # Python 3:
297 - def __bool__(self):
298 return bool(self._items)
299 300 # Python 2: 301 __nonzero__ = __bool__ 302
303 - def __str__(self):
304 305 lines = [] 306 # set hsp info line 307 statline = [] 308 # evalue 309 evalue = getattr_str(self, 'evalue', fmt='%.2g') 310 statline.append('evalue ' + evalue) 311 # bitscore 312 bitscore = getattr_str(self, 'bitscore', fmt='%.2f') 313 statline.append('bitscore ' + bitscore) 314 lines.append('Quick stats: ' + '; '.join(statline)) 315 316 if len(self.fragments) == 1: 317 return '\n'.join([self._str_hsp_header(), '\n'.join(lines), 318 self.fragments[0]._str_aln()]) 319 else: 320 lines.append(' Fragments: %s %s %s %s' % 321 ('-' * 3, '-' * 14, '-' * 22, '-' * 22)) 322 pattern = '%16s %14s %22s %22s' 323 lines.append(pattern % ('#', 'Span', 'Query range', 'Hit range')) 324 lines.append(pattern % ('-' * 3, '-' * 14, '-' * 22, '-' * 22)) 325 for idx, block in enumerate(self.fragments): 326 # set hsp line and table 327 # alignment span 328 aln_span = getattr_str(block, 'aln_span') 329 # query region 330 query_start = getattr_str(block, 'query_start') 331 query_end = getattr_str(block, 'query_end') 332 query_range = '[%s:%s]' % (query_start, query_end) 333 # max column length is 20 334 query_range = trim_str(query_range, 22, '~]') 335 # hit region 336 hit_start = getattr_str(block, 'hit_start') 337 hit_end = getattr_str(block, 'hit_end') 338 hit_range = '[%s:%s]' % (hit_start, hit_end) 339 hit_range = trim_str(hit_range, 22, '~]') 340 # append the hsp row 341 lines.append(pattern % (str(idx), aln_span, query_range, hit_range)) 342 343 return self._str_hsp_header() + '\n' + '\n'.join(lines)
344
345 - def __getitem__(self, idx):
346 # if key is slice, return a new HSP instance 347 if isinstance(idx, slice): 348 obj = self.__class__(self._items[idx]) 349 self._transfer_attrs(obj) 350 return obj 351 return self._items[idx]
352
353 - def __setitem__(self, idx, fragments):
354 # handle case if hsps is a list of hsp 355 if isinstance(fragments, (list, tuple)): 356 for fragment in fragments: 357 self._validate_fragment(fragment) 358 else: 359 self._validate_fragment(fragments) 360 361 self._items[idx] = fragments
362
363 - def __delitem__(self, idx):
364 # note that this may result in an empty HSP object, which should be 365 # invalid 366 del self._items[idx]
367
368 - def _validate_fragment(self, fragment):
369 if not isinstance(fragment, HSPFragment): 370 raise TypeError("HSP objects can only contain HSPFragment " 371 "objects.") 372 # HACK: to make validation during __init__ work 373 if self._items: 374 if fragment.hit_id != self.hit_id: 375 raise ValueError("Expected HSPFragment with hit ID %r, " 376 "found %r instead." % (self.id, fragment.hit_id)) 377 378 if fragment.hit_description != self.hit_description: 379 raise ValueError("Expected HSPFragment with hit " 380 "description %r, found %r instead." % 381 (self.description, fragment.hit_description)) 382 383 if fragment.query_id != self.query_id: 384 raise ValueError("Expected HSPFragment with query ID %r, " 385 "found %r instead." % (self.query_id, 386 fragment.query_id)) 387 388 if fragment.query_description != self.query_description: 389 raise ValueError("Expected HSP with query description %r, " 390 "found %r instead." % (self.query_description, 391 fragment.query_description))
392
393 - def _aln_span_get(self):
394 # length of all alignments 395 # alignment span can be its own attribute, or computed from 396 # query / hit length 397 return sum(frg.aln_span for frg in self.fragments)
398 399 aln_span = property(fget=_aln_span_get, 400 doc="""Total number of columns in all HSPFragment objects.""") 401 402 # coordinate properties #
403 - def _get_coords(self, seq_type, coord_type):
404 assert seq_type in ('hit', 'query') 405 assert coord_type in ('start', 'end') 406 coord_name = '%s_%s' % (seq_type, coord_type) 407 coords = [getattr(frag, coord_name) for frag in self.fragments] 408 if None in coords: 409 warnings.warn("'None' exist in %s coordinates; ignored" % 410 (coord_name), BiopythonWarning) 411 return coords
412
413 - def _hit_start_get(self):
414 return min(self._get_coords('hit', 'start'))
415 416 hit_start = property(fget=_hit_start_get, 417 doc="""Smallest coordinate value of all hit fragments""") 418
419 - def _query_start_get(self):
420 return min(self._get_coords('query', 'start'))
421 422 query_start = property(fget=_query_start_get, 423 doc="""Smallest coordinate value of all query fragments""") 424
425 - def _hit_end_get(self):
426 return max(self._get_coords('hit', 'end'))
427 428 hit_end = property(fget=_hit_end_get, 429 doc="""Largest coordinate value of all hit fragments""") 430
431 - def _query_end_get(self):
432 return max(self._get_coords('query', 'end'))
433 434 query_end = property(fget=_query_end_get, 435 doc="""Largest coordinate value of all hit fragments""") 436 437 # coordinate-dependent properties #
438 - def _hit_span_get(self):
439 try: 440 return self.hit_end - self.hit_start 441 except TypeError: # triggered if any of the coordinates are None 442 return None
443 444 hit_span = property(fget=_hit_span_get, 445 doc="""The number of hit residues covered by the HSP.""") 446
447 - def _query_span_get(self):
448 try: 449 return self.query_end - self.query_start 450 except TypeError: # triggered if any of the coordinates are None 451 return None
452 453 query_span = property(fget=_query_span_get, 454 doc="""The number of query residues covered by the HSP.""") 455
456 - def _hit_range_get(self):
457 return (self.hit_start, self.hit_end)
458 459 hit_range = property(fget=_hit_range_get, 460 doc="""Tuple of HSP hit start and end coordinates.""") 461
462 - def _query_range_get(self):
463 return (self.query_start, self.query_end)
464 465 query_range = property(fget=_query_range_get, 466 doc="""Tuple of HSP query start and end coordinates.""") 467
468 - def _inter_ranges_get(self, seq_type):
469 # this property assumes that there are no mixed strands in a hit/query 470 assert seq_type in ('query', 'hit') 471 strand = getattr(self, '%s_strand_all' % seq_type)[0] 472 coords = getattr(self, '%s_range_all' % seq_type) 473 # determine function used to set inter range 474 # start and end coordinates, given two pairs 475 # of fragment start and end coordinates 476 if strand == -1: 477 startfunc, endfunc = min, max 478 else: 479 startfunc, endfunc = max, min 480 inter_coords = [] 481 for idx, coord in enumerate(coords[:-1]): 482 start = startfunc(coords[idx]) 483 end = endfunc(coords[idx + 1]) 484 inter_coords.append((min(start, end), max(start, end))) 485 486 return inter_coords
487
488 - def _hit_inter_ranges_get(self):
489 return self._inter_ranges_get('hit')
490 491 hit_inter_ranges = property(fget=_hit_inter_ranges_get, 492 doc="""Hit sequence coordinates of the regions between fragments""") 493
494 - def _query_inter_ranges_get(self):
495 return self._inter_ranges_get('query')
496 497 query_inter_ranges = property(fget=_query_inter_ranges_get, 498 doc="""Query sequence coordinates of the regions between fragments""") 499
500 - def _inter_spans_get(self, seq_type):
501 assert seq_type in ('query', 'hit') 502 attr_name = '%s_inter_ranges' % seq_type 503 return [coord[1] - coord[0] for coord in getattr(self, attr_name)]
504
505 - def _hit_inter_spans_get(self):
506 return self._inter_spans_get('hit')
507 508 hit_inter_spans = property(fget=_hit_inter_spans_get, 509 doc="""Lengths of regions between hit fragments""") 510
511 - def _query_inter_spans_get(self):
512 return self._inter_spans_get('query')
513 514 query_inter_spans = property(fget=_query_inter_spans_get, 515 doc="""Lengths of regions between query fragments""") 516 517 # shortcuts for fragments' properties # 518 519 # bool check if there's more than one fragments 520 is_fragmented = property(lambda self: len(self) > 1, 521 doc="""Whether the HSP has more than one HSPFragment objects""") 522 523 # first item properties with setters 524 hit_description = fullcascade('hit_description', 525 doc="""Description of the hit sequence""") 526 527 query_description = fullcascade('query_description', 528 doc="""Description of the query sequence""") 529 530 hit_id = fullcascade('hit_id', 531 doc="""ID of the hit sequence""") 532 533 query_id = fullcascade('query_id', 534 doc="""ID of the query sequence""") 535 536 alphabet = fullcascade('alphabet', 537 doc="""Alphabet used in hit and query SeqRecord objects""") 538 539 # properties for single-fragment HSPs 540 fragment = singleitem( 541 doc="""HSPFragment object, first fragment""") 542 543 hit = singleitem('hit', 544 doc="""Hit sequence as a SeqRecord object, first fragment""") 545 546 query = singleitem('query', 547 doc="""Query sequence as a SeqRecord object, first fragment""") 548 549 aln = singleitem('aln', 550 doc="""Alignment of the first fragment as a MultipleSeqAlignment object""") 551 552 aln_annotation = singleitem('aln_annotation', 553 doc="""Dictionary of annotation(s) of the first fragment's alignment""") 554 555 hit_features = singleitem('hit_features', 556 doc="""Hit sequence features, first fragment""") 557 558 query_features = singleitem('query_features', 559 doc="""Query sequence features, first fragment""") 560 561 hit_strand = singleitem('hit_strand', 562 doc="""Hit strand orientation, first fragment""") 563 564 query_strand = singleitem('query_strand', 565 doc="""Query strand orientation, first fragment""") 566 567 hit_frame = singleitem('hit_frame', 568 doc="""Hit sequence reading frame, first fragment""") 569 570 query_frame = singleitem('query_frame', 571 doc="""Query sequence reading frame, first fragment""") 572 573 # properties for multi-fragment HSPs 574 fragments = allitems(doc="""List of all HSPFragment objects""") 575 576 hit_all = allitems('hit', 577 doc="""List of all fragments' hit sequences as SeqRecord objects""") 578 579 query_all = allitems('query', 580 doc="""List of all fragments' query sequences as SeqRecord objects""") 581 582 aln_all = allitems('aln', 583 doc="""List of all fragments' alignments as MultipleSeqAlignment objects""") 584 585 aln_annotation_all = allitems('aln_annotation', 586 doc="""Dictionary of annotation(s) of all fragments' alignments""") 587 588 hit_features_all = allitems('hit_features', 589 doc="""List of all hit sequence features""") 590 591 query_features_all = allitems('query_features', 592 doc="""List of all query sequence features""") 593 594 hit_strand_all = allitems('hit_strand', 595 doc="""List of all fragments' hit sequence strands""") 596 597 query_strand_all = allitems('query_strand', 598 doc="""List of all fragments' query sequence strands""") 599 600 hit_frame_all = allitems('hit_frame', 601 doc="""List of all fragments' hit sequence reading frames""") 602 603 query_frame_all = allitems('query_frame', 604 doc="""List of all fragments' query sequence reading frames""") 605 606 hit_start_all = allitems('hit_start', 607 doc="""List of all fragments' hit start coordinates""") 608 609 query_start_all = allitems('query_start', 610 doc="""List of all fragments' query start coordinates""") 611 612 hit_end_all = allitems('hit_end', 613 doc="""List of all fragments' hit end coordinates""") 614 615 query_end_all = allitems('query_end', 616 doc="""List of all fragments' query end coordinates""") 617 618 hit_span_all = allitems('hit_span', 619 doc="""List of all fragments' hit sequence size""") 620 621 query_span_all = allitems('query_span', 622 doc="""List of all fragments' query sequence size""") 623 624 hit_range_all = allitems('hit_range', 625 doc="""List of all fragments' hit start and end coordinates""") 626 627 query_range_all = allitems('query_range', 628 doc="""List of all fragments' query start and end coordinates""")
629 630
631 -class HSPFragment(_BaseHSP):
632 633 """Class representing a contiguous alignment of hit-query sequence. 634 635 HSPFragment forms the core of any parsed search output file. Depending on 636 the search output file format, it may contain the actual query and/or hit 637 sequences that produces the search hits. These sequences are stored as 638 SeqRecord objects (see SeqRecord): 639 640 >>> from Bio import SearchIO 641 >>> qresult = next(SearchIO.parse('Blast/mirna.xml', 'blast-xml')) 642 >>> fragment = qresult[0][0][0] # first hit, first hsp, first fragment 643 >>> print(fragment) 644 Query: 33211 mir_1 645 Hit: gi|262205317|ref|NR_030195.1| Homo sapiens microRNA 520b (MIR520... 646 Query range: [0:61] (1) 647 Hit range: [0:61] (1) 648 Fragments: 1 (61 columns) 649 Query - CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG 650 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| 651 Hit - CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG 652 653 # the query sequence is a SeqRecord object 654 >>> fragment.query.__class__ 655 <class 'Bio.SeqRecord.SeqRecord'> 656 >>> print(fragment.query) 657 ID: 33211 658 Name: aligned query sequence 659 Description: mir_1 660 Number of features: 0 661 Seq('CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTT...GGG', DNAAlphabet()) 662 663 # the hit sequence is a SeqRecord object as well 664 >>> fragment.hit.__class__ 665 <class 'Bio.SeqRecord.SeqRecord'> 666 >>> print(fragment.hit) 667 ID: gi|262205317|ref|NR_030195.1| 668 Name: aligned hit sequence 669 Description: Homo sapiens microRNA 520b (MIR520B), microRNA 670 Number of features: 0 671 Seq('CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTT...GGG', DNAAlphabet()) 672 673 # when both query and hit are present, we get a MultipleSeqAlignment object 674 >>> fragment.aln.__class__ 675 <class 'Bio.Align.MultipleSeqAlignment'> 676 >>> print(fragment.aln) 677 DNAAlphabet() alignment with 2 rows and 61 columns 678 CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAG...GGG 33211 679 CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAG...GGG gi|262205317|ref|NR_030195.1| 680 681 """ 682
683 - def __init__(self, hit_id='<unknown id>', query_id='<unknown id>', 684 hit=None, query=None, alphabet=single_letter_alphabet):
685 686 self._alphabet = alphabet 687 self.aln_annotation = {} 688 689 self._hit_id = hit_id 690 self._query_id = query_id 691 692 for seq_type in ('query', 'hit'): 693 # query or hit attributes default attributes 694 setattr(self, '_%s_description' % seq_type, '<unknown description>') 695 setattr(self, '_%s_features' % seq_type, []) 696 # query or hit attributes whose default attribute is None 697 for attr in ('strand', 'frame', 'start', 'end'): 698 setattr(self, '%s_%s' % (seq_type, attr), None) 699 # self.query or self.hit 700 if eval(seq_type): 701 setattr(self, seq_type, eval(seq_type)) 702 else: 703 setattr(self, seq_type, None)
704
705 - def __repr__(self):
706 info = "hit_id=%r, query_id=%r" % (self.hit_id, self.query_id) 707 try: 708 info += ", %i columns" % len(self) 709 except AttributeError: 710 pass 711 return "%s(%s)" % (self.__class__.__name__, info)
712
713 - def __len__(self):
714 return self.aln_span
715
716 - def __str__(self):
717 return self._str_hsp_header() + '\n' + self._str_aln()
718
719 - def __getitem__(self, idx):
720 if self.aln is not None: 721 obj = self.__class__( 722 hit_id=self.hit_id, query_id=self.query_id, 723 alphabet=self.alphabet) 724 # transfer query and hit attributes 725 # let SeqRecord handle feature slicing, then retrieve the sliced 726 # features into the sliced HSPFragment 727 if self.query is not None: 728 obj.query = self.query[idx] 729 obj.query_features = obj.query.features 730 if self.hit is not None: 731 obj.hit = self.hit[idx] 732 obj.hit_features = obj.hit.features 733 # description, strand, frame 734 for attr in ('description', 'strand', 'frame'): 735 for seq_type in ('hit', 'query'): 736 attr_name = '%s_%s' % (seq_type, attr) 737 self_val = getattr(self, attr_name) 738 setattr(obj, attr_name, self_val) 739 # alignment annotation should be transferred, since we can compute 740 # the resulting annotation 741 obj.aln_annotation = {} 742 for key, value in self.aln_annotation.items(): 743 assert len(value[idx]) == len(obj) 744 obj.aln_annotation[key] = value[idx] 745 return obj 746 else: 747 raise TypeError("Slicing for HSP objects without " 748 "alignment is not supported.")
749
750 - def _str_aln(self):
751 lines = [] 752 # alignment length 753 aln_span = getattr_str(self, 'aln_span') 754 lines.append(' Fragments: 1 (%s columns)' % aln_span) 755 # sequences 756 if self.query is not None and self.hit is not None: 757 try: 758 qseq = str(self.query.seq) 759 except AttributeError: # query is None 760 qseq = '?' 761 try: 762 hseq = str(self.hit.seq) 763 except AttributeError: # hit is None 764 hseq = '?' 765 766 # similarity line 767 simil = '' 768 if 'similarity' in self.aln_annotation and \ 769 isinstance(self.aln_annotation.get('similarity'), basestring): 770 simil = self.aln_annotation['similarity'] 771 772 if self.aln_span <= 67: 773 lines.append("%10s - %s" % ('Query', qseq)) 774 if simil: 775 lines.append(" %s" % simil) 776 lines.append("%10s - %s" % ('Hit', hseq)) 777 else: 778 # adjust continuation character length, so we don't display 779 # the same residues twice 780 if self.aln_span - 66 > 3: 781 cont = '~' * 3 782 else: 783 cont = '~' * (self.aln_span - 66) 784 lines.append("%10s - %s%s%s" % ('Query', 785 qseq[:59], cont, qseq[-5:])) 786 if simil: 787 lines.append(" %s%s%s" % 788 (simil[:59], cont, simil[-5:])) 789 lines.append("%10s - %s%s%s" % ('Hit', 790 hseq[:59], cont, hseq[-5:])) 791 792 return '\n'.join(lines)
793 794 # sequence properties #
795 - def _set_seq(self, seq, seq_type):
796 """Checks the given sequence for attribute setting 797 798 :param seq: sequence to check 799 :type seq: string or SeqRecord 800 :param seq_type: sequence type 801 :type seq_type: string, choice of 'hit' or 'query' 802 803 """ 804 assert seq_type in ('hit', 'query') 805 if seq is None: 806 return seq # return immediately if seq is None 807 else: 808 if not isinstance(seq, (basestring, SeqRecord)): 809 raise TypeError("%s sequence must be a string or a SeqRecord" 810 " object." % seq_type) 811 # check length if the opposite sequence is not None 812 opp_type = 'hit' if seq_type == 'query' else 'query' 813 opp_seq = getattr(self, '_%s' % opp_type, None) 814 if opp_seq is not None: 815 if len(seq) != len(opp_seq): 816 raise ValueError("Sequence lengths do not match. Expected: " 817 "%r (%s); found: %r (%s)." % (len(opp_seq), opp_type, 818 len(seq), seq_type)) 819 820 seq_id = getattr(self, '%s_id' % seq_type) 821 seq_desc = getattr(self, '%s_description' % seq_type) 822 seq_feats = getattr(self, '%s_features' % seq_type) 823 seq_name = 'aligned %s sequence' % seq_type 824 825 if isinstance(seq, SeqRecord): 826 seq.id = seq_id 827 seq.description = seq_desc 828 seq.name = seq_name 829 seq.features = seq_feats 830 seq.seq.alphabet = self.alphabet 831 elif isinstance(seq, basestring): 832 seq = SeqRecord(Seq(seq, self.alphabet), id=seq_id, name=seq_name, 833 description=seq_desc, features=seq_feats) 834 835 return seq
836
837 - def _hit_get(self):
838 return self._hit
839
840 - def _hit_set(self, value):
841 self._hit = self._set_seq(value, 'hit')
842 843 hit = property(fget=_hit_get, fset=_hit_set, 844 doc="""Hit sequence as a SeqRecord object, defaults to None""") 845
846 - def _query_get(self):
847 return self._query
848
849 - def _query_set(self, value):
850 self._query = self._set_seq(value, 'query')
851 852 query = property(fget=_query_get, fset=_query_set, 853 doc="""Query sequence as a SeqRecord object, defaults to None""") 854
855 - def _aln_get(self):
856 if self.query is None and self.hit is None: 857 return None 858 elif self.hit is None: 859 return MultipleSeqAlignment([self.query], self.alphabet) 860 elif self.query is None: 861 return MultipleSeqAlignment([self.hit], self.alphabet) 862 else: 863 return MultipleSeqAlignment([self.query, self.hit], self.alphabet)
864 865 aln = property(fget=_aln_get, 866 doc="""Query-hit alignment as a MultipleSeqAlignment object, 867 defaults to None""") 868
869 - def _alphabet_get(self):
870 return self._alphabet
871
872 - def _alphabet_set(self, value):
873 self._alphabet = value 874 try: 875 self.query.seq.alphabet = value 876 except AttributeError: 877 pass 878 try: 879 self.hit.seq.alphabet = value 880 except AttributeError: 881 pass
882 883 alphabet = property(fget=_alphabet_get, fset=_alphabet_set, 884 doc="""Alphabet object used in the fragment's sequences and alignment, 885 defaults to single_letter_alphabet""") 886
887 - def _aln_span_get(self):
888 # length of alignment (gaps included) 889 # alignment span can be its own attribute, or computed from 890 # query / hit length 891 if not hasattr(self, '_aln_span'): 892 if self.query is not None: 893 self._aln_span = len(self.query) 894 elif self.hit is not None: 895 self._aln_span = len(self.hit) 896 897 return self._aln_span
898
899 - def _aln_span_set(self, value):
900 self._aln_span = value
901 902 aln_span = property(fget=_aln_span_get, fset=_aln_span_set, 903 doc="""The number of alignment columns covered by the fragment""") 904 905 # id, description, and features properties # 906 hit_description = fragcascade('description', 'hit', 907 doc="""Hit sequence description""") 908 909 query_description = fragcascade('description', 'query', 910 doc="""Query sequence description""") 911 912 hit_id = fragcascade('id', 'hit', 913 doc="""Hit sequence ID""") 914 915 query_id = fragcascade('id', 'query', 916 doc="""Query sequence ID""") 917 918 hit_features = fragcascade('features', 'hit', 919 doc="""Hit sequence features""") 920 921 query_features = fragcascade('features', 'query', 922 doc="""Query sequence features""") 923 924 # strand properties #
925 - def _prep_strand(self, strand):
926 # follow SeqFeature's convention 927 if strand not in (-1, 0, 1, None): 928 raise ValueError("Strand should be -1, 0, 1, or None; not %r" % 929 strand) 930 return strand
931
932 - def _get_strand(self, seq_type):
933 assert seq_type in ('hit', 'query') 934 strand = getattr(self, '_%s_strand' % seq_type) 935 936 if strand is None: 937 # try to compute strand from frame 938 frame = getattr(self, '%s_frame' % seq_type) 939 if frame is not None: 940 try: 941 strand = frame // abs(frame) 942 except ZeroDivisionError: 943 strand = 0 944 setattr(self, '%s_strand' % seq_type, strand) 945 946 return strand
947
948 - def _hit_strand_get(self):
949 return self._get_strand('hit')
950
951 - def _hit_strand_set(self, value):
952 self._hit_strand = self._prep_strand(value)
953 954 hit_strand = property(fget=_hit_strand_get, fset=_hit_strand_set, 955 doc="""Hit sequence strand, defaults to None""") 956
957 - def _query_strand_get(self):
958 return self._get_strand('query')
959
960 - def _query_strand_set(self, value):
961 self._query_strand = self._prep_strand(value)
962 963 query_strand = property(fget=_query_strand_get, fset=_query_strand_set, 964 doc="""Query sequence strand, defaults to None""") 965 966 # frame properties #
967 - def _prep_frame(self, frame):
968 if frame not in (-3, -2, -1, 0, 1, 2, 3, None): 969 raise ValueError("Strand should be an integer between -3 and 3, " 970 "or None; not %r" % frame) 971 return frame
972
973 - def _hit_frame_get(self):
974 return self._hit_frame
975
976 - def _hit_frame_set(self, value):
977 self._hit_frame = self._prep_frame(value)
978 979 hit_frame = property(fget=_hit_frame_get, fset=_hit_frame_set, 980 doc="""Hit sequence reading frame, defaults to None""") 981
982 - def _query_frame_get(self):
983 return self._query_frame
984
985 - def _query_frame_set(self, value):
986 self._query_frame = self._prep_frame(value)
987 988 query_frame = property(fget=_query_frame_get, fset=_query_frame_set, 989 doc="""Query sequence reading frame, defaults to None""") 990 991 # coordinate properties #
992 - def _prep_coord(self, coord, opp_coord_name, op):
993 # coord must either be None or int 994 if coord is None: 995 return coord 996 assert isinstance(coord, int) 997 # try to get opposite coordinate, if it's not present, return 998 try: 999 opp_coord = getattr(self, opp_coord_name) 1000 except AttributeError: 1001 return coord 1002 # if opposite coordinate is None, return 1003 if opp_coord is None: 1004 return coord 1005 # otherwise compare it to coord ('>=' or '<=') 1006 else: 1007 assert op(coord, opp_coord) 1008 return coord
1009
1010 - def _hit_start_get(self):
1011 return self._hit_start
1012
1013 - def _hit_start_set(self, value):
1014 self._hit_start = self._prep_coord(value, 'hit_end', le)
1015 1016 hit_start = property(fget=_hit_start_get, fset=_hit_start_set, 1017 doc="""Hit sequence start coordinate, defaults to None""") 1018
1019 - def _query_start_get(self):
1020 return self._query_start
1021
1022 - def _query_start_set(self, value):
1023 self._query_start = self._prep_coord(value, 'query_end', le)
1024 1025 query_start = property(fget=_query_start_get, fset=_query_start_set, 1026 doc="""Query sequence start coordinate, defaults to None""") 1027
1028 - def _hit_end_get(self):
1029 return self._hit_end
1030
1031 - def _hit_end_set(self, value):
1032 self._hit_end = self._prep_coord(value, 'hit_start', ge)
1033 1034 hit_end = property(fget=_hit_end_get, fset=_hit_end_set, 1035 doc="""Hit sequence start coordinate, defaults to None""") 1036
1037 - def _query_end_get(self):
1038 return self._query_end
1039
1040 - def _query_end_set(self, value):
1041 self._query_end = self._prep_coord(value, 'query_start', ge)
1042 1043 query_end = property(fget=_query_end_get, fset=_query_end_set, 1044 doc="""Query sequence end coordinate, defaults to None""") 1045 1046 # coordinate-dependent properties #
1047 - def _hit_span_get(self):
1048 try: 1049 return self.hit_end - self.hit_start 1050 except TypeError: # triggered if any of the coordinates are None 1051 return None
1052 1053 hit_span = property(fget=_hit_span_get, 1054 doc="""The number of residues covered by the hit sequence""") 1055
1056 - def _query_span_get(self):
1057 try: 1058 return self.query_end - self.query_start 1059 except TypeError: # triggered if any of the coordinates are None 1060 return None
1061 1062 query_span = property(fget=_query_span_get, 1063 doc="""The number of residues covered by the query sequence""") 1064
1065 - def _hit_range_get(self):
1066 return (self.hit_start, self.hit_end)
1067 1068 hit_range = property(fget=_hit_range_get, 1069 doc="""Tuple of hit start and end coordinates""") 1070
1071 - def _query_range_get(self):
1072 return (self.query_start, self.query_end)
1073 1074 query_range = property(fget=_query_range_get, 1075 doc="""Tuple of query start and end coordinates""")
1076 1077 1078 # if not used as a module, run the doctest 1079 if __name__ == "__main__": 1080 from Bio._utils import run_doctest 1081 run_doctest() 1082