Package Bio :: Package SearchIO :: Package _model :: Module hsp
[hide private]
[frames] | no frames]

Source Code for Module Bio.SearchIO._model.hsp

   1  # Copyright 2012 by Wibowo Arindrarto.  All rights reserved. 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license.  Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5   
   6  """Bio.SearchIO objects to model high scoring regions between query and hit.""" 
   7   
   8  from __future__ import print_function 
   9  from Bio._py3k import basestring 
  10   
  11  import warnings 
  12  from operator import ge, le 
  13   
  14  from Bio import BiopythonWarning 
  15  from Bio.Align import MultipleSeqAlignment 
  16  from Bio.Alphabet import single_letter_alphabet 
  17  from Bio.Seq import Seq 
  18  from Bio.SeqRecord import SeqRecord 
  19   
  20  from Bio._utils import getattr_str, trim_str 
  21  from Bio.SearchIO._utils import singleitem, allitems, fullcascade, \ 
  22          fragcascade 
  23   
  24  from ._base import _BaseHSP 
  25   
  26   
27 -class HSP(_BaseHSP):
28 29 """Class representing high-scoring region(s) between query and hit. 30 31 HSP (high-scoring pair) objects are contained by Hit objects (see Hit). 32 In most cases, HSP objects store the bulk of the statistics and results 33 (e.g. e-value, bitscores, query sequence, etc.) produced by a search 34 program. 35 36 Depending on the search output file format, a given HSP will contain one 37 or more HSPFragment object(s). Examples of search programs that produce HSP 38 with one HSPFragments are BLAST, HMMER, and FASTA. Other programs such as 39 BLAT or Exonerate may produce HSPs containing more than one HSPFragment. 40 However, their native terminologies may differ: in BLAT these fragments 41 are called 'blocks' while in Exonerate they are called exons or NER. 42 43 Here are examples from each type of HSP. The first one comes from a BLAST 44 search: 45 46 >>> from Bio import SearchIO 47 >>> blast_qresult = next(SearchIO.parse('Blast/mirna.xml', 'blast-xml')) 48 >>> blast_hsp = blast_qresult[1][0] # the first HSP from the second hit 49 >>> blast_hsp 50 HSP(hit_id='gi|301171311|ref|NR_035856.1|', query_id='33211', 1 fragments) 51 >>> print(blast_hsp) 52 Query: 33211 mir_1 53 Hit: gi|301171311|ref|NR_035856.1| Pan troglodytes microRNA mir-520b ... 54 Query range: [1:61] (1) 55 Hit range: [0:60] (1) 56 Quick stats: evalue 1.7e-22; bitscore 109.49 57 Fragments: 1 (60 columns) 58 Query - CCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG 59 |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| 60 Hit - CCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG 61 62 For HSPs with a single HSPFragment, you can invoke `print` on it and see the 63 underlying sequence alignment, if it exists. This is not the case for HSPs 64 with more than one HSPFragment. Below is an example, using an HSP from a 65 BLAT search. Invoking `print` on these HSPs will instead show a table of the 66 HSPFragment objects it contains: 67 68 >>> blat_qresult = SearchIO.read('Blat/mirna.pslx', 'blat-psl', pslx=True) 69 >>> blat_hsp = blat_qresult[1][0] # the first HSP from the second hit 70 >>> blat_hsp 71 HSP(hit_id='chr11', query_id='blat_1', 2 fragments) 72 >>> print(blat_hsp) 73 Query: blat_1 <unknown description> 74 Hit: chr11 <unknown description> 75 Query range: [42:67] (-1) 76 Hit range: [59018929:59018955] (1) 77 Quick stats: evalue ?; bitscore ? 78 Fragments: --- -------------- ---------------------- ---------------------- 79 # Span Query range Hit range 80 --- -------------- ---------------------- ---------------------- 81 0 6 [61:67] [59018929:59018935] 82 1 16 [42:58] [59018939:59018955] 83 84 Notice that in HSPs with more than one HSPFragments, the HSP's `query_range` 85 `hit_range` properties encompasses all fragments it contains. 86 87 You can check whether an HSP has more than one HSPFragments or not using the 88 `is_fragmented` property: 89 90 >>> blast_hsp.is_fragmented 91 False 92 >>> blat_hsp.is_fragmented 93 True 94 95 Since HSP objects are also containers similar to Python lists, you can 96 access a single fragment in an HSP using its integer index: 97 98 >>> blat_fragment = blat_hsp[0] 99 >>> print(blat_fragment) 100 Query: blat_1 <unknown description> 101 Hit: chr11 <unknown description> 102 Query range: [61:67] (-1) 103 Hit range: [59018929:59018935] (1) 104 Fragments: 1 (6 columns) 105 Query - tatagt 106 Hit - tatagt 107 108 This applies to HSPs objects with a single fragment as well: 109 110 >>> blast_fragment = blast_hsp[0] 111 >>> print(blast_fragment) 112 Query: 33211 mir_1 113 Hit: gi|301171311|ref|NR_035856.1| Pan troglodytes microRNA mir-520b ... 114 Query range: [1:61] (1) 115 Hit range: [0:60] (1) 116 Fragments: 1 (60 columns) 117 Query - CCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG 118 |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| 119 Hit - CCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG 120 121 Regardless of the search output file format, HSP objects provide the 122 properties listed below. These properties always return values in a list, 123 due to the HSP object itself being a list-like container. However, for 124 HSP objects with a single HSPFragment, shortcut properties that fetches 125 the item from the list are also provided. 126 127 +----------------------+---------------------+-----------------------------+ 128 | Property | Shortcut | Value | 129 +======================+=====================+=============================+ 130 | aln_all | aln | HSP alignments as | 131 | | | MultipleSeqAlignment object | 132 +----------------------+---------------------+-----------------------------+ 133 | aln_annotation_all | aln_annotation | dictionary of annotation(s) | 134 | | | of all fragments' alignments| 135 +----------------------+---------------------+-----------------------------+ 136 | fragments | fragment | HSPFragment objects | 137 +----------------------+---------------------+-----------------------------+ 138 | hit_all | hit | hit sequence as SeqRecord | 139 | | | objects | 140 +----------------------+---------------------+-----------------------------+ 141 | hit_features_all | hit_features | SeqFeatures of all hit | 142 | | | fragments | 143 +----------------------+---------------------+-----------------------------+ 144 | hit_start_all | hit_start* | start coordinates of the | 145 | | | hit fragments | 146 +----------------------+---------------------+-----------------------------+ 147 | hit_end_all | hit_end* | end coordinates of the hit | 148 | | | fragments | 149 +----------------------+---------------------+-----------------------------+ 150 | hit_span_all | hit_span* | sizes of each hit fragments | 151 +----------------------+---------------------+-----------------------------+ 152 | hit_strand_all | hit_strand | strand orientations of the | 153 | | | hit fragments | 154 +----------------------+---------------------+-----------------------------+ 155 | hit_frame_all | hit_frame | reading frames of the hit | 156 | | | fragments | 157 +----------------------+---------------------+-----------------------------+ 158 | hit_range_all | hit_range | tuples of start and end | 159 | | | coordinates of each hit | 160 | | | fragment | 161 +----------------------+---------------------+-----------------------------+ 162 | query_all | query | query sequence as SeqRecord | 163 | | | object | 164 +----------------------+---------------------+-----------------------------+ 165 | query_features_all | query_features | SeqFeatures of all query | 166 | | | fragments | 167 +----------------------+---------------------+-----------------------------+ 168 | query_start_all | query_start* | start coordinates of the | 169 | | | fragments | 170 +----------------------+---------------------+-----------------------------+ 171 | query_end_all | query_end* | end coordinates of the | 172 | | | query fragments | 173 +----------------------+---------------------+-----------------------------+ 174 | query_span_all | query_span* | sizes of each query | 175 | | | fragments | 176 +----------------------+---------------------+-----------------------------+ 177 | query_strand_all | query_strand | strand orientations of the | 178 | | | query fragments | 179 +----------------------+---------------------+-----------------------------+ 180 | query_frame_all | query_frame | reading frames of the query | 181 | | | fragments | 182 +----------------------+---------------------+-----------------------------+ 183 | query_range_all | query_range | tuples of start and end | 184 | | | coordinates of each query | 185 | | | fragment | 186 +----------------------+---------------------+-----------------------------+ 187 * may be used in HSPs with multiple fragments 188 189 For all types of HSP objects, the property will return the values in a list. 190 Shorcuts are only applicable for HSPs with one fragment. Except the ones 191 noted, if they are used on an HSP with more than one fragments, an exception 192 will be raised. 193 194 For properties that may be used in HSPs with multiple or single fragments 195 (`*_start`, `*_end`, and `*_span` properties), their interpretation depends 196 on how many fragment the HSP has: 197 198 +------------+---------------------------------------------------+ 199 | Property | Value | 200 +============+===================================================+ 201 | hit_start | smallest coordinate value of all hit fragments | 202 +------------+---------------------------------------------------+ 203 | hit_end | largest coordinate value of all hit fragments | 204 +------------+---------------------------------------------------+ 205 | hit_span | difference between `hit_start` and `hit_end` | 206 +------------+---------------------------------------------------+ 207 | query_start| smallest coordinate value of all query fragments | 208 +------------+---------------------------------------------------+ 209 | query_end | largest coordinate value of all query fragments | 210 +------------+---------------------------------------------------+ 211 | query_span | difference between `query_start` and `query_end` | 212 +------------+---------------------------------------------------+ 213 214 In addition to the objects listed above, HSP objects also provide the 215 following properties: 216 217 +--------------------+------------------------------------------------------+ 218 | Property | Value | 219 +====================+======================================================+ 220 | aln_span | total number of residues in all HSPFragment objects | 221 +--------------------+------------------------------------------------------+ 222 | alphabet | alphabet used in hit and query SeqRecord objects | 223 +--------------------+------------------------------------------------------+ 224 | is_fragmented | boolean, whether there are multiple fragments or not | 225 +--------------------+------------------------------------------------------+ 226 | hit_id | ID of the hit sequence | 227 +--------------------+------------------------------------------------------+ 228 | hit_description | description of the hit sequence | 229 +--------------------+------------------------------------------------------+ 230 | hit_inter_ranges | list of hit sequence coordinates of the regions | 231 | | between fragments | 232 +--------------------+------------------------------------------------------+ 233 | hit_inter_spans | list of lengths of the regions between hit fragments | 234 +--------------------+------------------------------------------------------+ 235 | query_id | ID of the query sequence | 236 +--------------------+------------------------------------------------------+ 237 | query_description | description of the query sequence | 238 +--------------------+------------------------------------------------------+ 239 | query_inter_ranges | list of query sequence coordinates of the regions | 240 | | between fragments | 241 +--------------------+------------------------------------------------------+ 242 | query_inter_spans | list of lengths of the regions between query | 243 | |fragments | 244 +--------------------+------------------------------------------------------+ 245 246 """ 247 # attributes we don't want to transfer when creating a new Hit class 248 # from this one 249 _NON_STICKY_ATTRS = ('_items', ) 250
251 - def __init__(self, fragments=[]):
252 """Initializes an HSP object. 253 254 Arguments: 255 fragments -- List of HSPFragment objects. 256 257 HSP objects must be initialized with a list containing at least one 258 HSPFragment object. If multiple HSPFragment objects are used for 259 initialization, they must all have the same `query_id`, 260 `query_description`, `hit_id`, `hit_description`, and alphabet 261 properties. 262 263 """ 264 if not fragments: 265 raise ValueError("HSP objects must have at least one HSPFragment " 266 "object.") 267 # check that all fragments contain the same IDs, descriptions, alphabet 268 for attr in ('query_id', 'query_description', 'hit_id', 269 'hit_description', 'alphabet'): 270 if len(set(getattr(frag, attr) for frag in fragments)) != 1: 271 raise ValueError("HSP object can not contain fragments with " 272 "more than one %s." % attr) 273 274 self._items = [] 275 for fragment in fragments: 276 self._validate_fragment(fragment) 277 self._items.append(fragment)
278
279 - def __repr__(self):
280 return "%s(hit_id=%r, query_id=%r, %r fragments)" % \ 281 (self.__class__.__name__, self.hit_id, self.query_id, len(self))
282
283 - def __iter__(self):
284 return iter(self._items)
285
286 - def __contains__(self, fragment):
287 return fragment in self._items
288
289 - def __len__(self):
290 return len(self._items)
291 292 #Python 3:
293 - def __bool__(self):
294 return bool(self._items)
295 296 #Python 2: 297 __nonzero__= __bool__ 298
299 - def __str__(self):
300 301 lines = [] 302 # set hsp info line 303 statline = [] 304 # evalue 305 evalue = getattr_str(self, 'evalue', fmt='%.2g') 306 statline.append('evalue ' + evalue) 307 # bitscore 308 bitscore = getattr_str(self, 'bitscore', fmt='%.2f') 309 statline.append('bitscore ' + bitscore) 310 lines.append('Quick stats: ' + '; '.join(statline)) 311 312 if len(self.fragments) == 1: 313 return '\n'.join([self._str_hsp_header(), '\n'.join(lines), 314 self.fragments[0]._str_aln()]) 315 else: 316 lines.append(' Fragments: %s %s %s %s' % 317 ('-'*3, '-'*14, '-'*22, '-'*22)) 318 pattern = '%16s %14s %22s %22s' 319 lines.append(pattern % ('#', 'Span', 'Query range', 'Hit range')) 320 lines.append(pattern % ('-'*3, '-'*14, '-'*22, '-'*22)) 321 for idx, block in enumerate(self.fragments): 322 # set hsp line and table 323 # alignment span 324 aln_span = getattr_str(block, 'aln_span') 325 # query region 326 query_start = getattr_str(block, 'query_start') 327 query_end = getattr_str(block, 'query_end') 328 query_range = '[%s:%s]' % (query_start, query_end) 329 # max column length is 20 330 query_range = trim_str(query_range, 22, '~]') 331 # hit region 332 hit_start = getattr_str(block, 'hit_start') 333 hit_end = getattr_str(block, 'hit_end') 334 hit_range = '[%s:%s]' % (hit_start, hit_end) 335 hit_range = trim_str(hit_range, 22, '~]') 336 # append the hsp row 337 lines.append(pattern % (str(idx), aln_span, query_range, hit_range)) 338 339 return self._str_hsp_header() + '\n' + '\n'.join(lines)
340
341 - def __getitem__(self, idx):
342 # if key is slice, return a new HSP instance 343 if isinstance(idx, slice): 344 obj = self.__class__(self._items[idx]) 345 self._transfer_attrs(obj) 346 return obj 347 return self._items[idx]
348
349 - def __setitem__(self, idx, fragments):
350 # handle case if hsps is a list of hsp 351 if isinstance(fragments, (list, tuple)): 352 for fragment in fragments: 353 self._validate_fragment(fragment) 354 else: 355 self._validate_fragment(fragments) 356 357 self._items[idx] = fragments
358
359 - def __delitem__(self, idx):
360 # note that this may result in an empty HSP object, which should be 361 # invalid 362 del self._items[idx]
363
364 - def _validate_fragment(self, fragment):
365 if not isinstance(fragment, HSPFragment): 366 raise TypeError("HSP objects can only contain HSPFragment " 367 "objects.") 368 # HACK: to make validation during __init__ work 369 if self._items: 370 if fragment.hit_id != self.hit_id: 371 raise ValueError("Expected HSPFragment with hit ID %r, " 372 "found %r instead." % (self.id, fragment.hit_id)) 373 374 if fragment.hit_description != self.hit_description: 375 raise ValueError("Expected HSPFragment with hit " 376 "description %r, found %r instead." % \ 377 (self.description, fragment.hit_description)) 378 379 if fragment.query_id != self.query_id: 380 raise ValueError("Expected HSPFragment with query ID %r, " 381 "found %r instead." % (self.query_id, 382 fragment.query_id)) 383 384 if fragment.query_description != self.query_description: 385 raise ValueError("Expected HSP with query description %r, " 386 "found %r instead." % (self.query_description, 387 fragment.query_description))
388 389
390 - def _aln_span_get(self):
391 # length of all alignments 392 # alignment span can be its own attribute, or computed from 393 # query / hit length 394 return sum(frg.aln_span for frg in self.fragments)
395 396 aln_span = property(fget=_aln_span_get, 397 doc="""Total number of columns in all HSPFragment objects.""") 398 399 ## coordinate properties ##
400 - def _get_coords(self, seq_type, coord_type):
401 assert seq_type in ('hit', 'query') 402 assert coord_type in ('start', 'end') 403 coord_name = '%s_%s' % (seq_type, coord_type) 404 coords = [getattr(frag, coord_name) for frag in self.fragments] 405 if None in coords: 406 warnings.warn("'None' exist in %s coordinates; ignored" % 407 (coord_name), BiopythonWarning) 408 return coords
409
410 - def _hit_start_get(self):
411 return min(self._get_coords('hit', 'start'))
412 413 hit_start = property(fget=_hit_start_get, 414 doc="""Smallest coordinate value of all hit fragments""") 415
416 - def _query_start_get(self):
417 return min(self._get_coords('query', 'start'))
418 419 query_start = property(fget=_query_start_get, 420 doc="""Smallest coordinate value of all query fragments""") 421
422 - def _hit_end_get(self):
423 return max(self._get_coords('hit', 'end'))
424 425 hit_end = property(fget=_hit_end_get, 426 doc="""Largest coordinate value of all hit fragments""") 427
428 - def _query_end_get(self):
429 return max(self._get_coords('query', 'end'))
430 431 query_end = property(fget=_query_end_get, 432 doc="""Largest coordinate value of all hit fragments""") 433 434 ## coordinate-dependent properties ##
435 - def _hit_span_get(self):
436 try: 437 return self.hit_end - self.hit_start 438 except TypeError: # triggered if any of the coordinates are None 439 return None
440 441 hit_span = property(fget=_hit_span_get, 442 doc="""The number of hit residues covered by the HSP.""") 443
444 - def _query_span_get(self):
445 try: 446 return self.query_end - self.query_start 447 except TypeError: # triggered if any of the coordinates are None 448 return None
449 450 query_span = property(fget=_query_span_get, 451 doc="""The number of query residues covered by the HSP.""") 452
453 - def _hit_range_get(self):
454 return (self.hit_start, self.hit_end)
455 456 hit_range = property(fget=_hit_range_get, 457 doc="""Tuple of HSP hit start and end coordinates.""") 458
459 - def _query_range_get(self):
460 return (self.query_start, self.query_end)
461 462 query_range = property(fget=_query_range_get, 463 doc="""Tuple of HSP query start and end coordinates.""") 464
465 - def _inter_ranges_get(self, seq_type):
466 # this property assumes that there are no mixed strands in a hit/query 467 assert seq_type in ('query', 'hit') 468 strand = getattr(self, '%s_strand_all' % seq_type)[0] 469 coords = getattr(self, '%s_range_all' % seq_type) 470 # determine function used to set inter range 471 # start and end coordinates, given two pairs 472 # of fragment start and end coordinates 473 if strand == -1: 474 startfunc, endfunc = min, max 475 else: 476 startfunc, endfunc = max, min 477 inter_coords = [] 478 for idx, coord in enumerate(coords[:-1]): 479 start = startfunc(coords[idx]) 480 end = endfunc(coords[idx+1]) 481 inter_coords.append((min(start, end), max(start, end))) 482 483 return inter_coords
484
485 - def _hit_inter_ranges_get(self):
486 return self._inter_ranges_get('hit')
487 488 hit_inter_ranges = property(fget=_hit_inter_ranges_get, 489 doc="""Hit sequence coordinates of the regions between fragments""") 490
491 - def _query_inter_ranges_get(self):
492 return self._inter_ranges_get('query')
493 494 query_inter_ranges = property(fget=_query_inter_ranges_get, 495 doc="""Query sequence coordinates of the regions between fragments""") 496
497 - def _inter_spans_get(self, seq_type):
498 assert seq_type in ('query', 'hit') 499 attr_name = '%s_inter_ranges' % seq_type 500 return [coord[1] - coord[0] for coord in getattr(self, attr_name)]
501
502 - def _hit_inter_spans_get(self):
503 return self._inter_spans_get('hit')
504 505 hit_inter_spans = property(fget=_hit_inter_spans_get, 506 doc="""Lengths of regions between hit fragments""") 507
508 - def _query_inter_spans_get(self):
509 return self._inter_spans_get('query')
510 511 query_inter_spans = property(fget=_query_inter_spans_get, 512 doc="""Lengths of regions between query fragments""") 513 514 ## shortcuts for fragments' properties ## 515 516 # bool check if there's more than one fragments 517 is_fragmented = property(lambda self: len(self) > 1, 518 doc="""Whether the HSP has more than one HSPFragment objects""") 519 520 # first item properties with setters 521 hit_description = fullcascade('hit_description', 522 doc="""Description of the hit sequence""") 523 524 query_description = fullcascade('query_description', 525 doc="""Description of the query sequence""") 526 527 hit_id = fullcascade('hit_id', 528 doc="""ID of the hit sequence""") 529 530 query_id = fullcascade('query_id', 531 doc="""ID of the query sequence""") 532 533 alphabet = fullcascade('alphabet', 534 doc="""Alphabet used in hit and query SeqRecord objects""") 535 536 # properties for single-fragment HSPs 537 fragment = singleitem( 538 doc="""HSPFragment object, first fragment""") 539 540 hit = singleitem('hit', 541 doc="""Hit sequence as a SeqRecord object, first fragment""") 542 543 query = singleitem('query', 544 doc="""Query sequence as a SeqRecord object, first fragment""") 545 546 aln = singleitem('aln', 547 doc="""Alignment of the first fragment as a MultipleSeqAlignment object""") 548 549 aln_annotation = singleitem('aln_annotation', 550 doc="""Dictionary of annotation(s) of the first fragment's alignment""") 551 552 hit_features = singleitem('hit_features', 553 doc="""Hit sequence features, first fragment""") 554 555 query_features = singleitem('query_features', 556 doc="""Query sequence features, first fragment""") 557 558 hit_strand = singleitem('hit_strand', 559 doc="""Hit strand orientation, first fragment""") 560 561 query_strand = singleitem('query_strand', 562 doc="""Query strand orientation, first fragment""") 563 564 hit_frame = singleitem('hit_frame', 565 doc="""Hit sequence reading frame, first fragment""") 566 567 query_frame = singleitem('query_frame', 568 doc="""Query sequence reading frame, first fragment""") 569 570 # properties for multi-fragment HSPs 571 fragments = allitems(doc="""List of all HSPFragment objects""") 572 573 hit_all = allitems('hit', 574 doc="""List of all fragments' hit sequences as SeqRecord objects""") 575 576 query_all = allitems('query', 577 doc="""List of all fragments' query sequences as SeqRecord objects""") 578 579 aln_all = allitems('aln', 580 doc="""List of all fragments' alignments as MultipleSeqAlignment objects""") 581 582 aln_annotation_all = allitems('aln_annotation', 583 doc="""Dictionary of annotation(s) of all fragments' alignments""") 584 585 hit_features_all = allitems('hit_features', 586 doc="""List of all hit sequence features""") 587 588 query_features_all = allitems('query_features', 589 doc="""List of all query sequence features""") 590 591 hit_strand_all = allitems('hit_strand', 592 doc="""List of all fragments' hit sequence strands""") 593 594 query_strand_all = allitems('query_strand', 595 doc="""List of all fragments' query sequence strands""") 596 597 hit_frame_all = allitems('hit_frame', 598 doc="""List of all fragments' hit sequence reading frames""") 599 600 query_frame_all = allitems('query_frame', 601 doc="""List of all fragments' query sequence reading frames""") 602 603 hit_start_all = allitems('hit_start', 604 doc="""List of all fragments' hit start coordinates""") 605 606 query_start_all = allitems('query_starts', 607 doc="""List of all fragments' query start coordinates""") 608 609 hit_end_all = allitems('hit_ends', 610 doc="""List of all fragments' hit end coordinates""") 611 612 query_end_all = allitems('query_ends', 613 doc="""List of all fragments' query end coordinates""") 614 615 hit_span_all = allitems('hit_span', 616 doc="""List of all fragments' hit sequence size""") 617 618 query_span_all = allitems('query_span', 619 doc="""List of all fragments' query sequence size""") 620 621 hit_range_all = allitems('hit_range', 622 doc="""List of all fragments' hit start and end coordinates""") 623 624 query_range_all = allitems('query_range', 625 doc="""List of all fragments' query start and end coordinates""")
626 627
628 -class HSPFragment(_BaseHSP):
629 630 """Class representing a contiguous alignment of hit-query sequence. 631 632 HSPFragment forms the core of any parsed search output file. Depending on 633 the search output file format, it may contain the actual query and/or hit 634 sequences that produces the search hits. These sequences are stored as 635 SeqRecord objects (see SeqRecord): 636 637 >>> from Bio import SearchIO 638 >>> qresult = next(SearchIO.parse('Blast/mirna.xml', 'blast-xml')) 639 >>> fragment = qresult[0][0][0] # first hit, first hsp, first fragment 640 >>> print(fragment) 641 Query: 33211 mir_1 642 Hit: gi|262205317|ref|NR_030195.1| Homo sapiens microRNA 520b (MIR520... 643 Query range: [0:61] (1) 644 Hit range: [0:61] (1) 645 Fragments: 1 (61 columns) 646 Query - CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG 647 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| 648 Hit - CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG 649 650 # the query sequence is a SeqRecord object 651 >>> fragment.query.__class__ 652 <class 'Bio.SeqRecord.SeqRecord'> 653 >>> print(fragment.query) 654 ID: 33211 655 Name: aligned query sequence 656 Description: mir_1 657 Number of features: 0 658 Seq('CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTT...GGG', DNAAlphabet()) 659 660 # the hit sequence is a SeqRecord object as well 661 >>> fragment.hit.__class__ 662 <class 'Bio.SeqRecord.SeqRecord'> 663 >>> print(fragment.hit) 664 ID: gi|262205317|ref|NR_030195.1| 665 Name: aligned hit sequence 666 Description: Homo sapiens microRNA 520b (MIR520B), microRNA 667 Number of features: 0 668 Seq('CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTT...GGG', DNAAlphabet()) 669 670 # when both query and hit are present, we get a MultipleSeqAlignment object 671 >>> fragment.aln.__class__ 672 <class 'Bio.Align.MultipleSeqAlignment'> 673 >>> print(fragment.aln) 674 DNAAlphabet() alignment with 2 rows and 61 columns 675 CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAG...GGG 33211 676 CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAG...GGG gi|262205317|ref|NR_030195.1| 677 678 """ 679
680 - def __init__(self, hit_id='<unknown id>', query_id='<unknown id>', 681 hit=None, query=None, alphabet=single_letter_alphabet):
682 683 self._alphabet = alphabet 684 self.aln_annotation = {} 685 686 self._hit_id = hit_id 687 self._query_id = query_id 688 689 for seq_type in ('query', 'hit'): 690 # query or hit attributes default attributes 691 setattr(self, '_%s_description' % seq_type, '<unknown description>') 692 setattr(self, '_%s_features' % seq_type, []) 693 # query or hit attributes whose default attribute is None 694 for attr in ('strand', 'frame', 'start', 'end'): 695 setattr(self, '%s_%s' % (seq_type, attr), None) 696 # self.query or self.hit 697 if eval(seq_type): 698 setattr(self, seq_type, eval(seq_type)) 699 else: 700 setattr(self, seq_type, None)
701
702 - def __repr__(self):
703 info = "hit_id=%r, query_id=%r" % (self.hit_id, self.query_id) 704 try: 705 info += ", %i columns" % len(self) 706 except AttributeError: 707 pass 708 return "%s(%s)" % (self.__class__.__name__, info)
709
710 - def __len__(self):
711 return self.aln_span
712
713 - def __str__(self):
714 return self._str_hsp_header() + '\n' + self._str_aln()
715
716 - def __getitem__(self, idx):
717 if self.aln is not None: 718 obj = self.__class__( 719 hit_id=self.hit_id, query_id=self.query_id, 720 alphabet=self.alphabet) 721 # transfer query and hit attributes 722 # let SeqRecord handle feature slicing, then retrieve the sliced 723 # features into the sliced HSPFragment 724 if self.query is not None: 725 obj.query = self.query[idx] 726 obj.query_features = obj.query.features 727 if self.hit is not None: 728 obj.hit = self.hit[idx] 729 obj.hit_features = obj.hit.features 730 # description, strand, frame 731 for attr in ('description', 'strand', 'frame'): 732 for seq_type in ('hit', 'query'): 733 attr_name = '%s_%s' % (seq_type, attr) 734 self_val = getattr(self, attr_name) 735 setattr(obj, attr_name, self_val) 736 # alignment annotation should be transferred, since we can compute 737 # the resulting annotation 738 obj.aln_annotation = {} 739 for key, value in self.aln_annotation.items(): 740 assert len(value[idx]) == len(obj) 741 obj.aln_annotation[key] = value[idx] 742 return obj 743 else: 744 raise TypeError("Slicing for HSP objects without " 745 "alignment is not supported.")
746
747 - def _str_aln(self):
748 lines = [] 749 # alignment length 750 aln_span = getattr_str(self, 'aln_span') 751 lines.append(' Fragments: 1 (%s columns)' % aln_span) 752 # sequences 753 if self.query is not None and self.hit is not None: 754 try: 755 qseq = str(self.query.seq) 756 except AttributeError: # query is None 757 qseq = '?' 758 try: 759 hseq = str(self.hit.seq) 760 except AttributeError: # hit is None 761 hseq = '?' 762 763 # homology line 764 homol = '' 765 if 'homology' in self.aln_annotation: 766 homol = self.aln_annotation['homology'] 767 768 if self.aln_span <= 67: 769 lines.append("%10s - %s" % ('Query', qseq)) 770 if homol: 771 lines.append(" %s" % homol) 772 lines.append("%10s - %s" % ('Hit', hseq)) 773 else: 774 # adjust continuation character length, so we don't display 775 # the same residues twice 776 if self.aln_span - 66 > 3: 777 cont = '~' * 3 778 else: 779 cont = '~' * (self.aln_span - 66) 780 lines.append("%10s - %s%s%s" % ('Query', 781 qseq[:59], cont, qseq[-5:])) 782 if homol: 783 lines.append(" %s%s%s" % 784 (homol[:59], cont, homol[-5:])) 785 lines.append("%10s - %s%s%s" % ('Hit', 786 hseq[:59], cont, hseq[-5:])) 787 788 return '\n'.join(lines)
789 790 ## sequence properties ##
791 - def _set_seq(self, seq, seq_type):
792 """Checks the given sequence for attribute setting 793 794 Arguments: 795 seq -- String or SeqRecord to check 796 seq_type -- String of sequence type, must be 'hit' or 'query' 797 798 """ 799 assert seq_type in ('hit', 'query') 800 if seq is None: 801 return seq # return immediately if seq is None 802 else: 803 if not isinstance(seq, (basestring, SeqRecord)): 804 raise TypeError("%s sequence must be a string or a SeqRecord" 805 " object." % seq_type) 806 # check length if the opposite sequence is not None 807 opp_type = 'hit' if seq_type == 'query' else 'query' 808 opp_seq = getattr(self, '_%s' % opp_type, None) 809 if opp_seq is not None: 810 if len(seq) != len(opp_seq): 811 raise ValueError("Sequence lengths do not match. Expected: " 812 "%r (%s); found: %r (%s)." % (len(opp_seq), opp_type, 813 len(seq), seq_type)) 814 815 seq_id = getattr(self, '%s_id' % seq_type) 816 seq_desc = getattr(self, '%s_description' % seq_type) 817 seq_feats = getattr(self, '%s_features' % seq_type) 818 seq_name = 'aligned %s sequence' % seq_type 819 820 if isinstance(seq, SeqRecord): 821 seq.id = seq_id 822 seq.description = seq_desc 823 seq.name = seq_name 824 seq.features = seq_feats 825 seq.seq.alphabet = self.alphabet 826 elif isinstance(seq, basestring): 827 seq = SeqRecord(Seq(seq, self.alphabet), id=seq_id, name=seq_name, 828 description=seq_desc, features=seq_feats) 829 830 return seq
831
832 - def _hit_get(self):
833 return self._hit
834
835 - def _hit_set(self, value):
836 self._hit = self._set_seq(value, 'hit')
837 838 hit = property(fget=_hit_get, fset=_hit_set, 839 doc="""Hit sequence as a SeqRecord object, defaults to None""") 840
841 - def _query_get(self):
842 return self._query
843
844 - def _query_set(self, value):
845 self._query = self._set_seq(value, 'query')
846 847 query = property(fget=_query_get, fset=_query_set, 848 doc="""Query sequence as a SeqRecord object, defaults to None""") 849
850 - def _aln_get(self):
851 if self.query is None and self.hit is None: 852 return None 853 elif self.hit is None: 854 return MultipleSeqAlignment([self.query], self.alphabet) 855 elif self.query is None: 856 return MultipleSeqAlignment([self.hit], self.alphabet) 857 else: 858 return MultipleSeqAlignment([self.query, self.hit], self.alphabet)
859 860 aln = property(fget=_aln_get, 861 doc="""Query-hit alignment as a MultipleSeqAlignment object, 862 defaults to None""") 863
864 - def _alphabet_get(self):
865 return self._alphabet
866
867 - def _alphabet_set(self, value):
868 self._alphabet = value 869 try: 870 self.query.seq.alphabet = value 871 except AttributeError: 872 pass 873 try: 874 self.hit.seq.alphabet = value 875 except AttributeError: 876 pass
877 878 alphabet = property(fget=_alphabet_get, fset=_alphabet_set, 879 doc="""Alphabet object used in the fragment's sequences and alignment, 880 defaults to single_letter_alphabet""") 881
882 - def _aln_span_get(self):
883 # length of alignment (gaps included) 884 # alignment span can be its own attribute, or computed from 885 # query / hit length 886 if not hasattr(self, '_aln_span'): 887 if self.query is not None: 888 self._aln_span = len(self.query) 889 elif self.hit is not None: 890 self._aln_span = len(self.hit) 891 892 return self._aln_span
893
894 - def _aln_span_set(self, value):
895 self._aln_span = value
896 897 aln_span = property(fget=_aln_span_get, fset=_aln_span_set, 898 doc="""The number of alignment columns covered by the fragment""") 899 900 ## id, description, and features properties ## 901 hit_description = fragcascade('description', 'hit', 902 doc="""Hit sequence description""") 903 904 query_description = fragcascade('description', 'query', 905 doc="""Query sequence description""") 906 907 hit_id = fragcascade('id', 'hit', 908 doc="""Hit sequence ID""") 909 910 query_id = fragcascade('id', 'query', 911 doc="""Query sequence ID""") 912 913 hit_features = fragcascade('features', 'hit', 914 doc="""Hit sequence features""") 915 916 query_features = fragcascade('features', 'query', 917 doc="""Query sequence features""") 918 919 ## strand properties ##
920 - def _prep_strand(self, strand):
921 # follow SeqFeature's convention 922 if not strand in (-1, 0, 1, None): 923 raise ValueError("Strand should be -1, 0, 1, or None; not %r" % 924 strand) 925 return strand
926
927 - def _get_strand(self, seq_type):
928 assert seq_type in ('hit', 'query') 929 strand = getattr(self, '_%s_strand' % seq_type) 930 931 if strand is None: 932 # try to compute strand from frame 933 frame = getattr(self, '%s_frame' % seq_type) 934 if frame is not None: 935 try: 936 strand = frame // abs(frame) 937 except ZeroDivisionError: 938 strand = 0 939 setattr(self, '%s_strand' % seq_type, strand) 940 941 return strand
942
943 - def _hit_strand_get(self):
944 return self._get_strand('hit')
945
946 - def _hit_strand_set(self, value):
947 self._hit_strand = self._prep_strand(value)
948 949 hit_strand = property(fget=_hit_strand_get, fset=_hit_strand_set, 950 doc="""Hit sequence strand, defaults to None""") 951
952 - def _query_strand_get(self):
953 return self._get_strand('query')
954
955 - def _query_strand_set(self, value):
956 self._query_strand = self._prep_strand(value)
957 958 query_strand = property(fget=_query_strand_get, fset=_query_strand_set, 959 doc="""Query sequence strand, defaults to None""") 960 961 ## frame properties ##
962 - def _prep_frame(self, frame):
963 if not frame in (-3, -2, -1, 0, 1, 2, 3, None): 964 raise ValueError("Strand should be an integer between -3 and 3, " 965 "or None; not %r" % frame) 966 return frame
967
968 - def _hit_frame_get(self):
969 return self._hit_frame
970
971 - def _hit_frame_set(self, value):
972 self._hit_frame = self._prep_frame(value)
973 974 hit_frame = property(fget=_hit_frame_get, fset=_hit_frame_set, 975 doc="""Hit sequence reading frame, defaults to None""") 976
977 - def _query_frame_get(self):
978 return self._query_frame
979
980 - def _query_frame_set(self, value):
981 self._query_frame = self._prep_frame(value)
982 983 query_frame = property(fget=_query_frame_get, fset=_query_frame_set, 984 doc="""Query sequence reading frame, defaults to None""") 985 986 ## coordinate properties ##
987 - def _prep_coord(self, coord, opp_coord_name, op):
988 # coord must either be None or int 989 if coord is None: 990 return coord 991 assert isinstance(coord, int) 992 # try to get opposite coordinate, if it's not present, return 993 try: 994 opp_coord = getattr(self, opp_coord_name) 995 except AttributeError: 996 return coord 997 # if opposite coordinate is None, return 998 if opp_coord is None: 999 return coord 1000 # otherwise compare it to coord ('>=' or '<=') 1001 else: 1002 assert op(coord, opp_coord) 1003 return coord
1004
1005 - def _hit_start_get(self):
1006 return self._hit_start
1007
1008 - def _hit_start_set(self, value):
1009 self._hit_start = self._prep_coord(value, 'hit_end', le)
1010 1011 hit_start = property(fget=_hit_start_get, fset=_hit_start_set, 1012 doc="""Hit sequence start coordinate, defaults to None""") 1013
1014 - def _query_start_get(self):
1015 return self._query_start
1016
1017 - def _query_start_set(self, value):
1018 self._query_start = self._prep_coord(value, 'query_end', le)
1019 1020 query_start = property(fget=_query_start_get, fset=_query_start_set, 1021 doc="""Query sequence start coordinate, defaults to None""") 1022
1023 - def _hit_end_get(self):
1024 return self._hit_end
1025
1026 - def _hit_end_set(self, value):
1027 self._hit_end = self._prep_coord(value, 'hit_start', ge)
1028 1029 hit_end = property(fget=_hit_end_get, fset=_hit_end_set, 1030 doc="""Hit sequence start coordinate, defaults to None""") 1031
1032 - def _query_end_get(self):
1033 return self._query_end
1034
1035 - def _query_end_set(self, value):
1036 self._query_end = self._prep_coord(value, 'query_start', ge)
1037 1038 query_end = property(fget=_query_end_get, fset=_query_end_set, 1039 doc="""Query sequence end coordinate, defaults to None""") 1040 1041 ## coordinate-dependent properties ##
1042 - def _hit_span_get(self):
1043 try: 1044 return self.hit_end - self.hit_start 1045 except TypeError: # triggered if any of the coordinates are None 1046 return None
1047 1048 hit_span = property(fget=_hit_span_get, 1049 doc="""The number of residues covered by the hit sequence""") 1050
1051 - def _query_span_get(self):
1052 try: 1053 return self.query_end - self.query_start 1054 except TypeError: # triggered if any of the coordinates are None 1055 return None
1056 1057 query_span = property(fget=_query_span_get, 1058 doc="""The number of residues covered by the query sequence""") 1059
1060 - def _hit_range_get(self):
1061 return (self.hit_start, self.hit_end)
1062 1063 hit_range = property(fget=_hit_range_get, 1064 doc="""Tuple of hit start and end coordinates""") 1065
1066 - def _query_range_get(self):
1067 return (self.query_start, self.query_end)
1068 1069 query_range = property(fget=_query_range_get, 1070 doc="""Tuple of query start and end coordinates""")
1071 1072 1073 # if not used as a module, run the doctest 1074 if __name__ == "__main__": 1075 from Bio._utils import run_doctest 1076 run_doctest() 1077