Package Bio :: Package SearchIO :: Package _model :: Module hsp
[hide private]
[frames] | no frames]

Source Code for Module Bio.SearchIO._model.hsp

   1  # Copyright 2012 by Wibowo Arindrarto.  All rights reserved. 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license.  Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5   
   6  """Bio.SearchIO objects to model high scoring regions between query and hit.""" 
   7   
   8  import warnings 
   9  from operator import ge, le 
  10   
  11  from Bio import BiopythonWarning 
  12  from Bio.Align import MultipleSeqAlignment 
  13  from Bio.Alphabet import single_letter_alphabet 
  14  from Bio.Seq import Seq 
  15  from Bio.SeqRecord import SeqRecord 
  16   
  17  from Bio._utils import getattr_str, trim_str 
  18  from Bio.SearchIO._utils import singleitem, allitems, fullcascade, \ 
  19          fragcascade 
  20   
  21  from _base import _BaseHSP 
  22   
  23   
24 -class HSP(_BaseHSP):
25 26 """Class representing high-scoring region(s) between query and hit. 27 28 HSP (high-scoring pair) objects are contained by Hit objects (see Hit). 29 In most cases, HSP objects store the bulk of the statistics and results 30 (e.g. e-value, bitscores, query sequence, etc.) produced by a search 31 program. 32 33 Depending on the search output file format, a given HSP will contain one 34 or more HSPFragment object(s). Examples of search programs that produce HSP 35 with one HSPFragments are BLAST, HMMER, and FASTA. Other programs such as 36 BLAT or Exonerate may produce HSPs containing more than one HSPFragment. 37 However, their native terminologies may differ: in BLAT these fragments 38 are called 'blocks' while in Exonerate they are called exons or NER. 39 40 Here are examples from each type of HSP. The first one comes from a BLAST 41 search: 42 43 >>> from Bio import SearchIO 44 >>> blast_qresult = SearchIO.parse('Blast/mirna.xml', 'blast-xml').next() 45 >>> blast_hsp = blast_qresult[1][0] # the first HSP from the second hit 46 >>> blast_hsp 47 HSP(hit_id='gi|301171311|ref|NR_035856.1|', query_id='33211', 1 fragments) 48 >>> print blast_hsp 49 Query: 33211 mir_1 50 Hit: gi|301171311|ref|NR_035856.1| Pan troglodytes microRNA mir-520b ... 51 Query range: [1:61] (1) 52 Hit range: [0:60] (1) 53 Quick stats: evalue 1.7e-22; bitscore 109.49 54 Fragments: 1 (60 columns) 55 Query - CCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG 56 |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| 57 Hit - CCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG 58 59 For HSPs with a single HSPFragment, you can invoke `print` on it and see the 60 underlying sequence alignment, if it exists. This is not the case for HSPs 61 with more than one HSPFragment. Below is an example, using an HSP from a 62 BLAT search. Invoking `print` on these HSPs will instead show a table of the 63 HSPFragment objects it contains: 64 65 >>> blat_qresult = SearchIO.read('Blat/mirna.pslx', 'blat-psl', pslx=True) 66 >>> blat_hsp = blat_qresult[1][0] # the first HSP from the second hit 67 >>> blat_hsp 68 HSP(hit_id='chr11', query_id='blat_1', 2 fragments) 69 >>> print blat_hsp 70 Query: blat_1 <unknown description> 71 Hit: chr11 <unknown description> 72 Query range: [42:67] (-1) 73 Hit range: [59018929:59018955] (1) 74 Quick stats: evalue ?; bitscore ? 75 Fragments: --- -------------- ---------------------- ---------------------- 76 # Span Query range Hit range 77 --- -------------- ---------------------- ---------------------- 78 0 6 [61:67] [59018929:59018935] 79 1 16 [42:58] [59018939:59018955] 80 81 Notice that in HSPs with more than one HSPFragments, the HSP's `query_range` 82 `hit_range` properties encompasses all fragments it contains. 83 84 You can check whether an HSP has more than one HSPFragments or not using the 85 `is_fragmented` property: 86 87 >>> blast_hsp.is_fragmented 88 False 89 >>> blat_hsp.is_fragmented 90 True 91 92 Since HSP objects are also containers similar to Python lists, you can 93 access a single fragment in an HSP using its integer index: 94 95 >>> blat_fragment = blat_hsp[0] 96 >>> print blat_fragment 97 Query: blat_1 <unknown description> 98 Hit: chr11 <unknown description> 99 Query range: [61:67] (-1) 100 Hit range: [59018929:59018935] (1) 101 Fragments: 1 (6 columns) 102 Query - tatagt 103 Hit - tatagt 104 105 This applies to HSPs objects with a single fragment as well: 106 107 >>> blast_fragment = blast_hsp[0] 108 >>> print blast_fragment 109 Query: 33211 mir_1 110 Hit: gi|301171311|ref|NR_035856.1| Pan troglodytes microRNA mir-520b ... 111 Query range: [1:61] (1) 112 Hit range: [0:60] (1) 113 Fragments: 1 (60 columns) 114 Query - CCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG 115 |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| 116 Hit - CCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG 117 118 Regardless of the search output file format, HSP objects provide the 119 properties listed below. These properties always return values in a list, 120 due to the HSP object itself being a list-like container. However, for 121 HSP objects with a single HSPFragment, shortcut properties that fetches 122 the item from the list are also provided. 123 124 +----------------------+---------------------+-----------------------------+ 125 | Property | Shortcut | Value | 126 +======================+=====================+=============================+ 127 | aln_all | aln | HSP alignments as | 128 | | | MultipleSeqAlignment object | 129 +----------------------+---------------------+-----------------------------+ 130 | aln_annotation_all | aln_annotation | dictionary of annotation(s) | 131 | | | of all fragments' alignments| 132 +----------------------+---------------------+-----------------------------+ 133 | fragments | fragment | HSPFragment objects | 134 +----------------------+---------------------+-----------------------------+ 135 | hit_all | hit | hit sequence as SeqRecord | 136 | | | objects | 137 +----------------------+---------------------+-----------------------------+ 138 | hit_features_all | hit_features | SeqFeatures of all hit | 139 | | | fragments | 140 +----------------------+---------------------+-----------------------------+ 141 | hit_start_all | hit_start* | start coordinates of the | 142 | | | hit fragments | 143 +----------------------+---------------------+-----------------------------+ 144 | hit_end_all | hit_end* | end coordinates of the hit | 145 | | | fragments | 146 +----------------------+---------------------+-----------------------------+ 147 | hit_span_all | hit_span* | sizes of each hit fragments | 148 +----------------------+---------------------+-----------------------------+ 149 | hit_strand_all | hit_strand | strand orientations of the | 150 | | | hit fragments | 151 +----------------------+---------------------+-----------------------------+ 152 | hit_frame_all | hit_frame | reading frames of the hit | 153 | | | fragments | 154 +----------------------+---------------------+-----------------------------+ 155 | hit_range_all | hit_range | tuples of start and end | 156 | | | coordinates of each hit | 157 | | | fragment | 158 +----------------------+---------------------+-----------------------------+ 159 | query_all | query | query sequence as SeqRecord | 160 | | | object | 161 +----------------------+---------------------+-----------------------------+ 162 | query_features_all | query_features | SeqFeatures of all query | 163 | | | fragments | 164 +----------------------+---------------------+-----------------------------+ 165 | query_start_all | query_start* | start coordinates of the | 166 | | | fragments | 167 +----------------------+---------------------+-----------------------------+ 168 | query_end_all | query_end* | end coordinates of the | 169 | | | query fragments | 170 +----------------------+---------------------+-----------------------------+ 171 | query_span_all | query_span* | sizes of each query | 172 | | | fragments | 173 +----------------------+---------------------+-----------------------------+ 174 | query_strand_all | query_strand | strand orientations of the | 175 | | | query fragments | 176 +----------------------+---------------------+-----------------------------+ 177 | query_frame_all | query_frame | reading frames of the query | 178 | | | fragments | 179 +----------------------+---------------------+-----------------------------+ 180 | query_range_all | query_range | tuples of start and end | 181 | | | coordinates of each query | 182 | | | fragment | 183 +----------------------+---------------------+-----------------------------+ 184 * may be used in HSPs with multiple fragments 185 186 For all types of HSP objects, the property will return the values in a list. 187 Shorcuts are only applicable for HSPs with one fragment. Except the ones 188 noted, if they are used on an HSP with more than one fragments, an exception 189 will be raised. 190 191 For properties that may be used in HSPs with multiple or single fragments 192 (`*_start`, `*_end`, and `*_span` properties), their interpretation depends 193 on how many fragment the HSP has: 194 195 +------------+---------------------------------------------------+ 196 | Property | Value | 197 +============+===================================================+ 198 | hit_start | smallest coordinate value of all hit fragments | 199 +------------+---------------------------------------------------+ 200 | hit_end | largest coordinate value of all hit fragments | 201 +------------+---------------------------------------------------+ 202 | hit_span | difference between `hit_start` and `hit_end` | 203 +------------+---------------------------------------------------+ 204 | query_start| smallest coordinate value of all query fragments | 205 +------------+---------------------------------------------------+ 206 | query_end | largest coordinate value of all query fragments | 207 +------------+---------------------------------------------------+ 208 | query_span | difference between `query_start` and `query_end` | 209 +------------+---------------------------------------------------+ 210 211 In addition to the objects listed above, HSP objects also provide the 212 following properties: 213 214 +--------------------+------------------------------------------------------+ 215 | Property | Value | 216 +====================+======================================================+ 217 | aln_span | total number of residues in all HSPFragment objects | 218 +--------------------+------------------------------------------------------+ 219 | alphabet | alphabet used in hit and query SeqRecord objects | 220 +--------------------+------------------------------------------------------+ 221 | is_fragmented | boolean, whether there are multiple fragments or not | 222 +--------------------+------------------------------------------------------+ 223 | hit_id | ID of the hit sequence | 224 +--------------------+------------------------------------------------------+ 225 | hit_description | description of the hit sequence | 226 +--------------------+------------------------------------------------------+ 227 | hit_inter_ranges | list of hit sequence coordinates of the regions | 228 | | between fragments | 229 +--------------------+------------------------------------------------------+ 230 | hit_inter_spans | list of lengths of the regions between hit fragments | 231 +--------------------+------------------------------------------------------+ 232 | query_id | ID of the query sequence | 233 +--------------------+------------------------------------------------------+ 234 | query_description | description of the query sequence | 235 +--------------------+------------------------------------------------------+ 236 | query_inter_ranges | list of query sequence coordinates of the regions | 237 | | between fragments | 238 +--------------------+------------------------------------------------------+ 239 | query_inter_spans | list of lengths of the regions between query | 240 | |fragments | 241 +--------------------+------------------------------------------------------+ 242 243 """ 244 # attributes we don't want to transfer when creating a new Hit class 245 # from this one 246 _NON_STICKY_ATTRS = ('_items', ) 247
248 - def __init__(self, fragments=[]):
249 """Initializes an HSP object. 250 251 Arguments: 252 fragments -- List of HSPFragment objects. 253 254 HSP objects must be initialized with a list containing at least one 255 HSPFragment object. If multiple HSPFragment objects are used for 256 initialization, they must all have the same `query_id`, 257 `query_description`, `hit_id`, `hit_description`, and alphabet 258 properties. 259 260 """ 261 if not fragments: 262 raise ValueError("HSP objects must have at least one HSPFragment " 263 "object.") 264 # check that all fragments contain the same IDs, descriptions, alphabet 265 for attr in ('query_id', 'query_description', 'hit_id', 266 'hit_description', 'alphabet'): 267 if len(set([getattr(frag, attr) for frag in fragments])) != 1: 268 raise ValueError("HSP object can not contain fragments with " 269 "more than one %s." % attr) 270 271 self._items = [] 272 for fragment in fragments: 273 self._validate_fragment(fragment) 274 self._items.append(fragment)
275
276 - def __repr__(self):
277 return "%s(hit_id=%r, query_id=%r, %r fragments)" % \ 278 (self.__class__.__name__, self.hit_id, self.query_id, len(self))
279
280 - def __iter__(self):
281 return iter(self._items)
282
283 - def __contains__(self, fragment):
284 return fragment in self._items
285
286 - def __len__(self):
287 return len(self._items)
288
289 - def __nonzero__(self):
290 return bool(self._items)
291
292 - def __str__(self):
293 294 lines = [] 295 # set hsp info line 296 statline = [] 297 # evalue 298 evalue = getattr_str(self, 'evalue', fmt='%.2g') 299 statline.append('evalue ' + evalue) 300 # bitscore 301 bitscore = getattr_str(self, 'bitscore', fmt='%.2f') 302 statline.append('bitscore ' + bitscore) 303 lines.append('Quick stats: ' + '; '.join(statline)) 304 305 if len(self.fragments) == 1: 306 return '\n'.join([self._str_hsp_header(), '\n'.join(lines), 307 self.fragments[0]._str_aln()]) 308 else: 309 lines.append(' Fragments: %s %s %s %s' % 310 ('-'*3, '-'*14, '-'*22, '-'*22)) 311 pattern = '%16s %14s %22s %22s' 312 lines.append(pattern % ('#', 'Span', 'Query range', 'Hit range')) 313 lines.append(pattern % ('-'*3, '-'*14, '-'*22, '-'*22)) 314 for idx, block in enumerate(self.fragments): 315 # set hsp line and table 316 # alignment span 317 aln_span = getattr_str(block, 'aln_span') 318 # query region 319 query_start = getattr_str(block, 'query_start') 320 query_end = getattr_str(block, 'query_end') 321 query_range = '[%s:%s]' % (query_start, query_end) 322 # max column length is 20 323 query_range = trim_str(query_range, 22, '~]') 324 # hit region 325 hit_start = getattr_str(block, 'hit_start') 326 hit_end = getattr_str(block, 'hit_end') 327 hit_range = '[%s:%s]' % (hit_start, hit_end) 328 hit_range = trim_str(hit_range, 22, '~]') 329 # append the hsp row 330 lines.append(pattern % (str(idx), aln_span, query_range, hit_range)) 331 332 return self._str_hsp_header() + '\n' + '\n'.join(lines)
333
334 - def __getitem__(self, idx):
335 # if key is slice, return a new HSP instance 336 if isinstance(idx, slice): 337 obj = self.__class__(self._items[idx]) 338 self._transfer_attrs(obj) 339 return obj 340 return self._items[idx]
341
342 - def __setitem__(self, idx, fragments):
343 # handle case if hsps is a list of hsp 344 if isinstance(fragments, (list, tuple)): 345 for fragment in fragments: 346 self._validate_fragment(fragment) 347 else: 348 self._validate_fragment(fragments) 349 350 self._items[idx] = fragments
351
352 - def __delitem__(self, idx):
353 # note that this may result in an empty HSP object, which should be 354 # invalid 355 del self._items[idx]
356
357 - def _validate_fragment(self, fragment):
358 if not isinstance(fragment, HSPFragment): 359 raise TypeError("HSP objects can only contain HSPFragment " 360 "objects.")
361
362 - def _aln_span_get(self):
363 # length of all alignments 364 # alignment span can be its own attribute, or computed from 365 # query / hit length 366 return sum([frg.aln_span for frg in self.fragments])
367 368 aln_span = property(fget=_aln_span_get, 369 doc="""Total number of columns in all HSPFragment objects.""") 370 371 ## coordinate properties ##
372 - def _get_coords(self, seq_type, coord_type):
373 assert seq_type in ('hit', 'query') 374 assert coord_type in ('start', 'end') 375 coord_name = '%s_%s' % (seq_type, coord_type) 376 coords = [getattr(frag, coord_name) for frag in self.fragments] 377 if None in coords: 378 warnings.warn("'None' exist in %s coordinates; ignored" % 379 (coord_name), BiopythonWarning) 380 return coords
381
382 - def _hit_start_get(self):
383 return min(self._get_coords('hit', 'start'))
384 385 hit_start = property(fget=_hit_start_get, 386 doc="""Smallest coordinate value of all hit fragments""") 387
388 - def _query_start_get(self):
389 return min(self._get_coords('query', 'start'))
390 391 query_start = property(fget=_query_start_get, 392 doc="""Smallest coordinate value of all query fragments""") 393
394 - def _hit_end_get(self):
395 return max(self._get_coords('hit', 'end'))
396 397 hit_end = property(fget=_hit_end_get, 398 doc="""Largest coordinate value of all hit fragments""") 399
400 - def _query_end_get(self):
401 return max(self._get_coords('query', 'end'))
402 403 query_end = property(fget=_query_end_get, 404 doc="""Largest coordinate value of all hit fragments""") 405 406 ## coordinate-dependent properties ##
407 - def _hit_span_get(self):
408 try: 409 return self.hit_end - self.hit_start 410 except TypeError: # triggered if any of the coordinates are None 411 return None
412 413 hit_span = property(fget=_hit_span_get, 414 doc="""The number of hit residues covered by the HSP.""") 415
416 - def _query_span_get(self):
417 try: 418 return self.query_end - self.query_start 419 except TypeError: # triggered if any of the coordinates are None 420 return None
421 422 query_span = property(fget=_query_span_get, 423 doc="""The number of query residues covered by the HSP.""") 424
425 - def _hit_range_get(self):
426 return (self.hit_start, self.hit_end)
427 428 hit_range = property(fget=_hit_range_get, 429 doc="""Tuple of HSP hit start and end coordinates.""") 430
431 - def _query_range_get(self):
432 return (self.query_start, self.query_end)
433 434 query_range = property(fget=_query_range_get, 435 doc="""Tuple of HSP query start and end coordinates.""") 436
437 - def _inter_ranges_get(self, seq_type):
438 # this property assumes that there are no mixed strands in a hit/query 439 assert seq_type in ('query', 'hit') 440 strand = getattr(self, '%s_strand_all' % seq_type)[0] 441 coords = getattr(self, '%s_range_all' % seq_type) 442 # determine function used to set inter range 443 # start and end coordinates, given two pairs 444 # of fragment start and end coordinates 445 if strand == -1: 446 startfunc, endfunc = min, max 447 else: 448 startfunc, endfunc = max, min 449 inter_coords = [] 450 for idx, coord in enumerate(coords[:-1]): 451 start = startfunc(coords[idx]) 452 end = endfunc(coords[idx+1]) 453 inter_coords.append((min(start, end), max(start, end))) 454 455 return inter_coords
456
457 - def _hit_inter_ranges_get(self):
458 return self._inter_ranges_get('hit')
459 460 hit_inter_ranges = property(fget=_hit_inter_ranges_get, 461 doc="""Hit sequence coordinates of the regions between fragments""") 462
463 - def _query_inter_ranges_get(self):
464 return self._inter_ranges_get('query')
465 466 query_inter_ranges = property(fget=_query_inter_ranges_get, 467 doc="""Query sequence coordinates of the regions between fragments""") 468
469 - def _inter_spans_get(self, seq_type):
470 assert seq_type in ('query', 'hit') 471 attr_name = '%s_inter_ranges' % seq_type 472 return [coord[1] - coord[0] for coord in getattr(self, attr_name)]
473
474 - def _hit_inter_spans_get(self):
475 return self._inter_spans_get('hit')
476 477 hit_inter_spans = property(fget=_hit_inter_spans_get, 478 doc="""Lengths of regions between hit fragments""") 479
480 - def _query_inter_spans_get(self):
481 return self._inter_spans_get('query')
482 483 query_inter_spans = property(fget=_query_inter_spans_get, 484 doc="""Lengths of regions between query fragments""") 485 486 ## shortcuts for fragments' properties ## 487 488 # bool check if there's more than one fragments 489 is_fragmented = property(lambda self: len(self) > 1, 490 doc="""Whether the HSP has more than one HSPFragment objects""") 491 492 # first item properties with setters 493 hit_description = fullcascade('hit_description', 494 doc="""Description of the hit sequence""") 495 496 query_description = fullcascade('query_description', 497 doc="""Description of the query sequence""") 498 499 hit_id = fullcascade('hit_id', 500 doc="""ID of the hit sequence""") 501 502 query_id = fullcascade('query_id', 503 doc="""ID of the query sequence""") 504 505 alphabet = fullcascade('alphabet', 506 doc="""Alphabet used in hit and query SeqRecord objects""") 507 508 # properties for single-fragment HSPs 509 fragment = singleitem( 510 doc="""HSPFragment object, first fragment""") 511 512 hit = singleitem('hit', 513 doc="""Hit sequence as a SeqRecord object, first fragment""") 514 515 query = singleitem('query', 516 doc="""Query sequence as a SeqRecord object, first fragment""") 517 518 aln = singleitem('aln', 519 doc="""Alignment of the first fragment as a MultipleSeqAlignment object""") 520 521 aln_annotation = singleitem('aln_annotation', 522 doc="""Dictionary of annotation(s) of the first fragment's alignment""") 523 524 hit_features = singleitem('hit_features', 525 doc="""Hit sequence features, first fragment""") 526 527 query_features = singleitem('query_features', 528 doc="""Query sequence features, first fragment""") 529 530 hit_strand = singleitem('hit_strand', 531 doc="""Hit strand orientation, first fragment""") 532 533 query_strand = singleitem('query_strand', 534 doc="""Query strand orientation, first fragment""") 535 536 hit_frame = singleitem('hit_frame', 537 doc="""Hit sequence reading frame, first fragment""") 538 539 query_frame = singleitem('query_frame', 540 doc="""Query sequence reading frame, first fragment""") 541 542 # properties for multi-fragment HSPs 543 fragments = allitems(doc="""List of all HSPFragment objects""") 544 545 hit_all = allitems('hit', 546 doc="""List of all fragments' hit sequences as SeqRecord objects""") 547 548 query_all = allitems('query', 549 doc="""List of all fragments' query sequences as SeqRecord objects""") 550 551 aln_all = allitems('aln', 552 doc="""List of all fragments' alignments as MultipleSeqAlignment objects""") 553 554 aln_annotation_all = allitems('aln_annotation', 555 doc="""Dictionary of annotation(s) of all fragments' alignments""") 556 557 hit_features_all = allitems('hit_features', 558 doc="""List of all hit sequence features""") 559 560 query_features_all = allitems('query_features', 561 doc="""List of all query sequence features""") 562 563 hit_strand_all = allitems('hit_strand', 564 doc="""List of all fragments' hit sequence strands""") 565 566 query_strand_all = allitems('query_strand', 567 doc="""List of all fragments' query sequence strands""") 568 569 hit_frame_all = allitems('hit_frame', 570 doc="""List of all fragments' hit sequence reading frames""") 571 572 query_frame_all = allitems('query_frame', 573 doc="""List of all fragments' query sequence reading frames""") 574 575 hit_start_all = allitems('hit_start', 576 doc="""List of all fragments' hit start coordinates""") 577 578 query_start_all = allitems('query_starts', 579 doc="""List of all fragments' query start coordinates""") 580 581 hit_end_all = allitems('hit_ends', 582 doc="""List of all fragments' hit end coordinates""") 583 584 query_end_all = allitems('query_ends', 585 doc="""List of all fragments' query end coordinates""") 586 587 hit_span_all = allitems('hit_span', 588 doc="""List of all fragments' hit sequence size""") 589 590 query_span_all = allitems('query_span', 591 doc="""List of all fragments' query sequence size""") 592 593 hit_range_all = allitems('hit_range', 594 doc="""List of all fragments' hit start and end coordinates""") 595 596 query_range_all = allitems('query_range', 597 doc="""List of all fragments' query start and end coordinates""")
598 599
600 -class HSPFragment(_BaseHSP):
601 602 """Class representing a contiguous alignment of hit-query sequence. 603 604 HSPFragment forms the core of any parsed search output file. Depending on 605 the search output file format, it may contain the actual query and/or hit 606 sequences that produces the search hits. These sequences are stored as 607 SeqRecord objects (see SeqRecord): 608 609 >>> from Bio import SearchIO 610 >>> qresult = SearchIO.parse('Blast/mirna.xml', 'blast-xml').next() 611 >>> fragment = qresult[0][0][0] # first hit, first hsp, first fragment 612 >>> print fragment 613 Query: 33211 mir_1 614 Hit: gi|262205317|ref|NR_030195.1| Homo sapiens microRNA 520b (MIR520... 615 Query range: [0:61] (1) 616 Hit range: [0:61] (1) 617 Fragments: 1 (61 columns) 618 Query - CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG 619 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| 620 Hit - CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG 621 622 # the query sequence is a SeqRecord object 623 >>> fragment.query.__class__ 624 <class 'Bio.SeqRecord.SeqRecord'> 625 >>> print fragment.query 626 ID: 33211 627 Name: aligned query sequence 628 Description: mir_1 629 Number of features: 0 630 Seq('CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTT...GGG', DNAAlphabet()) 631 632 # the hit sequence is a SeqRecord object as well 633 >>> fragment.hit.__class__ 634 <class 'Bio.SeqRecord.SeqRecord'> 635 >>> print fragment.hit 636 ID: gi|262205317|ref|NR_030195.1| 637 Name: aligned hit sequence 638 Description: Homo sapiens microRNA 520b (MIR520B), microRNA 639 Number of features: 0 640 Seq('CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTT...GGG', DNAAlphabet()) 641 642 # when both query and hit are present, we get a MultipleSeqAlignment object 643 >>> fragment.aln.__class__ 644 <class 'Bio.Align.MultipleSeqAlignment'> 645 >>> print fragment.aln 646 DNAAlphabet() alignment with 2 rows and 61 columns 647 CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAG...GGG 33211 648 CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAG...GGG gi|262205317|ref|NR_030195.1| 649 650 """ 651
652 - def __init__(self, hit_id='<unknown id>', query_id='<unknown id>', 653 hit='', query='', alphabet=single_letter_alphabet):
654 655 self._alphabet = alphabet 656 self.aln_annotation = {} 657 658 self._hit_id = hit_id 659 self._query_id = query_id 660 661 for seq_type in ('query', 'hit'): 662 # query or hit attributes default attributes 663 setattr(self, '_%s_description' % seq_type, '<unknown description>') 664 setattr(self, '_%s_features' % seq_type, []) 665 # query or hit attributes whose default attribute is None 666 for attr in ('strand', 'frame', 'start', 'end'): 667 setattr(self, '%s_%s' % (seq_type, attr), None) 668 # self.query or self.hit 669 if eval(seq_type): 670 setattr(self, seq_type, eval(seq_type)) 671 else: 672 setattr(self, seq_type, None)
673
674 - def __repr__(self):
675 info = "hit_id=%r, query_id=%r" % (self.hit_id, self.query_id) 676 try: 677 info += ", %i columns" % len(self) 678 except AttributeError: 679 pass 680 return "%s(%s)" % (self.__class__.__name__, info)
681
682 - def __len__(self):
683 return self.aln_span
684
685 - def __str__(self):
686 return self._str_hsp_header() + '\n' + self._str_aln()
687
688 - def __getitem__(self, idx):
689 if self.aln is not None: 690 obj = self.__class__( 691 hit_id=self.hit_id, query_id=self.query_id, 692 alphabet=self.alphabet) 693 # transfer query and hit attributes 694 # let SeqRecord handle feature slicing, then retrieve the sliced 695 # features into the sliced HSPFragment 696 if self.query is not None: 697 obj.query = self.query[idx] 698 obj.query_features = obj.query.features 699 if self.hit is not None: 700 obj.hit = self.hit[idx] 701 obj.hit_features = obj.hit.features 702 # description, strand, frame 703 for attr in ('description', 'strand', 'frame'): 704 for seq_type in ('hit', 'query'): 705 attr_name = '%s_%s' % (seq_type, attr) 706 self_val = getattr(self, attr_name) 707 setattr(obj, attr_name, self_val) 708 # alignment annotation should be transferred, since we can compute 709 # the resulting annotation 710 obj.aln_annotation = {} 711 for key, value in self.aln_annotation.items(): 712 assert len(value[idx]) == len(obj) 713 obj.aln_annotation[key] = value[idx] 714 return obj 715 else: 716 raise TypeError("Slicing for HSP objects without " 717 "alignment is not supported.")
718
719 - def _str_aln(self):
720 lines = [] 721 # alignment length 722 aln_span = getattr_str(self, 'aln_span') 723 lines.append(' Fragments: 1 (%s columns)' % aln_span) 724 # sequences 725 if self.query is not None and self.hit is not None: 726 try: 727 qseq = str(self.query.seq) 728 except AttributeError: # query is None 729 qseq = '?' 730 try: 731 hseq = str(self.hit.seq) 732 except AttributeError: # hit is None 733 hseq = '?' 734 735 # homology line 736 homol = '' 737 if 'homology' in self.aln_annotation: 738 homol = self.aln_annotation['homology'] 739 740 if self.aln_span <= 67: 741 lines.append("%10s - %s" % ('Query', qseq)) 742 if homol: 743 lines.append(" %s" % homol) 744 lines.append("%10s - %s" % ('Hit', hseq)) 745 else: 746 # adjust continuation character length, so we don't display 747 # the same residues twice 748 if self.aln_span - 66 > 3: 749 cont = '~' * 3 750 else: 751 cont = '~' * (self.aln_span - 66) 752 lines.append("%10s - %s%s%s" % ('Query', 753 qseq[:59], cont, qseq[-5:])) 754 if homol: 755 lines.append(" %s%s%s" % 756 (homol[:59], cont, homol[-5:])) 757 lines.append("%10s - %s%s%s" % ('Hit', 758 hseq[:59], cont, hseq[-5:])) 759 760 return '\n'.join(lines)
761 762 ## sequence properties ##
763 - def _set_seq(self, seq, seq_type):
764 """Checks the given sequence for attribute setting 765 766 Arguments: 767 seq -- String or SeqRecord to check 768 seq_type -- String of sequence type, must be 'hit' or 'query' 769 770 """ 771 assert seq_type in ('hit', 'query') 772 if seq is None: 773 return seq # return immediately if seq is None 774 else: 775 if not isinstance(seq, (basestring, SeqRecord)): 776 raise TypeError("%s sequence must be a string or a SeqRecord" 777 " object." % seq_type) 778 # check length if the opposite sequence is not None 779 opp_type = 'hit' if seq_type == 'query' else 'query' 780 opp_seq = getattr(self, '_%s' % opp_type, None) 781 if opp_seq is not None: 782 if len(seq) != len(opp_seq): 783 raise ValueError("Sequence lengths do not match. Expected: " 784 "%r (%s); found: %r (%s)." % (len(opp_seq), opp_type, 785 len(seq), seq_type)) 786 787 seq_id = getattr(self, '%s_id' % seq_type) 788 seq_desc = getattr(self, '%s_description' % seq_type) 789 seq_feats = getattr(self, '%s_features' % seq_type) 790 seq_name = 'aligned %s sequence' % seq_type 791 792 if isinstance(seq, SeqRecord): 793 seq.id = seq_id 794 seq.description = seq_desc 795 seq.name = seq_name 796 seq.features = seq_feats 797 seq.seq.alphabet = self.alphabet 798 elif isinstance(seq, basestring): 799 seq = SeqRecord(Seq(seq, self.alphabet), id=seq_id, name=seq_name, 800 description=seq_desc, features=seq_feats) 801 802 return seq
803
804 - def _hit_get(self):
805 return self._hit
806
807 - def _hit_set(self, value):
808 self._hit = self._set_seq(value, 'hit')
809 810 hit = property(fget=_hit_get, fset=_hit_set, 811 doc="""Hit sequence as a SeqRecord object, defaults to None""") 812
813 - def _query_get(self):
814 return self._query
815
816 - def _query_set(self, value):
817 self._query = self._set_seq(value, 'query')
818 819 query = property(fget=_query_get, fset=_query_set, 820 doc="""Query sequence as a SeqRecord object, defaults to None""") 821
822 - def _aln_get(self):
823 if self.query is None and self.hit is None: 824 return None 825 elif self.hit is None: 826 return MultipleSeqAlignment([self.query], self.alphabet) 827 elif self.query is None: 828 return MultipleSeqAlignment([self.hit], self.alphabet) 829 else: 830 return MultipleSeqAlignment([self.query, self.hit], self.alphabet)
831 832 aln = property(fget=_aln_get, 833 doc="""Query-hit alignment as a MultipleSeqAlignment object, 834 defaults to None""") 835
836 - def _alphabet_get(self):
837 return self._alphabet
838
839 - def _alphabet_set(self, value):
840 self._alphabet = value 841 try: 842 self.query.seq.alphabet = value 843 except AttributeError: 844 pass 845 try: 846 self.hit.seq.alphabet = value 847 except AttributeError: 848 pass
849 850 alphabet = property(fget=_alphabet_get, fset=_alphabet_set, 851 doc="""Alphabet object used in the fragment's sequences and alignment, 852 defaults to single_letter_alphabet""") 853
854 - def _aln_span_get(self):
855 # length of alignment (gaps included) 856 # alignment span can be its own attribute, or computed from 857 # query / hit length 858 if not hasattr(self, '_aln_span'): 859 if self.query is not None: 860 self._aln_span = len(self.query) 861 elif self.hit is not None: 862 self._aln_span = len(self.hit) 863 864 return self._aln_span
865
866 - def _aln_span_set(self, value):
867 self._aln_span = value
868 869 aln_span = property(fget=_aln_span_get, fset=_aln_span_set, 870 doc="""The number of alignment columns covered by the fragment""") 871 872 ## id, description, and features properties ## 873 hit_description = fragcascade('description', 'hit', 874 doc="""Hit sequence description""") 875 876 query_description = fragcascade('description', 'query', 877 doc="""Query sequence description""") 878 879 hit_id = fragcascade('id', 'hit', 880 doc="""Hit sequence ID""") 881 882 query_id = fragcascade('id', 'query', 883 doc="""Query sequence ID""") 884 885 hit_features = fragcascade('features', 'hit', 886 doc="""Hit sequence features""") 887 888 query_features = fragcascade('features', 'query', 889 doc="""Query sequence features""") 890 891 ## strand properties ##
892 - def _prep_strand(self, strand):
893 # follow SeqFeature's convention 894 if not strand in (-1, 0, 1, None): 895 raise ValueError("Strand should be -1, 0, 1, or None; not %r" % 896 strand) 897 return strand
898
899 - def _get_strand(self, seq_type):
900 assert seq_type in ('hit', 'query') 901 strand = getattr(self, '_%s_strand' % seq_type) 902 903 if strand is None: 904 # try to compute strand from frame 905 frame = getattr(self, '%s_frame' % seq_type) 906 if frame is not None: 907 try: 908 strand = frame // abs(frame) 909 except ZeroDivisionError: 910 strand = 0 911 setattr(self, '%s_strand' % seq_type, strand) 912 913 return strand
914
915 - def _hit_strand_get(self):
916 return self._get_strand('hit')
917
918 - def _hit_strand_set(self, value):
919 self._hit_strand = self._prep_strand(value)
920 921 hit_strand = property(fget=_hit_strand_get, fset=_hit_strand_set, 922 doc="""Hit sequence strand, defaults to None""") 923
924 - def _query_strand_get(self):
925 return self._get_strand('query')
926
927 - def _query_strand_set(self, value):
928 self._query_strand = self._prep_strand(value)
929 930 query_strand = property(fget=_query_strand_get, fset=_query_strand_set, 931 doc="""Query sequence strand, defaults to None""") 932 933 ## frame properties ##
934 - def _prep_frame(self, frame):
935 if not frame in (-3, -2, -1, 0, 1, 2, 3, None): 936 raise ValueError("Strand should be an integer between -3 and 3, " 937 "or None; not %r" % frame) 938 return frame
939
940 - def _hit_frame_get(self):
941 return self._hit_frame
942
943 - def _hit_frame_set(self, value):
944 self._hit_frame = self._prep_frame(value)
945 946 hit_frame = property(fget=_hit_frame_get, fset=_hit_frame_set, 947 doc="""Hit sequence reading frame, defaults to None""") 948
949 - def _query_frame_get(self):
950 return self._query_frame
951
952 - def _query_frame_set(self, value):
953 self._query_frame = self._prep_frame(value)
954 955 query_frame = property(fget=_query_frame_get, fset=_query_frame_set, 956 doc="""Query sequence reading frame, defaults to None""") 957 958 ## coordinate properties ##
959 - def _prep_coord(self, coord, opp_coord_name, op):
960 # coord must either be None or int 961 if coord is None: 962 return coord 963 assert isinstance(coord, int) 964 # try to get opposite coordinate, if it's not present, return 965 try: 966 opp_coord = getattr(self, opp_coord_name) 967 except AttributeError: 968 return coord 969 # if opposite coordinate is None, return 970 if opp_coord is None: 971 return coord 972 # otherwise compare it to coord ('>=' or '<=') 973 else: 974 assert op(coord, opp_coord) 975 return coord
976
977 - def _hit_start_get(self):
978 return self._hit_start
979
980 - def _hit_start_set(self, value):
981 self._hit_start = self._prep_coord(value, 'hit_end', le)
982 983 hit_start = property(fget=_hit_start_get, fset=_hit_start_set, 984 doc="""Hit sequence start coordinate, defaults to None""") 985
986 - def _query_start_get(self):
987 return self._query_start
988
989 - def _query_start_set(self, value):
990 self._query_start = self._prep_coord(value, 'query_end', le)
991 992 query_start = property(fget=_query_start_get, fset=_query_start_set, 993 doc="""Query sequence start coordinate, defaults to None""") 994
995 - def _hit_end_get(self):
996 return self._hit_end
997
998 - def _hit_end_set(self, value):
999 self._hit_end = self._prep_coord(value, 'hit_start', ge)
1000 1001 hit_end = property(fget=_hit_end_get, fset=_hit_end_set, 1002 doc="""Hit sequence start coordinate, defaults to None""") 1003
1004 - def _query_end_get(self):
1005 return self._query_end
1006
1007 - def _query_end_set(self, value):
1008 self._query_end = self._prep_coord(value, 'query_start', ge)
1009 1010 query_end = property(fget=_query_end_get, fset=_query_end_set, 1011 doc="""Query sequence end coordinate, defaults to None""") 1012 1013 ## coordinate-dependent properties ##
1014 - def _hit_span_get(self):
1015 try: 1016 return self.hit_end - self.hit_start 1017 except TypeError: # triggered if any of the coordinates are None 1018 return None
1019 1020 hit_span = property(fget=_hit_span_get, 1021 doc="""The number of residues covered by the hit sequence""") 1022
1023 - def _query_span_get(self):
1024 try: 1025 return self.query_end - self.query_start 1026 except TypeError: # triggered if any of the coordinates are None 1027 return None
1028 1029 query_span = property(fget=_query_span_get, 1030 doc="""The number of residues covered by the query sequence""") 1031
1032 - def _hit_range_get(self):
1033 return (self.hit_start, self.hit_end)
1034 1035 hit_range = property(fget=_hit_range_get, 1036 doc="""Tuple of hit start and end coordinates""") 1037
1038 - def _query_range_get(self):
1039 return (self.query_start, self.query_end)
1040 1041 query_range = property(fget=_query_range_get, 1042 doc="""Tuple of query start and end coordinates""")
1043 1044 1045 # if not used as a module, run the doctest 1046 if __name__ == "__main__": 1047 from Bio._utils import run_doctest 1048 run_doctest() 1049