Package Bio :: Package SeqUtils :: Module lcc
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqUtils.lcc

  1  # Copyright 2003, 2007 by Sebastian Bassi. sbassi@genesdigitales.com 
  2  # All rights reserved.  This code is part of the Biopython 
  3  # distribution and governed by its license. 
  4  # Please see the LICENSE file that should have been included as part 
  5  # of this package. 
  6  """Local Composition Complexity.""" 
  7   
  8  import math 
  9   
 10  __docformat__ = "restructuredtext en" 
 11   
 12   
13 -def lcc_mult(seq, wsize):
14 """Local Composition Complexity (LCC) values over sliding window. 15 16 Returns a list of floats, the LCC values for a sliding window over 17 the sequence. 18 19 seq - an unambiguous DNA sequence (a string or Seq object) 20 wsize - window size, integer 21 22 The result is the same as applying lcc_simp multiple times, but this 23 version is optimized for speed. The optimization works by using the 24 value of previous window as a base to compute the next one.""" 25 l2 = math.log(2) 26 tamseq = len(seq) 27 try: 28 # Assume its a string 29 upper = seq.upper() 30 except AttributeError: 31 # Should be a Seq object then 32 upper = str(seq).upper() 33 compone = [0] 34 lccsal = [0] 35 for i in range(wsize): 36 compone.append(((i + 1) / float(wsize)) * 37 ((math.log((i + 1) / float(wsize))) / l2)) 38 window = seq[0:wsize] 39 cant_a = window.count('A') 40 cant_c = window.count('C') 41 cant_t = window.count('T') 42 cant_g = window.count('G') 43 term_a = compone[cant_a] 44 term_c = compone[cant_c] 45 term_t = compone[cant_t] 46 term_g = compone[cant_g] 47 lccsal.append(-(term_a + term_c + term_t + term_g)) 48 tail = seq[0] 49 for x in range(tamseq - wsize): 50 window = upper[x + 1:wsize + x + 1] 51 if tail == window[-1]: 52 lccsal.append(lccsal[-1]) 53 elif tail == 'A': 54 cant_a -= 1 55 if window.endswith('C'): 56 cant_c += 1 57 term_a = compone[cant_a] 58 term_c = compone[cant_c] 59 lccsal.append(-(term_a + term_c + term_t + term_g)) 60 elif window.endswith('T'): 61 cant_t += 1 62 term_a = compone[cant_a] 63 term_t = compone[cant_t] 64 lccsal.append(-(term_a + term_c + term_t + term_g)) 65 elif window.endswith('G'): 66 cant_g += 1 67 term_a = compone[cant_a] 68 term_g = compone[cant_g] 69 lccsal.append(-(term_a + term_c + term_t + term_g)) 70 elif tail == 'C': 71 cant_c -= 1 72 if window.endswith('A'): 73 cant_a += 1 74 term_a = compone[cant_a] 75 term_c = compone[cant_c] 76 lccsal.append(-(term_a + term_c + term_t + term_g)) 77 elif window.endswith('T'): 78 cant_t += 1 79 term_c = compone[cant_c] 80 term_t = compone[cant_t] 81 lccsal.append(-(term_a + term_c + term_t + term_g)) 82 elif window.endswith('G'): 83 cant_g += 1 84 term_c = compone[cant_c] 85 term_g = compone[cant_g] 86 lccsal.append(-(term_a + term_c + term_t + term_g)) 87 elif tail == 'T': 88 cant_t -= 1 89 if window.endswith('A'): 90 cant_a += 1 91 term_a = compone[cant_a] 92 term_t = compone[cant_t] 93 lccsal.append(-(term_a + term_c + term_t + term_g)) 94 elif window.endswith('C'): 95 cant_c += 1 96 term_c = compone[cant_c] 97 term_t = compone[cant_t] 98 lccsal.append(-(term_a + term_c + term_t + term_g)) 99 elif window.endswith('G'): 100 cant_g += 1 101 term_t = compone[cant_t] 102 term_g = compone[cant_g] 103 lccsal.append(-(term_a + term_c + term_t + term_g)) 104 elif tail == 'G': 105 cant_g -= 1 106 if window.endswith('A'): 107 cant_a += 1 108 term_a = compone[cant_a] 109 term_g = compone[cant_g] 110 lccsal.append(-(term_a + term_c + term_t + term_g)) 111 elif window.endswith('C'): 112 cant_c += 1 113 term_c = compone[cant_c] 114 term_g = compone[cant_g] 115 lccsal.append(-(term_a + term_c + term_t + term_g)) 116 elif window.endswith('T'): 117 cant_t += 1 118 term_t = compone[cant_t] 119 term_g = compone[cant_g] 120 lccsal.append(-(term_a + term_c + term_t + term_g)) 121 tail = window[0] 122 return lccsal
123 124
125 -def lcc_simp(seq):
126 """Local Composition Complexity (LCC) for a sequence. 127 128 seq - an unambiguous DNA sequence (a string or Seq object) 129 130 Returns the Local Composition Complexity (LCC) value for the entire 131 sequence (as a float). 132 133 Reference: 134 Andrzej K Konopka (2005) Sequence Complexity and Composition 135 DOI: 10.1038/npg.els.0005260 136 """ 137 wsize = len(seq) 138 try: 139 # Assume its a string 140 upper = seq.upper() 141 except AttributeError: 142 # Should be a Seq object then 143 upper = str(seq).upper() 144 l2 = math.log(2) 145 if 'A' not in seq: 146 term_a = 0 147 # Check to avoid calculating the log of 0. 148 else: 149 term_a = ((upper.count('A')) / float(wsize)) * ((math.log((upper.count('A')) 150 / float(wsize))) / l2) 151 if 'C' not in seq: 152 term_c = 0 153 else: 154 term_c = ((upper.count('C')) / float(wsize)) * ((math.log((upper.count('C')) 155 / float(wsize))) / l2) 156 if 'T' not in seq: 157 term_t = 0 158 else: 159 term_t = ((upper.count('T')) / float(wsize)) * ((math.log((upper.count('T')) 160 / float(wsize))) / l2) 161 if 'G' not in seq: 162 term_g = 0 163 else: 164 term_g = ((upper.count('G')) / float(wsize)) * ((math.log((upper.count('G')) 165 / float(wsize))) / l2) 166 return -(term_a + term_c + term_t + term_g)
167