Package Bio :: Package SeqUtils :: Module lcc
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqUtils.lcc

  1  # Copyright 2003, 2007 by Sebastian Bassi. sbassi@genesdigitales.com 
  2  # All rights reserved.  This code is part of the Biopython 
  3  # distribution and governed by its license. 
  4  # Please see the LICENSE file that should have been included as part 
  5  # of this package. 
  6  """Local Composition Complexity.""" 
  7   
  8  import math 
  9   
 10   
11 -def lcc_mult(seq, wsize):
12 """Local Composition Complexity (LCC) values over sliding window. 13 14 Returns a list of floats, the LCC values for a sliding window over 15 the sequence. 16 17 seq - an unambiguous DNA sequence (a string or Seq object) 18 wsize - window size, integer 19 20 The result is the same as applying lcc_simp multiple times, but this 21 version is optimized for speed. The optimization works by using the 22 value of previous window as a base to compute the next one. 23 """ 24 l2 = math.log(2) 25 tamseq = len(seq) 26 try: 27 # Assume its a string 28 upper = seq.upper() 29 except AttributeError: 30 # Should be a Seq object then 31 upper = str(seq).upper() 32 compone = [0] 33 lccsal = [0] 34 for i in range(wsize): 35 compone.append(((i + 1) / float(wsize)) * 36 ((math.log((i + 1) / float(wsize))) / l2)) 37 window = seq[0:wsize] 38 cant_a = window.count('A') 39 cant_c = window.count('C') 40 cant_t = window.count('T') 41 cant_g = window.count('G') 42 term_a = compone[cant_a] 43 term_c = compone[cant_c] 44 term_t = compone[cant_t] 45 term_g = compone[cant_g] 46 lccsal.append(-(term_a + term_c + term_t + term_g)) 47 tail = seq[0] 48 for x in range(tamseq - wsize): 49 window = upper[x + 1:wsize + x + 1] 50 if tail == window[-1]: 51 lccsal.append(lccsal[-1]) 52 elif tail == 'A': 53 cant_a -= 1 54 if window.endswith('C'): 55 cant_c += 1 56 term_a = compone[cant_a] 57 term_c = compone[cant_c] 58 lccsal.append(-(term_a + term_c + term_t + term_g)) 59 elif window.endswith('T'): 60 cant_t += 1 61 term_a = compone[cant_a] 62 term_t = compone[cant_t] 63 lccsal.append(-(term_a + term_c + term_t + term_g)) 64 elif window.endswith('G'): 65 cant_g += 1 66 term_a = compone[cant_a] 67 term_g = compone[cant_g] 68 lccsal.append(-(term_a + term_c + term_t + term_g)) 69 elif tail == 'C': 70 cant_c -= 1 71 if window.endswith('A'): 72 cant_a += 1 73 term_a = compone[cant_a] 74 term_c = compone[cant_c] 75 lccsal.append(-(term_a + term_c + term_t + term_g)) 76 elif window.endswith('T'): 77 cant_t += 1 78 term_c = compone[cant_c] 79 term_t = compone[cant_t] 80 lccsal.append(-(term_a + term_c + term_t + term_g)) 81 elif window.endswith('G'): 82 cant_g += 1 83 term_c = compone[cant_c] 84 term_g = compone[cant_g] 85 lccsal.append(-(term_a + term_c + term_t + term_g)) 86 elif tail == 'T': 87 cant_t -= 1 88 if window.endswith('A'): 89 cant_a += 1 90 term_a = compone[cant_a] 91 term_t = compone[cant_t] 92 lccsal.append(-(term_a + term_c + term_t + term_g)) 93 elif window.endswith('C'): 94 cant_c += 1 95 term_c = compone[cant_c] 96 term_t = compone[cant_t] 97 lccsal.append(-(term_a + term_c + term_t + term_g)) 98 elif window.endswith('G'): 99 cant_g += 1 100 term_t = compone[cant_t] 101 term_g = compone[cant_g] 102 lccsal.append(-(term_a + term_c + term_t + term_g)) 103 elif tail == 'G': 104 cant_g -= 1 105 if window.endswith('A'): 106 cant_a += 1 107 term_a = compone[cant_a] 108 term_g = compone[cant_g] 109 lccsal.append(-(term_a + term_c + term_t + term_g)) 110 elif window.endswith('C'): 111 cant_c += 1 112 term_c = compone[cant_c] 113 term_g = compone[cant_g] 114 lccsal.append(-(term_a + term_c + term_t + term_g)) 115 elif window.endswith('T'): 116 cant_t += 1 117 term_t = compone[cant_t] 118 term_g = compone[cant_g] 119 lccsal.append(-(term_a + term_c + term_t + term_g)) 120 tail = window[0] 121 return lccsal
122 123
124 -def lcc_simp(seq):
125 """Local Composition Complexity (LCC) for a sequence. 126 127 seq - an unambiguous DNA sequence (a string or Seq object) 128 129 Returns the Local Composition Complexity (LCC) value for the entire 130 sequence (as a float). 131 132 Reference: 133 Andrzej K Konopka (2005) Sequence Complexity and Composition 134 DOI: 10.1038/npg.els.0005260 135 """ 136 wsize = len(seq) 137 try: 138 # Assume its a string 139 upper = seq.upper() 140 except AttributeError: 141 # Should be a Seq object then 142 upper = str(seq).upper() 143 l2 = math.log(2) 144 if 'A' not in seq: 145 term_a = 0 146 # Check to avoid calculating the log of 0. 147 else: 148 term_a = ((upper.count('A')) / float(wsize)) * \ 149 ((math.log((upper.count('A')) / float(wsize))) / l2) 150 if 'C' not in seq: 151 term_c = 0 152 else: 153 term_c = ((upper.count('C')) / float(wsize)) * \ 154 ((math.log((upper.count('C')) / float(wsize))) / l2) 155 if 'T' not in seq: 156 term_t = 0 157 else: 158 term_t = ((upper.count('T')) / float(wsize)) * \ 159 ((math.log((upper.count('T')) / float(wsize))) / l2) 160 if 'G' not in seq: 161 term_g = 0 162 else: 163 term_g = ((upper.count('G')) / float(wsize)) * \ 164 ((math.log((upper.count('G')) / float(wsize))) / l2) 165 return -(term_a + term_c + term_t + term_g)
166