Package Bio :: Package SeqUtils :: Module lcc
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqUtils.lcc

  1  # Copyright 2003, 2007 by Sebastian Bassi. sbassi@genesdigitales.com 
  2  # All rights reserved.  This code is part of the Biopython 
  3  # distribution and governed by its license. 
  4  # Please see the LICENSE file that should have been included as part 
  5  # of this package. 
  6  """Local Composition Complexity.""" 
  7   
  8  import math 
  9   
 10   
11 -def lcc_mult(seq, wsize):
12 """Local Composition Complexity (LCC) values over sliding window. 13 14 Returns a list of floats, the LCC values for a sliding window over 15 the sequence. 16 17 seq - an unambiguous DNA sequence (a string or Seq object) 18 wsize - window size, integer 19 20 The result is the same as applying lcc_simp multiple times, but this 21 version is optimized for speed. The optimization works by using the 22 value of previous window as a base to compute the next one.""" 23 l2 = math.log(2) 24 tamseq = len(seq) 25 try: 26 # Assume its a string 27 upper = seq.upper() 28 except AttributeError: 29 # Should be a Seq object then 30 upper = str(seq).upper() 31 compone = [0] 32 lccsal = [0] 33 for i in range(wsize): 34 compone.append(((i + 1) / float(wsize)) * 35 ((math.log((i + 1) / float(wsize))) / l2)) 36 window = seq[0:wsize] 37 cant_a = window.count('A') 38 cant_c = window.count('C') 39 cant_t = window.count('T') 40 cant_g = window.count('G') 41 term_a = compone[cant_a] 42 term_c = compone[cant_c] 43 term_t = compone[cant_t] 44 term_g = compone[cant_g] 45 lccsal.append(-(term_a + term_c + term_t + term_g)) 46 tail = seq[0] 47 for x in range(tamseq - wsize): 48 window = upper[x + 1:wsize + x + 1] 49 if tail == window[-1]: 50 lccsal.append(lccsal[-1]) 51 elif tail == 'A': 52 cant_a -= 1 53 if window.endswith('C'): 54 cant_c += 1 55 term_a = compone[cant_a] 56 term_c = compone[cant_c] 57 lccsal.append(-(term_a + term_c + term_t + term_g)) 58 elif window.endswith('T'): 59 cant_t += 1 60 term_a = compone[cant_a] 61 term_t = compone[cant_t] 62 lccsal.append(-(term_a + term_c + term_t + term_g)) 63 elif window.endswith('G'): 64 cant_g += 1 65 term_a = compone[cant_a] 66 term_g = compone[cant_g] 67 lccsal.append(-(term_a + term_c + term_t + term_g)) 68 elif tail == 'C': 69 cant_c -= 1 70 if window.endswith('A'): 71 cant_a += 1 72 term_a = compone[cant_a] 73 term_c = compone[cant_c] 74 lccsal.append(-(term_a + term_c + term_t + term_g)) 75 elif window.endswith('T'): 76 cant_t += 1 77 term_c = compone[cant_c] 78 term_t = compone[cant_t] 79 lccsal.append(-(term_a + term_c + term_t + term_g)) 80 elif window.endswith('G'): 81 cant_g += 1 82 term_c = compone[cant_c] 83 term_g = compone[cant_g] 84 lccsal.append(-(term_a + term_c + term_t + term_g)) 85 elif tail == 'T': 86 cant_t -= 1 87 if window.endswith('A'): 88 cant_a += 1 89 term_a = compone[cant_a] 90 term_t = compone[cant_t] 91 lccsal.append(-(term_a + term_c + term_t + term_g)) 92 elif window.endswith('C'): 93 cant_c += 1 94 term_c = compone[cant_c] 95 term_t = compone[cant_t] 96 lccsal.append(-(term_a + term_c + term_t + term_g)) 97 elif window.endswith('G'): 98 cant_g += 1 99 term_t = compone[cant_t] 100 term_g = compone[cant_g] 101 lccsal.append(-(term_a + term_c + term_t + term_g)) 102 elif tail == 'G': 103 cant_g -= 1 104 if window.endswith('A'): 105 cant_a += 1 106 term_a = compone[cant_a] 107 term_g = compone[cant_g] 108 lccsal.append(-(term_a + term_c + term_t + term_g)) 109 elif window.endswith('C'): 110 cant_c += 1 111 term_c = compone[cant_c] 112 term_g = compone[cant_g] 113 lccsal.append(-(term_a + term_c + term_t + term_g)) 114 elif window.endswith('T'): 115 cant_t += 1 116 term_t = compone[cant_t] 117 term_g = compone[cant_g] 118 lccsal.append(-(term_a + term_c + term_t + term_g)) 119 tail = window[0] 120 return lccsal
121 122
123 -def lcc_simp(seq):
124 """Local Composition Complexity (LCC) for a sequence. 125 126 seq - an unambiguous DNA sequence (a string or Seq object) 127 128 Returns the Local Composition Complexity (LCC) value for the entire 129 sequence (as a float). 130 131 Reference: 132 Andrzej K Konopka (2005) Sequence Complexity and Composition 133 DOI: 10.1038/npg.els.0005260 134 """ 135 wsize = len(seq) 136 try: 137 # Assume its a string 138 upper = seq.upper() 139 except AttributeError: 140 # Should be a Seq object then 141 upper = str(seq).upper() 142 l2 = math.log(2) 143 if 'A' not in seq: 144 term_a = 0 145 # Check to avoid calculating the log of 0. 146 else: 147 term_a = ((upper.count('A')) / float(wsize)) * \ 148 ((math.log((upper.count('A')) / float(wsize))) / l2) 149 if 'C' not in seq: 150 term_c = 0 151 else: 152 term_c = ((upper.count('C')) / float(wsize)) * \ 153 ((math.log((upper.count('C')) / float(wsize))) / l2) 154 if 'T' not in seq: 155 term_t = 0 156 else: 157 term_t = ((upper.count('T')) / float(wsize)) * \ 158 ((math.log((upper.count('T')) / float(wsize))) / l2) 159 if 'G' not in seq: 160 term_g = 0 161 else: 162 term_g = ((upper.count('G')) / float(wsize)) * \ 163 ((math.log((upper.count('G')) / float(wsize))) / l2) 164 return -(term_a + term_c + term_t + term_g)
165