Package Bio :: Package PopGen :: Package FDist :: Module Controller
[hide private]
[frames] | no frames]

Source Code for Module Bio.PopGen.FDist.Controller

  1  # Copyright 2007 by Tiago Antao <tiagoantao@gmail.com>.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """ 
  7  This module allows to control fdist. 
  8   
  9  This will allow to call fdist and associated program (cplot, datacal, pv). 
 10   
 11  http://www.rubic.rdg.ac.uk/~mab/software.html 
 12  """ 
 13   
 14  import os 
 15  import sys 
 16  from random import randint 
 17  from time import strftime, clock 
 18  #from logging import debug 
 19   
 20  if sys.version_info[0] == 3: 
 21      maxint = sys.maxsize 
 22  else: 
 23      maxint = sys.maxint 
 24   
 25   
26 -def my_float(f):
27 #Because of Jython, mostly 28 if f=="-nan": 29 f="nan" 30 return float(f)
31 32
33 -class FDistController(object):
34 - def __init__(self, fdist_dir = '', ext = None):
35 """Initializes the controller. 36 37 fdist_dir is the directory where fdist2 is. 38 ext is the extension of binaries (.exe on windows, 39 none on Unix) 40 41 """ 42 self.tmp_idx = 0 43 self.fdist_dir = fdist_dir 44 self.os_name = os.name 45 if sys.platform=='win32': 46 py_ext = '.exe' 47 else: 48 py_ext = '' 49 if ext is None: 50 self.ext = py_ext 51 else: 52 self.ext = ext 53 exec_counts = 0
54
55 - def _get_path(self, app):
56 """Returns the path to an fdist application. 57 58 Includes Path where fdist can be found plus executable extension. 59 """ 60 if self.fdist_dir == '': 61 return app + self.ext 62 else: 63 return os.sep.join([self.fdist_dir, app]) + self.ext
64
65 - def _get_temp_file(self):
66 """Gets a temporary file name. 67 68 Returns a temporary file name, if executing inside jython 69 tries to replace unexisting tempfile.mkstemp(). 70 """ 71 self.tmp_idx += 1 72 return strftime("%H%M%S") + str(int(clock()*100)) + str(randint(0,1000)) + str(self.tmp_idx)
73
74 - def run_datacal(self, data_dir='.', version=1, 75 crit_freq = 0.99, p = 0.5, beta= (0.25, 0.25)):
76 """Executes datacal. 77 78 data_dir - Where the data is found. 79 """ 80 in_name = self._get_temp_file() 81 out_name = self._get_temp_file() 82 f = open(data_dir + os.sep + in_name, 'w') 83 if version==1: 84 f.write('a\n') 85 datacal_name = "datacal" 86 else: 87 f.write('%f\n%f\n%f %f\na\n' % (crit_freq, p, beta[0], beta[1])) 88 datacal_name = "Ddatacal" 89 f.close() 90 curr_dir = os.getcwd() 91 os.system('cd ' + data_dir + ' && ' + 92 self._get_path(datacal_name) + ' < ' + in_name + ' > ' + out_name) 93 f = open(data_dir + os.sep + out_name) 94 if version == 1: 95 fst_line = f.readline().rstrip().split(' ') 96 fst = my_float(fst_line[4]) 97 sample_line = f.readline().rstrip().split(' ') 98 sample = int(sample_line[9]) 99 else: 100 l = f.readline().rstrip().split(" ") 101 loci, pops = int(l[-5]), int(l[-2]) 102 fst_line = f.readline().rstrip().split(' ') 103 fst = my_float(fst_line[4]) 104 sample_line = f.readline().rstrip().split(' ') 105 sample = int(sample_line[9]) 106 F_line = f.readline().rstrip().split(' ') 107 F, obs = my_float(F_line[5]), int(F_line[8]) 108 f.close() 109 os.remove(data_dir + os.sep + in_name) 110 os.remove(data_dir + os.sep + out_name) 111 if version==1: 112 return fst, sample 113 else: 114 return fst, sample, loci, pops, F, obs
115
116 - def _generate_intfile(self, data_dir):
117 """Generates an INTFILE. 118 119 Parameter: 120 data_dir - data directory 121 """ 122 inf = open(data_dir + os.sep + 'INTFILE', 'w') 123 for i in range(98): 124 inf.write(str(randint(-maxint+1,maxint-1)) + '\n') 125 inf.write('8\n') 126 inf.close()
127
128 - def run_fdist(self, npops, nsamples, fst, sample_size, 129 mut = 0, num_sims = 50000, data_dir='.', 130 is_dominant = False, theta = 0.06, beta = (0.25, 0.25), 131 max_freq = 0.99):
132 """Executes (d)fdist. 133 134 Parameters: 135 npops - Number of populations 136 nsamples - Number of populations sampled 137 fst - expected Fst 138 sample_size - Sample size per population 139 For dfdist: if zero a sample size file has to be provided 140 mut - 1=Stepwise, 0=Infinite allele 141 num_sims - number of simulations 142 data_dir - Where the data is found 143 is_dominant - If true executes dfdist 144 theta - Theta (=2Nmu) 145 beta - Parameters for the beta prior 146 max_freq - Maximum allowed frequency of the commonest allele 147 148 Returns: 149 fst - Average Fst 150 151 Important Note: This can take quite a while to run! 152 """ 153 if fst >= 0.9: 154 #Lets not joke 155 fst = 0.899 156 if fst <= 0.0: 157 #0 will make fdist run forever 158 fst = 0.001 159 in_name = 'input.fd' 160 out_name = 'output.fd' 161 #print 'writing', data_dir + os.sep + in_name 162 f = open(data_dir + os.sep + in_name, 'w') 163 f.write('y\n\n') 164 f.close() 165 if is_dominant: 166 config_name = "Dfdist_params" 167 else: 168 config_name = "fdist_params2.dat" 169 170 f = open(data_dir + os.sep + config_name, 'w') 171 f.write(str(npops) + '\n') 172 f.write(str(nsamples) + '\n') 173 f.write(str(fst) + '\n') 174 f.write(str(sample_size) + '\n') 175 if is_dominant: 176 f.write(str(theta) + '\n') 177 else: 178 f.write(str(mut) + '\n') 179 f.write(str(num_sims) + '\n') 180 if is_dominant: 181 f.write("%f %f\n" % beta) 182 f.write("%f\n" % max_freq) 183 f.close() 184 self._generate_intfile(data_dir) 185 186 if is_dominant: 187 bin_name = "Dfdist" 188 else: 189 bin_name = "fdist2" 190 os.system('cd ' + data_dir + ' && ' + 191 self._get_path(bin_name) + ' < ' + in_name + ' > ' + out_name) 192 f = open(data_dir + os.sep + out_name) 193 lines = f.readlines() 194 f.close() 195 for line in lines: 196 if line.startswith('average Fst'): 197 fst = my_float(line.rstrip().split(' ')[-1]) 198 os.remove(data_dir + os.sep + in_name) 199 os.remove(data_dir + os.sep + out_name) 200 return fst
201
202 - def run_fdist_force_fst(self, npops, nsamples, fst, sample_size, 203 mut = 0, num_sims = 50000, data_dir='.', 204 try_runs = 5000, limit=0.001,is_dominant = False, 205 theta = 0.06, beta = (0.25, 0.25), 206 max_freq = 0.99):
207 """Executes fdist trying to force Fst. 208 209 Parameters: 210 try_runs - Number of simulations on the part trying to get 211 Fst correct 212 limit - Interval limit 213 Other parameters can be seen on run_fdist. 214 """ 215 max_run_fst = 1 216 min_run_fst = 0 217 current_run_fst = fst 218 old_fst = fst 219 while True: 220 #debug('testing fst ' + str(current_run_fst)) 221 real_fst = self.run_fdist(npops, nsamples, 222 current_run_fst, sample_size, 223 mut, try_runs, data_dir, 224 is_dominant, theta, beta, max_freq) 225 #debug('got real fst ' + str(real_fst)) 226 if abs(real_fst - fst) < limit: 227 #debug('We are OK') 228 return self.run_fdist(npops, nsamples, current_run_fst, 229 sample_size, 230 mut, num_sims, data_dir, 231 is_dominant, theta, beta, max_freq) 232 old_fst = current_run_fst 233 if real_fst > fst: 234 max_run_fst = current_run_fst 235 if current_run_fst < min_run_fst + limit: 236 #we can do no better 237 #debug('Lower limit is ' + str(min_run_fst)) 238 return self.run_fdist(npops, nsamples, current_run_fst, 239 sample_size, mut, num_sims, data_dir) 240 current_run_fst = (min_run_fst + current_run_fst)/2 241 else: 242 min_run_fst = current_run_fst 243 if current_run_fst > max_run_fst - limit: 244 #we can do no better 245 #debug('Upper limit is ' + str(max_run_fst)) 246 return self.run_fdist(npops, nsamples, current_run_fst, 247 sample_size, mut, num_sims, data_dir, 248 is_dominant, theta, beta, max_freq) 249 current_run_fst = (max_run_fst + current_run_fst)/2
250
251 - def run_cplot(self, ci= 0.95, data_dir='.', version = 1, smooth=0.04):
252 """Executes cplot. 253 254 ci - Confidence interval. 255 data_dir - Where the data is found. 256 """ 257 in_name = self._get_temp_file() 258 out_name = self._get_temp_file() 259 f = open(data_dir + os.sep + in_name, 'w') 260 if version == 1: 261 f.write('out.dat out.cpl\n' + str(ci) + '\n') 262 else: 263 f.write("\n".join([ 264 "data_fst_outfile out.cpl out.dat", 265 str(ci), str(smooth)])) 266 f.close() 267 curr_dir = os.getcwd() 268 self._generate_intfile(data_dir) 269 if version == 1: 270 cplot_name = "cplot" 271 else: 272 cplot_name = "cplot2" 273 os.system('cd ' + data_dir + ' && ' + 274 self._get_path(cplot_name) + ' < ' + in_name + ' > ' + out_name) 275 os.remove(data_dir + os.sep + in_name) 276 os.remove(data_dir + os.sep + out_name) 277 f = open(data_dir + os.sep + 'out.cpl') 278 conf_lines = [] 279 l = f.readline() 280 try: 281 while l!='': 282 conf_lines.append( 283 tuple(map(lambda x : my_float(x), l.rstrip().split(' '))) 284 ) 285 l = f.readline() 286 except ValueError: 287 f.close() 288 return [] 289 f.close() 290 return conf_lines
291
292 - def run_pv(self, out_file='probs.dat', data_dir='.', 293 version = 1, smooth=0.04):
294 """Executes pv. 295 296 out_file - Name of output file. 297 data_dir - Where the data is found. 298 """ 299 in_name = self._get_temp_file() 300 out_name = self._get_temp_file() 301 f = open(data_dir + os.sep + in_name, 'w') 302 f.write('data_fst_outfile ' + out_file + ' out.dat\n') 303 f.write(str(smooth) + '\n') 304 f.close() 305 self._generate_intfile(data_dir) 306 if version == 1: 307 pv_name = "pv" 308 else: 309 pv_name = "pv2" 310 os.system('cd ' + data_dir + ' && ' + 311 self._get_path(pv_name) + ' < ' + in_name + ' > ' + out_name) 312 pvf = open(data_dir + os.sep + out_file, 'r') 313 result = map(lambda x: tuple(map(lambda y: my_float(y), x.rstrip().split(' '))), 314 pvf.readlines()) 315 pvf.close() 316 os.remove(data_dir + os.sep + in_name) 317 os.remove(data_dir + os.sep + out_name) 318 return result
319