Package Bio :: Package Phylo :: Package PAML :: Module _parse_codeml
[hide private]
[frames] | no frames]

Source Code for Module Bio.Phylo.PAML._parse_codeml

  1  # Copyright (C) 2011, 2016 by Brandon Invergo (b.invergo@gmail.com) 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license. Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  import re 
  7   
  8  line_floats_re = re.compile("-*\d+\.\d+") 
  9   
 10  try: 
 11      float("nan") 
 12      _nan_float = float 
 13  except ValueError: 
 14      # Happens prior to Python 2.6 depending on C library, e.g. breaks on WinXP 
15 - def _nan_float(text):
16 try: 17 return float(text) 18 except ValueError: 19 if text.lower() == "nan": 20 import struct 21 return struct.unpack('d', struct.pack('Q', 0xfff8000000000000))[0] 22 else: 23 raise
24 25
26 -def parse_basics(lines, results):
27 """Parse the basic information that should be present in most codeml output files. 28 """ 29 # multi_models is used to designate there being results for more than one 30 # model in the file 31 multi_models = False 32 multi_genes = False 33 version_re = re.compile(".+ \(in paml version (\d+\.\d+[a-z]*).*") 34 model_re = re.compile("Model:\s+(.+)") 35 num_genes_re = re.compile("\(([0-9]+) genes: separate data\)") 36 # In codeml 4.1, the codon substitution model is headed by: 37 # "Codon frequencies:" 38 # In codeml 4.3+ it is headed by: 39 # "Codon frequency model:" 40 codon_freq_re = re.compile("Codon frequenc[a-z\s]{3,7}:\s+(.+)") 41 siteclass_re = re.compile("Site-class models:\s*([^\s]*)") 42 for line in lines: 43 # Find all floating point numbers in this line 44 line_floats_res = line_floats_re.findall(line) 45 line_floats = [_nan_float(val) for val in line_floats_res] 46 # Get the program version number 47 version_res = version_re.match(line) 48 if version_res is not None: 49 results["version"] = version_res.group(1) 50 continue 51 # Find the model description at the beginning of the file. 52 model_res = model_re.match(line) 53 if model_res is not None: 54 results["model"] = model_res.group(1) 55 # Find out if more than one genes are analyzed 56 num_genes_res = num_genes_re.search(line) 57 if num_genes_res is not None: 58 results["genes"] = [] 59 num_genes = int(num_genes_res.group(1)) 60 for n in range(num_genes): 61 results["genes"].append({}) 62 multi_genes = True 63 continue 64 # Get the codon substitution model 65 codon_freq_res = codon_freq_re.match(line) 66 if codon_freq_res is not None: 67 results["codon model"] = codon_freq_res.group(1) 68 continue 69 # Find the site-class model name at the beginning of the file. 70 # This exists only if a NSsites class other than 0 is used. 71 # If there's no site class model listed, then there are multiple 72 # models in the results file 73 # Example match: "Site-class models: PositiveSelection" 74 siteclass_res = siteclass_re.match(line) 75 if siteclass_res is not None: 76 siteclass_model = siteclass_res.group(1) 77 if siteclass_model != "": 78 results["site-class model"] = siteclass_model 79 multi_models = False 80 else: 81 multi_models = True 82 # Get the maximum log-likelihood 83 if "ln Lmax" in line and line_floats: 84 results["lnL max"] = line_floats[0] 85 return (results, multi_models, multi_genes)
86 87
88 -def parse_nssites(lines, results, multi_models, multi_genes):
89 """Determine which NSsites models are present and parse them.""" 90 ns_sites = {} 91 model_re = re.compile("Model (\d+):\s+(.+)") 92 gene_re = re.compile("Gene\s+([0-9]+)\s+.+") 93 siteclass_model = results.get("site-class model") 94 if not multi_models: 95 # If there's only one model in the results, find out 96 # which one it is and then parse it. 97 if siteclass_model is None: 98 siteclass_model = "one-ratio" 99 current_model = {"one-ratio": 0, 100 "NearlyNeutral": 1, 101 "PositiveSelection": 2, 102 "discrete": 3, 103 "beta": 7, 104 "beta&w>1": 8, 105 "M2a_rel": 22}[siteclass_model] 106 if multi_genes: 107 genes = results["genes"] 108 current_gene = None 109 gene_start = None 110 for line_num, line in enumerate(lines): 111 gene_res = gene_re.match(line) 112 if gene_res: 113 if current_gene is not None: 114 parse_model(lines[gene_start:line_num], model_results) 115 genes[current_gene - 1] = model_results 116 gene_start = line_num 117 current_gene = int(gene_res.group(1)) 118 model_results = {"description": siteclass_model} 119 if len(genes[current_gene - 1]) == 0: 120 model_results = parse_model(lines[gene_start:], model_results) 121 genes[current_gene - 1] = model_results 122 else: 123 model_results = {"description": siteclass_model} 124 model_results = parse_model(lines, model_results) 125 ns_sites[current_model] = model_results 126 else: 127 # If there are multiple models in the results, scan through 128 # the file and send each model's text to be parsed individually. 129 current_model = None 130 model_start = None 131 for line_num, line in enumerate(lines): 132 # Find model names. If this is found on a line, 133 # all of the following lines until the next time this matches 134 # contain results for Model X. 135 # Example match: "Model 1: NearlyNeutral (2 categories)" 136 model_res = model_re.match(line) 137 if model_res: 138 if current_model is not None: 139 # We've already been tracking a model, so it's time 140 # to send those lines off for parsing before beginning 141 # a new one. 142 parse_model(lines[model_start:line_num], model_results) 143 ns_sites[current_model] = model_results 144 model_start = line_num 145 current_model = int(model_res.group(1)) 146 model_results = {"description": model_res.group(2)} 147 if ns_sites.get(current_model) is None: 148 # When we reach the end of the file, we'll still have one more 149 # model to parse. 150 model_results = parse_model(lines[model_start:], model_results) 151 ns_sites[current_model] = model_results 152 # Only add the ns_sites dict to the results if we really have results. 153 # Model M0 is added by default in some cases, so if it exists, make sure 154 # it's not empty 155 if len(ns_sites) == 1: 156 m0 = ns_sites.get(0) 157 if not m0 or len(m0) > 1: 158 results["NSsites"] = ns_sites 159 elif len(ns_sites) > 1: 160 results["NSsites"] = ns_sites 161 return results
162 163
164 -def parse_model(lines, results):
165 """Parse an individual NSsites model's results. 166 """ 167 parameters = {} 168 SEs_flag = False 169 dS_tree_flag = False 170 dN_tree_flag = False 171 w_tree_flag = False 172 num_params = None 173 tree_re = re.compile("^\([\w #:',.()]*\);\s*$") 174 branch_re = re.compile("\s+(\d+\.\.\d+)[\s+\d+\.\d+]+") 175 model_params_re = re.compile("(?<!\S)([a-z]\d?)\s*=\s+(\d+\.\d+)") 176 for line in lines: 177 # Find all floating point numbers in this line 178 line_floats_res = line_floats_re.findall(line) 179 line_floats = [_nan_float(val) for val in line_floats_res] 180 # Check if branch-specific results are in the line 181 branch_res = branch_re.match(line) 182 # Check if additional model parameters are in the line 183 model_params = model_params_re.findall(line) 184 # Find lnL values. 185 # Example match (lnL = -2021.348300): 186 # "lnL(ntime: 19 np: 22): -2021.348300 +0.000000" 187 if "lnL(ntime:" in line and line_floats: 188 results["lnL"] = line_floats[0] 189 np_res = re.match("lnL\(ntime:\s+\d+\s+np:\s+(\d+)\)", line) 190 if np_res is not None: 191 num_params = int(np_res.group(1)) 192 # Get parameter list. This can be useful for specifying starting 193 # parameters in another run by copying the list of parameters 194 # to a file called in.codeml. Since the parameters must be in 195 # a fixed order and format, copying and pasting to the file is 196 # best. For this reason, they are grabbed here just as a long 197 # string and not as individual numbers. 198 elif len(line_floats) == num_params and not SEs_flag: 199 parameters["parameter list"] = line.strip() 200 # Find SEs. The same format as parameters above is maintained 201 # since there is a correspondence between the SE format and 202 # the parameter format. 203 # Example match: 204 # "SEs for parameters: 205 # -1.00000 -1.00000 -1.00000 801727.63247 730462.67590 -1.00000 206 elif "SEs for parameters:" in line: 207 SEs_flag = True 208 elif SEs_flag and len(line_floats) == num_params: 209 parameters["SEs"] = line.strip() 210 SEs_flag = False 211 # Find tree lengths. 212 # Example match: "tree length = 1.71931" 213 elif "tree length =" in line and line_floats: 214 results["tree length"] = line_floats[0] 215 # Find the estimated trees only taking the tree if it has 216 # lengths or rate estimates on the branches 217 elif tree_re.match(line) is not None: 218 if ":" in line or "#" in line: 219 if dS_tree_flag: 220 results["dS tree"] = line.strip() 221 dS_tree_flag = False 222 elif dN_tree_flag: 223 results["dN tree"] = line.strip() 224 dN_tree_flag = False 225 elif w_tree_flag: 226 results["omega tree"] = line.strip() 227 w_tree_flag = False 228 else: 229 results["tree"] = line.strip() 230 elif "dS tree:" in line: 231 dS_tree_flag = True 232 elif "dN tree:" in line: 233 dN_tree_flag = True 234 elif "w ratios as labels for TreeView:" in line: 235 w_tree_flag = True 236 # Find rates for multiple genes 237 # Example match: "rates for 2 genes: 1 2.75551" 238 elif "rates for" in line and line_floats: 239 line_floats.insert(0, 1.0) 240 parameters["rates"] = line_floats 241 # Find kappa values. 242 # Example match: "kappa (ts/tv) = 2.77541" 243 elif "kappa (ts/tv)" in line and line_floats: 244 parameters["kappa"] = line_floats[0] 245 # Find omega values. 246 # Example match: "omega (dN/dS) = 0.25122" 247 elif "omega (dN/dS)" in line and line_floats: 248 parameters["omega"] = line_floats[0] 249 elif "w (dN/dS)" in line and line_floats: 250 parameters["omega"] = line_floats 251 # Find omega and kappa values for multi-gene files 252 # Example match: "gene # 1: kappa = 1.72615 omega = 0.39333" 253 elif "gene # " in line: 254 gene_num = int(re.match("gene # (\d+)", line).group(1)) 255 if parameters.get("genes") is None: 256 parameters["genes"] = {} 257 parameters["genes"][gene_num] = {"kappa": line_floats[0], 258 "omega": line_floats[1]} 259 # Find dN values. 260 # Example match: "tree length for dN: 0.2990" 261 elif "tree length for dN" in line and line_floats: 262 parameters["dN"] = line_floats[0] 263 # Find dS values 264 # Example match: "tree length for dS: 1.1901" 265 elif "tree length for dS" in line and line_floats: 266 parameters["dS"] = line_floats[0] 267 # Find site class distributions. 268 # Example match 1 (normal model, 2 site classes): 269 # "p: 0.77615 0.22385" 270 # Example match 2 (branch site A model, 4 site classes): 271 # "proportion 0.00000 0.00000 0.73921 0.26079" 272 elif line[0:2] == "p:" or line[0:10] == "proportion": 273 site_classes = parse_siteclass_proportions(line_floats) 274 parameters["site classes"] = site_classes 275 # Find the omega value corresponding to each site class 276 # Example match (2 site classes): "w: 0.10224 1.00000" 277 elif line[0:2] == "w:": 278 site_classes = parameters.get("site classes") 279 site_classes = parse_siteclass_omegas(line, site_classes) 280 parameters["site classes"] = site_classes 281 # Find the omega values corresponding to a branch type from 282 # the clade model C for each site class 283 # Example match: 284 # "branch type 0: 0.31022 1.00000 0.00000" 285 elif "branch type " in line: 286 branch_type = re.match("branch type (\d)", line) 287 if branch_type: 288 site_classes = parameters.get("site classes") 289 branch_type_no = int(branch_type.group(1)) 290 site_classes = parse_clademodelc(branch_type_no, line_floats, 291 site_classes) 292 parameters["site classes"] = site_classes 293 # Find the omega values of the foreground branch for each site 294 # class in the branch site A model 295 # Example match: 296 # "foreground w 0.07992 1.00000 134.54218 134.54218" 297 elif line[0:12] == "foreground w": 298 site_classes = parameters.get("site classes") 299 site_classes = parse_branch_site_a(True, line_floats, site_classes) 300 parameters["site classes"] = site_classes 301 # Find the omega values of the background for each site 302 # class in the branch site A model 303 # Example match: 304 # "background w 0.07992 1.00000 0.07992 1.00000" 305 elif line[0:12] == "background w": 306 site_classes = parameters.get("site classes") 307 site_classes = parse_branch_site_a(False, line_floats, site_classes) 308 parameters["site classes"] = site_classes 309 # Find dN & dS for each branch, which is organized in a table 310 # The possibility of NaNs forces me to not use the line_floats 311 # method. 312 # Example row (some spaces removed to make it smaller...). 313 # " 6..7 0.000 167.7 54.3 0.0000 0.0000 0.0000 0.0 0.0" 314 elif branch_res is not None and line_floats: 315 branch = branch_res.group(1) 316 if parameters.get("branches") is None: 317 parameters["branches"] = {} 318 # Hack for Jython http://bugs.jython.org/issue1762 float("-nan") 319 line = line.replace(" -nan", " nan") 320 params = line.strip().split()[1:] 321 parameters["branches"][branch] = { 322 "t": _nan_float(params[0].strip()), 323 "N": _nan_float(params[1].strip()), 324 "S": _nan_float(params[2].strip()), 325 "omega": _nan_float(params[3].strip()), 326 "dN": _nan_float(params[4].strip()), 327 "dS": _nan_float(params[5].strip()), 328 "N*dN": _nan_float(params[6].strip()), 329 "S*dS": _nan_float(params[7].strip())} 330 # Find model parameters, which can be spread across multiple 331 # lines. 332 # Example matches: 333 # " p0= 0.99043 p= 0.36657 q= 1.04445 334 # " (p1= 0.00957) w= 3.25530" 335 elif model_params: 336 float_model_params = [] 337 for param in model_params: 338 float_model_params.append((param[0], _nan_float(param[1]))) 339 parameters.update(dict(float_model_params)) 340 if parameters: 341 results["parameters"] = parameters 342 return results
343 344
345 -def parse_siteclass_proportions(line_floats):
346 """For models which have multiple site classes, find the proportion of the 347 alignment assigned to each class. 348 """ 349 site_classes = {} 350 if line_floats: 351 for n in range(len(line_floats)): 352 site_classes[n] = {"proportion": line_floats[n]} 353 return site_classes
354 355
356 -def parse_siteclass_omegas(line, site_classes):
357 """For models which have multiple site classes, find the omega estimated 358 for each class. 359 """ 360 # The omega results are tabular with strictly 9 characters per column 361 # (1 to 3 digits before the decimal point and 5 after). This causes 362 # numbers to sometimes run into each other, so we must use a different 363 # regular expression to account for this. i.e.: 364 # w: 0.00012 1.00000109.87121 365 line_floats = re.findall("\d{1,3}\.\d{5}", line) 366 if not site_classes or len(line_floats) == 0: 367 return 368 for n in range(len(line_floats)): 369 site_classes[n]["omega"] = line_floats[n] 370 return site_classes
371 372
373 -def parse_clademodelc(branch_type_no, line_floats, site_classes):
374 """Parse results specific to the clade model C. 375 """ 376 if not site_classes or len(line_floats) == 0: 377 return 378 for n in range(len(line_floats)): 379 if site_classes[n].get("branch types") is None: 380 site_classes[n]["branch types"] = {} 381 site_classes[n]["branch types"][branch_type_no] = line_floats[n] 382 return site_classes
383 384
385 -def parse_branch_site_a(foreground, line_floats, site_classes):
386 """Parse results specific to the branch site A model. 387 """ 388 if not site_classes or len(line_floats) == 0: 389 return 390 for n in range(len(line_floats)): 391 if site_classes[n].get("branch types") is None: 392 site_classes[n]["branch types"] = {} 393 if foreground: 394 site_classes[n]["branch types"]["foreground"] = line_floats[n] 395 else: 396 site_classes[n]["branch types"]["background"] = line_floats[n] 397 return site_classes
398 399
400 -def parse_pairwise(lines, results):
401 """Parse results from pairwise comparisons. 402 """ 403 # Find pairwise comparisons 404 # Example: 405 # 2 (Pan_troglo) ... 1 (Homo_sapie) 406 # lnL = -291.465693 407 # 0.01262 999.00000 0.00100 408 # 409 # t= 0.0126 S= 81.4 N= 140.6 dN/dS= 0.0010 dN= 0.0000 dS= 0.0115 410 pair_re = re.compile("\d+ \((.+)\) ... \d+ \((.+)\)") 411 pairwise = {} 412 seq1 = None 413 seq2 = None 414 for line in lines: 415 # Find all floating point numbers in this line 416 line_floats_res = line_floats_re.findall(line) 417 line_floats = [_nan_float(val) for val in line_floats_res] 418 pair_res = pair_re.match(line) 419 if pair_res: 420 seq1 = pair_res.group(1) 421 seq2 = pair_res.group(2) 422 if seq1 not in pairwise: 423 pairwise[seq1] = {} 424 if seq2 not in pairwise: 425 pairwise[seq2] = {} 426 if len(line_floats) == 1 and seq1 is not None and seq2 is not None: 427 pairwise[seq1][seq2] = {"lnL": line_floats[0]} 428 pairwise[seq2][seq1] = pairwise[seq1][seq2] 429 elif len(line_floats) == 6 and seq1 is not None and seq2 is not None: 430 pairwise[seq1][seq2].update({"t": line_floats[0], 431 "S": line_floats[1], 432 "N": line_floats[2], 433 "omega": line_floats[3], 434 "dN": line_floats[4], 435 "dS": line_floats[5]}) 436 pairwise[seq2][seq1] = pairwise[seq1][seq2] 437 if pairwise: 438