Package Bio :: Package Phylo :: Package PAML :: Module _parse_codeml
[hide private]
[frames] | no frames]

Source Code for Module Bio.Phylo.PAML._parse_codeml

  1  # Copyright (C) 2011, 2016 by Brandon Invergo (b.invergo@gmail.com) 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license. Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  import re 
  7   
  8   
  9  line_floats_re = re.compile("-*\d+\.\d+") 
 10   
 11  try: 
 12      float("nan") 
 13      _nan_float = float 
 14  except ValueError: 
 15      # Happens prior to Python 2.6 depending on C library, e.g. breaks on WinXP 
16 - def _nan_float(text):
17 try: 18 return float(text) 19 except ValueError: 20 if text.lower() == "nan": 21 import struct 22 return struct.unpack('d', struct.pack('Q', 0xfff8000000000000))[0] 23 else: 24 raise
25 26
27 -def parse_basics(lines, results):
28 """Parse the basic information that should be present in most codeml output files. 29 """ 30 # multi_models is used to designate there being results for more than one 31 # model in the file 32 multi_models = False 33 multi_genes = False 34 version_re = re.compile(".+ \(in paml version (\d+\.\d+[a-z]*).*") 35 model_re = re.compile("Model:\s+(.+)") 36 num_genes_re = re.compile("\(([0-9]+) genes: separate data\)") 37 # In codeml 4.1, the codon substitution model is headed by: 38 # "Codon frequencies:" 39 # In codeml 4.3+ it is headed by: 40 # "Codon frequency model:" 41 codon_freq_re = re.compile("Codon frequenc[a-z\s]{3,7}:\s+(.+)") 42 siteclass_re = re.compile("Site-class models:\s*([^\s]*)") 43 for line in lines: 44 # Find all floating point numbers in this line 45 line_floats_res = line_floats_re.findall(line) 46 line_floats = [_nan_float(val) for val in line_floats_res] 47 # Get the program version number 48 version_res = version_re.match(line) 49 if version_res is not None: 50 results["version"] = version_res.group(1) 51 continue 52 # Find the model description at the beginning of the file. 53 model_res = model_re.match(line) 54 if model_res is not None: 55 results["model"] = model_res.group(1) 56 # Find out if more than one genes are analyzed 57 num_genes_res = num_genes_re.search(line) 58 if num_genes_res is not None: 59 results["genes"] = [] 60 num_genes = int(num_genes_res.group(1)) 61 for n in range(num_genes): 62 results["genes"].append({}) 63 multi_genes = True 64 continue 65 # Get the codon substitution model 66 codon_freq_res = codon_freq_re.match(line) 67 if codon_freq_res is not None: 68 results["codon model"] = codon_freq_res.group(1) 69 continue 70 # Find the site-class model name at the beginning of the file. 71 # This exists only if a NSsites class other than 0 is used. 72 # If there's no site class model listed, then there are multiple 73 # models in the results file 74 # Example match: "Site-class models: PositiveSelection" 75 siteclass_res = siteclass_re.match(line) 76 if siteclass_res is not None: 77 siteclass_model = siteclass_res.group(1) 78 if siteclass_model != "": 79 results["site-class model"] = siteclass_model 80 multi_models = False 81 else: 82 multi_models = True 83 # Get the maximum log-likelihood 84 if "ln Lmax" in line and len(line_floats) > 0: 85 results["lnL max"] = line_floats[0] 86 return (results, multi_models, multi_genes)
87 88
89 -def parse_nssites(lines, results, multi_models, multi_genes):
90 """Determine which NSsites models are present and parse them. 91 """ 92 93 ns_sites = {} 94 model_re = re.compile("Model (\d+):\s+(.+)") 95 gene_re = re.compile("Gene\s+([0-9]+)\s+.+") 96 siteclass_model = results.get("site-class model") 97 if not multi_models: 98 # If there's only one model in the results, find out 99 # which one it is and then parse it. 100 if siteclass_model is None: 101 siteclass_model = "one-ratio" 102 current_model = {"one-ratio": 0, 103 "NearlyNeutral": 1, 104 "PositiveSelection": 2, 105 "discrete": 3, 106 "beta": 7, 107 "beta&w>1": 8, 108 "M2a_rel": 22}[siteclass_model] 109 if multi_genes: 110 genes = results["genes"] 111 current_gene = None 112 gene_start = None 113 for line_num, line in enumerate(lines): 114 gene_res = gene_re.match(line) 115 if gene_res: 116 if current_gene is not None: 117 parse_model(lines[gene_start:line_num], model_results) 118 genes[current_gene - 1] = model_results 119 gene_start = line_num 120 current_gene = int(gene_res.group(1)) 121 model_results = {"description": siteclass_model} 122 if len(genes[current_gene - 1]) == 0: 123 model_results = parse_model(lines[gene_start:], model_results) 124 genes[current_gene - 1] = model_results 125 else: 126 model_results = {"description": siteclass_model} 127 model_results = parse_model(lines, model_results) 128 ns_sites[current_model] = model_results 129 else: 130 # If there are multiple models in the results, scan through 131 # the file and send each model's text to be parsed individually. 132 current_model = None 133 model_start = None 134 for line_num, line in enumerate(lines): 135 # Find model names. If this is found on a line, 136 # all of the following lines until the next time this matches 137 # contain results for Model X. 138 # Example match: "Model 1: NearlyNeutral (2 categories)" 139 model_res = model_re.match(line) 140 if model_res: 141 if current_model is not None: 142 # We've already been tracking a model, so it's time 143 # to send those lines off for parsing before beginning 144 # a new one. 145 parse_model(lines[model_start:line_num], model_results) 146 ns_sites[current_model] = model_results 147 model_start = line_num 148 current_model = int(model_res.group(1)) 149 model_results = {"description": model_res.group(2)} 150 if ns_sites.get(current_model) is None: 151 # When we reach the end of the file, we'll still have one more 152 # model to parse. 153 model_results = parse_model(lines[model_start:], model_results) 154 ns_sites[current_model] = model_results 155 # Only add the ns_sites dict to the results if we really have results. 156 # Model M0 is added by default in some cases, so if it exists, make sure 157 # it's not empty 158 if len(ns_sites) == 1: 159 m0 = ns_sites.get(0) 160 if not m0 or len(m0) > 1: 161 results["NSsites"] = ns_sites 162 elif len(ns_sites) > 1: 163 results["NSsites"] = ns_sites 164 return results
165 166
167 -def parse_model(lines, results):
168 """Parse an individual NSsites model's results. 169 """ 170 parameters = {} 171 SEs_flag = False 172 dS_tree_flag = False 173 dN_tree_flag = False 174 w_tree_flag = False 175 num_params = None 176 tree_re = re.compile("^\([\w #:',.()]*\);\s*$") 177 branch_re = re.compile("\s+(\d+\.\.\d+)[\s+\d+\.\d+]+") 178 model_params_re = re.compile("(?<!\S)([a-z]\d?)\s*=\s+(\d+\.\d+)") 179 for line in lines: 180 # Find all floating point numbers in this line 181 line_floats_res = line_floats_re.findall(line) 182 line_floats = [_nan_float(val) for val in line_floats_res] 183 # Check if branch-specific results are in the line 184 branch_res = branch_re.match(line) 185 # Check if additional model parameters are in the line 186 model_params = model_params_re.findall(line) 187 # Find lnL values. 188 # Example match (lnL = -2021.348300): 189 # "lnL(ntime: 19 np: 22): -2021.348300 +0.000000" 190 if "lnL(ntime:" in line and len(line_floats) > 0: 191 results["lnL"] = line_floats[0] 192 np_res = re.match("lnL\(ntime:\s+\d+\s+np:\s+(\d+)\)", line) 193 if np_res is not None: 194 num_params = int(np_res.group(1)) 195 # Get parameter list. This can be useful for specifying starting 196 # parameters in another run by copying the list of parameters 197 # to a file called in.codeml. Since the parameters must be in 198 # a fixed order and format, copying and pasting to the file is 199 # best. For this reason, they are grabbed here just as a long 200 # string and not as individual numbers. 201 elif len(line_floats) == num_params and not SEs_flag: 202 parameters["parameter list"] = line.strip() 203 # Find SEs. The same format as parameters above is maintained 204 # since there is a correspondence between the SE format and 205 # the parameter format. 206 # Example match: 207 # "SEs for parameters: 208 # -1.00000 -1.00000 -1.00000 801727.63247 730462.67590 -1.00000 209 elif "SEs for parameters:" in line: 210 SEs_flag = True 211 elif SEs_flag and len(line_floats) == num_params: 212 parameters["SEs"] = line.strip() 213 SEs_flag = False 214 # Find tree lengths. 215 # Example match: "tree length = 1.71931" 216 elif "tree length =" in line and len(line_floats) > 0: 217 results["tree length"] = line_floats[0] 218 # Find the estimated trees only taking the tree if it has 219 # lengths or rate estimates on the branches 220 elif tree_re.match(line) is not None: 221 if ":" in line or "#" in line: 222 if dS_tree_flag: 223 results["dS tree"] = line.strip() 224 dS_tree_flag = False 225 elif dN_tree_flag: 226 results["dN tree"] = line.strip() 227 dN_tree_flag = False 228 elif w_tree_flag: 229 results["omega tree"] = line.strip() 230 w_tree_flag = False 231 else: 232 results["tree"] = line.strip() 233 elif "dS tree:" in line: 234 dS_tree_flag = True 235 elif "dN tree:" in line: 236 dN_tree_flag = True 237 elif "w ratios as labels for TreeView:" in line: 238 w_tree_flag = True 239 # Find rates for multiple genes 240 # Example match: "rates for 2 genes: 1 2.75551" 241 elif "rates for" in line and len(line_floats) > 0: 242 line_floats.insert(0, 1.0) 243 parameters["rates"] = line_floats 244 # Find kappa values. 245 # Example match: "kappa (ts/tv) = 2.77541" 246 elif "kappa (ts/tv)" in line and len(line_floats) > 0: 247 parameters["kappa"] = line_floats[0] 248 # Find omega values. 249 # Example match: "omega (dN/dS) = 0.25122" 250 elif "omega (dN/dS)" in line and len(line_floats) > 0: 251 parameters["omega"] = line_floats[0] 252 elif "w (dN/dS)" in line and len(line_floats) > 0: 253 parameters["omega"] = line_floats 254 # Find omega and kappa values for multi-gene files 255 # Example match: "gene # 1: kappa = 1.72615 omega = 0.39333" 256 elif "gene # " in line: 257 gene_num = int(re.match("gene # (\d+)", line).group(1)) 258 if parameters.get("genes") is None: 259 parameters["genes"] = {} 260 parameters["genes"][gene_num] = {"kappa": line_floats[0], 261 "omega": line_floats[1]} 262 # Find dN values. 263 # Example match: "tree length for dN: 0.2990" 264 elif "tree length for dN" in line and len(line_floats) > 0: 265 parameters["dN"] = line_floats[0] 266 # Find dS values 267 # Example match: "tree length for dS: 1.1901" 268 elif "tree length for dS" in line and len(line_floats) > 0: 269 parameters["dS"] = line_floats[0] 270 # Find site class distributions. 271 # Example match 1 (normal model, 2 site classes): 272 # "p: 0.77615 0.22385" 273 # Example match 2 (branch site A model, 4 site classes): 274 # "proportion 0.00000 0.00000 0.73921 0.26079" 275 elif line[0:2] == "p:" or line[0:10] == "proportion": 276 site_classes = parse_siteclass_proportions(line_floats) 277 parameters["site classes"] = site_classes 278 # Find the omega value corresponding to each site class 279 # Example match (2 site classes): "w: 0.10224 1.00000" 280 elif line[0:2] == "w:": 281 site_classes = parameters.get("site classes") 282 site_classes = parse_siteclass_omegas(line, site_classes) 283 parameters["site classes"] = site_classes 284 # Find the omega values corresponding to a branch type from 285 # the clade model C for each site class 286 # Example match: 287 # "branch type 0: 0.31022 1.00000 0.00000" 288 elif "branch type " in line: 289 branch_type = re.match("branch type (\d)", line) 290 if branch_type: 291 site_classes = parameters.get("site classes") 292 branch_type_no = int(branch_type.group(1)) 293 site_classes = parse_clademodelc(branch_type_no, line_floats, 294 site_classes) 295 parameters["site classes"] = site_classes 296 # Find the omega values of the foreground branch for each site 297 # class in the branch site A model 298 # Example match: 299 # "foreground w 0.07992 1.00000 134.54218 134.54218" 300 elif line[0:12] == "foreground w": 301 site_classes = parameters.get("site classes") 302 site_classes = parse_branch_site_a(True, line_floats, site_classes) 303 parameters["site classes"] = site_classes 304 # Find the omega values of the background for each site 305 # class in the branch site A model 306 # Example match: 307 # "background w 0.07992 1.00000 0.07992 1.00000" 308 elif line[0:12] == "background w": 309 site_classes = parameters.get("site classes") 310 site_classes = parse_branch_site_a(False, line_floats, site_classes) 311 parameters["site classes"] = site_classes 312 # Find dN & dS for each branch, which is organized in a table 313 # The possibility of NaNs forces me to not use the line_floats 314 # method. 315 # Example row (some spaces removed to make it smaller...). 316 # " 6..7 0.000 167.7 54.3 0.0000 0.0000 0.0000 0.0 0.0" 317 elif branch_res is not None and len(line_floats) > 0: 318 branch = branch_res.group(1) 319 if parameters.get("branches") is None: 320 parameters["branches"] = {} 321 # Hack for Jython http://bugs.jython.org/issue1762 float("-nan") 322 line = line.replace(" -nan", " nan") 323 params = line.strip().split()[1:] 324 parameters["branches"][branch] = { 325 "t": _nan_float(params[0].strip()), 326 "N": _nan_float(params[1].strip()), 327 "S": _nan_float(params[2].strip()), 328 "omega": _nan_float(params[3].strip()), 329 "dN": _nan_float(params[4].strip()), 330 "dS": _nan_float(params[5].strip()), 331 "N*dN": _nan_float(params[6].strip()), 332 "S*dS": _nan_float(params[7].strip())} 333 # Find model parameters, which can be spread across multiple 334 # lines. 335 # Example matches: 336 # " p0= 0.99043 p= 0.36657 q= 1.04445 337 # " (p1= 0.00957) w= 3.25530" 338 elif len(model_params) > 0: 339 float_model_params = [] 340 for param in model_params: 341 float_model_params.append((param[0], _nan_float(param[1]))) 342 parameters.update(dict(float_model_params)) 343 if len(parameters) > 0: 344 results["parameters"] = parameters 345 return results
346 347
348 -def parse_siteclass_proportions(line_floats):
349 """For models which have multiple site classes, find the proportion of the 350 alignment assigned to each class. 351 """ 352 site_classes = {} 353 if len(line_floats) > 0: 354 for n in range(len(line_floats)): 355 site_classes[n] = {"proportion": line_floats[n]} 356 return site_classes
357 358
359 -def parse_siteclass_omegas(line, site_classes):
360 """For models which have multiple site classes, find the omega estimated 361 for each class. 362 """ 363 # The omega results are tabular with strictly 9 characters per column 364 # (1 to 3 digits before the decimal point and 5 after). This causes 365 # numbers to sometimes run into each other, so we must use a different 366 # regular expression to account for this. i.e.: 367 # w: 0.00012 1.00000109.87121 368 line_floats = re.findall("\d{1,3}\.\d{5}", line) 369 if not site_classes or len(line_floats) == 0: 370 return 371 for n in range(len(line_floats)): 372 site_classes[n]["omega"] = line_floats[n] 373 return site_classes
374 375
376 -def parse_clademodelc(branch_type_no, line_floats, site_classes):
377 """Parse results specific to the clade model C. 378 """ 379 if not site_classes or len(line_floats) == 0: 380 return 381 for n in range(len(line_floats)): 382 if site_classes[n].get("branch types") is None: 383 site_classes[n]["branch types"] = {} 384 site_classes[n]["branch types"][branch_type_no] = line_floats[n] 385 return site_classes
386 387
388 -def parse_branch_site_a(foreground, line_floats, site_classes):
389 """Parse results specific to the branch site A model. 390 """ 391 if not site_classes or len(line_floats) == 0: 392 return 393 for n in range(len(line_floats)): 394 if site_classes[n].get("branch types") is None: 395 site_classes[n]["branch types"] = {} 396 if foreground: 397 site_classes[n]["branch types"]["foreground"] = line_floats[n] 398 else: 399 site_classes[n]["branch types"]["background"] = line_floats[n] 400 return site_classes
401 402
403 -def parse_pairwise(lines, results):
404 """Parse results from pairwise comparisons. 405 """ 406 # Find pairwise comparisons 407 # Example: 408 # 2 (Pan_troglo) ... 1 (Homo_sapie) 409 # lnL = -291.465693 410 # 0.01262 999.00000 0.00100 411 # 412 # t= 0.0126 S= 81.4 N= 140.6 dN/dS= 0.0010 dN= 0.0000 dS= 0.0115 413 pair_re = re.compile("\d+ \((.+)\) ... \d+ \((.+)\)") 414 pairwise = {} 415 for line in lines: 416 # Find all floating point numbers in this line 417 line_floats_res = line_floats_re.findall(line) 418 line_floats = [_nan_float(val) for val in line_floats_res] 419 pair_res = pair_re.match(line) 420 if pair_res: 421 seq1 = pair_res.group(1) 422 seq2 = pair_res.group(2) 423 if pairwise.get(seq1) is None: 424 pairwise[seq1] = {} 425 if pairwise.get(seq2) is None: 426 pairwise[seq2] = {} 427 if len(line_floats) == 1: 428 pairwise[seq1][seq2] = {"lnL": line_floats[0]} 429 pairwise[seq2][seq1] = pairwise[seq1][seq2] 430 elif len(line_floats) == 6: 431 pairwise[seq1][seq2] = {"t": line_floats[0], 432 "S": line_floats[1], 433 "N": line_floats[2], 434 "omega": line_floats[3], 435 "dN": line_floats[4], 436 "dS": line_floats[5]} 437 pairwise[seq2][seq1] = pairwise[seq1][seq2] 438 if len(pairwise) > 0: 439