Package Bio :: Package Phylo :: Package PAML :: Module _parse_codeml
[hide private]
[frames] | no frames]

Source Code for Module Bio.Phylo.PAML._parse_codeml

  1  # Copyright (C) 2011 by Brandon Invergo (b.invergo@gmail.com) 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license. Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  import re 
  7   
  8  line_floats_re = re.compile("-*\d+\.\d+") 
  9   
 10  try: 
 11      float("nan") 
 12      _nan_float = float 
 13  except ValueError: 
 14      # Happens prior to Python 2.6 depending on C library, e.g. breaks on WinXP 
15 - def _nan_float(text):
16 try: 17 return float(text) 18 except ValueError: 19 if text.lower()=="nan": 20 import struct 21 return struct.unpack('d', struct.pack('Q', 0xfff8000000000000))[0] 22 else: 23 raise
24 25
26 -def parse_basics(lines, results):
27 """Parse the basic information that should be present in most codeml output files. 28 """ 29 # multi_models is used to designate there being results for more than one 30 # model in the file 31 multi_models = False 32 multi_genes = False 33 version_re = re.compile(".+ \(in paml version (\d+\.\d+[a-z]*).*") 34 model_re = re.compile("Model:\s+(.+)") 35 num_genes_re = re.compile("\(([0-9]+) genes: separate data\)") 36 # In codeml 4.1, the codon substitution model is headed by: 37 # "Codon frequencies:" 38 # In codeml 4.3+ it is headed by: 39 # "Codon frequency model:" 40 codon_freq_re = re.compile("Codon frequenc[a-z\s]{3,7}:\s+(.+)") 41 siteclass_re = re.compile("Site-class models:\s*([^\s]*)") 42 for line in lines: 43 # Find all floating point numbers in this line 44 line_floats_res = line_floats_re.findall(line) 45 line_floats = [_nan_float(val) for val in line_floats_res] 46 # Get the program version number 47 version_res = version_re.match(line) 48 if version_res is not None: 49 results["version"] = version_res.group(1) 50 continue 51 # Find the model description at the beginning of the file. 52 model_res = model_re.match(line) 53 if model_res is not None: 54 results["model"] = model_res.group(1) 55 # Find out if more than one genes are analyzed 56 num_genes_res = num_genes_re.search(line) 57 if num_genes_res is not None: 58 results["genes"] = [] 59 num_genes = int(num_genes_res.group(1)) 60 for n in range(num_genes): 61 results["genes"].append({}) 62 multi_genes = True 63 continue 64 # Get the codon substitution model 65 codon_freq_res = codon_freq_re.match(line) 66 if codon_freq_res is not None: 67 results["codon model"] = codon_freq_res.group(1) 68 continue 69 # Find the site-class model name at the beginning of the file. 70 # This exists only if a NSsites class other than 0 is used. 71 # If there's no site class model listed, then there are multiple 72 # models in the results file 73 # Example match: "Site-class models: PositiveSelection" 74 siteclass_res = siteclass_re.match(line) 75 if siteclass_res is not None: 76 siteclass_model = siteclass_res.group(1) 77 if siteclass_model != "": 78 results["site-class model"] = siteclass_model 79 multi_models = False 80 else: 81 multi_models = True 82 # Get the maximum log-likelihood 83 if "ln Lmax" in line and len(line_floats) > 0: 84 results["lnL max"] = line_floats[0] 85 return (results, multi_models, multi_genes)
86 87
88 -def parse_nssites(lines, results, multi_models, multi_genes):
89 """Determine which NSsites models are present and parse them. 90 """ 91 92 ns_sites = {} 93 model_re = re.compile("Model (\d+):\s+(.+)") 94 gene_re = re.compile("Gene\s+([0-9]+)\s+.+") 95 siteclass_model = results.get("site-class model") 96 if not multi_models: 97 # If there's only one model in the results, find out 98 # which one it is and then parse it. 99 if siteclass_model is None: 100 siteclass_model = "one-ratio" 101 current_model = {"one-ratio": 0, 102 "NearlyNeutral": 1, 103 "PositiveSelection": 2, 104 "discrete": 3, 105 "beta": 7, 106 "beta&w>1": 8}[siteclass_model] 107 if multi_genes: 108 genes = results["genes"] 109 current_gene = None 110 gene_start = None 111 for line_num, line in enumerate(lines): 112 gene_res = gene_re.match(line) 113 if gene_res: 114 if current_gene is not None: 115 parse_model(lines[gene_start:line_num], model_results) 116 genes[current_gene-1] = model_results 117 gene_start = line_num 118 current_gene = int(gene_res.group(1)) 119 model_results = {"description": siteclass_model} 120 if len(genes[current_gene-1]) == 0: 121 model_results = parse_model(lines[gene_start:], model_results) 122 genes[current_gene-1] = model_results 123 else: 124 model_results = {"description": siteclass_model} 125 model_results = parse_model(lines, model_results) 126 ns_sites[current_model] = model_results 127 else: 128 # If there are multiple models in the results, scan through 129 # the file and send each model's text to be parsed individually. 130 current_model = None 131 model_start = None 132 for line_num, line in enumerate(lines): 133 # Find model names. If this is found on a line, 134 # all of the following lines until the next time this matches 135 # contain results for Model X. 136 # Example match: "Model 1: NearlyNeutral (2 categories)" 137 model_res = model_re.match(line) 138 if model_res: 139 if current_model is not None: 140 # We've already been tracking a model, so it's time 141 # to send those lines off for parsing before beginning 142 # a new one. 143 parse_model(lines[model_start:line_num], model_results) 144 ns_sites[current_model] = model_results 145 model_start = line_num 146 current_model = int(model_res.group(1)) 147 model_results = {"description": model_res.group(2)} 148 if ns_sites.get(current_model) is None: 149 # When we reach the end of the file, we'll still have one more 150 # model to parse. 151 model_results = parse_model(lines[model_start:], model_results) 152 ns_sites[current_model] = model_results 153 # Only add the ns_sites dict to the results if we really have results. 154 # Model M0 is added by default in some cases, so if it exists, make sure 155 # it's not empty 156 if len(ns_sites) == 1: 157 m0 = ns_sites.get(0) 158 if not m0 or len(m0) > 1: 159 results["NSsites"] = ns_sites 160 elif len(ns_sites) > 1: 161 results["NSsites"] = ns_sites 162 return results
163 164
165 -def parse_model(lines, results):
166 """Parse an individual NSsites model's results. 167 """ 168 parameters = {} 169 SEs_flag = False 170 dS_tree_flag = False 171 dN_tree_flag = False 172 w_tree_flag = False 173 num_params = None 174 tree_re = re.compile("\(\(+") 175 branch_re = re.compile("\s+(\d+\.\.\d+)[\s+\d+\.\d+]+") 176 model_params_re = re.compile("(?<!\S)([a-z]\d?)\s*=\s+(\d+\.\d+)") 177 for line in lines: 178 # Find all floating point numbers in this line 179 line_floats_res = line_floats_re.findall(line) 180 line_floats = [_nan_float(val) for val in line_floats_res] 181 # Check if branch-specific results are in the line 182 branch_res = branch_re.match(line) 183 # Check if additional model parameters are in the line 184 model_params = model_params_re.findall(line) 185 # Find lnL values. 186 # Example match (lnL = -2021.348300): 187 # "lnL(ntime: 19 np: 22): -2021.348300 +0.000000" 188 if "lnL(ntime:" in line and len(line_floats) > 0: 189 results["lnL"] = line_floats[0] 190 np_res = re.match("lnL\(ntime:\s+\d+\s+np:\s+(\d+)\)", line) 191 if np_res is not None: 192 num_params = int(np_res.group(1)) 193 # Get parameter list. This can be useful for specifying starting 194 # parameters in another run by copying the list of parameters 195 # to a file called in.codeml. Since the parameters must be in 196 # a fixed order and format, copying and pasting to the file is 197 # best. For this reason, they are grabbed here just as a long 198 # string and not as individual numbers. 199 elif len(line_floats) == num_params and not SEs_flag: 200 parameters["parameter list"] = line.strip() 201 # Find SEs. The same format as parameters above is maintained 202 # since there is a correspondance between the SE format and 203 # the parameter format. 204 # Example match: 205 # "SEs for parameters: 206 # -1.00000 -1.00000 -1.00000 801727.63247 730462.67590 -1.00000 207 elif "SEs for parameters:" in line: 208 SEs_flag = True 209 elif SEs_flag and len(line_floats) == num_params: 210 parameters["SEs"] = line.strip() 211 SEs_flag = False 212 # Find tree lengths. 213 # Example match: "tree length = 1.71931" 214 elif "tree length =" in line and len(line_floats) > 0: 215 results["tree length"] = line_floats[0] 216 # Find the estimated trees only taking the tree if it has 217 # lengths or rate estimates on the branches 218 elif tree_re.match(line) is not None: 219 if ":" in line or "#" in line: 220 if dS_tree_flag: 221 results["dS tree"] = line.strip() 222 dS_tree_flag = False 223 elif dN_tree_flag: 224 results["dN tree"] = line.strip() 225 dN_tree_flag = False 226 elif w_tree_flag: 227 results["omega tree"] = line.strip() 228 w_tree_flag = False 229 else: 230 results["tree"] = line.strip() 231 elif "dS tree:" in line: 232 dS_tree_flag = True 233 elif "dN tree:" in line: 234 dN_tree_flag = True 235 elif "w ratios as labels for TreeView:" in line: 236 w_tree_flag = True 237 # Find rates for multiple genes 238 # Example match: "rates for 2 genes: 1 2.75551" 239 elif "rates for" in line and len(line_floats) > 0: 240 line_floats.insert(0, 1.0) 241 parameters["rates"] = line_floats 242 # Find kappa values. 243 # Example match: "kappa (ts/tv) = 2.77541" 244 elif "kappa (ts/tv)" in line and len(line_floats) > 0: 245 parameters["kappa"] = line_floats[0] 246 # Find omega values. 247 # Example match: "omega (dN/dS) = 0.25122" 248 elif "omega (dN/dS)" in line and len(line_floats) > 0: 249 parameters["omega"] = line_floats[0] 250 elif "w (dN/dS)" in line and len(line_floats) > 0: 251 parameters["omega"] = line_floats 252 # Find omega and kappa values for multi-gene files 253 # Example match: "gene # 1: kappa = 1.72615 omega = 0.39333" 254 elif "gene # " in line: 255 gene_num = int(re.match("gene # (\d+)", line).group(1)) 256 if parameters.get("genes") is None: 257 parameters["genes"] = {} 258 parameters["genes"][gene_num] = {"kappa": line_floats[0], 259 "omega": line_floats[1]} 260 # Find dN values. 261 # Example match: "tree length for dN: 0.2990" 262 elif "tree length for dN" in line and len(line_floats) > 0: 263 parameters["dN"] = line_floats[0] 264 # Find dS values 265 # Example match: "tree length for dS: 1.1901" 266 elif "tree length for dS" in line and len(line_floats) > 0: 267 parameters["dS"] = line_floats[0] 268 # Find site class distributions. 269 # Example match 1 (normal model, 2 site classes): 270 # "p: 0.77615 0.22385" 271 # Example match 2 (branch site A model, 4 site classes): 272 # "proportion 0.00000 0.00000 0.73921 0.26079" 273 elif line[0:2] == "p:" or line[0:10] == "proportion": 274 site_classes = parse_siteclass_proportions(line_floats) 275 parameters["site classes"] = site_classes 276 # Find the omega value corresponding to each site class 277 # Example match (2 site classes): "w: 0.10224 1.00000" 278 elif line[0:2] == "w:": 279 site_classes = parameters.get("site classes") 280 site_classes = parse_siteclass_omegas(line, site_classes) 281 parameters["site classes"] = site_classes 282 # Find the omega values corresponding to a branch type from 283 # the clade model C for each site class 284 # Example match: 285 # "branch type 0: 0.31022 1.00000 0.00000" 286 elif "branch type " in line: 287 branch_type = re.match("branch type (\d)", line) 288 if branch_type: 289 site_classes = parameters.get("site classes") 290 branch_type_no = int(branch_type.group(1)) 291 site_classes = parse_clademodelc(branch_type_no, line_floats, 292 site_classes) 293 parameters["site classes"] = site_classes 294 # Find the omega values of the foreground branch for each site 295 # class in the branch site A model 296 # Example match: 297 # "foreground w 0.07992 1.00000 134.54218 134.54218" 298 elif line[0:12] == "foreground w": 299 site_classes = parameters.get("site classes") 300 site_classes = parse_branch_site_a(True, line_floats, site_classes) 301 parameters["site classes"] = site_classes 302 # Find the omega values of the background for each site 303 # class in the branch site A model 304 # Example match: 305 # "background w 0.07992 1.00000 0.07992 1.00000" 306 elif line[0:12] == "background w": 307 site_classes = parameters.get("site classes") 308 site_classes = parse_branch_site_a(False, line_floats, site_classes) 309 parameters["site classes"] = site_classes 310 # Find dN & dS for each branch, which is organized in a table 311 # The possibility of NaNs forces me to not use the line_floats 312 # method. 313 # Example row (some spaces removed to make it smaller...). 314 # " 6..7 0.000 167.7 54.3 0.0000 0.0000 0.0000 0.0 0.0" 315 elif branch_res is not None and len(line_floats) > 0: 316 branch = branch_res.group(1) 317 if parameters.get("branches") is None: 318 parameters["branches"] = {} 319 # Hack for Jython http://bugs.jython.org/issue1762 float("-nan") 320 line = line.replace(" -nan", " nan") 321 params = line.strip().split()[1:] 322 parameters["branches"][branch]= { 323 "t": _nan_float(params[0].strip()), 324 "N": _nan_float(params[1].strip()), 325 "S": _nan_float(params[2].strip()), 326 "omega": _nan_float(params[3].strip()), 327 "dN": _nan_float(params[4].strip()), 328 "dS": _nan_float(params[5].strip()), 329 "N*dN": _nan_float(params[6].strip()), 330 "S*dS": _nan_float(params[7].strip())} 331 # Find model parameters, which can be spread across multiple 332 # lines. 333 # Example matches: 334 # " p0= 0.99043 p= 0.36657 q= 1.04445 335 # " (p1= 0.00957) w= 3.25530" 336 elif len(model_params) > 0: 337 float_model_params = [] 338 for param in model_params: 339 float_model_params.append((param[0], _nan_float(param[1]))) 340 parameters.update(dict(float_model_params)) 341 if len(parameters) > 0: 342 results["parameters"] = parameters 343 return results
344 345
346 -def parse_siteclass_proportions(line_floats):
347 """For models which have multiple site classes, find the proportion of the alignment assigned to each class. 348 """ 349 site_classes = {} 350 if len(line_floats) > 0: 351 for n in range(len(line_floats)): 352 site_classes[n] = {"proportion": line_floats[n]} 353 return site_classes
354 355
356 -def parse_siteclass_omegas(line, site_classes):
357 """For models which have multiple site classes, find the omega estimated for each class. 358 """ 359 # The omega results are tabular with strictly 9 characters per column 360 # (1 to 3 digits before the decimal point and 5 after). This causes 361 # numbers to sometimes run into each other, so we must use a different 362 # regular expression to account for this. i.e.: 363 # w: 0.00012 1.00000109.87121 364 line_floats = re.findall("\d{1,3}\.\d{5}", line) 365 if not site_classes or len(line_floats) == 0: 366 return 367 for n in range(len(line_floats)): 368 site_classes[n]["omega"] = line_floats[n] 369 return site_classes
370 371
372 -def parse_clademodelc(branch_type_no, line_floats, site_classes):
373 """Parse results specific to the clade model C. 374 """ 375 if not site_classes or len(line_floats) == 0: 376 return 377 for n in range(len(line_floats)): 378 if site_classes[n].get("branch types") is None: 379 site_classes[n]["branch types"] = {} 380 site_classes[n]["branch types"][branch_type_no] = line_floats[n] 381 return site_classes
382 383
384 -def parse_branch_site_a(foreground, line_floats, site_classes):
385 """Parse results specific to the branch site A model. 386 """ 387 if not site_classes or len(line_floats) == 0: 388 return 389 for n in range(len(line_floats)): 390 if site_classes[n].get("branch types") is None: 391 site_classes[n]["branch types"] = {} 392 if foreground: 393 site_classes[n]["branch types"]["foreground"] =\ 394 line_floats[n] 395 else: 396 site_classes[n]["branch types"]["background"] =\ 397 line_floats[n] 398 return site_classes
399 400
401 -def parse_pairwise(lines, results):
402 """Parse results from pairwise comparisons. 403 """ 404 # Find pairwise comparisons 405 # Example: 406 # 2 (Pan_troglo) ... 1 (Homo_sapie) 407 # lnL = -291.465693 408 # 0.01262 999.00000 0.00100 409 # 410 # t= 0.0126 S= 81.4 N= 140.6 dN/dS= 0.0010 dN= 0.0000 dS= 0.0115 411 pair_re = re.compile("\d+ \((.+)\) ... \d+ \((.+)\)") 412 pairwise = {} 413 for line in lines: 414 # Find all floating point numbers in this line 415 line_floats_res = line_floats_re.findall(line) 416 line_floats = [_nan_float(val) for val in line_floats_res] 417 pair_res = pair_re.match(line) 418 if pair_res: 419 seq1 = pair_res.group(1) 420 seq2 = pair_res.group(2) 421 if pairwise.get(seq1) is None: 422 pairwise[seq1] = {} 423 if pairwise.get(seq2) is None: 424 pairwise[seq2] = {} 425 if len(line_floats) == 1: 426 pairwise[seq1][seq2] = {"lnL": line_floats[0]} 427 pairwise[seq2][seq1] = pairwise[seq1][seq2] 428 elif len(line_floats) == 6: 429 pairwise[seq1][seq2] = {"t": line_floats[0], 430 "S": line_floats[1], 431 "N": line_floats[2], 432 "omega": line_floats[3], 433 "dN": line_floats[4], 434 "dS": line_floats[5]} 435 pairwise[seq2][seq1] = pairwise[seq1][seq2] 436 if len(pairwise) > 0: 437 results["pairwise"] = pairwise 438 return results
439 440
441 -def parse_distances(lines, results):
442 """Parse amino acid sequence distance results. 443 """ 444 distances = {} 445 sequences = [] 446 raw_aa_distances_flag = False 447 ml_aa_distances_flag = False 448 matrix_row_re = re.compile("(.+)\s{5,15}") 449 for line in lines: 450 # Find all floating point numbers in this line 451 line_floats_res = line_floats_re.findall(line) 452 line_floats = [_nan_float(val) for val in line_floats_res] 453 if "AA distances" in line: 454 raw_aa_distances_flag = True 455 # In current versions, the raw distances always come 456 # first but I don't trust this to always be true 457 ml_aa_distances_flag = False 458 elif "ML distances of aa seqs." in line: 459 ml_aa_distances_flag = True 460 raw_aa_distances_flag = False 461 # Parse AA distances (raw or ML), in a lower diagonal matrix 462 matrix_row_res = matrix_row_re.match(line) 463 if matrix_row_res and (raw_aa_distances_flag or 464 ml_aa_distances_flag): 465 seq_name = matrix_row_res.group(1).strip() 466 if seq_name not in sequences: 467 sequences.append(seq_name) 468 if raw_aa_distances_flag: 469 if distances.get("raw") is None: 470 distances["raw"] = {} 471 distances["raw"][seq_name] = {} 472 for i in range(0, len(line_floats)): 473 distances["raw"][seq_name][sequences[i]] = line_floats[i] 474 distances["raw"][sequences[i]][seq_name] = line_floats[i] 475 else: 476 if distances.get("ml") is None: 477 distances["ml"] = {} 478 distances["ml"][seq_name] = {} 479 for i in range(0, len(line_floats)): 480 distances["ml"][seq_name][sequences[i]] = line_floats[i] 481 distances["ml"][sequences[i]][seq_name] = line_floats[i] 482 if len(distances) > 0: 483 results["distances"] = distances 484 return