Package Bio :: Package Phylo :: Package PAML :: Module _parse_baseml
[hide private]
[frames] | no frames]

Source Code for Module Bio.Phylo.PAML._parse_baseml

  1  # Copyright (C) 2011 by Brandon Invergo (b.invergo@gmail.com) 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license. Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  import re 
  7   
  8  line_floats_re = re.compile("-*\d+\.\d+") 
  9   
 10   
11 -def parse_basics(lines, results):
12 """Parse the basics that should be present in most baseml results files. 13 """ 14 version_re = re.compile("BASEML \(in paml version (\d+\.\d+[a-z]*).*") 15 np_re = re.compile("lnL\(ntime:\s+\d+\s+np:\s+(\d+)\)") 16 num_params = -1 17 for line in lines: 18 # Find all floating point numbers in this line 19 line_floats_res = line_floats_re.findall(line) 20 line_floats = [float(val) for val in line_floats_res] 21 # Find the version number 22 # Example match: 23 # "BASEML (in paml version 4.3, August 2009) alignment.phylip" 24 version_res = version_re.match(line) 25 if version_res is not None: 26 results["version"] = version_res.group(1) 27 # Find max lnL 28 # Example match: 29 # ln Lmax (unconstrained) = -316.049385 30 if "ln Lmax" in line and len(line_floats) == 1: 31 results["lnL max"] = line_floats[0] 32 # Find lnL values. 33 # Example match (lnL = -2021.348300): 34 # "lnL(ntime: 19 np: 22): -2021.348300 +0.000000" 35 elif "lnL(ntime:" in line and len(line_floats) > 0: 36 results["lnL"] = line_floats[0] 37 np_res = np_re.match(line) 38 if np_res is not None: 39 num_params = int(np_res.group(1)) 40 # Find tree lengths. 41 # Example match: "tree length = 1.71931" 42 elif "tree length" in line and len(line_floats) == 1: 43 results["tree length"] = line_floats[0] 44 # Find the estimated tree, only taking the tree if it has 45 # branch lengths 46 elif re.match("\(+", line) is not None: 47 if ":" in line: 48 results["tree"] = line.strip() 49 return (results, num_params)
50 51
52 -def parse_parameters(lines, results, num_params):
53 """Parse the various parameters from the file. 54 """ 55 parameters = {} 56 parameters = parse_parameter_list(lines, parameters, num_params) 57 parameters = parse_kappas(lines, parameters) 58 parameters = parse_rates(lines, parameters) 59 parameters = parse_freqs(lines, parameters) 60 results["parameters"] = parameters 61 return results
62 63
64 -def parse_parameter_list(lines, parameters, num_params):
65 """ Parse the parameters list, which is just an unlabeled list of numeric values. 66 """ 67 for line_num in range(len(lines)): 68 line = lines[line_num] 69 # Find all floating point numbers in this line 70 line_floats_res = line_floats_re.findall(line) 71 line_floats = [float(val) for val in line_floats_res] 72 # Get parameter list. This can be useful for specifying starting 73 # parameters in another run by copying the list of parameters 74 # to a file called in.baseml. Since the parameters must be in 75 # a fixed order and format, copying and pasting to the file is 76 # best. For this reason, they are grabbed here just as a long 77 # string and not as individual numbers. 78 if len(line_floats) == num_params: 79 parameters["parameter list"] = line.strip() 80 # Find SEs. The same format as parameters above is maintained 81 # since there is a correspondance between the SE format and 82 # the parameter format. 83 # Example match: 84 # "SEs for parameters: 85 # -1.00000 -1.00000 -1.00000 801727.63247 730462.67590 -1.00000 86 if "SEs for parameters:" in lines[line_num + 1]: 87 SEs_line = lines[line_num + 2] 88 parameters["SEs"] = SEs_line.strip() 89 break 90 return parameters
91 92
93 -def parse_kappas(lines, parameters):
94 """Parse out the kappa parameters. 95 """ 96 kappa_found = False 97 for line in lines: 98 # Find all floating point numbers in this line 99 line_floats_res = line_floats_re.findall(line) 100 line_floats = [float(val) for val in line_floats_res] 101 # Find kappa parameter (F84, HKY85, T92 model) 102 # Example match: 103 # "Parameters (kappa) in the rate matrix (F84) (Yang 1994 J Mol Evol 39:105-111): 104 # 3.00749" 105 if "Parameters (kappa)" in line: 106 kappa_found = True 107 elif kappa_found and len(line_floats) > 0: 108 branch_res = re.match("\s(\d+\.\.\d+)", line) 109 if branch_res is None: 110 if len(line_floats) == 1: 111 parameters["kappa"] = line_floats[0] 112 else: 113 parameters["kappa"] = line_floats 114 kappa_found = False 115 else: 116 if parameters.get("branches") is None: 117 parameters["branches"] = {} 118 branch = branch_res.group(1) 119 if len(line_floats) > 0: 120 parameters["branches"][branch] = \ 121 {"t": line_floats[0], "kappa": line_floats[1], 122 "TS": line_floats[2], "TV": line_floats[3]} 123 # Find kappa under REV 124 # Example match: 125 # kappa under REV: 999.00000 145.76453 0.00001 0.00001 0.00001 126 elif "kappa under" in line and len(line_floats) > 0: 127 if len(line_floats) == 1: 128 parameters["kappa"] = line_floats[0] 129 else: 130 parameters["kappa"] = line_floats 131 return parameters
132 133
134 -def parse_rates(lines, parameters):
135 """Parse the rate parameters. 136 """ 137 Q_mat_found = False 138 trans_probs_found = False 139 for line in lines: 140 # Find all floating point numbers in this line 141 line_floats_res = line_floats_re.findall(line) 142 line_floats = [float(val) for val in line_floats_res] 143 # Find rate parameters 144 # Example match: 145 # "Rate parameters: 999.00000 145.59775 0.00001 0.00001 0.00001" 146 if "Rate parameters:" in line and len(line_floats) > 0: 147 parameters["rate parameters"] = line_floats 148 # Find rates 149 # Example match: 150 # "rate: 0.90121 0.96051 0.99831 1.03711 1.10287" 151 elif "rate: " in line and len(line_floats) > 0: 152 parameters["rates"] = line_floats 153 # Find Rate matrix Q & average kappa (REV model) 154 # Example match: 155 # Rate matrix Q, Average Ts/Tv = 3.0308 156 # -2.483179 1.865730 0.617449 0.000000 157 # 2.298662 -2.298662 0.000000 0.000000 158 # 0.335015 0.000000 -0.338059 0.003044 159 # 0.000000 0.000000 0.004241 -0.004241 160 elif "matrix Q" in line: 161 parameters["Q matrix"] = {"matrix": []} 162 if len(line_floats) > 0: 163 parameters["Q matrix"]["average Ts/Tv"] = \ 164 line_floats[0] 165 Q_mat_found = True 166 elif Q_mat_found and len(line_floats) > 0: 167 parameters["Q matrix"]["matrix"].append(line_floats) 168 if len(parameters["Q matrix"]["matrix"]) == 4: 169 Q_mat_found = False 170 # Find alpha (gamma shape parameter for variable rates) 171 # Example match: "alpha (gamma, K=5) = 192.47918" 172 elif "alpha" in line and len(line_floats) > 0: 173 parameters["alpha"] = line_floats[0] 174 # Find rho for auto-discrete-gamma model 175 elif "rho" in line and len(line_floats) > 0: 176 parameters["rho"] = line_floats[0] 177 elif "transition probabilities" in line: 178 parameters["transition probs."] = [] 179 trans_probs_found = True 180 elif trans_probs_found and len(line_floats) > 0: 181 parameters["transition probs."].append(line_floats) 182 if len(parameters["transition probs."]) == len(parameters["rates"]): 183 trans_probs_found = False 184 return parameters
185 186
187 -def parse_freqs(lines, parameters):
188 """Parse the basepair frequencies. 189 """ 190 root_re = re.compile("Note: node (\d+) is root.") 191 branch_freqs_found = False 192 base_freqs_found = False 193 for line in lines: 194 # Find all floating point numbers in this line 195 line_floats_res = line_floats_re.findall(line) 196 line_floats = [float(val) for val in line_floats_res] 197 # Find base frequencies from baseml 4.3 198 # Example match: 199 # "Base frequencies: 0.20090 0.16306 0.37027 0.26577" 200 if "Base frequencies" in line and len(line_floats) > 0: 201 base_frequencies = {} 202 base_frequencies["T"] = line_floats[0] 203 base_frequencies["C"] = line_floats[1] 204 base_frequencies["A"] = line_floats[2] 205 base_frequencies["G"] = line_floats[3] 206 parameters["base frequencies"] = base_frequencies 207 # Find base frequencies from baseml 4.1: 208 # Example match: 209 # "base frequency parameters 210 # " 0.20317 0.16768 0.36813 0.26102" 211 elif "base frequency parameters" in line: 212 base_freqs_found = True 213 # baseml 4.4 returns to having the base frequencies on the next line 214 # but the heading changed 215 elif "Base frequencies" in line and len(line_floats) == 0: 216 base_freqs_found = True 217 elif base_freqs_found and len(line_floats) > 0: 218 base_frequencies = {} 219 base_frequencies["T"] = line_floats[0] 220 base_frequencies["C"] = line_floats[1] 221 base_frequencies["A"] = line_floats[2] 222 base_frequencies["G"] = line_floats[3] 223 parameters["base frequencies"] = base_frequencies 224 base_freqs_found = False 225 # Find frequencies 226 # Example match: 227 # "freq: 0.90121 0.96051 0.99831 1.03711 1.10287" 228 elif "freq: " in line and len(line_floats) > 0: 229 parameters["rate frequencies"] = line_floats 230 # Find branch-specific frequency parameters 231 # Example match (note: I think it's possible to have 4 more 232 # values per line, enclosed in brackets, so I'll account for 233 # this): 234 # (frequency parameters for branches) [frequencies at nodes] (see Yang & Roberts 1995 fig 1) 235 # 236 # Node #1 ( 0.25824 0.24176 0.25824 0.24176 ) 237 # Node #2 ( 0.00000 0.50000 0.00000 0.50000 ) 238 elif "(frequency parameters for branches)" in line: 239 parameters["nodes"] = {} 240 branch_freqs_found = True 241 elif branch_freqs_found is True: 242 if len(line_floats) > 0: 243 node_res = re.match("Node \#(\d+)", line) 244 node_num = int(node_res.group(1)) 245 node = {"root": False} 246 node["frequency parameters"] = line_floats[:4] 247 if len(line_floats) > 4: 248 node["base frequencies"] = {"T": line_floats[4], 249 "C": line_floats[5], 250 "A": line_floats[6], 251 "G": line_floats[7]} 252 parameters["nodes"][node_num] = node 253 else: 254 root_res = root_re.match(line) 255 if root_res is not None: 256 root_node = int(root_res.group(1)) 257 parameters["nodes"][root_node]["root"] =\ 258 True 259 branch_freqs_found = False 260 return parameters
261