Package Bio :: Package Phylo :: Package PAML :: Module _parse_baseml
[hide private]
[frames] | no frames]

Source Code for Module Bio.Phylo.PAML._parse_baseml

  1  # Copyright (C) 2011 by Brandon Invergo (b.invergo@gmail.com) 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license. Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  import re 
  7   
  8   
  9  line_floats_re = re.compile("-*\d+\.\d+") 
 10   
 11   
12 -def parse_basics(lines, results):
13 """Parse the basics that should be present in most baseml results files. 14 """ 15 version_re = re.compile("BASEML \(in paml version (\d+\.\d+[a-z]*).*") 16 np_re = re.compile("lnL\(ntime:\s+\d+\s+np:\s+(\d+)\)") 17 num_params = -1 18 for line in lines: 19 # Find all floating point numbers in this line 20 line_floats_res = line_floats_re.findall(line) 21 line_floats = [float(val) for val in line_floats_res] 22 # Find the version number 23 # Example match: 24 # "BASEML (in paml version 4.3, August 2009) alignment.phylip" 25 version_res = version_re.match(line) 26 if version_res is not None: 27 results["version"] = version_res.group(1) 28 # Find max lnL 29 # Example match: 30 # ln Lmax (unconstrained) = -316.049385 31 if "ln Lmax" in line and len(line_floats) == 1: 32 results["lnL max"] = line_floats[0] 33 # Find lnL values. 34 # Example match (lnL = -2021.348300): 35 # "lnL(ntime: 19 np: 22): -2021.348300 +0.000000" 36 elif "lnL(ntime:" in line and line_floats: 37 results["lnL"] = line_floats[0] 38 np_res = np_re.match(line) 39 if np_res is not None: 40 num_params = int(np_res.group(1)) 41 # Find tree lengths. 42 # Example match: "tree length = 1.71931" 43 elif "tree length" in line and len(line_floats) == 1: 44 results["tree length"] = line_floats[0] 45 # Find the estimated tree, only taking the tree if it has 46 # branch lengths 47 elif re.match("\(+", line) is not None: 48 if ":" in line: 49 results["tree"] = line.strip() 50 return (results, num_params)
51 52
53 -def parse_parameters(lines, results, num_params):
54 """Parse the various parameters from the file. 55 """ 56 parameters = {} 57 parameters = parse_parameter_list(lines, parameters, num_params) 58 parameters = parse_kappas(lines, parameters) 59 parameters = parse_rates(lines, parameters) 60 parameters = parse_freqs(lines, parameters) 61 results["parameters"] = parameters 62 return results
63 64
65 -def parse_parameter_list(lines, parameters, num_params):
66 """Parse the parameters list, which is just an unlabeled list of numeric values. 67 """ 68 for line_num in range(len(lines)): 69 line = lines[line_num] 70 # Find all floating point numbers in this line 71 line_floats_res = line_floats_re.findall(line) 72 line_floats = [float(val) for val in line_floats_res] 73 # Get parameter list. This can be useful for specifying starting 74 # parameters in another run by copying the list of parameters 75 # to a file called in.baseml. Since the parameters must be in 76 # a fixed order and format, copying and pasting to the file is 77 # best. For this reason, they are grabbed here just as a long 78 # string and not as individual numbers. 79 if len(line_floats) == num_params: 80 parameters["parameter list"] = line.strip() 81 # Find SEs. The same format as parameters above is maintained 82 # since there is a correspondence between the SE format and 83 # the parameter format. 84 # Example match: 85 # "SEs for parameters: 86 # -1.00000 -1.00000 -1.00000 801727.63247 730462.67590 -1.00000 87 if "SEs for parameters:" in lines[line_num + 1]: 88 SEs_line = lines[line_num + 2] 89 parameters["SEs"] = SEs_line.strip() 90 break 91 return parameters
92 93
94 -def parse_kappas(lines, parameters):
95 """Parse out the kappa parameters. 96 """ 97 kappa_found = False 98 for line in lines: 99 # Find all floating point numbers in this line 100 line_floats_res = line_floats_re.findall(line) 101 line_floats = [float(val) for val in line_floats_res] 102 # Find kappa parameter (F84, HKY85, T92 model) 103 # Example match: 104 # "Parameters (kappa) in the rate matrix (F84) (Yang 1994 J Mol Evol 39:105-111): 105 # 3.00749" 106 if "Parameters (kappa)" in line: 107 kappa_found = True 108 elif kappa_found and line_floats: 109 branch_res = re.match("\s(\d+\.\.\d+)", line) 110 if branch_res is None: 111 if len(line_floats) == 1: 112 parameters["kappa"] = line_floats[0] 113 else: 114 parameters["kappa"] = line_floats 115 kappa_found = False 116 else: 117 if parameters.get("branches") is None: 118 parameters["branches"] = {} 119 branch = branch_res.group(1) 120 if line_floats: 121 parameters["branches"][branch] = \ 122 {"t": line_floats[0], "kappa": line_floats[1], 123 "TS": line_floats[2], "TV": line_floats[3]} 124 # Find kappa under REV 125 # Example match: 126 # kappa under REV: 999.00000 145.76453 0.00001 0.00001 0.00001 127 elif "kappa under" in line and line_floats: 128 if len(line_floats) == 1: 129 parameters["kappa"] = line_floats[0] 130 else: 131 parameters["kappa"] = line_floats 132 return parameters
133 134
135 -def parse_rates(lines, parameters):
136 """Parse the rate parameters. 137 """ 138 Q_mat_found = False 139 trans_probs_found = False 140 for line in lines: 141 # Find all floating point numbers in this line 142 line_floats_res = line_floats_re.findall(line) 143 line_floats = [float(val) for val in line_floats_res] 144 # Find rate parameters 145 # Example match: 146 # "Rate parameters: 999.00000 145.59775 0.00001 0.00001 0.00001" 147 if "Rate parameters:" in line and line_floats: 148 parameters["rate parameters"] = line_floats 149 # Find rates 150 # Example match: 151 # "rate: 0.90121 0.96051 0.99831 1.03711 1.10287" 152 elif "rate: " in line and line_floats: 153 parameters["rates"] = line_floats 154 # Find Rate matrix Q & average kappa (REV model) 155 # Example match: 156 # Rate matrix Q, Average Ts/Tv = 3.0308 157 # -2.483179 1.865730 0.617449 0.000000 158 # 2.298662 -2.298662 0.000000 0.000000 159 # 0.335015 0.000000 -0.338059 0.003044 160 # 0.000000 0.000000 0.004241 -0.004241 161 elif "matrix Q" in line: 162 parameters["Q matrix"] = {"matrix": []} 163 if line_floats: 164 parameters["Q matrix"]["average Ts/Tv"] = \ 165 line_floats[0] 166 Q_mat_found = True 167 elif Q_mat_found and line_floats: 168 parameters["Q matrix"]["matrix"].append(line_floats) 169 if len(parameters["Q matrix"]["matrix"]) == 4: 170 Q_mat_found = False 171 # Find alpha (gamma shape parameter for variable rates) 172 # Example match: "alpha (gamma, K=5) = 192.47918" 173 elif "alpha" in line and line_floats: 174 parameters["alpha"] = line_floats[0] 175 # Find rho for auto-discrete-gamma model 176 elif "rho" in line and line_floats: 177 parameters["rho"] = line_floats[0] 178 elif "transition probabilities" in line: 179 parameters["transition probs."] = [] 180 trans_probs_found = True 181 elif trans_probs_found and line_floats: 182 parameters["transition probs."].append(line_floats) 183 if len(parameters["transition probs."]) == len(parameters["rates"]): 184 trans_probs_found = False 185 return parameters
186 187
188 -def parse_freqs(lines, parameters):
189 """Parse the basepair frequencies. 190 """ 191 root_re = re.compile("Note: node (\d+) is root.") 192 branch_freqs_found = False 193 base_freqs_found = False 194 for line in lines: 195 # Find all floating point numbers in this line 196 line_floats_res = line_floats_re.findall(line) 197 line_floats = [float(val) for val in line_floats_res] 198 # Find base frequencies from baseml 4.3 199 # Example match: 200 # "Base frequencies: 0.20090 0.16306 0.37027 0.26577" 201 if "Base frequencies" in line and line_floats: 202 base_frequencies = {} 203 base_frequencies["T"] = line_floats[0] 204 base_frequencies["C"] = line_floats[1] 205 base_frequencies["A"] = line_floats[2] 206 base_frequencies["G"] = line_floats[3] 207 parameters["base frequencies"] = base_frequencies 208 # Find base frequencies from baseml 4.1: 209 # Example match: 210 # "base frequency parameters 211 # " 0.20317 0.16768 0.36813 0.26102" 212 elif "base frequency parameters" in line: 213 base_freqs_found = True 214 # baseml 4.4 returns to having the base frequencies on the next line 215 # but the heading changed 216 elif "Base frequencies" in line and not line_floats: 217 base_freqs_found = True 218 elif base_freqs_found and line_floats: 219 base_frequencies = {} 220 base_frequencies["T"] = line_floats[0] 221 base_frequencies["C"] = line_floats[1] 222 base_frequencies["A"] = line_floats[2] 223 base_frequencies["G"] = line_floats[3] 224 parameters["base frequencies"] = base_frequencies 225 base_freqs_found = False 226 # Find frequencies 227 # Example match: 228 # "freq: 0.90121 0.96051 0.99831 1.03711 1.10287" 229 elif "freq: " in line and line_floats: 230 parameters["rate frequencies"] = line_floats 231 # Find branch-specific frequency parameters 232 # Example match (note: I think it's possible to have 4 more 233 # values per line, enclosed in brackets, so I'll account for 234 # this): 235 # (frequency parameters for branches) [frequencies at nodes] (see Yang & Roberts 1995 fig 1) 236 # 237 # Node #1 ( 0.25824 0.24176 0.25824 0.24176 ) 238 # Node #2 ( 0.00000 0.50000 0.00000 0.50000 ) 239 elif "(frequency parameters for branches)" in line: 240 parameters["nodes"] = {} 241 branch_freqs_found = True 242 elif branch_freqs_found: 243 if line_floats: 244 node_res = re.match("Node \#(\d+)", line) 245 node_num = int(node_res.group(1)) 246 node = {"root": False} 247 node["frequency parameters"] = line_floats[:4] 248 if len(line_floats) > 4: 249 node["base frequencies"] = {"T": line_floats[4], 250 "C": line_floats[5], 251 "A": line_floats[6], 252 "G": line_floats[7]} 253 parameters["nodes"][node_num] = node 254 else: 255 root_res = root_re.match(line) 256 if root_res is not None: 257 root_node = int(root_res.group(1)) 258 parameters["nodes"][root_node]["root"] =\ 259 True 260 branch_freqs_found = False 261 return parameters
262