Package Bio :: Package Phylo :: Package PAML :: Module _parse_baseml
[hide private]
[frames] | no frames]

Source Code for Module Bio.Phylo.PAML._parse_baseml

  1  # Copyright (C) 2011 by Brandon Invergo (b.invergo@gmail.com) 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license. Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  import re 
  7   
  8   
  9  line_floats_re = re.compile("-*\d+\.\d+") 
 10   
 11   
12 -def parse_basics(lines, results):
13 """Parse the basics that should be present in most baseml results files.""" 14 version_re = re.compile("BASEML \(in paml version (\d+\.\d+[a-z]*).*") 15 np_re = re.compile("lnL\(ntime:\s+\d+\s+np:\s+(\d+)\)") 16 num_params = -1 17 for line in lines: 18 # Find all floating point numbers in this line 19 line_floats_res = line_floats_re.findall(line) 20 line_floats = [float(val) for val in line_floats_res] 21 # Find the version number 22 # Example match: 23 # "BASEML (in paml version 4.3, August 2009) alignment.phylip" 24 version_res = version_re.match(line) 25 if version_res is not None: 26 results["version"] = version_res.group(1) 27 # Find max lnL 28 # Example match: 29 # ln Lmax (unconstrained) = -316.049385 30 if "ln Lmax" in line and len(line_floats) == 1: 31 results["lnL max"] = line_floats[0] 32 # Find lnL values. 33 # Example match (lnL = -2021.348300): 34 # "lnL(ntime: 19 np: 22): -2021.348300 +0.000000" 35 elif "lnL(ntime:" in line and line_floats: 36 results["lnL"] = line_floats[0] 37 np_res = np_re.match(line) 38 if np_res is not None: 39 num_params = int(np_res.group(1)) 40 # Find tree lengths. 41 # Example match: "tree length = 1.71931" 42 elif "tree length" in line and len(line_floats) == 1: 43 results["tree length"] = line_floats[0] 44 # Find the estimated tree, only taking the tree if it has 45 # branch lengths 46 elif re.match("\(+", line) is not None: 47 if ":" in line: 48 results["tree"] = line.strip() 49 return (results, num_params)
50 51
52 -def parse_parameters(lines, results, num_params):
53 """Parse the various parameters from the file.""" 54 parameters = {} 55 parameters = parse_parameter_list(lines, parameters, num_params) 56 parameters = parse_kappas(lines, parameters) 57 parameters = parse_rates(lines, parameters) 58 parameters = parse_freqs(lines, parameters) 59 results["parameters"] = parameters 60 return results
61 62
63 -def parse_parameter_list(lines, parameters, num_params):
64 """Parse the parameters list, which is just an unlabeled list of numeric values.""" 65 for line_num in range(len(lines)): 66 line = lines[line_num] 67 # Find all floating point numbers in this line 68 line_floats_res = line_floats_re.findall(line) 69 line_floats = [float(val) for val in line_floats_res] 70 # Get parameter list. This can be useful for specifying starting 71 # parameters in another run by copying the list of parameters 72 # to a file called in.baseml. Since the parameters must be in 73 # a fixed order and format, copying and pasting to the file is 74 # best. For this reason, they are grabbed here just as a long 75 # string and not as individual numbers. 76 if len(line_floats) == num_params: 77 parameters["parameter list"] = line.strip() 78 # Find SEs. The same format as parameters above is maintained 79 # since there is a correspondence between the SE format and 80 # the parameter format. 81 # Example match: 82 # "SEs for parameters: 83 # -1.00000 -1.00000 -1.00000 801727.63247 730462.67590 -1.00000 84 if "SEs for parameters:" in lines[line_num + 1]: 85 SEs_line = lines[line_num + 2] 86 parameters["SEs"] = SEs_line.strip() 87 break 88 return parameters
89 90
91 -def parse_kappas(lines, parameters):
92 """Parse out the kappa parameters.""" 93 kappa_found = False 94 for line in lines: 95 # Find all floating point numbers in this line 96 line_floats_res = line_floats_re.findall(line) 97 line_floats = [float(val) for val in line_floats_res] 98 # Find kappa parameter (F84, HKY85, T92 model) 99 # Example match: 100 # "Parameters (kappa) in the rate matrix (F84) (Yang 1994 J Mol Evol 39:105-111): 101 # 3.00749" 102 if "Parameters (kappa)" in line: 103 kappa_found = True 104 elif kappa_found and line_floats: 105 branch_res = re.match("\s(\d+\.\.\d+)", line) 106 if branch_res is None: 107 if len(line_floats) == 1: 108 parameters["kappa"] = line_floats[0] 109 else: 110 parameters["kappa"] = line_floats 111 kappa_found = False 112 else: 113 if parameters.get("branches") is None: 114 parameters["branches"] = {} 115 branch = branch_res.group(1) 116 if line_floats: 117 parameters["branches"][branch] = \ 118 {"t": line_floats[0], "kappa": line_floats[1], 119 "TS": line_floats[2], "TV": line_floats[3]} 120 # Find kappa under REV 121 # Example match: 122 # kappa under REV: 999.00000 145.76453 0.00001 0.00001 0.00001 123 elif "kappa under" in line and line_floats: 124 if len(line_floats) == 1: 125 parameters["kappa"] = line_floats[0] 126 else: 127 parameters["kappa"] = line_floats 128 return parameters
129 130
131 -def parse_rates(lines, parameters):
132 """Parse the rate parameters.""" 133 Q_mat_found = False 134 trans_probs_found = False 135 for line in lines: 136 # Find all floating point numbers in this line 137 line_floats_res = line_floats_re.findall(line) 138 line_floats = [float(val) for val in line_floats_res] 139 # Find rate parameters 140 # Example match: 141 # "Rate parameters: 999.00000 145.59775 0.00001 0.00001 0.00001" 142 if "Rate parameters:" in line and line_floats: 143 parameters["rate parameters"] = line_floats 144 # Find rates 145 # Example match: 146 # "rate: 0.90121 0.96051 0.99831 1.03711 1.10287" 147 elif "rate: " in line and line_floats: 148 parameters["rates"] = line_floats 149 # Find Rate matrix Q & average kappa (REV model) 150 # Example match: 151 # Rate matrix Q, Average Ts/Tv = 3.0308 152 # -2.483179 1.865730 0.617449 0.000000 153 # 2.298662 -2.298662 0.000000 0.000000 154 # 0.335015 0.000000 -0.338059 0.003044 155 # 0.000000 0.000000 0.004241 -0.004241 156 elif "matrix Q" in line: 157 parameters["Q matrix"] = {"matrix": []} 158 if line_floats: 159 parameters["Q matrix"]["average Ts/Tv"] = \ 160 line_floats[0] 161 Q_mat_found = True 162 elif Q_mat_found and line_floats: 163 parameters["Q matrix"]["matrix"].append(line_floats) 164 if len(parameters["Q matrix"]["matrix"]) == 4: 165 Q_mat_found = False 166 # Find alpha (gamma shape parameter for variable rates) 167 # Example match: "alpha (gamma, K=5) = 192.47918" 168 elif "alpha" in line and line_floats: 169 parameters["alpha"] = line_floats[0] 170 # Find rho for auto-discrete-gamma model 171 elif "rho" in line and line_floats: 172 parameters["rho"] = line_floats[0] 173 elif "transition probabilities" in line: 174 parameters["transition probs."] = [] 175 trans_probs_found = True 176 elif trans_probs_found and line_floats: 177 parameters["transition probs."].append(line_floats) 178 if len(parameters["transition probs."]) == len(parameters["rates"]): 179 trans_probs_found = False 180 return parameters
181 182
183 -def parse_freqs(lines, parameters):
184 """Parse the basepair frequencies.""" 185 root_re = re.compile("Note: node (\d+) is root.") 186 branch_freqs_found = False 187 base_freqs_found = False 188 for line in lines: 189 # Find all floating point numbers in this line 190 line_floats_res = line_floats_re.findall(line) 191 line_floats = [float(val) for val in line_floats_res] 192 # Find base frequencies from baseml 4.3 193 # Example match: 194 # "Base frequencies: 0.20090 0.16306 0.37027 0.26577" 195 if "Base frequencies" in line and line_floats: 196 base_frequencies = {} 197 base_frequencies["T"] = line_floats[0] 198 base_frequencies["C"] = line_floats[1] 199 base_frequencies["A"] = line_floats[2] 200 base_frequencies["G"] = line_floats[3] 201 parameters["base frequencies"] = base_frequencies 202 # Find base frequencies from baseml 4.1: 203 # Example match: 204 # "base frequency parameters 205 # " 0.20317 0.16768 0.36813 0.26102" 206 elif "base frequency parameters" in line: 207 base_freqs_found = True 208 # baseml 4.4 returns to having the base frequencies on the next line 209 # but the heading changed 210 elif "Base frequencies" in line and not line_floats: 211 base_freqs_found = True 212 elif base_freqs_found and line_floats: 213 base_frequencies = {} 214 base_frequencies["T"] = line_floats[0] 215 base_frequencies["C"] = line_floats[1] 216 base_frequencies["A"] = line_floats[2] 217 base_frequencies["G"] = line_floats[3] 218 parameters["base frequencies"] = base_frequencies 219 base_freqs_found = False 220 # Find frequencies 221 # Example match: 222 # "freq: 0.90121 0.96051 0.99831 1.03711 1.10287" 223 elif "freq: " in line and line_floats: 224 parameters["rate frequencies"] = line_floats 225 # Find branch-specific frequency parameters 226 # Example match (note: I think it's possible to have 4 more 227 # values per line, enclosed in brackets, so I'll account for 228 # this): 229 # (frequency parameters for branches) [frequencies at nodes] (see Yang & Roberts 1995 fig 1) 230 # 231 # Node #1 ( 0.25824 0.24176 0.25824 0.24176 ) 232 # Node #2 ( 0.00000 0.50000 0.00000 0.50000 ) 233 elif "(frequency parameters for branches)" in line: 234 parameters["nodes"] = {} 235 branch_freqs_found = True 236 elif branch_freqs_found: 237 if line_floats: 238 node_res = re.match("Node \#(\d+)", line) 239 node_num = int(node_res.group(1)) 240 node = {"root": False} 241 node["frequency parameters"] = line_floats[:4] 242 if len(line_floats) > 4: 243 node["base frequencies"] = {"T": line_floats[4], 244 "C": line_floats[5], 245 "A": line_floats[6], 246 "G": line_floats[7]} 247 parameters["nodes"][node_num] = node 248 else: 249 root_res = root_re.match(line) 250 if root_res is not None: 251 root_node = int(root_res.group(1)) 252 parameters["nodes"][root_node]["root"] =\ 253 True 254 branch_freqs_found = False 255 return parameters
256