Package Bio :: Package Phylo :: Package PAML :: Module _parse_yn00
[hide private]
[frames] | no frames]

Source Code for Module Bio.Phylo.PAML._parse_yn00

  1  # Copyright (C) 2011 by Brandon Invergo (b.invergo@gmail.com) 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license. Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  import re 
  7   
  8   
9 -def parse_ng86(lines, results):
10 """Parse the Nei & Gojobori (1986) section of the results. 11 12 Nei_Gojobori results are organized in a lower 13 triangular matrix, with the sequence names labeling 14 the rows and statistics in the format: 15 w (dN dS) per column 16 Example row (2 columns): 17 0.0000 (0.0000 0.0207) 0.0000 (0.0000 0.0421) 18 """ 19 sequences = [] 20 for line in lines: 21 # Find all floating point numbers in this line 22 line_floats_res = re.findall("-*\d+\.\d+", line) 23 line_floats = [float(val) for val in line_floats_res] 24 25 # The purpose of this complex regex is to parse the NG86 section for 26 # valid lines of data that are mixed in with citations and comments. 27 # The data lines begin with a taxon name, followed by zero or more 28 # fields containing numeric values, sometimes enclosed in parens. 29 # Taxon names are from 1-30 characters and are usually separated from 30 # the numeric portion of the line by space(s). Long taxon names to are 31 # truncated to 30 characters, and may run into the data fields without 32 # any separator., e.g. some_long_name-1.0000 33 # This regex is an attempt to cover more pathological cases while also 34 # parsing all existing versions of yn00 output with shorter names. 35 36 matrix_row_res = re.match("^([^\s]+?)(?:\s+-?\d+\.\d+|\s*$|-1.0000\s*\()", line) 37 if matrix_row_res is not None: 38 seq_name = matrix_row_res.group(1).strip() 39 sequences.append(seq_name) 40 results[seq_name] = {} 41 for i in range(0, len(line_floats), 3): 42 NG86 = {} 43 NG86["omega"] = line_floats[i] 44 NG86["dN"] = line_floats[i + 1] 45 NG86["dS"] = line_floats[i + 2] 46 results[seq_name][sequences[i // 3]] = {"NG86": NG86} 47 results[sequences[i // 3]][seq_name] = {"NG86": NG86} 48 return (results, sequences)
49 50
51 -def parse_yn00(lines, results, sequences):
52 """Parse the Yang & Nielsen (2000) part of the results. 53 54 Yang & Nielsen results are organized in a table with 55 each row comprising one pairwise species comparison. 56 Rows are labeled by sequence number rather than by 57 sequence name. 58 """ 59 # Example (header row and first table row): 60 # seq. seq. S N t kappa omega dN +- SE dS +- SE 61 # 2 1 67.3 154.7 0.0136 3.6564 0.0000 -0.0000 +- 0.0000 0.0150 62 # +- 0.0151 63 for line in lines: 64 # Find all floating point numbers in this line 65 line_floats_res = re.findall("-*\d+\.\d+", line) 66 line_floats = [float(val) for val in line_floats_res] 67 row_res = re.match("\s+(\d+)\s+(\d+)", line) 68 if row_res is not None: 69 seq1 = int(row_res.group(1)) 70 seq2 = int(row_res.group(2)) 71 seq_name1 = sequences[seq1 - 1] 72 seq_name2 = sequences[seq2 - 1] 73 YN00 = {} 74 YN00["S"] = line_floats[0] 75 YN00["N"] = line_floats[1] 76 YN00["t"] = line_floats[2] 77 YN00["kappa"] = line_floats[3] 78 YN00["omega"] = line_floats[4] 79 YN00["dN"] = line_floats[5] 80 YN00["dN SE"] = line_floats[6] 81 YN00["dS"] = line_floats[7] 82 YN00["dS SE"] = line_floats[8] 83 results[seq_name1][seq_name2]["YN00"] = YN00 84 results[seq_name2][seq_name1]["YN00"] = YN00 85 seq_name1 = None 86 seq_name2 = None 87 return results
88 89
90 -def parse_others(lines, results, sequences):
91 """Parse the results from the other methods. 92 93 The remaining methods are grouped together. Statistics 94 for all three are listed for each of the pairwise 95 species comparisons, with each method's results on its 96 own line. 97 The stats in this section must be handled differently 98 due to the possible presence of NaN values, which won't 99 get caught by my typical "line_floats" method used above. 100 """ 101 # Example: 102 # 2 (Pan_troglo) vs. 1 (Homo_sapie) 103 104 # L(i): 143.0 51.0 28.0 sum= 222.0 105 # Ns(i): 0.0000 1.0000 0.0000 sum= 1.0000 106 # Nv(i): 0.0000 0.0000 0.0000 sum= 0.0000 107 # A(i): 0.0000 0.0200 0.0000 108 # B(i): -0.0000 -0.0000 -0.0000 109 # LWL85: dS = 0.0227 dN = 0.0000 w = 0.0000 S = 45.0 N = 177.0 110 # LWL85m: dS = -nan dN = -nan w = -nan S = -nan N = -nan (rho = -nan) 111 # LPB93: dS = 0.0129 dN = 0.0000 w = 0.0000 112 seq_name1 = None 113 seq_name2 = None 114 for line in lines: 115 comp_res = re.match("\d+ \((.+)\) vs. \d+ \((.+)\)", line) 116 if comp_res is not None: 117 seq_name1 = comp_res.group(1) 118 seq_name2 = comp_res.group(2) 119 elif seq_name1 is not None and seq_name2 is not None: 120 if "dS =" in line: 121 stats = {} 122 line_stats = line.split(":")[1].strip() 123 # Find all of the xx = ###### values in a row 124 # ie dS = 0.0227 125 # For dN and dS, the values have 8 characters from the equals 126 # sign, while the rest have 7 characters. On Windows, 127 # NaNs take on weird values like -1.#IND, which might fill the 128 # entire fixed column width. 129 res_matches = re.findall("[dSNwrho]{1,3} =.{7,8}?", 130 line_stats) 131 for stat_pair in res_matches: 132 stat = stat_pair.split('=')[0].strip() 133 value = stat_pair.split('=')[1].strip() 134 try: 135 stats[stat] = float(value) 136 except ValueError: 137 stats[stat] = None 138 if "LWL85:" in line: 139 results[seq_name1][seq_name2]["LWL85"] = stats 140 results[seq_name2][seq_name1]["LWL85"] = stats 141 elif "LWL85m" in line: 142 results[seq_name1][seq_name2]["LWL85m"] = stats 143 results[seq_name2][seq_name1]["LWL85m"] = stats 144 elif "LPB93" in line: 145 results[seq_name1][seq_name2]["LPB93"] = stats 146 results[seq_name2][seq_name1]["LPB93"] = stats 147 return results
148