Package Bio :: Package Phylo :: Module NewickIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.Phylo.NewickIO

  1  # Copyright (C) 2009 by Eric Talevich (eric.talevich@gmail.com) 
  2  # Based on Bio.Nexus, copyright 2005-2008 by Frank Kauff & Cymon J. Cox. 
  3  # All rights reserved. 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license. Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7   
  8  """I/O function wrappers for the Newick file format. 
  9   
 10  See: http://evolution.genetics.washington.edu/phylip/newick_doc.html 
 11  """ 
 12  __docformat__ = "restructuredtext en" 
 13   
 14  import re 
 15  from Bio._py3k import StringIO 
 16   
 17  from Bio.Phylo import Newick 
18 19 20 -class NewickError(Exception):
21 """Exception raised when Newick object construction cannot continue.""" 22 pass
23 24 25 tokens = [ 26 (r"\(", 'open parens'), 27 (r"\)", 'close parens'), 28 (r"[^\s\(\)\[\]\'\:\;\,]+", 'unquoted node label'), 29 (r"\:[0-9]*\.?[0-9]+([eE][+-]?[0-9]+)?", 'edge length'), 30 (r"\,", 'comma'), 31 (r"\[(\\.|[^\]])*\]", 'comment'), 32 (r"\'(\\.|[^\'])*\'", 'quoted node label'), 33 (r"\;", 'semicolon'), 34 (r"\n", 'newline'), 35 ] 36 tokenizer = re.compile('(%s)' % '|'.join(token[0] for token in tokens)) 37 token_dict = dict((name, re.compile(token)) for (token, name) in tokens)
38 39 40 # --------------------------------------------------------- 41 # Public API 42 43 -def parse(handle, **kwargs):
44 """Iterate over the trees in a Newick file handle. 45 46 :returns: generator of Bio.Phylo.Newick.Tree objects. 47 """ 48 return Parser(handle).parse(**kwargs)
49
50 51 -def write(trees, handle, plain=False, **kwargs):
52 """Write a trees in Newick format to the given file handle. 53 54 :returns: number of trees written. 55 """ 56 return Writer(trees).write(handle, plain=plain, **kwargs)
57
58 59 # --------------------------------------------------------- 60 # Input 61 62 -def _parse_confidence(text):
63 if text.isdigit(): 64 return int(text) 65 # NB: Could make this more consistent by treating as a percentage 66 # return int(text) / 100. 67 try: 68 return float(text) 69 # NB: This should be in [0.0, 1.0], but who knows what people will do 70 # assert 0 <= current_clade.confidence <= 1 71 except ValueError: 72 return None
73
74 75 -def _format_comment(text):
76 return '[%s]' % (text.replace('[', '\\[').replace(']', '\\]'))
77
78 -def _get_comment(clade):
79 if hasattr(clade, 'comment') and clade.comment: 80 return _format_comment(str(clade.comment)) 81 else: 82 return ''
83
84 85 -class Parser(object):
86 """Parse a Newick tree given a file handle. 87 88 Based on the parser in `Bio.Nexus.Trees`. 89 """ 90
91 - def __init__(self, handle):
92 self.handle = handle
93 94 @classmethod
95 - def from_string(cls, treetext):
96 handle = StringIO(treetext) 97 return cls(handle)
98
99 - def parse(self, values_are_confidence=False, comments_are_confidence=False, rooted=False):
100 """Parse the text stream this object was initialized with.""" 101 self.values_are_confidence = values_are_confidence 102 self.comments_are_confidence = comments_are_confidence 103 self.rooted = rooted 104 buf = '' 105 for line in self.handle: 106 buf += line.rstrip() 107 if buf.endswith(';'): 108 yield self._parse_tree(buf) 109 buf = '' 110 if buf: 111 # Last tree is missing a terminal ';' character -- that's OK 112 yield self._parse_tree(buf)
113
114 - def _parse_tree(self, text):
115 """Parses the text representation into an Tree object.""" 116 tokens = re.finditer(tokenizer, text.strip()) 117 118 new_clade = self.new_clade 119 root_clade = new_clade() 120 121 current_clade = root_clade 122 entering_branch_length = False 123 124 lp_count = 0 125 rp_count = 0 126 for match in tokens: 127 token = match.group() 128 129 if token.startswith("'"): 130 # quoted label; add characters to clade name 131 current_clade.name = token[1:-1] 132 133 elif token.startswith('['): 134 # comment 135 current_clade.comment = token[1:-1] 136 if self.comments_are_confidence: 137 # Try to use this comment as a numeric support value 138 current_clade.confidence = _parse_confidence(current_clade.comment) 139 140 elif token == '(': 141 # start a new clade, which is a child of the current clade 142 current_clade = new_clade(current_clade) 143 entering_branch_length = False 144 lp_count += 1 145 146 elif token == ',': 147 # if the current clade is the root, then the external parentheses are missing 148 # and a new root should be created 149 if current_clade is root_clade: 150 root_clade = new_clade() 151 current_clade.parent = root_clade 152 # start a new child clade at the same level as the current clade 153 parent = self.process_clade(current_clade) 154 current_clade = new_clade(parent) 155 entering_branch_length = False 156 157 elif token == ')': 158 # done adding children for this parent clade 159 parent = self.process_clade(current_clade) 160 if not parent: 161 raise NewickError('Parenthesis mismatch.') 162 current_clade = parent 163 entering_branch_length = False 164 rp_count += 1 165 166 elif token == ';': 167 break 168 169 elif token.startswith(':'): 170 # branch length or confidence 171 value = float(token[1:]) 172 if self.values_are_confidence: 173 current_clade.confidence = value 174 else: 175 current_clade.branch_length = value 176 177 elif token == '\n': 178 pass 179 180 else: 181 # unquoted node label 182 current_clade.name = token 183 184 if not lp_count == rp_count: 185 raise NewickError('Number of open/close parentheses do not match.') 186 187 # if ; token broke out of for loop, there should be no remaining tokens 188 try: 189 next_token = next(tokens) 190 raise NewickError('Text after semicolon in Newick tree: %s' 191 % next_token.group()) 192 except StopIteration: 193 pass 194 195 self.process_clade(current_clade) 196 self.process_clade(root_clade) 197 return Newick.Tree(root=root_clade, rooted=self.rooted)
198
199 - def new_clade(self, parent=None):
200 """Returns a new Newick.Clade, optionally with a temporary reference 201 to its parent clade.""" 202 clade = Newick.Clade() 203 if parent: 204 clade.parent = parent 205 return clade
206
207 - def process_clade(self, clade):
208 """Final processing of a parsed clade. Removes the node's parent and 209 returns it.""" 210 if (clade.name and not (self.values_are_confidence or 211 self.comments_are_confidence) 212 and clade.confidence is None): 213 clade.confidence = _parse_confidence(clade.name) 214 if not clade.confidence is None: 215 clade.name = None 216 217 if hasattr(clade, 'parent'): 218 parent = clade.parent 219 parent.clades.append(clade) 220 del clade.parent 221 return parent
222
223 224 # --------------------------------------------------------- 225 # Output 226 227 -class Writer(object):
228 """Based on the writer in Bio.Nexus.Trees (str, to_string).""" 229
230 - def __init__(self, trees):
231 self.trees = trees
232
233 - def write(self, handle, **kwargs):
234 """Write this instance's trees to a file handle.""" 235 count = 0 236 for treestr in self.to_strings(**kwargs): 237 handle.write(treestr + '\n') 238 count += 1 239 return count
240
241 - def to_strings(self, confidence_as_branch_length=False, 242 branch_length_only=False, plain=False, 243 plain_newick=True, ladderize=None, max_confidence=1.0, 244 format_confidence='%1.2f', format_branch_length='%1.5f'):
245 """Return an iterable of PAUP-compatible tree lines.""" 246 # If there's a conflict in the arguments, we override plain=True 247 if confidence_as_branch_length or branch_length_only: 248 plain = False 249 make_info_string = self._info_factory(plain, 250 confidence_as_branch_length, branch_length_only, max_confidence, 251 format_confidence, format_branch_length) 252 253 def newickize(clade): 254 """Convert a node tree to a Newick tree string, recursively.""" 255 label = clade.name or '' 256 if label: 257 unquoted_label = re.match(token_dict['unquoted node label'], label) 258 if (not unquoted_label) or (unquoted_label.end() < len(label)): 259 label = "'%s'" % label.replace('\\', '\\\\').replace("'", "\\'") 260 261 if clade.is_terminal(): # terminal 262 return (label 263 + make_info_string(clade, terminal=True)) 264 else: 265 subtrees = (newickize(sub) for sub in clade) 266 return '(%s)%s' % (','.join(subtrees), 267 label + make_info_string(clade))
268 269 # Convert each tree to a string 270 for tree in self.trees: 271 if ladderize in ('left', 'LEFT', 'right', 'RIGHT'): 272 # Nexus compatibility shim, kind of 273 tree.ladderize(reverse=(ladderize in ('right', 'RIGHT'))) 274 rawtree = newickize(tree.root) + ';' 275 if plain_newick: 276 yield rawtree 277 continue 278 # Nexus-style (?) notation before the raw Newick tree 279 treeline = ['tree', (tree.name or 'a_tree'), '='] 280 if tree.weight != 1: 281 treeline.append('[&W%s]' % round(float(tree.weight), 3)) 282 if tree.rooted: 283 treeline.append('[&R]') 284 treeline.append(rawtree) 285 yield ' '.join(treeline)
286
287 - def _info_factory(self, plain, confidence_as_branch_length, 288 branch_length_only, max_confidence, format_confidence, 289 format_branch_length):
290 """Return a function that creates a nicely formatted node tag.""" 291 if plain: 292 # Plain tree only. That's easy. 293 def make_info_string(clade, terminal=False): 294 return _get_comment(clade)
295 296 elif confidence_as_branch_length: 297 # Support as branchlengths (eg. PAUP), ignore actual branchlengths 298 def make_info_string(clade, terminal=False): 299 if terminal: 300 # terminal branches have 100% support 301 return (':' + format_confidence % max_confidence) + _get_comment(clade) 302 else: 303 return (':' + format_confidence % clade.confidence) + _get_comment(clade) 304 305 elif branch_length_only: 306 # write only branchlengths, ignore support 307 def make_info_string(clade, terminal=False): 308 return (':' + format_branch_length % clade.branch_length) + _get_comment(clade) 309 310 else: 311 # write support and branchlengths (e.g. .con tree of mrbayes) 312 def make_info_string(clade, terminal=False): 313 if (terminal or 314 not hasattr(clade, 'confidence') or 315 clade.confidence is None): 316 return (':' + format_branch_length 317 ) % (clade.branch_length or 0.0) + _get_comment(clade) 318 else: 319 return (format_confidence + ':' + format_branch_length 320 ) % (clade.confidence, clade.branch_length or 0.0) + _get_comment(clade) 321 322 return make_info_string 323