Package Bio :: Package Phylo :: Module NewickIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.Phylo.NewickIO

  1  # Copyright (C) 2009 by Eric Talevich (eric.talevich@gmail.com) 
  2  # Based on Bio.Nexus, copyright 2005-2008 by Frank Kauff & Cymon J. Cox. 
  3  # All rights reserved. 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license. Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7   
  8  """I/O function wrappers for the Newick file format. 
  9   
 10  See: http://evolution.genetics.washington.edu/phylip/newick_doc.html 
 11  """ 
 12   
 13  import re 
 14  from Bio._py3k import StringIO 
 15   
 16  from Bio.Phylo import Newick 
17 18 19 -class NewickError(Exception):
20 """Exception raised when Newick object construction cannot continue.""" 21 pass
22 23 24 tokens = [ 25 (r"\(", 'open parens'), 26 (r"\)", 'close parens'), 27 (r"[^\s\(\)\[\]\'\:\;\,]+", 'unquoted node label'), 28 (r"\:[+-]?[0-9]*\.?[0-9]+([eE][+-]?[0-9]+)?", 'edge length'), 29 (r"\,", 'comma'), 30 (r"\[(\\.|[^\]])*\]", 'comment'), 31 (r"\'(\\.|[^\'])*\'", 'quoted node label'), 32 (r"\;", 'semicolon'), 33 (r"\n", 'newline'), 34 ] 35 tokenizer = re.compile('(%s)' % '|'.join(token[0] for token in tokens)) 36 token_dict = dict((name, re.compile(token)) for (token, name) in tokens)
37 38 39 # --------------------------------------------------------- 40 # Public API 41 42 -def parse(handle, **kwargs):
43 """Iterate over the trees in a Newick file handle. 44 45 :returns: generator of Bio.Phylo.Newick.Tree objects. 46 """ 47 return Parser(handle).parse(**kwargs)
48
49 50 -def write(trees, handle, plain=False, **kwargs):
51 """Write a trees in Newick format to the given file handle. 52 53 :returns: number of trees written. 54 """ 55 return Writer(trees).write(handle, plain=plain, **kwargs)
56
57 58 # --------------------------------------------------------- 59 # Input 60 61 -def _parse_confidence(text):
62 if text.isdigit(): 63 return int(text) 64 # NB: Could make this more consistent by treating as a percentage 65 # return int(text) / 100. 66 try: 67 return float(text) 68 # NB: This should be in [0.0, 1.0], but who knows what people will do 69 # assert 0 <= current_clade.confidence <= 1 70 except ValueError: 71 return None
72
73 74 -def _format_comment(text):
75 return '[%s]' % (text.replace('[', '\\[').replace(']', '\\]'))
76
77 78 -def _get_comment(clade):
79 if hasattr(clade, 'comment') and clade.comment: 80 return _format_comment(str(clade.comment)) 81 else: 82 return ''
83
84 85 -class Parser(object):
86 """Parse a Newick tree given a file handle. 87 88 Based on the parser in `Bio.Nexus.Trees`. 89 """ 90
91 - def __init__(self, handle):
92 self.handle = handle
93 94 @classmethod
95 - def from_string(cls, treetext):
96 handle = StringIO(treetext) 97 return cls(handle)
98
99 - def parse(self, values_are_confidence=False, comments_are_confidence=False, rooted=False):
100 """Parse the text stream this object was initialized with.""" 101 self.values_are_confidence = values_are_confidence 102 self.comments_are_confidence = comments_are_confidence 103 self.rooted = rooted 104 buf = '' 105 unicodeChecked = False 106 unicodeLines = ("\xef", "\xff", "\xfe", "\x00") 107 for line in self.handle: 108 if not unicodeChecked: 109 # check for unicode byte order marks on first line only, 110 # these lead to parsing errors (on Python 2) 111 if line.startswith(unicodeLines): 112 raise NewickError("The file or stream you attempted to parse includes " 113 "unicode byte order marks. You must convert it to " 114 "ASCII before it can be parsed.") 115 unicodeChecked = True 116 buf += line.rstrip() 117 if buf.endswith(';'): 118 yield self._parse_tree(buf) 119 buf = '' 120 if buf: 121 # Last tree is missing a terminal ';' character -- that's OK 122 yield self._parse_tree(buf)
123
124 - def _parse_tree(self, text):
125 """Parses the text representation into an Tree object.""" 126 tokens = re.finditer(tokenizer, text.strip()) 127 128 new_clade = self.new_clade 129 root_clade = new_clade() 130 131 current_clade = root_clade 132 entering_branch_length = False 133 134 lp_count = 0 135 rp_count = 0 136 for match in tokens: 137 token = match.group() 138 139 if token.startswith("'"): 140 # quoted label; add characters to clade name 141 current_clade.name = token[1:-1] 142 143 elif token.startswith('['): 144 # comment 145 current_clade.comment = token[1:-1] 146 if self.comments_are_confidence: 147 # Try to use this comment as a numeric support value 148 current_clade.confidence = _parse_confidence(current_clade.comment) 149 150 elif token == '(': 151 # start a new clade, which is a child of the current clade 152 current_clade = new_clade(current_clade) 153 entering_branch_length = False 154 lp_count += 1 155 156 elif token == ',': 157 # if the current clade is the root, then the external parentheses 158 # are missing and a new root should be created 159 if current_clade is root_clade: 160 root_clade = new_clade() 161 current_clade.parent = root_clade 162 # start a new child clade at the same level as the current clade 163 parent = self.process_clade(current_clade) 164 current_clade = new_clade(parent) 165 entering_branch_length = False 166 167 elif token == ')': 168 # done adding children for this parent clade 169 parent = self.process_clade(current_clade) 170 if not parent: 171 raise NewickError('Parenthesis mismatch.') 172 current_clade = parent 173 entering_branch_length = False 174 rp_count += 1 175 176 elif token == ';': 177 break 178 179 elif token.startswith(':'): 180 # branch length or confidence 181 value = float(token[1:]) 182 if self.values_are_confidence: 183 current_clade.confidence = value 184 else: 185 current_clade.branch_length = value 186 187 elif token == '\n': 188 pass 189 190 else: 191 # unquoted node label 192 current_clade.name = token 193 194 if not lp_count == rp_count: 195 raise NewickError('Number of open/close parentheses do not match.') 196 197 # if ; token broke out of for loop, there should be no remaining tokens 198 try: 199 next_token = next(tokens) 200 raise NewickError('Text after semicolon in Newick tree: %s' 201 % next_token.group()) 202 except StopIteration: 203 pass 204 205 self.process_clade(current_clade) 206 self.process_clade(root_clade) 207 return Newick.Tree(root=root_clade, rooted=self.rooted)
208
209 - def new_clade(self, parent=None):
210 """Returns a new Newick.Clade, optionally with a temporary reference 211 to its parent clade.""" 212 clade = Newick.Clade() 213 if parent: 214 clade.parent = parent 215 return clade
216
217 - def process_clade(self, clade):
218 """Final processing of a parsed clade. Removes the node's parent and 219 returns it.""" 220 if ((clade.name) and not 221 (self.values_are_confidence or self.comments_are_confidence) and 222 (clade.confidence is None) and 223 (clade.clades)): 224 clade.confidence = _parse_confidence(clade.name) 225 if clade.confidence is not None: 226 clade.name = None 227 228 if hasattr(clade, 'parent'): 229 parent = clade.parent 230 parent.clades.append(clade) 231 del clade.parent 232 return parent
233
234 235 # --------------------------------------------------------- 236 # Output 237 238 -class Writer(object):
239 """Based on the writer in Bio.Nexus.Trees (str, to_string).""" 240
241 - def __init__(self, trees):
242 self.trees = trees
243
244 - def write(self, handle, **kwargs):
245 """Write this instance's trees to a file handle.""" 246 count = 0 247 for treestr in self.to_strings(**kwargs): 248 handle.write(treestr + '\n') 249 count += 1 250 return count
251
252 - def to_strings(self, confidence_as_branch_length=False, 253 branch_length_only=False, plain=False, 254 plain_newick=True, ladderize=None, max_confidence=1.0, 255 format_confidence='%1.2f', format_branch_length='%1.5f'):
256 """Return an iterable of PAUP-compatible tree lines.""" 257 # If there's a conflict in the arguments, we override plain=True 258 if confidence_as_branch_length or branch_length_only: 259 plain = False 260 make_info_string = self._info_factory(plain, 261 confidence_as_branch_length, branch_length_only, max_confidence, 262 format_confidence, format_branch_length) 263 264 def newickize(clade): 265 """Convert a node tree to a Newick tree string, recursively.""" 266 label = clade.name or '' 267 if label: 268 unquoted_label = re.match(token_dict['unquoted node label'], label) 269 if (not unquoted_label) or (unquoted_label.end() < len(label)): 270 label = "'%s'" % label.replace( 271 '\\', '\\\\').replace("'", "\\'") 272 273 if clade.is_terminal(): # terminal 274 return (label + make_info_string(clade, terminal=True)) 275 else: 276 subtrees = (newickize(sub) for sub in clade) 277 return '(%s)%s' % (','.join(subtrees), 278 label + make_info_string(clade))
279 280 # Convert each tree to a string 281 for tree in self.trees: 282 if ladderize in ('left', 'LEFT', 'right', 'RIGHT'): 283 # Nexus compatibility shim, kind of 284 tree.ladderize(reverse=(ladderize in ('right', 'RIGHT'))) 285 rawtree = newickize(tree.root) + ';' 286 if plain_newick: 287 yield rawtree 288 continue 289 # Nexus-style (?) notation before the raw Newick tree 290 treeline = ['tree', (tree.name or 'a_tree'), '='] 291 if tree.weight != 1: 292 treeline.append('[&W%s]' % round(float(tree.weight), 3)) 293 if tree.rooted: 294 treeline.append('[&R]') 295 treeline.append(rawtree) 296 yield ' '.join(treeline)
297
298 - def _info_factory(self, plain, confidence_as_branch_length, 299 branch_length_only, max_confidence, format_confidence, 300 format_branch_length):
301 """Return a function that creates a nicely formatted node tag.""" 302 if plain: 303 # Plain tree only. That's easy. 304 def make_info_string(clade, terminal=False): 305 return _get_comment(clade)
306 307 elif confidence_as_branch_length: 308 # Support as branchlengths (eg. PAUP), ignore actual branchlengths 309 def make_info_string(clade, terminal=False): 310 if terminal: 311 # terminal branches have 100% support 312 return (':' + format_confidence % max_confidence) + _get_comment(clade) 313 else: 314 return (':' + format_confidence % clade.confidence) + _get_comment(clade) 315 316 elif branch_length_only: 317 # write only branchlengths, ignore support 318 def make_info_string(clade, terminal=False): 319 return (':' + format_branch_length % clade.branch_length) + _get_comment(clade) 320 321 else: 322 # write support and branchlengths (e.g. .con tree of mrbayes) 323 def make_info_string(clade, terminal=False): 324 if (terminal or 325 not hasattr(clade, 'confidence') or 326 clade.confidence is None): 327 return (':' + format_branch_length 328 ) % (clade.branch_length or 0.0) + _get_comment(clade) 329 else: 330 return (':' + format_confidence + ':' + format_branch_length 331 ) % (clade.confidence, clade.branch_length or 0.0) + _get_comment(clade) 332 333 return make_info_string 334