Package Bio :: Package Phylo :: Module NewickIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.Phylo.NewickIO

  1  # Copyright (C) 2009 by Eric Talevich (eric.talevich@gmail.com) 
  2  # Based on Bio.Nexus, copyright 2005-2008 by Frank Kauff & Cymon J. Cox. 
  3  # All rights reserved. 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license. Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7   
  8  """I/O function wrappers for the Newick file format. 
  9   
 10  See: http://evolution.genetics.washington.edu/phylip/newick_doc.html 
 11  """ 
 12   
 13  import re 
 14  from Bio._py3k import StringIO 
 15   
 16  from Bio.Phylo import Newick 
17 18 19 -class NewickError(Exception):
20 """Exception raised when Newick object construction cannot continue.""" 21 22 pass
23 24 25 tokens = [ 26 (r"\(", 'open parens'), 27 (r"\)", 'close parens'), 28 (r"[^\s\(\)\[\]\'\:\;\,]+", 'unquoted node label'), 29 (r"\:[+-]?[0-9]*\.?[0-9]+([eE][+-]?[0-9]+)?", 'edge length'), 30 (r"\,", 'comma'), 31 (r"\[(\\.|[^\]])*\]", 'comment'), 32 (r"\'(\\.|[^\'])*\'", 'quoted node label'), 33 (r"\;", 'semicolon'), 34 (r"\n", 'newline'), 35 ] 36 tokenizer = re.compile('(%s)' % '|'.join(token[0] for token in tokens)) 37 token_dict = dict((name, re.compile(token)) for (token, name) in tokens)
38 39 40 # --------------------------------------------------------- 41 # Public API 42 43 -def parse(handle, **kwargs):
44 """Iterate over the trees in a Newick file handle. 45 46 :returns: generator of Bio.Phylo.Newick.Tree objects. 47 48 """ 49 return Parser(handle).parse(**kwargs)
50
51 52 -def write(trees, handle, plain=False, **kwargs):
53 """Write a trees in Newick format to the given file handle. 54 55 :returns: number of trees written. 56 57 """ 58 return Writer(trees).write(handle, plain=plain, **kwargs)
59
60 61 # --------------------------------------------------------- 62 # Input 63 64 -def _parse_confidence(text):
65 if text.isdigit(): 66 return int(text) 67 # NB: Could make this more consistent by treating as a percentage 68 # return int(text) / 100. 69 try: 70 return float(text) 71 # NB: This should be in [0.0, 1.0], but who knows what people will do 72 # assert 0 <= current_clade.confidence <= 1 73 except ValueError: 74 return None
75
76 77 -def _format_comment(text):
78 return '[%s]' % (text.replace('[', '\\[').replace(']', '\\]'))
79
80 81 -def _get_comment(clade):
82 if hasattr(clade, 'comment') and clade.comment: 83 return _format_comment(str(clade.comment)) 84 else: 85 return ''
86
87 88 -class Parser(object):
89 """Parse a Newick tree given a file handle. 90 91 Based on the parser in `Bio.Nexus.Trees`. 92 """ 93
94 - def __init__(self, handle):
95 """Initialize file handle for the Newick Tree.""" 96 self.handle = handle
97 98 @classmethod
99 - def from_string(cls, treetext):
100 """Instantiate the Newick Tree class from the given string.""" 101 handle = StringIO(treetext) 102 return cls(handle)
103
104 - def parse(self, values_are_confidence=False, comments_are_confidence=False, rooted=False):
105 """Parse the text stream this object was initialized with.""" 106 self.values_are_confidence = values_are_confidence 107 self.comments_are_confidence = comments_are_confidence 108 self.rooted = rooted 109 buf = '' 110 unicodeChecked = False 111 unicodeLines = ("\xef", "\xff", "\xfe", "\x00") 112 for line in self.handle: 113 if not unicodeChecked: 114 # check for unicode byte order marks on first line only, 115 # these lead to parsing errors (on Python 2) 116 if line.startswith(unicodeLines): 117 raise NewickError("The file or stream you attempted to parse includes " 118 "unicode byte order marks. You must convert it to " 119 "ASCII before it can be parsed.") 120 unicodeChecked = True 121 buf += line.rstrip() 122 if buf.endswith(';'): 123 yield self._parse_tree(buf) 124 buf = '' 125 if buf: 126 # Last tree is missing a terminal ';' character -- that's OK 127 yield self._parse_tree(buf)
128
129 - def _parse_tree(self, text):
130 """Parses the text representation into an Tree object.""" 131 tokens = re.finditer(tokenizer, text.strip()) 132 133 new_clade = self.new_clade 134 root_clade = new_clade() 135 136 current_clade = root_clade 137 entering_branch_length = False 138 139 lp_count = 0 140 rp_count = 0 141 for match in tokens: 142 token = match.group() 143 144 if token.startswith("'"): 145 # quoted label; add characters to clade name 146 current_clade.name = token[1:-1] 147 148 elif token.startswith('['): 149 # comment 150 current_clade.comment = token[1:-1] 151 if self.comments_are_confidence: 152 # Try to use this comment as a numeric support value 153 current_clade.confidence = _parse_confidence(current_clade.comment) 154 155 elif token == '(': 156 # start a new clade, which is a child of the current clade 157 current_clade = new_clade(current_clade) 158 entering_branch_length = False 159 lp_count += 1 160 161 elif token == ',': 162 # if the current clade is the root, then the external parentheses 163 # are missing and a new root should be created 164 if current_clade is root_clade: 165 root_clade = new_clade() 166 current_clade.parent = root_clade 167 # start a new child clade at the same level as the current clade 168 parent = self.process_clade(current_clade) 169 current_clade = new_clade(parent) 170 entering_branch_length = False 171 172 elif token == ')': 173 # done adding children for this parent clade 174 parent = self.process_clade(current_clade) 175 if not parent: 176 raise NewickError('Parenthesis mismatch.') 177 current_clade = parent 178 entering_branch_length = False 179 rp_count += 1 180 181 elif token == ';': 182 break 183 184 elif token.startswith(':'): 185 # branch length or confidence 186 value = float(token[1:]) 187 if self.values_are_confidence: 188 current_clade.confidence = value 189 else: 190 current_clade.branch_length = value 191 192 elif token == '\n': 193 pass 194 195 else: 196 # unquoted node label 197 current_clade.name = token 198 199 if not lp_count == rp_count: 200 raise NewickError('Number of open/close parentheses do not match.') 201 202 # if ; token broke out of for loop, there should be no remaining tokens 203 try: 204 next_token = next(tokens) 205 raise NewickError('Text after semicolon in Newick tree: %s' 206 % next_token.group()) 207 except StopIteration: 208 pass 209 210 self.process_clade(current_clade) 211 self.process_clade(root_clade) 212 return Newick.Tree(root=root_clade, rooted=self.rooted)
213
214 - def new_clade(self, parent=None):
215 """Returns a new Newick.Clade, optionally with a temporary reference 216 to its parent clade. 217 """ 218 clade = Newick.Clade() 219 if parent: 220 clade.parent = parent 221 return clade
222
223 - def process_clade(self, clade):
224 """Final processing of a parsed clade. Removes the node's parent and 225 returns it. 226 """ 227 if ((clade.name) and not 228 (self.values_are_confidence or self.comments_are_confidence) and 229 (clade.confidence is None) and 230 (clade.clades)): 231 clade.confidence = _parse_confidence(clade.name) 232 if clade.confidence is not None: 233 clade.name = None 234 235 if hasattr(clade, 'parent'): 236 parent = clade.parent 237 parent.clades.append(clade) 238 del clade.parent 239 return parent
240
241 242 # --------------------------------------------------------- 243 # Output 244 245 -class Writer(object):
246 """Based on the writer in Bio.Nexus.Trees (str, to_string).""" 247
248 - def __init__(self, trees):
249 """Initialize parameter for Tree Writer object.""" 250 self.trees = trees
251
252 - def write(self, handle, **kwargs):
253 """Write this instance's trees to a file handle.""" 254 count = 0 255 for treestr in self.to_strings(**kwargs): 256 handle.write(treestr + '\n') 257 count += 1 258 return count
259
260 - def to_strings(self, confidence_as_branch_length=False, 261 branch_length_only=False, plain=False, 262 plain_newick=True, ladderize=None, max_confidence=1.0, 263 format_confidence='%1.2f', format_branch_length='%1.5f'):
264 """Return an iterable of PAUP-compatible tree lines.""" 265 # If there's a conflict in the arguments, we override plain=True 266 if confidence_as_branch_length or branch_length_only: 267 plain = False 268 make_info_string = self._info_factory(plain, 269 confidence_as_branch_length, branch_length_only, max_confidence, 270 format_confidence, format_branch_length) 271 272 def newickize(clade): 273 """Convert a node tree to a Newick tree string, recursively.""" 274 label = clade.name or '' 275 if label: 276 unquoted_label = re.match(token_dict['unquoted node label'], label) 277 if (not unquoted_label) or (unquoted_label.end() < len(label)): 278 label = "'%s'" % label.replace( 279 '\\', '\\\\').replace("'", "\\'") 280 281 if clade.is_terminal(): # terminal 282 return (label + make_info_string(clade, terminal=True)) 283 else: 284 subtrees = (newickize(sub) for sub in clade) 285 return '(%s)%s' % (','.join(subtrees), 286 label + make_info_string(clade))
287 288 # Convert each tree to a string 289 for tree in self.trees: 290 if ladderize in ('left', 'LEFT', 'right', 'RIGHT'): 291 # Nexus compatibility shim, kind of 292 tree.ladderize(reverse=(ladderize in ('right', 'RIGHT'))) 293 rawtree = newickize(tree.root) + ';' 294 if plain_newick: 295 yield rawtree 296 continue 297 # Nexus-style (?) notation before the raw Newick tree 298 treeline = ['tree', (tree.name or 'a_tree'), '='] 299 if tree.weight != 1: 300 treeline.append('[&W%s]' % round(float(tree.weight), 3)) 301 if tree.rooted: 302 treeline.append('[&R]') 303 treeline.append(rawtree) 304 yield ' '.join(treeline)
305
306 - def _info_factory(self, plain, confidence_as_branch_length, 307 branch_length_only, max_confidence, format_confidence, 308 format_branch_length):
309 """Return a function that creates a nicely formatted node tag.""" 310 if plain: 311 # Plain tree only. That's easy. 312 def make_info_string(clade, terminal=False): 313 return _get_comment(clade)
314 315 elif confidence_as_branch_length: 316 # Support as branchlengths (eg. PAUP), ignore actual branchlengths 317 def make_info_string(clade, terminal=False): 318 if terminal: 319 # terminal branches have 100% support 320 return (':' + format_confidence % max_confidence) + _get_comment(clade) 321 else: 322 return (':' + format_confidence % clade.confidence) + _get_comment(clade) 323 324 elif branch_length_only: 325 # write only branchlengths, ignore support 326 def make_info_string(clade, terminal=False): 327 return (':' + format_branch_length % clade.branch_length) + _get_comment(clade) 328 329 else: 330 # write support and branchlengths (e.g. .con tree of mrbayes) 331 def make_info_string(clade, terminal=False): 332 if (terminal or 333 not hasattr(clade, 'confidence') or 334 clade.confidence is None): 335 return (':' + format_branch_length 336 ) % (clade.branch_length or 0.0) + _get_comment(clade) 337 else: 338 return (':' + format_confidence + ':' + format_branch_length 339 ) % (clade.confidence, clade.branch_length or 0.0) + _get_comment(clade) 340 341 return make_info_string 342