Package Bio :: Package Phylo :: Module NewickIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.Phylo.NewickIO

  1  # Copyright (C) 2009 by Eric Talevich (eric.talevich@gmail.com) 
  2  # Based on Bio.Nexus, copyright 2005-2008 by Frank Kauff & Cymon J. Cox. 
  3  # All rights reserved. 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license. Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7   
  8  """I/O function wrappers for the Newick file format. 
  9   
 10  See: http://evolution.genetics.washington.edu/phylip/newick_doc.html 
 11  """ 
 12   
 13  __docformat__ = "restructuredtext en" 
 14   
 15  import re 
 16  from Bio._py3k import StringIO 
 17   
 18  from Bio.Phylo import Newick 
19 20 21 -class NewickError(Exception):
22 """Exception raised when Newick object construction cannot continue.""" 23 pass
24 25 26 tokens = [ 27 (r"\(", 'open parens'), 28 (r"\)", 'close parens'), 29 (r"[^\s\(\)\[\]\'\:\;\,]+", 'unquoted node label'), 30 (r"\:[0-9]*\.?[0-9]+([eE][+-]?[0-9]+)?", 'edge length'), 31 (r"\,", 'comma'), 32 (r"\[(\\.|[^\]])*\]", 'comment'), 33 (r"\'(\\.|[^\'])*\'", 'quoted node label'), 34 (r"\;", 'semicolon'), 35 (r"\n", 'newline'), 36 ] 37 tokenizer = re.compile('(%s)' % '|'.join(token[0] for token in tokens)) 38 token_dict = dict((name, re.compile(token)) for (token, name) in tokens)
39 40 41 # --------------------------------------------------------- 42 # Public API 43 44 -def parse(handle, **kwargs):
45 """Iterate over the trees in a Newick file handle. 46 47 :returns: generator of Bio.Phylo.Newick.Tree objects. 48 """ 49 return Parser(handle).parse(**kwargs)
50
51 52 -def write(trees, handle, plain=False, **kwargs):
53 """Write a trees in Newick format to the given file handle. 54 55 :returns: number of trees written. 56 """ 57 return Writer(trees).write(handle, plain=plain, **kwargs)
58
59 60 # --------------------------------------------------------- 61 # Input 62 63 -def _parse_confidence(text):
64 if text.isdigit(): 65 return int(text) 66 # NB: Could make this more consistent by treating as a percentage 67 # return int(text) / 100. 68 try: 69 return float(text) 70 # NB: This should be in [0.0, 1.0], but who knows what people will do 71 # assert 0 <= current_clade.confidence <= 1 72 except ValueError: 73 return None
74
75 76 -def _format_comment(text):
77 return '[%s]' % (text.replace('[', '\\[').replace(']', '\\]'))
78
79 80 -def _get_comment(clade):
81 if hasattr(clade, 'comment') and clade.comment: 82 return _format_comment(str(clade.comment)) 83 else: 84 return ''
85
86 87 -class Parser(object):
88 """Parse a Newick tree given a file handle. 89 90 Based on the parser in `Bio.Nexus.Trees`. 91 """ 92
93 - def __init__(self, handle):
94 self.handle = handle
95 96 @classmethod
97 - def from_string(cls, treetext):
98 handle = StringIO(treetext) 99 return cls(handle)
100
101 - def parse(self, values_are_confidence=False, comments_are_confidence=False, rooted=False):
102 """Parse the text stream this object was initialized with.""" 103 self.values_are_confidence = values_are_confidence 104 self.comments_are_confidence = comments_are_confidence 105 self.rooted = rooted 106 buf = '' 107 unicodeChecked = False 108 unicodeLines = ("\xef", "\xff", "\xfe", "\x00") 109 for line in self.handle: 110 if not unicodeChecked: 111 # check for unicode byte order marks on first line only, 112 # these lead to parsing errors (on Python 2) 113 if line.startswith(unicodeLines): 114 raise NewickError("The file or stream you attempted to parse includes " 115 "unicode byte order marks. You must convert it to " 116 "ASCII before it can be parsed.") 117 unicodeChecked = True 118 buf += line.rstrip() 119 if buf.endswith(';'): 120 yield self._parse_tree(buf) 121 buf = '' 122 if buf: 123 # Last tree is missing a terminal ';' character -- that's OK 124 yield self._parse_tree(buf)
125
126 - def _parse_tree(self, text):
127 """Parses the text representation into an Tree object.""" 128 tokens = re.finditer(tokenizer, text.strip()) 129 130 new_clade = self.new_clade 131 root_clade = new_clade() 132 133 current_clade = root_clade 134 entering_branch_length = False 135 136 lp_count = 0 137 rp_count = 0 138 for match in tokens: 139 token = match.group() 140 141 if token.startswith("'"): 142 # quoted label; add characters to clade name 143 current_clade.name = token[1:-1] 144 145 elif token.startswith('['): 146 # comment 147 current_clade.comment = token[1:-1] 148 if self.comments_are_confidence: 149 # Try to use this comment as a numeric support value 150 current_clade.confidence = _parse_confidence(current_clade.comment) 151 152 elif token == '(': 153 # start a new clade, which is a child of the current clade 154 current_clade = new_clade(current_clade) 155 entering_branch_length = False 156 lp_count += 1 157 158 elif token == ',': 159 # if the current clade is the root, then the external parentheses 160 # are missing and a new root should be created 161 if current_clade is root_clade: 162 root_clade = new_clade() 163 current_clade.parent = root_clade 164 # start a new child clade at the same level as the current clade 165 parent = self.process_clade(current_clade) 166 current_clade = new_clade(parent) 167 entering_branch_length = False 168 169 elif token == ')': 170 # done adding children for this parent clade 171 parent = self.process_clade(current_clade) 172 if not parent: 173 raise NewickError('Parenthesis mismatch.') 174 current_clade = parent 175 entering_branch_length = False 176 rp_count += 1 177 178 elif token == ';': 179 break 180 181 elif token.startswith(':'): 182 # branch length or confidence 183 value = float(token[1:]) 184 if self.values_are_confidence: 185 current_clade.confidence = value 186 else: 187 current_clade.branch_length = value 188 189 elif token == '\n': 190 pass 191 192 else: 193 # unquoted node label 194 current_clade.name = token 195 196 if not lp_count == rp_count: 197 raise NewickError('Number of open/close parentheses do not match.') 198 199 # if ; token broke out of for loop, there should be no remaining tokens 200 try: 201 next_token = next(tokens) 202 raise NewickError('Text after semicolon in Newick tree: %s' 203 % next_token.group()) 204 except StopIteration: 205 pass 206 207 self.process_clade(current_clade) 208 self.process_clade(root_clade) 209 return Newick.Tree(root=root_clade, rooted=self.rooted)
210
211 - def new_clade(self, parent=None):
212 """Returns a new Newick.Clade, optionally with a temporary reference 213 to its parent clade.""" 214 clade = Newick.Clade() 215 if parent: 216 clade.parent = parent 217 return clade
218
219 - def process_clade(self, clade):
220 """Final processing of a parsed clade. Removes the node's parent and 221 returns it.""" 222 if ((clade.name) and not 223 (self.values_are_confidence or self.comments_are_confidence) and 224 (clade.confidence is None) and 225 (clade.clades)): 226 clade.confidence = _parse_confidence(clade.name) 227 if clade.confidence is not None: 228 clade.name = None 229 230 if hasattr(clade, 'parent'): 231 parent = clade.parent 232 parent.clades.append(clade) 233 del clade.parent 234 return parent
235
236 237 # --------------------------------------------------------- 238 # Output 239 240 -class Writer(object):
241 """Based on the writer in Bio.Nexus.Trees (str, to_string).""" 242
243 - def __init__(self, trees):
244 self.trees = trees
245
246 - def write(self, handle, **kwargs):
247 """Write this instance's trees to a file handle.""" 248 count = 0 249 for treestr in self.to_strings(**kwargs): 250 handle.write(treestr + '\n') 251 count += 1 252 return count
253
254 - def to_strings(self, confidence_as_branch_length=False, 255 branch_length_only=False, plain=False, 256 plain_newick=True, ladderize=None, max_confidence=1.0, 257 format_confidence='%1.2f', format_branch_length='%1.5f'):
258 """Return an iterable of PAUP-compatible tree lines.""" 259 # If there's a conflict in the arguments, we override plain=True 260 if confidence_as_branch_length or branch_length_only: 261 plain = False 262 make_info_string = self._info_factory(plain, 263 confidence_as_branch_length, branch_length_only, max_confidence, 264 format_confidence, format_branch_length) 265 266 def newickize(clade): 267 """Convert a node tree to a Newick tree string, recursively.""" 268 label = clade.name or '' 269 if label: 270 unquoted_label = re.match(token_dict['unquoted node label'], label) 271 if (not unquoted_label) or (unquoted_label.end() < len(label)): 272 label = "'%s'" % label.replace( 273 '\\', '\\\\').replace("'", "\\'") 274 275 if clade.is_terminal(): # terminal 276 return (label 277 + make_info_string(clade, terminal=True)) 278 else: 279 subtrees = (newickize(sub) for sub in clade) 280 return '(%s)%s' % (','.join(subtrees), 281 label + make_info_string(clade))
282 283 # Convert each tree to a string 284 for tree in self.trees: 285 if ladderize in ('left', 'LEFT', 'right', 'RIGHT'): 286 # Nexus compatibility shim, kind of 287 tree.ladderize(reverse=(ladderize in ('right', 'RIGHT'))) 288 rawtree = newickize(tree.root) + ';' 289 if plain_newick: 290 yield rawtree 291 continue 292 # Nexus-style (?) notation before the raw Newick tree 293 treeline = ['tree', (tree.name or 'a_tree'), '='] 294 if tree.weight != 1: 295 treeline.append('[&W%s]' % round(float(tree.weight), 3)) 296 if tree.rooted: 297 treeline.append('[&R]') 298 treeline.append(rawtree) 299 yield ' '.join(treeline)
300
301 - def _info_factory(self, plain, confidence_as_branch_length, 302 branch_length_only, max_confidence, format_confidence, 303 format_branch_length):
304 """Return a function that creates a nicely formatted node tag.""" 305 if plain: 306 # Plain tree only. That's easy. 307 def make_info_string(clade, terminal=False): 308 return _get_comment(clade)
309 310 elif confidence_as_branch_length: 311 # Support as branchlengths (eg. PAUP), ignore actual branchlengths 312 def make_info_string(clade, terminal=False): 313 if terminal: 314 # terminal branches have 100% support 315 return (':' + format_confidence % max_confidence) + _get_comment(clade) 316 else: 317 return (':' + format_confidence % clade.confidence) + _get_comment(clade) 318 319 elif branch_length_only: 320 # write only branchlengths, ignore support 321 def make_info_string(clade, terminal=False): 322 return (':' + format_branch_length % clade.branch_length) + _get_comment(clade) 323 324 else: 325 # write support and branchlengths (e.g. .con tree of mrbayes) 326 def make_info_string(clade, terminal=False): 327 if (terminal or 328 not hasattr(clade, 'confidence') or 329 clade.confidence is None): 330 return (':' + format_branch_length 331 ) % (clade.branch_length or 0.0) + _get_comment(clade) 332 else: 333 return (format_confidence + ':' + format_branch_length 334 ) % (clade.confidence, clade.branch_length or 0.0) + _get_comment(clade) 335 336 return make_info_string 337