Package Bio :: Package Nexus :: Module Nexus
[hide private]
[frames] | no frames]

Source Code for Module Bio.Nexus.Nexus

   1  # Copyright 2005-2008 by Frank Kauff & Cymon J. Cox. All rights reserved. 
   2  #           2014-2015 by Joe Cora (standard data) 
   3  # This code is part of the Biopython distribution and governed by its 
   4  # license. Please see the LICENSE file that should have been included 
   5  # as part of this package. 
   6  # 
   7  # Bug reports welcome: fkauff@biologie.uni-kl.de or on Biopython's bugzilla. 
   8  """Nexus class. Parse the contents of a NEXUS file. 
   9   
  10  Based upon 'NEXUS: An extensible file format for systematic information' 
  11  Maddison, Swofford, Maddison. 1997. Syst. Biol. 46(4):590-621 
  12  """ 
  13  from __future__ import print_function 
  14   
  15  from Bio._py3k import zip 
  16  from Bio._py3k import range 
  17  from Bio._py3k import basestring 
  18   
  19  from functools import reduce 
  20  import copy 
  21  import math 
  22  import random 
  23  import sys 
  24   
  25  from Bio import File 
  26  from Bio.Alphabet import IUPAC 
  27  from Bio.Data import IUPACData 
  28  from Bio.Seq import Seq 
  29   
  30  from Bio.Nexus.StandardData import StandardData 
  31  from Bio.Nexus.Trees import Tree 
  32   
  33   
  34  INTERLEAVE = 70 
  35  SPECIAL_COMMANDS = ['charstatelabels', 'charlabels', 'taxlabels', 'taxset', 
  36                      'charset', 'charpartition', 'taxpartition', 'matrix', 
  37                      'tree', 'utree', 'translate', 'codonposset', 'title'] 
  38  KNOWN_NEXUS_BLOCKS = ['trees', 'data', 'characters', 'taxa', 'sets', 'codons'] 
  39  PUNCTUATION = '()[]{}\,;:=*\'"`+-<>' 
  40  MRBAYESSAFE = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890_' 
  41  WHITESPACE = ' \t\n' 
  42  # SPECIALCOMMENTS = ['!','&','%','/','\\','@'] # original list of special comments 
  43  SPECIALCOMMENTS = ['&']  # supported special comment ('tree' command), all others are ignored 
  44  CHARSET = 'chars' 
  45  TAXSET = 'taxa' 
  46  CODONPOSITIONS = 'codonpositions' 
  47  DEFAULTNEXUS = '#NEXUS\nbegin data; dimensions ntax=0 nchar=0; format datatype=dna; end; ' 
  48   
  49   
50 -class NexusError(Exception):
51 pass
52 53
54 -class CharBuffer(object):
55 """Helps reading NEXUS-words and characters from a buffer (semi-PRIVATE). 56 57 This class is not intended for public use (any more). 58 """ 59
60 - def __init__(self, string):
61 if string: 62 self.buffer = list(string) 63 else: 64 self.buffer = []
65
66 - def peek(self):
67 if self.buffer: 68 return self.buffer[0] 69 else: 70 return None
71
72 - def peek_nonwhitespace(self):
73 b = ''.join(self.buffer).strip() 74 if b: 75 return b[0] 76 else: 77 return None
78
79 - def __next__(self):
80 """Iterates over NEXUS characters in the file.""" 81 if self.buffer: 82 return self.buffer.pop(0) 83 else: 84 return None
85 86 if sys.version_info[0] < 3:
87 - def next(self):
88 """Deprecated Python 2 style alias for Python 3 style __next__ method.""" 89 return self.__next__()
90
91 - def next_nonwhitespace(self):
92 """Checks for next non whitespace character in NEXUS file.""" 93 while True: 94 p = next(self) 95 if p is None: 96 break 97 if p not in WHITESPACE: 98 return p 99 return None
100
101 - def skip_whitespace(self):
102 """Skips whitespace characters in NEXUS file.""" 103 while self.buffer[0] in WHITESPACE: 104 self.buffer = self.buffer[1:]
105
106 - def next_until(self, target):
107 """Keeps iterating the NEXUS file until it reaches a target character. 108 109 Returns the word found in the NEXUS file. 110 """ 111 for t in target: 112 try: 113 pos = self.buffer.index(t) 114 except ValueError: 115 pass 116 else: 117 found = ''.join(self.buffer[:pos]) 118 self.buffer = self.buffer[pos:] 119 return found 120 else: 121 return None
122
123 - def peek_word(self, word):
124 """Returns a word stored in the buffer.""" 125 return ''.join(self.buffer[:len(word)]) == word
126
127 - def next_word(self):
128 """Return the next NEXUS word from a string. 129 130 This deals with single and double quotes, whitespace and punctuation. 131 """ 132 word = [] 133 quoted = False 134 # get first character 135 first = self.next_nonwhitespace() 136 if not first: 137 # return empty if only whitespace left 138 return None 139 word.append(first) 140 if first == "'": 141 quoted = "'" 142 elif first == '"': 143 quoted = '"' 144 elif first in PUNCTUATION: 145 # if it's non-quote punctuation, return immediately 146 return first 147 while True: 148 c = self.peek() 149 if c == quoted: # a quote? 150 word.append(next(self)) # store quote 151 if self.peek() == quoted: # double quote 152 next(self) # skip second quote 153 elif quoted: # second single quote ends word 154 break 155 elif quoted: 156 # if quoted, then add anything 157 word.append(next(self)) 158 elif not c or c in PUNCTUATION or c in WHITESPACE: 159 # if not quoted and special character, stop 160 break 161 else: 162 word.append(next(self)) # standard character 163 return ''.join(word)
164
165 - def rest(self):
166 """Return the rest of the string without parsing.""" 167 return ''.join(self.buffer)
168 169
170 -class StepMatrix(object):
171 """Calculate a stepmatrix for weighted parsimony. 172 173 See Wheeler (1990), Cladistics 6:269-275. 174 """ 175
176 - def __init__(self, symbols, gap):
177 self.data = {} 178 self.symbols = sorted(symbols) 179 if gap: 180 self.symbols.append(gap) 181 for x in self.symbols: 182 for y in [s for s in self.symbols if s != x]: 183 self.set(x, y, 0)
184
185 - def set(self, x, y, value):
186 """Swaps the value.""" 187 if x > y: 188 x, y = y, x 189 self.data[x + y] = value
190
191 - def add(self, x, y, value):
192 if x > y: 193 x, y = y, x 194 self.data[x + y] += value
195
196 - def sum(self):
197 return reduce(lambda x, y: x + y, self.data.values())
198
199 - def transformation(self):
200 total = self.sum() 201 if total != 0: 202 for k in self.data: 203 self.data[k] = self.data[k] / float(total) 204 return self
205
206 - def weighting(self):
207 for k in self.data: 208 if self.data[k] != 0: 209 self.data[k] = -math.log(self.data[k]) 210 return self
211
212 - def smprint(self, name='your_name_here'):
213 """Prints a stepmatrix.""" 214 matrix = 'usertype %s stepmatrix=%d\n' % (name, len(self.symbols)) 215 matrix += ' %s\n' % ' '.join(self.symbols) 216 for x in self.symbols: 217 matrix += '[%s]'.ljust(8) % x 218 for y in self.symbols: 219 if x == y: 220 matrix += ' . ' 221 else: 222 if x > y: 223 x1, y1 = y, x 224 else: 225 x1, y1 = x, y 226 if self.data[x1 + y1] == 0: 227 matrix += 'inf. ' 228 else: 229 matrix += '%2.2f'.ljust(10) % (self.data[x1 + y1]) 230 matrix += '\n' 231 matrix += ';\n' 232 return matrix
233 234
235 -def safename(name, mrbayes=False):
236 """Return a taxon identifier according to NEXUS standard. 237 238 Wrap quotes around names with punctuation or whitespace, and double 239 single quotes. 240 241 mrbayes=True: write names without quotes, whitespace or punctuation 242 for the mrbayes software package. 243 """ 244 if mrbayes: 245 safe = name.replace(' ', '_') 246 safe = ''.join(c for c in safe if c in MRBAYESSAFE) 247 else: 248 safe = name.replace("'", "''") 249 if set(safe).intersection(set(WHITESPACE + PUNCTUATION)): 250 safe = "'" + safe + "'" 251 return safe
252 253
254 -def quotestrip(word):
255 """Remove quotes and/or double quotes around identifiers.""" 256 if not word: 257 return None 258 while (word.startswith("'") and word.endswith("'")) or (word.startswith('"') and word.endswith('"')): 259 word = word[1:-1] 260 return word
261 262
263 -def get_start_end(sequence, skiplist=('-', '?')):
264 """Return position of first and last character which is not in skiplist. 265 266 Skiplist defaults to ['-','?']. 267 """ 268 length = len(sequence) 269 if length == 0: 270 return None, None 271 end = length - 1 272 while end >= 0 and (sequence[end] in skiplist): 273 end -= 1 274 start = 0 275 while start < length and (sequence[start] in skiplist): 276 start += 1 277 if start == length and end == -1: # empty sequence 278 return -1, -1 279 else: 280 return start, end
281 282
283 -def _sort_keys_by_values(p):
284 """Returns a sorted list of keys of p sorted by values of p.""" 285 return sorted((pn for pn in p if p[pn]), key=lambda pn: p[pn])
286 287
288 -def _make_unique(l):
289 """Check that all values in list are unique and return a pruned and sorted list.""" 290 return sorted(set(l))
291 292
293 -def _unique_label(previous_labels, label):
294 """Returns a unique name if label is already in previous_labels.""" 295 while label in previous_labels: 296 label_split = label.split('.') 297 if label_split[-1].startswith('copy'): 298 copy_num = 1 299 if label_split[-1] != "copy": 300 copy_num = int(label_split[-1][4:]) + 1 301 new_label = "%s.copy%s" % ('.'.join(label_split[:-1]), copy_num) 302 label = new_label 303 else: 304 label += '.copy' 305 return label
306 307
308 -def _seqmatrix2strmatrix(matrix):
309 """Converts a Seq-object matrix to a plain sequence-string matrix.""" 310 return dict((t, str(matrix[t])) for t in matrix)
311 312
313 -def _compact4nexus(orig_list):
314 """Transform [1 2 3 5 6 7 8 12 15 18 20] (baseindex 0, used in the Nexus class) 315 into '2-4 6-9 13-19\\3 21' (baseindex 1, used in programs like Paup or MrBayes.). 316 """ 317 if not orig_list: 318 return '' 319 orig_list = sorted(set(orig_list)) 320 shortlist = [] 321 clist = orig_list[:] 322 clist.append(clist[-1] + .5) # dummy value makes it easier 323 while len(clist) > 1: 324 step = 1 325 for i, x in enumerate(clist): 326 if x == clist[0] + i * step: # are we still in the right step? 327 continue 328 elif i == 1 and len(clist) > 3 and clist[i + 1] - x == x - clist[0]: 329 # second element, and possibly at least 3 elements to link, 330 # and the next one is in the right step 331 step = x - clist[0] 332 else: # pattern broke, add all values before current position to new list 333 sub = clist[:i] 334 if len(sub) == 1: 335 shortlist.append(str(sub[0] + 1)) 336 else: 337 if step == 1: 338 shortlist.append('%d-%d' % (sub[0] + 1, sub[-1] + 1)) 339 else: 340 shortlist.append('%d-%d\\%d' % (sub[0] + 1, sub[-1] + 1, step)) 341 clist = clist[i:] 342 break 343 return ' '.join(shortlist)
344 345
346 -def combine(matrices):
347 """Combine matrices in [(name,nexus-instance),...] and return new nexus instance. 348 349 combined_matrix=combine([(name1,nexus_instance1),(name2,nexus_instance2),...] 350 Character sets, character partitions and taxon sets are prefixed, readjusted 351 and present in the combined matrix. 352 """ 353 if not matrices: 354 return None 355 name = matrices[0][0] 356 combined = copy.deepcopy(matrices[0][1]) # initiate with copy of first matrix 357 mixed_datatypes = (len(set(n[1].datatype for n in matrices)) > 1) 358 if mixed_datatypes: 359 # dealing with mixed matrices is application specific. 360 # You take care of that yourself! 361 combined.datatype = 'None' 362 # raise NexusError('Matrices must be of same datatype') 363 combined.charlabels = None 364 combined.statelabels = None 365 combined.interleave = False 366 combined.translate = None 367 368 # rename taxon sets and character sets and name them with prefix 369 for cn, cs in combined.charsets.items(): 370 combined.charsets['%s.%s' % (name, cn)] = cs 371 del combined.charsets[cn] 372 for tn, ts in combined.taxsets.items(): 373 combined.taxsets['%s.%s' % (name, tn)] = ts 374 del combined.taxsets[tn] 375 # previous partitions usually don't make much sense in combined matrix 376 # just initiate one new partition parted by single matrices 377 combined.charpartitions = {'combined': {name: list(range(combined.nchar))}} 378 for n, m in matrices[1:]: # add all other matrices 379 both = [t for t in combined.taxlabels if t in m.taxlabels] 380 combined_only = [t for t in combined.taxlabels if t not in both] 381 m_only = [t for t in m.taxlabels if t not in both] 382 for t in both: 383 # concatenate sequences and unify gap and missing character symbols 384 combined.matrix[t] += Seq(str(m.matrix[t]) 385 .replace(m.gap, combined.gap) 386 .replace(m.missing, combined.missing), 387 combined.alphabet) 388 # replace date of missing taxa with symbol for missing data 389 for t in combined_only: 390 combined.matrix[t] += Seq(combined.missing * m.nchar, 391 combined.alphabet) 392 for t in m_only: 393 combined.matrix[t] = Seq(combined.missing * combined.nchar, 394 combined.alphabet) + \ 395 Seq(str(m.matrix[t]) 396 .replace(m.gap, combined.gap) 397 .replace(m.missing, combined.missing), 398 combined.alphabet) 399 combined.taxlabels.extend(m_only) # new taxon list 400 for cn, cs in m.charsets.items(): # adjust character sets for new matrix 401 combined.charsets['%s.%s' % (n, cn)] = [x + combined.nchar for x in cs] 402 if m.taxsets: 403 if not combined.taxsets: 404 combined.taxsets = {} 405 # update taxon sets 406 combined.taxsets.update(dict(('%s.%s' % (n, tn), ts) 407 for tn, ts in m.taxsets.items())) 408 # update new charpartition 409 combined.charpartitions['combined'][n] = list(range(combined.nchar, combined.nchar + m.nchar)) 410 # update charlabels 411 if m.charlabels: 412 if not combined.charlabels: 413 combined.charlabels = {} 414 combined.charlabels.update(dict((combined.nchar + i, label) 415 for (i, label) in m.charlabels.items())) 416 combined.nchar += m.nchar # update nchar and ntax 417 combined.ntax += len(m_only) 418 419 # some prefer partitions, some charsets: 420 # make separate charset for ecah initial dataset 421 for c in combined.charpartitions['combined']: 422 combined.charsets[c] = combined.charpartitions['combined'][c] 423 424 return combined
425 426
427 -def _kill_comments_and_break_lines(text):
428 """Delete []-delimited comments out of a file and break into lines separated by ';'. 429 430 stripped_text=_kill_comments_and_break_lines(text): 431 Nested and multiline comments are allowed. [ and ] symbols within single 432 or double quotes are ignored, newline ends a quote, all symbols with quotes are 433 treated the same (thus not quoting inside comments like [this character ']' ends a comment]) 434 Special [&...] and [\...] comments remain untouched, if not inside standard comment. 435 Quotes inside special [& and [\ are treated as normal characters, 436 but no nesting inside these special comments allowed (like [& [\ ]]). 437 ';' ist deleted from end of line. 438 439 NOTE: this function is very slow for large files, and obsolete when using C extension cnexus 440 """ 441 contents = iter(text) 442 newtext = [] 443 newline = [] 444 quotelevel = '' 445 speciallevel = False 446 commlevel = 0 447 # Parse with one character look ahead (for special comments) 448 t2 = next(contents) 449 while True: 450 t = t2 451 try: 452 t2 = next(contents) 453 except StopIteration: 454 t2 = None 455 if t is None: 456 break 457 if t == quotelevel and not (commlevel or speciallevel): 458 # matching quote ends quotation 459 quotelevel = '' 460 elif not quotelevel and not (commlevel or speciallevel) and (t == '"' or t == "'"): 461 # single or double quote starts quotation 462 quotelevel = t 463 elif not quotelevel and t == '[': 464 # opening bracket outside a quote 465 if t2 in SPECIALCOMMENTS and commlevel == 0 and not speciallevel: 466 speciallevel = True 467 else: 468 commlevel += 1 469 elif not quotelevel and t == ']': 470 # closing bracket ioutside a quote 471 if speciallevel: 472 speciallevel = False 473 else: 474 commlevel -= 1 475 if commlevel < 0: 476 raise NexusError('Nexus formatting error: unmatched ]') 477 continue 478 if commlevel == 0: 479 # copy if we're not in comment 480 if t == ';' and not quotelevel: 481 newtext.append(''.join(newline)) 482 newline = [] 483 else: 484 newline.append(t) 485 # level of comments should be 0 at the end of the file 486 if newline: 487 newtext.append('\n'.join(newline)) 488 if commlevel > 0: 489 raise NexusError('Nexus formatting error: unmatched [') 490 return newtext
491 492
493 -def _adjust_lines(lines):
494 """Adjust linebreaks to match ';', strip leading/trailing whitespace. 495 496 list_of_commandlines=_adjust_lines(input_text) 497 Lines are adjusted so that no linebreaks occur within a commandline 498 (except matrix command line) 499 """ 500 formatted_lines = [] 501 for l in lines: 502 # Convert line endings 503 l = l.replace('\r\n', '\n').replace('\r', '\n').strip() 504 if l.lower().startswith('matrix'): 505 formatted_lines.append(l) 506 else: 507 l = l.replace('\n', ' ') 508 if l: 509 formatted_lines.append(l) 510 return formatted_lines
511 512
513 -def _replace_parenthesized_ambigs(seq, rev_ambig_values):
514 """Replaces ambigs in xxx(ACG)xxx format by IUPAC ambiguity code.""" 515 opening = seq.find('(') 516 while opening > -1: 517 closing = seq.find(')') 518 if closing < 0: 519 raise NexusError('Missing closing parenthesis in: ' + seq) 520 elif closing < opening: 521 raise NexusError('Missing opening parenthesis in: ' + seq) 522 ambig = ''.join(sorted(seq[opening + 1:closing])) 523 ambig_code = rev_ambig_values[ambig.upper()] 524 if ambig != ambig.upper(): 525 ambig_code = ambig_code.lower() 526 seq = seq[:opening] + ambig_code + seq[closing + 1:] 527 opening = seq.find('(') 528 return seq
529 530
531 -class Commandline(object):
532 """Represent a commandline as command and options.""" 533
534 - def __init__(self, line, title):
535 self.options = {} 536 options = [] 537 self.command = None 538 try: 539 # Assume matrix (all other command lines have been stripped of \n) 540 self.command, options = line.strip().split('\n', 1) 541 except ValueError: # Not matrix 542 # self.command,options=line.split(' ',1) # no: could be tab or spaces (translate...) 543 self.command = line.split()[0] 544 options = ' '.join(line.split()[1:]) 545 self.command = self.command.strip().lower() 546 if self.command in SPECIAL_COMMANDS: 547 # special command that need newlines and order of options preserved 548 self.options = options.strip() 549 else: 550 if len(options) > 0: 551 try: 552 options = options.replace('=', ' = ').split() 553 valued_indices = [(n - 1, n, n + 1) for n in range(len(options)) 554 if options[n] == '=' and n != 0 and n != len((options))] 555 indices = [] 556 for sl in valued_indices: 557 indices.extend(sl) 558 token_indices = [n for n in range(len(options)) if n not in indices] 559 for opt in valued_indices: 560 # self.options[options[opt[0]].lower()] = options[opt[2]].lower() 561 self.options[options[opt[0]].lower()] = options[opt[2]] 562 for token in token_indices: 563 self.options[options[token].lower()] = None 564 except ValueError: 565 raise NexusError('Incorrect formatting in line: %s' % line)
566 567
568 -class Block(object):
569 """Represent a NEXUS block with block name and list of commandlines.""" 570
571 - def __init__(self, title=None):
572 self.title = title 573 self.commandlines = []
574 575
576 -class Nexus(object):
577
578 - def __init__(self, input=None):
579 self.ntax = 0 # number of taxa 580 self.nchar = 0 # number of characters 581 self.unaltered_taxlabels = [] # taxlabels as the appear in the input file (incl. duplicates, etc.) 582 self.taxlabels = [] # labels for taxa, ordered by their id 583 self.charlabels = None # ... and for characters 584 self.statelabels = None # ... and for states 585 self.datatype = 'dna' # (standard), dna, rna, nucleotide, protein 586 self.respectcase = False # case sensitivity 587 self.missing = '?' # symbol for missing characters 588 self.gap = '-' # symbol for gap 589 self.symbols = None # set of symbols 590 self.equate = None # set of symbol synonyms 591 self.matchchar = None # matching char for matrix representation 592 self.labels = None # left, right, no 593 self.transpose = False # whether matrix is transposed 594 self.interleave = False # whether matrix is interleaved 595 self.tokens = False # unsupported 596 self.eliminate = None # unsupported 597 self.matrix = None # ... 598 self.unknown_blocks = [] # blocks we don't care about 599 self.taxsets = {} 600 self.charsets = {} 601 self.charpartitions = {} 602 self.taxpartitions = {} 603 self.trees = [] # list of Trees (instances of Tree class) 604 self.translate = None # Dict to translate taxon <-> taxon numbers 605 self.structured = [] # structured input representation 606 self.set = {} # dict of the set command to set various options 607 self.options = {} # dict of the options command in the data block 608 self.codonposset = None # name of the charpartition that defines codon positions 609 610 # some defaults 611 self.options['gapmode'] = 'missing' 612 613 if input: 614 self.read(input) 615 else: 616 self.read(DEFAULTNEXUS)
617
618 - def get_original_taxon_order(self):
619 """Included for backwards compatibility (DEPRECATED).""" 620 return self.taxlabels
621
622 - def set_original_taxon_order(self, value):
623 """Included for backwards compatibility (DEPRECATED).""" 624 self.taxlabels = value
625 626 original_taxon_order = property(get_original_taxon_order, set_original_taxon_order) 627
628 - def read(self, input):
629 """Read and parse NEXUS input (a filename, file-handle, or string).""" 630 # 1. Assume we have the name of a file in the execution dir or a 631 # file-like object. 632 # Note we need to add parsing of the path to dir/filename 633 try: 634 with File.as_handle(input, 'rU') as fp: 635 file_contents = fp.read() 636 self.filename = getattr(fp, 'name', 'Unknown_nexus_file') 637 except (TypeError, IOError, AttributeError): 638 # 2. Assume we have a string from a fh.read() 639 if isinstance(input, basestring): 640 file_contents = input 641 self.filename = 'input_string' 642 else: 643 print(input.strip()[:50]) 644 raise NexusError('Unrecognized input: %s ...' % input[:100]) 645 file_contents = file_contents.strip() 646 if file_contents.startswith('#NEXUS'): 647 file_contents = file_contents[6:] 648 commandlines = _get_command_lines(file_contents) 649 # get rid of stupid 'NEXUS token - in merged treefiles, this might appear multiple times' 650 for i, cl in enumerate(commandlines): 651 try: 652 if cl[:6].upper() == '#NEXUS': 653 commandlines[i] = cl[6:].strip() 654 except IndexError: 655 pass 656 # now loop through blocks (we parse only data in known blocks, thus ignoring non-block commands 657 nexus_block_gen = self._get_nexus_block(commandlines) 658 while True: 659 try: 660 title, contents = next(nexus_block_gen) 661 except StopIteration: 662 break 663 if title in KNOWN_NEXUS_BLOCKS: 664 self._parse_nexus_block(title, contents) 665 else: 666 self._unknown_nexus_block(title, contents)
667
668 - def _get_nexus_block(self, file_contents):
669 """Generator for looping through Nexus blocks.""" 670 inblock = False 671 blocklines = [] 672 while file_contents: 673 cl = file_contents.pop(0) 674 if cl.lower().startswith('begin'): 675 if not inblock: 676 inblock = True 677 title = cl.split()[1].lower() 678 else: 679 raise NexusError('Illegal block nesting in block %s' % title) 680 elif cl.lower().startswith('end'): 681 if inblock: 682 inblock = False 683 yield title, blocklines 684 blocklines = [] 685 else: 686 raise NexusError('Unmatched \'end\'.') 687 elif inblock: 688 blocklines.append(cl)
689
690 - def _unknown_nexus_block(self, title, contents):
691 block = Block() 692 block.commandlines.append(contents) 693 block.title = title 694 self.unknown_blocks.append(block)
695
696 - def _parse_nexus_block(self, title, contents):
697 """Parse a known Nexus Block (PRIVATE).""" 698 # attached the structured block representation 699 self._apply_block_structure(title, contents) 700 # now check for taxa,characters,data blocks. If this stuff is defined more than once 701 # the later occurrences will override the previous ones. 702 block = self.structured[-1] 703 for line in block.commandlines: 704 try: 705 getattr(self, '_' + line.command)(line.options) 706 except AttributeError: 707 raise NexusError('Unknown command: %s ' % line.command)
708
709 - def _title(self, options):
710 pass
711 714
715 - def _dimensions(self, options):
716 if 'ntax' in options: 717 self.ntax = eval(options['ntax']) 718 if 'nchar' in options: 719 self.nchar = eval(options['nchar'])
720
721 - def _format(self, options):
722 # print options 723 # we first need to test respectcase, then symbols (which depends on respectcase) 724 # then datatype (which, if standard, depends on symbols and respectcase in order to generate 725 # dicts for ambiguous values and alphabet 726 if 'respectcase' in options: 727 self.respectcase = True 728 # adjust symbols to for respectcase 729 if 'symbols' in options: 730 self.symbols = ''.join(options['symbols'].split()) 731 if (self.symbols.startswith('"') and self.symbols.endswith('"')) or \ 732 (self.symbols.startswith("'") and self.symbols.endswith("'")): 733 self.symbols = self.symbols[1:-1] 734 if not self.respectcase: 735 self.symbols = list(self.symbols.upper()) 736 # self.symbols = self.symbols.lower() + self.symbols.upper() 737 # self.symbols = list(set(self.symbols)) 738 if 'datatype' in options: 739 self.datatype = options['datatype'].lower() 740 741 if self.datatype == 'dna' or self.datatype == 'nucleotide': 742 self.alphabet = IUPAC.IUPACAmbiguousDNA() # fresh instance! 743 self.ambiguous_values = IUPACData.ambiguous_dna_values.copy() 744 self.unambiguous_letters = IUPACData.unambiguous_dna_letters 745 elif self.datatype == 'rna': 746 self.alphabet = IUPAC.IUPACAmbiguousDNA() # fresh instance! 747 self.ambiguous_values = IUPACData.ambiguous_rna_values.copy() 748 self.unambiguous_letters = IUPACData.unambiguous_rna_letters 749 elif self.datatype == 'protein': 750 # TODO - Should this not be ExtendedIUPACProtein? 751 self.alphabet = IUPAC.IUPACProtein() # fresh instance 752 self.ambiguous_values = {'B': 'DN', 753 'Z': 'EQ', 754 'X': IUPACData.protein_letters} 755 # that's how PAUP handles it 756 self.unambiguous_letters = IUPACData.protein_letters + '*' # stop-codon 757 elif self.datatype == 'standard': 758 self.alphabet = None 759 self.ambiguous_values = {} 760 if not self.symbols: 761 # PARSER BUG ## 762 # This error arises when symbols are absent or when 763 # whitespace is located within the SYMBOLS command values. 764 # The Nexus parser quits reading the SYMBOLS line upon 765 # finding a whitespace character. 766 raise NexusError( 767 "Symbols must be defined when using standard datatype. " 768 "Please remove any whitespace (spaces, tabs, etc.) " 769 "between values for symbols as this confuses the Nexus " 770 "parser.") 771 772 self.unambiguous_letters = ''.join(self.symbols) 773 if not self.respectcase: 774 self.unambiguous_letters += self.unambiguous_letters.lower() 775 else: 776 raise NexusError('Unsupported datatype: ' + self.datatype) 777 self.valid_characters = ''.join(self.ambiguous_values) + self.unambiguous_letters 778 if not self.respectcase: 779 self.valid_characters = self.valid_characters.lower() + self.valid_characters.upper() 780 # we have to sort the reverse ambig coding dict key characters: 781 # to be sure that it's 'ACGT':'N' and not 'GTCA':'N' 782 rev = dict((i[1], i[0]) for i in self.ambiguous_values.items() if i[0] != 'X') 783 self.rev_ambiguous_values = {} 784 for (k, v) in rev.items(): 785 key = sorted(c for c in k) 786 self.rev_ambiguous_values[''.join(key)] = v 787 # overwrite symbols for datype rna,dna,nucleotide 788 if self.datatype in ['dna', 'rna', 'nucleotide']: 789 self.symbols = self.alphabet.letters 790 if self.missing not in self.ambiguous_values: 791 self.ambiguous_values[self.missing] = self.unambiguous_letters + self.gap 792 self.ambiguous_values[self.gap] = self.gap 793 # elif self.datatype == 'standard': 794 # if not self.symbols: 795 # self.symbols = ['0', '1'] 796 if 'missing' in options: 797 self.missing = options['missing'][0] 798 if 'gap' in options: 799 self.gap = options['gap'][0] 800 if 'equate' in options: 801 self.equate = options['equate'] 802 if 'matchchar' in options: 803 self.matchchar = options['matchchar'][0] 804 if 'labels' in options: 805 self.labels = options['labels'] 806 if 'transpose' in options: 807 raise NexusError('TRANSPOSE is not supported!') 808 self.transpose = True 809 if 'interleave' in options: 810 if options['interleave'] is None or options['interleave'].lower() == 'yes': 811 self.interleave = True 812 if 'tokens' in options: 813 self.tokens = True 814 if 'notokens' in options: 815 self.tokens = False
816
817 - def _set(self, options):
818 self.set = options
819
820 - def _options(self, options):
821 self.options = options
822
823 - def _eliminate(self, options):
824 self.eliminate = options
825
826 - def _taxlabels(self, options):
827 """Get taxon labels (PRIVATE). 828 829 As the taxon names are already in the matrix, this is superfluous 830 except for transpose matrices, which are currently unsupported anyway. 831 Thus, we ignore the taxlabels command to make handling of duplicate 832 taxon names easier. 833 """ 834 pass
835 # self.taxlabels = [] 836 # opts = CharBuffer(options) 837 # while True: 838 # taxon = quotestrip(opts.next_word()) 839 # if not taxon: 840 # break 841 # self.taxlabels.append(taxon) 842
843 - def _check_taxlabels(self, taxon):
844 """Check for presence of taxon in self.taxlabels.""" 845 # According to NEXUS standard, underscores shall be treated as spaces..., 846 # so checking for identity is more difficult 847 nextaxa = dict((t.replace(' ', '_'), t) for t in self.taxlabels) 848 nexid = taxon.replace(' ', '_') 849 return nextaxa.get(nexid)
850
851 - def _charlabels(self, options):
852 """Get labels for characters.""" 853 self.charlabels = {} 854 opts = CharBuffer(options) 855 while True: 856 # get id and state 857 w = opts.next_word() 858 if w is None: # McClade saves and reads charlabel-lists with terminal comma?! 859 break 860 identifier = self._resolve(w, set_type=CHARSET) 861 state = quotestrip(opts.next_word()) 862 self.charlabels[identifier] = state 863 # check for comma or end of command 864 c = opts.next_nonwhitespace() 865 if c is None: 866 break 867 elif c != ',': 868 raise NexusError('Missing \',\' in line %s.' % options)
869
870 - def _charstatelabels(self, options):
871 self.charlabels = {} 872 self.statelabels = {} 873 opts = CharBuffer(options) 874 875 # Make sure symbols are defined 876 if not self.symbols: 877 raise NexusError( 878 'Symbols must be defined when using character states') 879 880 while True: 881 # get id and character name 882 w = opts.next_word() 883 884 # McClade saves and reads charlabel-lists with terminal comma?! 885 if w is None: 886 break 887 888 identifier = self._resolve(w, set_type=CHARSET) 889 character = quotestrip(opts.next_word()) 890 891 self.charlabels[identifier] = character 892 self.statelabels[identifier] = [] 893 894 # check for comma, slash or end of command 895 c = opts.next_nonwhitespace() 896 897 if c is None: 898 break 899 elif c != ',': 900 # Check if states are defined, otherwise report error 901 if c != '/': 902 raise NexusError('Missing \',\' in line %s.' % options) 903 904 # Get the first state 905 state = quotestrip(opts.next_word()) 906 907 if state is None: 908 raise NexusError( 909 'Missing character state in line %s.' % options) 910 911 while True: 912 # Make sure current state does not exceed number of 913 # available symbols 914 if len(self.statelabels[identifier]) > len(self.symbols): 915 raise NexusError( 916 'Character states exceed number of available symbols in line %s.' % options) 917 918 # Add the character state to the statelabels 919 self.statelabels[identifier].append(state) 920 921 # Check for another state or comma to end states (last 922 # character should not have comma at end of states - but 923 # we'll ignore) 924 state = quotestrip(opts.next_word()) 925 926 if state is None: 927 return 928 elif state is ',': 929 break
930
931 - def _statelabels(self, options):
932 # self.charlabels = options 933 # print 'Command statelabels is not supported and will be ignored.' 934 pass
935
936 - def _matrix(self, options):
937 """Creates a matrix for NEXUS object (PRIVATE).""" 938 if not self.ntax or not self.nchar: 939 raise NexusError('Dimensions must be specified before matrix!') 940 self.matrix = {} 941 taxcount = 0 942 first_matrix_block = True 943 944 # eliminate empty lines and leading/trailing whitespace 945 lines = [l.strip() for l in options.split('\n') if l.strip() != ''] 946 lineiter = iter(lines) 947 while True: 948 try: 949 l = next(lineiter) 950 except StopIteration: 951 if taxcount < self.ntax: 952 raise NexusError('Not enough taxa in matrix.') 953 elif taxcount > self.ntax: 954 raise NexusError('Too many taxa in matrix.') 955 else: 956 break 957 # count the taxa and check for interleaved matrix 958 taxcount += 1 959 if taxcount > self.ntax: 960 if not self.interleave: 961 raise NexusError('Too many taxa in matrix - should matrix be interleaved?') 962 else: 963 taxcount = 1 964 first_matrix_block = False 965 # get taxon name and sequence 966 linechars = CharBuffer(l) 967 id = quotestrip(linechars.next_word()) 968 l = linechars.rest().strip() 969 chars = '' 970 if self.interleave: 971 # interleaved matrix 972 if l: 973 chars = ''.join(l.split()) 974 else: 975 chars = ''.join(next(lineiter).split()) 976 else: 977 # non-interleaved matrix 978 chars = ''.join(l.split()) 979 while len(chars) < self.nchar: 980 l = next(lineiter) 981 chars += ''.join(l.split()) 982 983 # Reformat sequence for non-standard datatypes 984 if self.datatype != 'standard': 985 iupac_seq = Seq(_replace_parenthesized_ambigs( 986 chars, self.rev_ambiguous_values), self.alphabet) 987 # first taxon has the reference sequence if matchhar is used 988 if taxcount == 1: 989 refseq = iupac_seq 990 else: 991 if self.matchchar: 992 while True: 993 p = str(iupac_seq).find(self.matchchar) 994 if p == -1: 995 break 996 iupac_seq = Seq(str(iupac_seq)[:p] + refseq[ 997 p] + str(iupac_seq)[p + 1:], self.alphabet) 998 999 # Check for invalid characters 1000 for i, c in enumerate(str(iupac_seq)): 1001 if c not in self.valid_characters and c != self.gap and c != self.missing: 1002 raise NexusError("Taxon %s: Illegal character %s in sequence %s " 1003 "(check dimensions/interleaving)" % (id, c, iupac_seq)) 1004 else: 1005 iupac_seq = StandardData(chars) 1006 1007 # Check for invalid characters 1008 for i, c in enumerate(iupac_seq): 1009 # Go through each coding for each character 1010 for coding in c['d']: 1011 if coding not in self.valid_characters: 1012 if coding != self.gap and coding != self.missing: 1013 raise NexusError("Taxon %s: Illegal character %s " 1014 "in sequence %s " 1015 "(check dimensions/interleaving)" 1016 % (id, coding, iupac_seq)) 1017 1018 # add sequence to matrix 1019 if first_matrix_block: 1020 self.unaltered_taxlabels.append(id) 1021 id = _unique_label(list(self.matrix.keys()), id) 1022 self.matrix[id] = iupac_seq 1023 self.taxlabels.append(id) 1024 else: 1025 # taxon names need to be in the same order in each interleaved block 1026 id = _unique_label(self.taxlabels[:taxcount - 1], id) 1027 taxon_present = self._check_taxlabels(id) 1028 if taxon_present: 1029 self.matrix[taxon_present] += iupac_seq 1030 else: 1031 raise NexusError("Taxon %s not in first block of interleaved " 1032 "matrix. Check matrix dimensions and interleave." % id) 1033 # check all sequences for length according to nchar 1034 for taxon in self.matrix: 1035 if len(self.matrix[taxon]) != self.nchar: 1036 raise NexusError('Matrix Nchar %d does not match data length (%d) for taxon %s' 1037 % (self.nchar, len(self.matrix[taxon]), taxon)) 1038 # check that taxlabels is identical with matrix.keys. If not, it's a problem 1039 matrixkeys = sorted(self.matrix) 1040 taxlabelssort = sorted(self.taxlabels[:]) 1041 assert matrixkeys == taxlabelssort, \ 1042 "ERROR: TAXLABELS must be identical with MATRIX. " + \ 1043 "Please Report this as a bug, and send in data file."
1044
1045 - def _translate(self, options):
1046 """Translates a Nexus file (PRIVATE).""" 1047 self.translate = {} 1048 opts = CharBuffer(options) 1049 while True: 1050 try: 1051 # get id and state 1052 identifier = int(opts.next_word()) 1053 label = quotestrip(opts.next_word()) 1054 self.translate[identifier] = label 1055 # check for comma or end of command 1056 c = opts.next_nonwhitespace() 1057 if c is None: 1058 break 1059 elif c != ',': 1060 raise NexusError('Missing \',\' in line %s.' % options) 1061 except NexusError: 1062 raise 1063 except Exception: # TODO: ValueError? 1064 raise NexusError('Format error in line %s.' % options)
1065
1066 - def _utree(self, options):
1067 """Some software (clustalx) uses 'utree' to denote an unrooted tree.""" 1068 self._tree(options)
1069
1070 - def _tree(self, options):
1071 opts = CharBuffer(options) 1072 if opts.peek_nonwhitespace() == '*': 1073 # a star can be used to make it the default tree in some software packages 1074 dummy = opts.next_nonwhitespace() 1075 name = opts.next_word() 1076 if opts.next_nonwhitespace() != '=': 1077 raise NexusError('Syntax error in tree description: %s' 1078 % options[:50]) 1079 rooted = False 1080 weight = 1.0 1081 while opts.peek_nonwhitespace() == '[': 1082 opts.next_nonwhitespace() # discard opening bracket 1083 symbol = next(opts) 1084 if symbol != '&': 1085 raise NexusError('Illegal special comment [%s...] in tree description: %s' 1086 % (symbol, options[:50])) 1087 special = next(opts) 1088 value = opts.next_until(']') 1089 next(opts) # discard closing bracket 1090 if special == 'R': 1091 rooted = True 1092 elif special == 'U': 1093 rooted = False 1094 elif special == 'W': 1095 weight = float(value) 1096 tree = Tree(name=name, weight=weight, rooted=rooted, 1097 tree=opts.rest().strip()) 1098 # if there's an active translation table, translate 1099 if self.translate: 1100 for n in tree.get_terminals(): 1101 try: 1102 tree.node(n).data.taxon = safename(self.translate[int(tree.node(n).data.taxon)]) 1103 except (ValueError, KeyError): 1104 raise NexusError('Unable to substitute %s using \'translate\' data.' 1105 % tree.node(n).data.taxon) 1106 self.trees.append(tree)
1107
1108 - def _apply_block_structure(self, title, lines):
1109 """Applies Block structure to the NEXUS file (PRIVATE).""" 1110 block = Block('') 1111 block.title = title 1112 for line in lines: 1113 block.commandlines.append(Commandline(line, title)) 1114 self.structured.append(block)
1115
1116 - def _taxset(self, options):
1117 """Creates unique taxset (PRIVATE).""" 1118 name, taxa = self._get_indices(options, set_type=TAXSET) 1119 self.taxsets[name] = _make_unique(taxa)
1120
1121 - def _charset(self, options):
1122 """Creates unique character set (PRIVATE).""" 1123 name, sites = self._get_indices(options, set_type=CHARSET) 1124 self.charsets[name] = _make_unique(sites)
1125
1126 - def _taxpartition(self, options):
1127 """Collects taxpartition from a NEXUS file (PRIVATE).""" 1128 taxpartition = {} 1129 quotelevel = False 1130 opts = CharBuffer(options) 1131 name = self._name_n_vector(opts) 1132 if not name: 1133 raise NexusError('Formatting error in taxpartition: %s ' % options) 1134 # now collect thesubbpartitions and parse them 1135 # subpartitons separated by commas - which unfortunately could be part of a quoted identifier... 1136 # this is rather unelegant, but we have to avoid double-parsing and potential change of special nexus-words 1137 sub = '' 1138 while True: 1139 w = next(opts) 1140 if w is None or (w == ',' and not quotelevel): 1141 subname, subindices = self._get_indices(sub, set_type=TAXSET, separator=':') 1142 taxpartition[subname] = _make_unique(subindices) 1143 sub = '' 1144 if w is None: 1145 break 1146 else: 1147 if w == "'": 1148 quotelevel = not quotelevel 1149 sub += w 1150 self.taxpartitions[name] = taxpartition
1151
1152 - def _codonposset(self, options):
1153 """Read codon positions from a codons block as written from McClade. 1154 1155 Here codonposset is just a fancy name for a character partition with 1156 the name CodonPositions and the partitions N,1,2,3 1157 """ 1158 prev_partitions = list(self.charpartitions.keys()) 1159 self._charpartition(options) 1160 # mcclade calls it CodonPositions, but you never know... 1161 codonname = [n for n in self.charpartitions if n not in prev_partitions] 1162 if codonname == [] or len(codonname) > 1: 1163 raise NexusError('Formatting Error in codonposset: %s ' % options) 1164 else: 1165 self.codonposset = codonname[0]
1166
1167 - def _codeset(self, options):
1168 pass
1169
1170 - def _charpartition(self, options):
1171 """Collects character partition from NEXUS file (PRIVATE).""" 1172 charpartition = {} 1173 quotelevel = False 1174 opts = CharBuffer(options) 1175 name = self._name_n_vector(opts) 1176 if not name: 1177 raise NexusError('Formatting error in charpartition: %s ' % options) 1178 # now collect the subpartitions and parse them 1179 # subpartitions separated by commas - which unfortunately could be part 1180 # of a quoted identifier... 1181 sub = '' 1182 while True: 1183 w = next(opts) 1184 if w is None or (w == ',' and not quotelevel): 1185 subname, subindices = self._get_indices(sub, set_type=CHARSET, separator=':') 1186 charpartition[subname] = _make_unique(subindices) 1187 sub = '' 1188 if w is None: 1189 break 1190 else: 1191 if w == "'": 1192 quotelevel = not quotelevel 1193 sub += w 1194 self.charpartitions[name] = charpartition
1195
1196 - def _get_indices(self, options, set_type=CHARSET, separator='='):
1197 """Parse the taxset/charset specification (PRIVATE). 1198 1199 e.g. '1 2 3 - 5 dog cat 10 - 20 \\ 3' 1200 --> [0,1,2,3,4,'dog','cat',9,12,15,18] 1201 """ 1202 opts = CharBuffer(options) 1203 name = self._name_n_vector(opts, separator=separator) 1204 indices = self._parse_list(opts, set_type=set_type) 1205 if indices is None: 1206 raise NexusError('Formatting error in line: %s ' % options) 1207 return name, indices
1208
1209 - def _name_n_vector(self, opts, separator='='):
1210 """Extract name and check that it's not in vector format.""" 1211 rest = opts.rest() 1212 name = opts.next_word() 1213 # we ignore * before names 1214 if name == '*': 1215 name = opts.next_word() 1216 if not name: 1217 raise NexusError('Formatting error in line: %s ' % rest) 1218 name = quotestrip(name) 1219 if opts.peek_nonwhitespace == '(': 1220 open = opts.next_nonwhitespace() 1221 qualifier = open.next_word() 1222 close = opts.next_nonwhitespace() 1223 if qualifier.lower() == 'vector': 1224 raise NexusError('Unsupported VECTOR format in line %s' 1225 % (opts)) 1226 elif qualifier.lower() != 'standard': 1227 raise NexusError('Unknown qualifier %s in line %s' 1228 % (qualifier, opts)) 1229 if opts.next_nonwhitespace() != separator: 1230 raise NexusError('Formatting error in line: %s ' % rest) 1231 return name
1232
1233 - def _parse_list(self, options_buffer, set_type):
1234 """Parse a NEXUS list (PRIVATE). 1235 1236 e.g. [1, 2, 4-8\\2, dog, cat] --> [1,2,4,6,8,17,21], 1237 (assuming dog is taxon no. 17 and cat is taxon no. 21). 1238 """ 1239 plain_list = [] 1240 if options_buffer.peek_nonwhitespace(): 1241 try: 1242 # capture all possible exceptions and treat them as formatting 1243 # errors, if they are not NexusError 1244 while True: 1245 identifier = options_buffer.next_word() # next list element 1246 if not identifier: # end of list? 1247 break 1248 start = self._resolve(identifier, set_type=set_type) 1249 if options_buffer.peek_nonwhitespace() == '-': # followd by - 1250 end = start 1251 step = 1 1252 # get hyphen and end of range 1253 hyphen = options_buffer.next_nonwhitespace() 1254 end = self._resolve(options_buffer.next_word(), set_type=set_type) 1255 if set_type == CHARSET: 1256 if options_buffer.peek_nonwhitespace() == '\\': # followd by \ 1257 backslash = options_buffer.next_nonwhitespace() 1258 step = int(options_buffer.next_word()) # get backslash and step 1259 plain_list.extend(range(start, end + 1, step)) 1260 else: 1261 if isinstance(start, list) or isinstance(end, list): 1262 raise NexusError('Name if character sets not allowed in range definition: %s' 1263 % identifier) 1264 start = self.taxlabels.index(start) 1265 end = self.taxlabels.index(end) 1266 taxrange = self.taxlabels[start:end + 1] 1267 plain_list.extend(taxrange) 1268 else: 1269 if isinstance(start, list): 1270 # start was the name of charset or taxset 1271 plain_list.extend(start) 1272 else: 1273 # start was an ordinary identifier 1274 plain_list.append(start) 1275 except NexusError: 1276 raise 1277 except Exception: # FIXME - this seems unwise 1278 return None 1279 return plain_list
1280
1281 - def _resolve(self, identifier, set_type=None):
1282 """Translate identifier in list into character/taxon index. 1283 1284 Characters (which are referred to by their index in Nexus.py): 1285 Plain numbers are returned minus 1 (Nexus indices to python indices) 1286 Text identifiers are translated into their indices (if plain character identifiers), 1287 the first hit in charlabels is returned (charlabels don't need to be unique) 1288 or the range of indices is returned (if names of character sets). 1289 Taxa (which are referred to by their unique name in Nexus.py): 1290 Plain numbers are translated in their taxon name, underscores and spaces are considered equal. 1291 Names are returned unchanged (if plain taxon identifiers), or the names in 1292 the corresponding taxon set is returned. 1293 1294 """ 1295 identifier = quotestrip(identifier) 1296 if not set_type: 1297 raise NexusError('INTERNAL ERROR: Need type to resolve identifier.') 1298 if set_type == CHARSET: 1299 try: 1300 n = int(identifier) 1301 except ValueError: 1302 if self.charlabels and identifier in self.charlabels.values(): 1303 for k in self.charlabels: 1304 if self.charlabels[k] == identifier: 1305 return k 1306 elif self.charsets and identifier in self.charsets: 1307 return self.charsets[identifier] 1308 else: 1309 raise NexusError('Unknown character identifier: %s' 1310 % identifier) 1311 else: 1312 if n <= self.nchar: 1313 return n - 1 1314 else: 1315 raise NexusError('Illegal character identifier: %d>nchar (=%d).' 1316 % (identifier, self.nchar)) 1317 elif set_type == TAXSET: 1318 try: 1319 n = int(identifier) 1320 except ValueError: 1321 taxlabels_id = self._check_taxlabels(identifier) 1322 if taxlabels_id: 1323 return taxlabels_id 1324 elif self.taxsets and identifier in self.taxsets: 1325 return self.taxsets[identifier] 1326 else: 1327 raise NexusError('Unknown taxon identifier: %s' 1328 % identifier) 1329 else: 1330 if n > 0 and n <= self.ntax: 1331 return self.taxlabels[n - 1] 1332 else: 1333 raise NexusError('Illegal taxon identifier: %d>ntax (=%d).' 1334 % (identifier, self.ntax)) 1335 else: 1336 raise NexusError('Unknown set specification: %s.' % set_type)
1337
1338 - def _stateset(self, options):
1339 # Not implemented 1340 pass
1341
1342 - def _changeset(self, options):
1343 # Not implemented 1344 pass
1345
1346 - def _treeset(self, options):
1347 # Not implemented 1348 pass
1349
1350 - def _treepartition(self, options):
1351 # Not implemented 1352 pass
1353
1354 - def write_nexus_data_partitions(self, matrix=None, filename=None, blocksize=None, 1355 interleave=False, exclude=(), delete=(), 1356 charpartition=None, comment='', mrbayes=False):
1357 """Writes a nexus file for each partition in charpartition. 1358 1359 Only non-excluded characters and non-deleted taxa are included, 1360 just the data block is written. 1361 """ 1362 if not matrix: 1363 matrix = self.matrix 1364 if not matrix: 1365 return 1366 if not filename: 1367 filename = self.filename 1368 if charpartition: 1369 pfilenames = {} 1370 for p in charpartition: 1371 total_exclude = list(exclude) 1372 total_exclude.extend(c for c in range(self.nchar) if c not in charpartition[p]) 1373 total_exclude = _make_unique(total_exclude) 1374 pcomment = comment + '\nPartition: ' + p + '\n' 1375 dot = filename.rfind('.') 1376 if dot > 0: 1377 pfilename = filename[:dot] + '_' + p + '.data' 1378 else: 1379 pfilename = filename + '_' + p 1380 pfilenames[p] = pfilename 1381 self.write_nexus_data(filename=pfilename, matrix=matrix, blocksize=blocksize, 1382 interleave=interleave, exclude=total_exclude, delete=delete, 1383 comment=pcomment, append_sets=False, mrbayes=mrbayes) 1384 return pfilenames 1385 else: 1386 fn = self.filename + '.data' 1387 self.write_nexus_data(filename=fn, matrix=matrix, blocksize=blocksize, 1388 interleave=interleave, exclude=exclude, delete=delete, 1389 comment=comment, append_sets=False, mrbayes=mrbayes) 1390 return fn
1391
1392 - def write_nexus_data(self, filename=None, matrix=None, exclude=(), delete=(), 1393 blocksize=None, interleave=False, interleave_by_partition=False, 1394 comment=None, omit_NEXUS=False, append_sets=True, mrbayes=False, 1395 codons_block=True):
1396 """Writes a nexus file with data and sets block to a file or handle. 1397 1398 Character sets and partitions are appended by default, and are 1399 adjusted according to excluded characters (i.e. character sets 1400 still point to the same sites (not necessarily same positions), 1401 without including the deleted characters. 1402 1403 - filename - Either a filename as a string (which will be opened, 1404 written to and closed), or a handle object (which will 1405 be written to but NOT closed). 1406 - interleave_by_partition - Optional name of partition (string) 1407 - omit_NEXUS - Boolean. If true, the '#NEXUS' line normally at the 1408 start of the file is omitted. 1409 1410 Returns the filename/handle used to write the data. 1411 """ 1412 if not matrix: 1413 matrix = self.matrix 1414 if not matrix: 1415 return 1416 if not filename: 1417 filename = self.filename 1418 if [t for t in delete if not self._check_taxlabels(t)]: 1419 raise NexusError('Unknown taxa: %s' 1420 % ', '.join(set(delete).difference(set(self.taxlabels)))) 1421 if interleave_by_partition: 1422 if interleave_by_partition not in self.charpartitions: 1423 raise NexusError('Unknown partition: %r' % interleave_by_partition) 1424 else: 1425 partition = self.charpartitions[interleave_by_partition] 1426 # we need to sort the partition names by starting position 1427 # before we exclude characters 1428 names = _sort_keys_by_values(partition) 1429 newpartition = {} 1430 for p in partition: 1431 newpartition[p] = [c for c in partition[p] if c not in exclude] 1432 # how many taxa and how many characters are left? 1433 undelete = [taxon for taxon in self.taxlabels if taxon in matrix and taxon not in delete] 1434 cropped_matrix = _seqmatrix2strmatrix(self.crop_matrix(matrix, exclude=exclude, delete=delete)) 1435 ntax_adjusted = len(undelete) 1436 nchar_adjusted = len(cropped_matrix[undelete[0]]) 1437 if not undelete or (undelete and undelete[0] == ''): 1438 return 1439 1440 with File.as_handle(filename, mode='w') as fh: 1441 if not omit_NEXUS: 1442 fh.write('#NEXUS\n') 1443 if comment: 1444 fh.write('[' + comment + ']\n') 1445 fh.write('begin data;\n') 1446 fh.write('\tdimensions ntax=%d nchar=%d;\n' % (ntax_adjusted, nchar_adjusted)) 1447 fh.write('\tformat datatype=' + self.datatype) 1448 if self.respectcase: 1449 fh.write(' respectcase') 1450 if self.missing: 1451 fh.write(' missing=' + self.missing) 1452 if self.gap: 1453 fh.write(' gap=' + self.gap) 1454 if self.matchchar: 1455 fh.write(' matchchar=' + self.matchchar) 1456 if self.labels: 1457 fh.write(' labels=' + self.labels) 1458 if self.equate: 1459 fh.write(' equate=' + self.equate) 1460 if interleave or interleave_by_partition: 1461 fh.write(' interleave') 1462 fh.write(';\n') 1463 # if self.taxlabels: 1464 # fh.write('taxlabels '+' '.join(self.taxlabels)+';\n') 1465 if self.charlabels: 1466 newcharlabels = self._adjust_charlabels(exclude=exclude) 1467 clkeys = sorted(newcharlabels) 1468 fh.write('charlabels ' + 1469 ', '.join("%s %s" % (k + 1, safename(newcharlabels[k])) for k in clkeys) + 1470 ';\n') 1471 fh.write('matrix\n') 1472 if not blocksize: 1473 if interleave: 1474 blocksize = 70 1475 else: 1476 blocksize = self.nchar 1477 # delete deleted taxa and ecxclude excluded characters... 1478 namelength = max(len(safename(t, mrbayes=mrbayes)) for t in undelete) 1479 if interleave_by_partition: 1480 # interleave by partitions, but adjust partitions with regard 1481 # to excluded characters 1482 seek = 0 1483 for p in names: 1484 fh.write('[%s: %s]\n' % (interleave_by_partition, p)) 1485 if len(newpartition[p]) > 0: 1486 for taxon in undelete: 1487 fh.write(safename(taxon, mrbayes=mrbayes).ljust(namelength + 1)) 1488 fh.write(cropped_matrix[taxon][seek:seek + len(newpartition[p])] + '\n') 1489 fh.write('\n') 1490 else: 1491 fh.write('[empty]\n\n') 1492 seek += len(newpartition[p]) 1493 elif interleave: 1494 for seek in range(0, nchar_adjusted, blocksize): 1495 for taxon in undelete: 1496 fh.write(safename(taxon, mrbayes=mrbayes).ljust(namelength + 1)) 1497 fh.write(cropped_matrix[taxon][seek:seek + blocksize] + '\n') 1498 fh.write('\n') 1499 else: 1500 for taxon in undelete: 1501 if blocksize < nchar_adjusted: 1502 fh.write(safename(taxon, mrbayes=mrbayes) + '\n') 1503 else: 1504 fh.write(safename(taxon, mrbayes=mrbayes).ljust(namelength + 1)) 1505 taxon_seq = cropped_matrix[taxon] 1506 for seek in range(0, nchar_adjusted, blocksize): 1507 fh.write(taxon_seq[seek:seek + blocksize] + '\n') 1508 del taxon_seq 1509 fh.write(';\nend;\n') 1510 if append_sets: 1511 if codons_block: 1512 fh.write(self.append_sets(exclude=exclude, delete=delete, mrbayes=mrbayes, include_codons=False)) 1513 fh.write(self.append_sets(exclude=exclude, delete=delete, mrbayes=mrbayes, codons_only=True)) 1514 else: 1515 fh.write(self.append_sets(exclude=exclude, delete=delete, mrbayes=mrbayes)) 1516 return filename
1517
1518 - def append_sets(self, exclude=(), delete=(), mrbayes=False, include_codons=True, codons_only=False):
1519 """Returns a sets block.""" 1520 if not self.charsets and not self.taxsets and not self.charpartitions: 1521 return '' 1522 if codons_only: 1523 setsb = ['\nbegin codons'] 1524 else: 1525 setsb = ['\nbegin sets'] 1526 # - now if characters have been excluded, the character sets need to be adjusted, 1527 # so that they still point to the right character positions 1528 # calculate a list of offsets: for each deleted character, the following character position 1529 # in the new file will have an additional offset of -1 1530 offset = 0 1531 offlist = [] 1532 for c in range(self.nchar): 1533 if c in exclude: 1534 offset += 1 1535 offlist.append(-1) # dummy value as these character positions are excluded 1536 else: 1537 offlist.append(c - offset) 1538 # now adjust each of the character sets 1539 if not codons_only: 1540 for n, ns in self.charsets.items(): 1541 cset = [offlist[c] for c in ns if c not in exclude] 1542 if cset: 1543 setsb.append('charset %s = %s' % (safename(n), _compact4nexus(cset))) 1544 for n, s in self.taxsets.items(): 1545 tset = [safename(t, mrbayes=mrbayes) for t in s if t not in delete] 1546 if tset: 1547 setsb.append('taxset %s = %s' % (safename(n), ' '.join(tset))) 1548 for n, p in self.charpartitions.items(): 1549 if not include_codons and n == CODONPOSITIONS: 1550 continue 1551 elif codons_only and n != CODONPOSITIONS: 1552 continue 1553 # as characters have been excluded, the partitions must be adjusted 1554 # if a partition is empty, it will be omitted from the charpartition command 1555 # (although paup allows charpartition part=t1:,t2:,t3:1-100) 1556 names = _sort_keys_by_values(p) 1557 newpartition = {} 1558 for sn in names: 1559 nsp = [offlist[c] for c in p[sn] if c not in exclude] 1560 if nsp: 1561 newpartition[sn] = nsp 1562 if newpartition: 1563 if include_codons and n == CODONPOSITIONS: 1564 command = 'codonposset' 1565 else: 1566 command = 'charpartition' 1567 setsb.append('%s %s = %s' % (command, safename(n), 1568 ', '.join('%s: %s' % (sn, _compact4nexus(newpartition[sn])) 1569 for sn in names if sn in newpartition))) 1570 # now write charpartititions, much easier than charpartitions 1571 for n, p in self.taxpartitions.items(): 1572 names = _sort_keys_by_values(p) 1573 newpartition = {} 1574 for sn in names: 1575 nsp = [t for t in p[sn] if t not in delete] 1576 if nsp: 1577 newpartition[sn] = nsp 1578 if newpartition: 1579 setsb.append('taxpartition %s = %s' % (safename(n), 1580 ', '.join('%s: %s' % (safename(sn), 1581 ' '.join(safename(x) for x in newpartition[sn])) 1582 for sn in names if sn in newpartition))) 1583 # add 'end' and return everything 1584 setsb.append('end;\n') 1585 if len(setsb) == 2: # begin and end only 1586 return '' 1587 else: 1588 return ';\n'.join(setsb)
1589
1590 - def export_fasta(self, filename=None, width=70):
1591 """Writes matrix into a fasta file.""" 1592 if not filename: 1593 if '.' in self.filename and self.filename.split('.')[-1].lower() in ['paup', 'nexus', 'nex', 'dat']: 1594 filename = '.'.join(self.filename.split('.')[:-1]) + '.fas' 1595 else: 1596 filename = self.filename + '.fas' 1597 with open(filename, 'w') as fh: 1598 for taxon in self.taxlabels: 1599 fh.write('>' + safename(taxon) + '\n') 1600 for i in range(0, len(str(self.matrix[taxon])), width): 1601 fh.write(str(self.matrix[taxon])[i:i + width] + '\n') 1602 return filename
1603
1604 - def export_phylip(self, filename=None):
1605 """Writes matrix into a PHYLIP file. 1606 1607 Note that this writes a relaxed PHYLIP format file, where the names 1608 are not truncated, nor checked for invalid characters. 1609 """ 1610 if not filename: 1611 if '.' in self.filename and self.filename.split('.')[-1].lower() in ['paup', 'nexus', 'nex', 'dat']: 1612 filename = '.'.join(self.filename.split('.')[:-1]) + '.phy' 1613 else: 1614 filename = self.filename + '.phy' 1615 with open(filename, 'w') as fh: 1616 fh.write('%d %d\n' % (self.ntax, self.nchar)) 1617 for taxon in self.taxlabels: 1618 fh.write('%s %s\n' % (safename(taxon), str(self.matrix[taxon]))) 1619 return filename
1620
1621 - def constant(self, matrix=None, delete=(), exclude=()):
1622 """Return a list with all constant characters.""" 1623 if not matrix: 1624 matrix = self.matrix 1625 undelete = [t for t in self.taxlabels if t in matrix and t not in delete] 1626 if not undelete: 1627 return None 1628 elif len(undelete) == 1: 1629 return [x for x in range(len(matrix[undelete[0]])) if x not in exclude] 1630 # get the first sequence and expand all ambiguous values 1631 constant = [(x, self.ambiguous_values.get(n.upper(), n.upper())) for 1632 x, n in enumerate(str(matrix[undelete[0]])) if x not in exclude] 1633 1634 for taxon in undelete[1:]: 1635 newconstant = [] 1636 for site in constant: 1637 # print '%d (paup=%d)' % (site[0],site[0]+1), 1638 seqsite = matrix[taxon][site[0]].upper() 1639 # print seqsite,'checked against',site[1],'\t', 1640 if seqsite == self.missing or \ 1641 (seqsite == self.gap and self.options['gapmode'].lower() == 'missing') or \ 1642 seqsite == site[1]: 1643 # missing or same as before -> ok 1644 newconstant.append(site) 1645 elif (seqsite in site[1] or 1646 site[1] == self.missing or 1647 (self.options['gapmode'].lower() == 'missing' and 1648 site[1] == self.gap)): 1649 # subset of an ambig or only missing in previous -> take subset 1650 newconstant.append((site[0], self.ambiguous_values.get(seqsite, seqsite))) 1651