Package Bio :: Package Nexus :: Module Nexus
[hide private]
[frames] | no frames]

Source Code for Module Bio.Nexus.Nexus

   1  # Copyright 2005-2008 by Frank Kauff & Cymon J. Cox. All rights reserved. 
   2  #           2014-2015 by Joe Cora (standard data) 
   3  # This code is part of the Biopython distribution and governed by its 
   4  # license. Please see the LICENSE file that should have been included 
   5  # as part of this package. 
   6  # 
   7  # Bug reports welcome: fkauff@biologie.uni-kl.de or on Biopython's bugzilla. 
   8  """Nexus class. Parse the contents of a NEXUS file. 
   9   
  10  Based upon 'NEXUS: An extensible file format for systematic information' 
  11  Maddison, Swofford, Maddison. 1997. Syst. Biol. 46(4):590-621 
  12  """ 
  13  from __future__ import print_function 
  14   
  15  from Bio._py3k import zip 
  16  from Bio._py3k import range 
  17  from Bio._py3k import basestring 
  18   
  19  from functools import reduce 
  20  import copy 
  21  import math 
  22  import random 
  23  import sys 
  24   
  25  from Bio import File 
  26  from Bio.Alphabet import IUPAC 
  27  from Bio.Data import IUPACData 
  28  from Bio.Seq import Seq 
  29   
  30  from Bio.Nexus.StandardData import StandardData 
  31  from Bio.Nexus.Trees import Tree 
  32   
  33   
  34  INTERLEAVE = 70 
  35  SPECIAL_COMMANDS = ['charstatelabels', 'charlabels', 'taxlabels', 'taxset', 
  36                      'charset', 'charpartition', 'taxpartition', 'matrix', 
  37                      'tree', 'utree', 'translate', 'codonposset', 'title'] 
  38  KNOWN_NEXUS_BLOCKS = ['trees', 'data', 'characters', 'taxa', 'sets', 'codons'] 
  39  PUNCTUATION = '()[]{}/\,;:=*\'"`+-<>' 
  40  MRBAYESSAFE = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890_' 
  41  WHITESPACE = ' \t\n' 
  42  # SPECIALCOMMENTS = ['!','&','%','/','\\','@'] # original list of special comments 
  43  SPECIALCOMMENTS = ['&']  # supported special comment ('tree' command), all others are ignored 
  44  CHARSET = 'chars' 
  45  TAXSET = 'taxa' 
  46  CODONPOSITIONS = 'codonpositions' 
  47  DEFAULTNEXUS = '#NEXUS\nbegin data; dimensions ntax=0 nchar=0; format datatype=dna; end; ' 
  48   
  49   
50 -class NexusError(Exception):
51 pass
52 53
54 -class CharBuffer(object):
55 """Helps reading NEXUS-words and characters from a buffer (semi-PRIVATE). 56 57 This class is not intended for public use (any more). 58 """
59 - def __init__(self, string):
60 if string: 61 self.buffer = list(string) 62 else: 63 self.buffer = []
64
65 - def peek(self):
66 if self.buffer: 67 return self.buffer[0] 68 else: 69 return None
70
71 - def peek_nonwhitespace(self):
72 b = ''.join(self.buffer).strip() 73 if b: 74 return b[0] 75 else: 76 return None
77
78 - def __next__(self):
79 if self.buffer: 80 return self.buffer.pop(0) 81 else: 82 return None
83 84 if sys.version_info[0] < 3:
85 - def next(self):
86 """Deprecated Python 2 style alias for Python 3 style __next__ method.""" 87 return self.__next__()
88
89 - def next_nonwhitespace(self):
90 while True: 91 p = next(self) 92 if p is None: 93 break 94 if p not in WHITESPACE: 95 return p 96 return None
97
98 - def skip_whitespace(self):
99 while self.buffer[0] in WHITESPACE: 100 self.buffer = self.buffer[1:]
101
102 - def next_until(self, target):
103 for t in target: 104 try: 105 pos = self.buffer.index(t) 106 except ValueError: 107 pass 108 else: 109 found = ''.join(self.buffer[:pos]) 110 self.buffer = self.buffer[pos:] 111 return found 112 else: 113 return None
114
115 - def peek_word(self, word):
116 return ''.join(self.buffer[:len(word)]) == word
117
118 - def next_word(self):
119 """Return the next NEXUS word from a string. 120 121 This deals with single and double quotes, whitespace and punctuation. 122 """ 123 word = [] 124 quoted = False 125 # get first character 126 first = self.next_nonwhitespace() 127 if not first: 128 # return empty if only whitespace left 129 return None 130 word.append(first) 131 if first == "'": 132 quoted = "'" 133 elif first == '"': 134 quoted = '"' 135 elif first in PUNCTUATION: 136 # if it's non-quote punctuation, return immediately 137 return first 138 while True: 139 c = self.peek() 140 if c == quoted: # a quote? 141 word.append(next(self)) # store quote 142 if self.peek() == quoted: # double quote 143 next(self) # skip second quote 144 elif quoted: # second single quote ends word 145 break 146 elif quoted: 147 # if quoted, then add anything 148 word.append(next(self)) 149 elif not c or c in PUNCTUATION or c in WHITESPACE: 150 # if not quoted and special character, stop 151 break 152 else: 153 word.append(next(self)) # standard character 154 return ''.join(word)
155
156 - def rest(self):
157 """Return the rest of the string without parsing.""" 158 return ''.join(self.buffer)
159 160
161 -class StepMatrix(object):
162 """Calculate a stepmatrix for weighted parsimony. 163 164 See Wheeler (1990), Cladistics 6:269-275. 165 """
166 - def __init__(self, symbols, gap):
167 self.data = {} 168 self.symbols = sorted(symbols) 169 if gap: 170 self.symbols.append(gap) 171 for x in self.symbols: 172 for y in [s for s in self.symbols if s != x]: 173 self.set(x, y, 0)
174
175 - def set(self, x, y, value):
176 if x > y: 177 x, y = y, x 178 self.data[x + y] = value
179
180 - def add(self, x, y, value):
181 if x > y: 182 x, y = y, x 183 self.data[x + y] += value
184
185 - def sum(self):
186 return reduce(lambda x, y: x + y, self.data.values())
187
188 - def transformation(self):
189 total = self.sum() 190 if total != 0: 191 for k in self.data: 192 self.data[k] = self.data[k] / float(total) 193 return self
194
195 - def weighting(self):
196 for k in self.data: 197 if self.data[k] != 0: 198 self.data[k] = -math.log(self.data[k]) 199 return self
200
201 - def smprint(self, name='your_name_here'):
202 matrix = 'usertype %s stepmatrix=%d\n' % (name, len(self.symbols)) 203 matrix += ' %s\n' % ' '.join(self.symbols) 204 for x in self.symbols: 205 matrix += '[%s]'.ljust(8) % x 206 for y in self.symbols: 207 if x == y: 208 matrix += ' . ' 209 else: 210 if x > y: 211 x1, y1 = y, x 212 else: 213 x1, y1 = x, y 214 if self.data[x1 + y1] == 0: 215 matrix += 'inf. ' 216 else: 217 matrix += '%2.2f'.ljust(10) % (self.data[x1 + y1]) 218 matrix += '\n' 219 matrix += ';\n' 220 return matrix
221 222
223 -def safename(name, mrbayes=False):
224 """Return a taxon identifier according to NEXUS standard. 225 226 Wrap quotes around names with punctuation or whitespace, and double 227 single quotes. 228 229 mrbayes=True: write names without quotes, whitespace or punctuation 230 for the mrbayes software package. 231 """ 232 if mrbayes: 233 safe = name.replace(' ', '_') 234 safe = ''.join(c for c in safe if c in MRBAYESSAFE) 235 else: 236 safe = name.replace("'", "''") 237 if set(safe).intersection(set(WHITESPACE + PUNCTUATION)): 238 safe = "'" + safe + "'" 239 return safe
240 241
242 -def quotestrip(word):
243 """Remove quotes and/or double quotes around identifiers.""" 244 if not word: 245 return None 246 while (word.startswith("'") and word.endswith("'")) or (word.startswith('"') and word.endswith('"')): 247 word = word[1:-1] 248 return word
249 250
251 -def get_start_end(sequence, skiplist=('-', '?')):
252 """Return position of first and last character which is not in skiplist. 253 254 Skiplist defaults to ['-','?']. 255 """ 256 length = len(sequence) 257 if length == 0: 258 return None, None 259 end = length - 1 260 while end >= 0 and (sequence[end] in skiplist): 261 end -= 1 262 start = 0 263 while start < length and (sequence[start] in skiplist): 264 start += 1 265 if start == length and end == -1: # empty sequence 266 return -1, -1 267 else: 268 return start, end
269 270
271 -def _sort_keys_by_values(p):
272 """Returns a sorted list of keys of p sorted by values of p.""" 273 return sorted((pn for pn in p if p[pn]), key=lambda pn: p[pn])
274 275
276 -def _make_unique(l):
277 """Check that all values in list are unique and return a pruned and sorted list.""" 278 return sorted(set(l))
279 280
281 -def _unique_label(previous_labels, label):
282 """Returns a unique name if label is already in previous_labels.""" 283 while label in previous_labels: 284 label_split = label.split('.') 285 if label_split[-1].startswith('copy'): 286 copy_num = 1 287 if label_split[-1] != "copy": 288 copy_num = int(label_split[-1][4:]) + 1 289 new_label = "%s.copy%s" % ('.'.join(label_split[:-1]), copy_num) 290 label = new_label 291 else: 292 label += '.copy' 293 return label
294 295
296 -def _seqmatrix2strmatrix(matrix):
297 """Converts a Seq-object matrix to a plain sequence-string matrix.""" 298 return dict((t, str(matrix[t])) for t in matrix)
299 300
301 -def _compact4nexus(orig_list):
302 """Transform [1 2 3 5 6 7 8 12 15 18 20] (baseindex 0, used in the Nexus class) 303 into '2-4 6-9 13-19\\3 21' (baseindex 1, used in programs like Paup or MrBayes.). 304 """ 305 306 if not orig_list: 307 return '' 308 orig_list = sorted(set(orig_list)) 309 shortlist = [] 310 clist = orig_list[:] 311 clist.append(clist[-1] + .5) # dummy value makes it easier 312 while len(clist) > 1: 313 step = 1 314 for i, x in enumerate(clist): 315 if x == clist[0] + i * step: # are we still in the right step? 316 continue 317 elif i == 1 and len(clist) > 3 and clist[i + 1] - x == x - clist[0]: 318 # second element, and possibly at least 3 elements to link, 319 # and the next one is in the right step 320 step = x - clist[0] 321 else: # pattern broke, add all values before current position to new list 322 sub = clist[:i] 323 if len(sub) == 1: 324 shortlist.append(str(sub[0] + 1)) 325 else: 326 if step == 1: 327 shortlist.append('%d-%d' % (sub[0] + 1, sub[-1] + 1)) 328 else: 329 shortlist.append('%d-%d\\%d' % (sub[0] + 1, sub[-1] + 1, step)) 330 clist = clist[i:] 331 break 332 return ' '.join(shortlist)
333 334
335 -def combine(matrices):
336 """Combine matrices in [(name,nexus-instance),...] and return new nexus instance. 337 338 combined_matrix=combine([(name1,nexus_instance1),(name2,nexus_instance2),...] 339 Character sets, character partitions and taxon sets are prefixed, readjusted 340 and present in the combined matrix. 341 """ 342 343 if not matrices: 344 return None 345 name = matrices[0][0] 346 combined = copy.deepcopy(matrices[0][1]) # initiate with copy of first matrix 347 mixed_datatypes = (len(set(n[1].datatype for n in matrices)) > 1) 348 if mixed_datatypes: 349 # dealing with mixed matrices is application specific. 350 # You take care of that yourself! 351 combined.datatype = 'None' 352 # raise NexusError('Matrices must be of same datatype') 353 combined.charlabels = None 354 combined.statelabels = None 355 combined.interleave = False 356 combined.translate = None 357 358 # rename taxon sets and character sets and name them with prefix 359 for cn, cs in combined.charsets.items(): 360 combined.charsets['%s.%s' % (name, cn)] = cs 361 del combined.charsets[cn] 362 for tn, ts in combined.taxsets.items(): 363 combined.taxsets['%s.%s' % (name, tn)] = ts 364 del combined.taxsets[tn] 365 # previous partitions usually don't make much sense in combined matrix 366 # just initiate one new partition parted by single matrices 367 combined.charpartitions = {'combined': {name: list(range(combined.nchar))}} 368 for n, m in matrices[1:]: # add all other matrices 369 both = [t for t in combined.taxlabels if t in m.taxlabels] 370 combined_only = [t for t in combined.taxlabels if t not in both] 371 m_only = [t for t in m.taxlabels if t not in both] 372 for t in both: 373 # concatenate sequences and unify gap and missing character symbols 374 combined.matrix[t] += Seq(str(m.matrix[t]) 375 .replace(m.gap, combined.gap) 376 .replace(m.missing, combined.missing), 377 combined.alphabet) 378 # replace date of missing taxa with symbol for missing data 379 for t in combined_only: 380 combined.matrix[t] += Seq(combined.missing * m.nchar, 381 combined.alphabet) 382 for t in m_only: 383 combined.matrix[t] = Seq(combined.missing * combined.nchar, 384 combined.alphabet) + \ 385 Seq(str(m.matrix[t]) 386 .replace(m.gap, combined.gap) 387 .replace(m.missing, combined.missing), 388 combined.alphabet) 389 combined.taxlabels.extend(m_only) # new taxon list 390 for cn, cs in m.charsets.items(): # adjust character sets for new matrix 391 combined.charsets['%s.%s' % (n, cn)] = [x + combined.nchar for x in cs] 392 if m.taxsets: 393 if not combined.taxsets: 394 combined.taxsets = {} 395 # update taxon sets 396 combined.taxsets.update(dict(('%s.%s' % (n, tn), ts) 397 for tn, ts in m.taxsets.items())) 398 # update new charpartition 399 combined.charpartitions['combined'][n] = list(range(combined.nchar, combined.nchar + m.nchar)) 400 # update charlabels 401 if m.charlabels: 402 if not combined.charlabels: 403 combined.charlabels = {} 404 combined.charlabels.update(dict((combined.nchar + i, label) 405 for (i, label) in m.charlabels.items())) 406 combined.nchar += m.nchar # update nchar and ntax 407 combined.ntax += len(m_only) 408 409 # some prefer partitions, some charsets: 410 # make separate charset for ecah initial dataset 411 for c in combined.charpartitions['combined']: 412 combined.charsets[c] = combined.charpartitions['combined'][c] 413 414 return combined
415 416
417 -def _kill_comments_and_break_lines(text):
418 """Delete []-delimited comments out of a file and break into lines separated by ';'. 419 420 stripped_text=_kill_comments_and_break_lines(text): 421 Nested and multiline comments are allowed. [ and ] symbols within single 422 or double quotes are ignored, newline ends a quote, all symbols with quotes are 423 treated the same (thus not quoting inside comments like [this character ']' ends a comment]) 424 Special [&...] and [\...] comments remain untouched, if not inside standard comment. 425 Quotes inside special [& and [\ are treated as normal characters, 426 but no nesting inside these special comments allowed (like [& [\ ]]). 427 ';' ist deleted from end of line. 428 429 NOTE: this function is very slow for large files, and obsolete when using C extension cnexus 430 """ 431 contents = iter(text) 432 newtext = [] 433 newline = [] 434 quotelevel = '' 435 speciallevel = False 436 commlevel = 0 437 # Parse with one character look ahead (for special comments) 438 t2 = next(contents) 439 while True: 440 t = t2 441 try: 442 t2 = next(contents) 443 except StopIteration: 444 t2 = None 445 if t is None: 446 break 447 if t == quotelevel and not (commlevel or speciallevel): 448 # matching quote ends quotation 449 quotelevel = '' 450 elif not quotelevel and not (commlevel or speciallevel) and (t == '"' or t == "'"): 451 # single or double quote starts quotation 452 quotelevel = t 453 elif not quotelevel and t == '[': 454 # opening bracket outside a quote 455 if t2 in SPECIALCOMMENTS and commlevel == 0 and not speciallevel: 456 speciallevel = True 457 else: 458 commlevel += 1 459 elif not quotelevel and t == ']': 460 # closing bracket ioutside a quote 461 if speciallevel: 462 speciallevel = False 463 else: 464 commlevel -= 1 465 if commlevel < 0: 466 raise NexusError('Nexus formatting error: unmatched ]') 467 continue 468 if commlevel == 0: 469 # copy if we're not in comment 470 if t == ';' and not quotelevel: 471 newtext.append(''.join(newline)) 472 newline = [] 473 else: 474 newline.append(t) 475 # level of comments should be 0 at the end of the file 476 if newline: 477 newtext.append('\n'.join(newline)) 478 if commlevel > 0: 479 raise NexusError('Nexus formatting error: unmatched [') 480 return newtext
481 482
483 -def _adjust_lines(lines):
484 """Adjust linebreaks to match ';', strip leading/trailing whitespace. 485 486 list_of_commandlines=_adjust_lines(input_text) 487 Lines are adjusted so that no linebreaks occur within a commandline 488 (except matrix command line) 489 """ 490 formatted_lines = [] 491 for l in lines: 492 # Convert line endings 493 l = l.replace('\r\n', '\n').replace('\r', '\n').strip() 494 if l.lower().startswith('matrix'): 495 formatted_lines.append(l) 496 else: 497 l = l.replace('\n', ' ') 498 if l: 499 formatted_lines.append(l) 500 return formatted_lines
501 502
503 -def _replace_parenthesized_ambigs(seq, rev_ambig_values):
504 """Replaces ambigs in xxx(ACG)xxx format by IUPAC ambiguity code.""" 505 506 opening = seq.find('(') 507 while opening > -1: 508 closing = seq.find(')') 509 if closing < 0: 510 raise NexusError('Missing closing parenthesis in: ' + seq) 511 elif closing < opening: 512 raise NexusError('Missing opening parenthesis in: ' + seq) 513 ambig = ''.join(sorted(seq[opening + 1:closing])) 514 ambig_code = rev_ambig_values[ambig.upper()] 515 if ambig != ambig.upper(): 516 ambig_code = ambig_code.lower() 517 seq = seq[:opening] + ambig_code + seq[closing + 1:] 518 opening = seq.find('(') 519 return seq
520 521
522 -class Commandline(object):
523 """Represent a commandline as command and options.""" 524
525 - def __init__(self, line, title):
526 self.options = {} 527 options = [] 528 self.command = None 529 try: 530 # Assume matrix (all other command lines have been stripped of \n) 531 self.command, options = line.strip().split('\n', 1) 532 except ValueError: # Not matrix 533 # self.command,options=line.split(' ',1) # no: could be tab or spaces (translate...) 534 self.command = line.split()[0] 535 options = ' '.join(line.split()[1:]) 536 self.command = self.command.strip().lower() 537 if self.command in SPECIAL_COMMANDS: 538 # special command that need newlines and order of options preserved 539 self.options = options.strip() 540 else: 541 if len(options) > 0: 542 try: 543 options = options.replace('=', ' = ').split() 544 valued_indices = [(n - 1, n, n + 1) for n in range(len(options)) 545 if options[n] == '=' and n != 0 and n != len((options))] 546 indices = [] 547 for sl in valued_indices: 548 indices.extend(sl) 549 token_indices = [n for n in range(len(options)) if n not in indices] 550 for opt in valued_indices: 551 # self.options[options[opt[0]].lower()] = options[opt[2]].lower() 552 self.options[options[opt[0]].lower()] = options[opt[2]] 553 for token in token_indices: 554 self.options[options[token].lower()] = None 555 except ValueError: 556 raise NexusError('Incorrect formatting in line: %s' % line)
557 558
559 -class Block(object):
560 """Represent a NEXUS block with block name and list of commandlines."""
561 - def __init__(self, title=None):
562 self.title = title 563 self.commandlines = []
564 565
566 -class Nexus(object):
567
568 - def __init__(self, input=None):
569 self.ntax = 0 # number of taxa 570 self.nchar = 0 # number of characters 571 self.unaltered_taxlabels = [] # taxlabels as the appear in the input file (incl. duplicates, etc.) 572 self.taxlabels = [] # labels for taxa, ordered by their id 573 self.charlabels = None # ... and for characters 574 self.statelabels = None # ... and for states 575 self.datatype = 'dna' # (standard), dna, rna, nucleotide, protein 576 self.respectcase = False # case sensitivity 577 self.missing = '?' # symbol for missing characters 578 self.gap = '-' # symbol for gap 579 self.symbols = None # set of symbols 580 self.equate = None # set of symbol synonyms 581 self.matchchar = None # matching char for matrix representation 582 self.labels = None # left, right, no 583 self.transpose = False # whether matrix is transposed 584 self.interleave = False # whether matrix is interleaved 585 self.tokens = False # unsupported 586 self.eliminate = None # unsupported 587 self.matrix = None # ... 588 self.unknown_blocks = [] # blocks we don't care about 589 self.taxsets = {} 590 self.charsets = {} 591 self.charpartitions = {} 592 self.taxpartitions = {} 593 self.trees = [] # list of Trees (instances of Tree class) 594 self.translate = None # Dict to translate taxon <-> taxon numbers 595 self.structured = [] # structured input representation 596 self.set = {} # dict of the set command to set various options 597 self.options = {} # dict of the options command in the data block 598 self.codonposset = None # name of the charpartition that defines codon positions 599 600 # some defaults 601 self.options['gapmode'] = 'missing' 602 603 if input: 604 self.read(input) 605 else: 606 self.read(DEFAULTNEXUS)
607
608 - def get_original_taxon_order(self):
609 """Included for backwards compatibility (DEPRECATED).""" 610 return self.taxlabels
611
612 - def set_original_taxon_order(self, value):
613 """Included for backwards compatibility (DEPRECATED).""" 614 self.taxlabels = value
615 616 original_taxon_order = property(get_original_taxon_order, set_original_taxon_order) 617
618 - def read(self, input):
619 """Read and parse NEXUS input (a filename, file-handle, or string).""" 620 621 # 1. Assume we have the name of a file in the execution dir or a 622 # file-like object. 623 # Note we need to add parsing of the path to dir/filename 624 try: 625 with File.as_handle(input, 'rU') as fp: 626 file_contents = fp.read() 627 self.filename = getattr(fp, 'name', 'Unknown_nexus_file') 628 except (TypeError, IOError, AttributeError): 629 # 2. Assume we have a string from a fh.read() 630 if isinstance(input, basestring): 631 file_contents = input 632 self.filename = 'input_string' 633 else: 634 print(input.strip()[:50]) 635 raise NexusError('Unrecognized input: %s ...' % input[:100]) 636 file_contents = file_contents.strip() 637 if file_contents.startswith('#NEXUS'): 638 file_contents = file_contents[6:] 639 commandlines = _get_command_lines(file_contents) 640 # get rid of stupid 'NEXUS token - in merged treefiles, this might appear multiple times' 641 for i, cl in enumerate(commandlines): 642 try: 643 if cl[:6].upper() == '#NEXUS': 644 commandlines[i] = cl[6:].strip() 645 except IndexError: 646 pass 647 # now loop through blocks (we parse only data in known blocks, thus ignoring non-block commands 648 nexus_block_gen = self._get_nexus_block(commandlines) 649 while True: 650 try: 651 title, contents = next(nexus_block_gen) 652 except StopIteration: 653 break 654 if title in KNOWN_NEXUS_BLOCKS: 655 self._parse_nexus_block(title, contents) 656 else: 657 self._unknown_nexus_block(title, contents)
658
659 - def _get_nexus_block(self, file_contents):
660 """Generator for looping through Nexus blocks.""" 661 inblock = False 662 blocklines = [] 663 while file_contents: 664 cl = file_contents.pop(0) 665 if cl.lower().startswith('begin'): 666 if not inblock: 667 inblock = True 668 title = cl.split()[1].lower() 669 else: 670 raise NexusError('Illegal block nesting in block %s' % title) 671 elif cl.lower().startswith('end'): 672 if inblock: 673 inblock = False 674 yield title, blocklines 675 blocklines = [] 676 else: 677 raise NexusError('Unmatched \'end\'.') 678 elif inblock: 679 blocklines.append(cl)
680
681 - def _unknown_nexus_block(self, title, contents):
682 block = Block() 683 block.commandlines.append(contents) 684 block.title = title 685 self.unknown_blocks.append(block)
686
687 - def _parse_nexus_block(self, title, contents):
688 """Parse a known Nexus Block (PRIVATE).""" 689 # attached the structured block representation 690 self._apply_block_structure(title, contents) 691 # now check for taxa,characters,data blocks. If this stuff is defined more than once 692 # the later occurrences will override the previous ones. 693 block = self.structured[-1] 694 for line in block.commandlines: 695 try: 696 getattr(self, '_' + line.command)(line.options) 697 except AttributeError: 698 raise NexusError('Unknown command: %s ' % line.command)
699
700 - def _title(self, options):
701 pass
702 705
706 - def _dimensions(self, options):
707 if 'ntax' in options: 708 self.ntax = eval(options['ntax']) 709 if 'nchar' in options: 710 self.nchar = eval(options['nchar'])
711
712 - def _format(self, options):
713 # print options 714 # we first need to test respectcase, then symbols (which depends on respectcase) 715 # then datatype (which, if standard, depends on symbols and respectcase in order to generate 716 # dicts for ambiguous values and alphabet 717 if 'respectcase' in options: 718 self.respectcase = True 719 # adjust symbols to for respectcase 720 if 'symbols' in options: 721 self.symbols = ''.join(options['symbols'].split()) 722 if (self.symbols.startswith('"') and self.symbols.endswith('"')) or\ 723 (self.symbols.startswith("'") and self.symbols.endswith("'")): 724 self.symbols = self.symbols[1:-1] 725 if not self.respectcase: 726 self.symbols = list(self.symbols.upper()) 727 # self.symbols = self.symbols.lower() + self.symbols.upper() 728 # self.symbols = list(set(self.symbols)) 729 if 'datatype' in options: 730 self.datatype = options['datatype'].lower() 731 732 if self.datatype == 'dna' or self.datatype == 'nucleotide': 733 self.alphabet = IUPAC.IUPACAmbiguousDNA() # fresh instance! 734 self.ambiguous_values = IUPACData.ambiguous_dna_values.copy() 735 self.unambiguous_letters = IUPACData.unambiguous_dna_letters 736 elif self.datatype == 'rna': 737 self.alphabet = IUPAC.IUPACAmbiguousDNA() # fresh instance! 738 self.ambiguous_values = IUPACData.ambiguous_rna_values.copy() 739 self.unambiguous_letters = IUPACData.unambiguous_rna_letters 740 elif self.datatype == 'protein': 741 # TODO - Should this not be ExtendedIUPACProtein? 742 self.alphabet = IUPAC.IUPACProtein() # fresh instance 743 self.ambiguous_values = {'B': 'DN', 744 'Z': 'EQ', 745 'X': IUPACData.protein_letters} 746 # that's how PAUP handles it 747 self.unambiguous_letters = IUPACData.protein_letters + '*' # stop-codon 748 elif self.datatype == 'standard': 749 self.alphabet = None 750 self.ambiguous_values = {} 751 if not self.symbols: 752 # PARSER BUG ## 753 # This error arises when symbols are absent or when 754 # whitespace is located within the SYMBOLS command values. 755 # The Nexus parser quits reading the SYMBOLS line upon 756 # finding a whitespace character. 757 raise NexusError( 758 "Symbols must be defined when using standard datatype. " 759 "Please remove any whitespace (spaces, tabs, etc.) " 760 "between values for symbols as this confuses the Nexus " 761 "parser.") 762 763 self.unambiguous_letters = ''.join(self.symbols) 764 if not self.respectcase: 765 self.unambiguous_letters += self.unambiguous_letters.lower() 766 else: 767 raise NexusError('Unsupported datatype: ' + self.datatype) 768 self.valid_characters = ''.join(self.ambiguous_values) + self.unambiguous_letters 769 if not self.respectcase: 770 self.valid_characters = self.valid_characters.lower() + self.valid_characters.upper() 771 # we have to sort the reverse ambig coding dict key characters: 772 # to be sure that it's 'ACGT':'N' and not 'GTCA':'N' 773 rev = dict((i[1], i[0]) for i in self.ambiguous_values.items() if i[0] != 'X') 774 self.rev_ambiguous_values = {} 775 for (k, v) in rev.items(): 776 key = sorted(c for c in k) 777 self.rev_ambiguous_values[''.join(key)] = v 778 # overwrite symbols for datype rna,dna,nucleotide 779 if self.datatype in ['dna', 'rna', 'nucleotide']: 780 self.symbols = self.alphabet.letters 781 if self.missing not in self.ambiguous_values: 782 self.ambiguous_values[self.missing] = self.unambiguous_letters + self.gap 783 self.ambiguous_values[self.gap] = self.gap 784 # elif self.datatype == 'standard': 785 # if not self.symbols: 786 # self.symbols = ['0', '1'] 787 if 'missing' in options: 788 self.missing = options['missing'][0] 789 if 'gap' in options: 790 self.gap = options['gap'][0] 791 if 'equate' in options: 792 self.equate = options['equate'] 793 if 'matchchar' in options: 794 self.matchchar = options['matchchar'][0] 795 if 'labels' in options: 796 self.labels = options['labels'] 797 if 'transpose' in options: 798 raise NexusError('TRANSPOSE is not supported!') 799 self.transpose = True 800 if 'interleave' in options: 801 if options['interleave'] is None or options['interleave'].lower() == 'yes': 802 self.interleave = True 803 if 'tokens' in options: 804 self.tokens = True 805 if 'notokens' in options: 806 self.tokens = False
807
808 - def _set(self, options):
809 self.set = options
810
811 - def _options(self, options):
812 self.options = options
813
814 - def _eliminate(self, options):
815 self.eliminate = options
816
817 - def _taxlabels(self, options):
818 """Get taxon labels (PRIVATE). 819 820 As the taxon names are already in the matrix, this is superfluous 821 except for transpose matrices, which are currently unsupported anyway. 822 Thus, we ignore the taxlabels command to make handling of duplicate 823 taxon names easier. 824 """ 825 pass
826 # self.taxlabels = [] 827 # opts = CharBuffer(options) 828 # while True: 829 # taxon = quotestrip(opts.next_word()) 830 # if not taxon: 831 # break 832 # self.taxlabels.append(taxon) 833
834 - def _check_taxlabels(self, taxon):
835 """Check for presence of taxon in self.taxlabels.""" 836 # According to NEXUS standard, underscores shall be treated as spaces..., 837 # so checking for identity is more difficult 838 nextaxa = dict((t.replace(' ', '_'), t) for t in self.taxlabels) 839 nexid = taxon.replace(' ', '_') 840 return nextaxa.get(nexid)
841
842 - def _charlabels(self, options):
843 self.charlabels = {} 844 opts = CharBuffer(options) 845 while True: 846 # get id and state 847 w = opts.next_word() 848 if w is None: # McClade saves and reads charlabel-lists with terminal comma?! 849 break 850 identifier = self._resolve(w, set_type=CHARSET) 851 state = quotestrip(opts.next_word()) 852 self.charlabels[identifier] = state 853 # check for comma or end of command 854 c = opts.next_nonwhitespace() 855 if c is None: 856 break 857 elif c != ',': 858 raise NexusError('Missing \',\' in line %s.' % options)
859
860 - def _charstatelabels(self, options):
861 self.charlabels = {} 862 self.statelabels = {} 863 opts = CharBuffer(options) 864 865 # Make sure symbols are defined 866 if not self.symbols: 867 raise NexusError( 868 'Symbols must be defined when using character states') 869 870 while True: 871 # get id and character name 872 w = opts.next_word() 873 874 # McClade saves and reads charlabel-lists with terminal comma?! 875 if w is None: 876 break 877 878 identifier = self._resolve(w, set_type=CHARSET) 879 character = quotestrip(opts.next_word()) 880 881 self.charlabels[identifier] = character 882 self.statelabels[identifier] = [] 883 884 # check for comma, slash or end of command 885 c = opts.next_nonwhitespace() 886 887 if c is None: 888 break 889 elif c != ',': 890 # Check if states are defined, otherwise report error 891 if c != '/': 892 raise NexusError('Missing \',\' in line %s.' % options) 893 894 # Get the first state 895 state = quotestrip(opts.next_word()) 896 897 if state is None: 898 raise NexusError( 899 'Missing character state in line %s.' % options) 900 901 while True: 902 # Make sure current state does not exceed number of 903 # available symbols 904 if len(self.statelabels[identifier]) > len(self.symbols): 905 raise NexusError( 906 'Character states exceed number of available symbols in line %s.' % options) 907 908 # Add the character state to the statelabels 909 self.statelabels[identifier].append(state) 910 911 # Check for another state or comma to end states (last 912 # character should not have comma at end of states - but 913 # we'll ignore) 914 state = quotestrip(opts.next_word()) 915 916 if state is None: 917 return 918 elif state is ',': 919 break
920
921 - def _statelabels(self, options):
922 # self.charlabels = options 923 # print 'Command statelabels is not supported and will be ignored.' 924 pass
925
926 - def _matrix(self, options):
927 if not self.ntax or not self.nchar: 928 raise NexusError('Dimensions must be specified before matrix!') 929 self.matrix = {} 930 taxcount = 0 931 first_matrix_block = True 932 933 # eliminate empty lines and leading/trailing whitespace 934 lines = [l.strip() for l in options.split('\n') if l.strip() != ''] 935 lineiter = iter(lines) 936 while True: 937 try: 938 l = next(lineiter) 939 except StopIteration: 940 if taxcount < self.ntax: 941 raise NexusError('Not enough taxa in matrix.') 942 elif taxcount > self.ntax: 943 raise NexusError('Too many taxa in matrix.') 944 else: 945 break 946 # count the taxa and check for interleaved matrix 947 taxcount += 1 948 if taxcount > self.ntax: 949 if not self.interleave: 950 raise NexusError('Too many taxa in matrix - should matrix be interleaved?') 951 else: 952 taxcount = 1 953 first_matrix_block = False 954 # get taxon name and sequence 955 linechars = CharBuffer(l) 956 id = quotestrip(linechars.next_word()) 957 l = linechars.rest().strip() 958 chars = '' 959 if self.interleave: 960 # interleaved matrix 961 if l: 962 chars = ''.join(l.split()) 963 else: 964 chars = ''.join(next(lineiter).split()) 965 else: 966 # non-interleaved matrix 967 chars = ''.join(l.split()) 968 while len(chars) < self.nchar: 969 l = next(lineiter) 970 chars += ''.join(l.split()) 971 972 # Reformat sequence for non-standard datatypes 973 if self.datatype != 'standard': 974 iupac_seq = Seq(_replace_parenthesized_ambigs( 975 chars, self.rev_ambiguous_values), self.alphabet) 976 # first taxon has the reference sequence if matchhar is used 977 if taxcount == 1: 978 refseq = iupac_seq 979 else: 980 if self.matchchar: 981 while True: 982 p = str(iupac_seq).find(self.matchchar) 983 if p == -1: 984 break 985 iupac_seq = Seq(str(iupac_seq)[:p] + refseq[ 986 p] + str(iupac_seq)[p + 1:], self.alphabet) 987 988 # Check for invalid characters 989 for i, c in enumerate(str(iupac_seq)): 990 if c not in self.valid_characters and c != self.gap and c != self.missing: 991 raise NexusError("Taxon %s: Illegal character %s in sequence %s " 992 "(check dimensions/interleaving)" % (id, c, iupac_seq)) 993 else: 994 iupac_seq = StandardData(chars) 995 996 # Check for invalid characters 997 for i, c in enumerate(iupac_seq): 998 # Go through each coding for each character 999 for coding in c['d']: 1000 if (coding not in self.valid_characters and 1001 coding != self.gap and coding != self.missing): 1002 raise NexusError("Taxon %s: Illegal character %s " 1003 "in sequence %s " 1004 "(check dimensions/interleaving)" 1005 % (id, coding, iupac_seq)) 1006 1007 # add sequence to matrix 1008 if first_matrix_block: 1009 self.unaltered_taxlabels.append(id) 1010 id = _unique_label(list(self.matrix.keys()), id) 1011 self.matrix[id] = iupac_seq 1012 self.taxlabels.append(id) 1013 else: 1014 # taxon names need to be in the same order in each interleaved block 1015 id = _unique_label(self.taxlabels[:taxcount - 1], id) 1016 taxon_present = self._check_taxlabels(id) 1017 if taxon_present: 1018 self.matrix[taxon_present] += iupac_seq 1019 else: 1020 raise NexusError("Taxon %s not in first block of interleaved " 1021 "matrix. Check matrix dimensions and interleave." % id) 1022 # check all sequences for length according to nchar 1023 for taxon in self.matrix: 1024 if len(self.matrix[taxon]) != self.nchar: 1025 raise NexusError('Matrix Nchar %d does not match data length (%d) for taxon %s' 1026 % (self.nchar, len(self.matrix[taxon]), taxon)) 1027 # check that taxlabels is identical with matrix.keys. If not, it's a problem 1028 matrixkeys = sorted(self.matrix) 1029 taxlabelssort = sorted(self.taxlabels[:]) 1030 assert matrixkeys == taxlabelssort, \ 1031 "ERROR: TAXLABELS must be identical with MATRIX. " + \ 1032 "Please Report this as a bug, and send in data file."
1033
1034 - def _translate(self, options):
1035 self.translate = {} 1036 opts = CharBuffer(options) 1037 while True: 1038 try: 1039 # get id and state 1040 identifier = int(opts.next_word()) 1041 label = quotestrip(opts.next_word()) 1042 self.translate[identifier] = label 1043 # check for comma or end of command 1044 c = opts.next_nonwhitespace() 1045 if c is None: 1046 break 1047 elif c != ',': 1048 raise NexusError('Missing \',\' in line %s.' % options) 1049 except NexusError: 1050 raise 1051 except Exception: # TODO: ValueError? 1052 raise NexusError('Format error in line %s.' % options)
1053
1054 - def _utree(self, options):
1055 """Some software (clustalx) uses 'utree' to denote an unrooted tree.""" 1056 self._tree(options)
1057
1058 - def _tree(self, options):
1059 opts = CharBuffer(options) 1060 if opts.peek_nonwhitespace() == '*': 1061 # a star can be used to make it the default tree in some software packages 1062 dummy = opts.next_nonwhitespace() 1063 name = opts.next_word() 1064 if opts.next_nonwhitespace() != '=': 1065 raise NexusError('Syntax error in tree description: %s' 1066 % options[:50]) 1067 rooted = False 1068 weight = 1.0 1069 while opts.peek_nonwhitespace() == '[': 1070 opts.next_nonwhitespace() # discard opening bracket 1071 symbol = next(opts) 1072 if symbol != '&': 1073 raise NexusError('Illegal special comment [%s...] in tree description: %s' 1074 % (symbol, options[:50])) 1075 special = next(opts) 1076 value = opts.next_until(']') 1077 next(opts) # discard closing bracket 1078 if special == 'R': 1079 rooted = True 1080 elif special == 'U': 1081 rooted = False 1082 elif special == 'W': 1083 weight = float(value) 1084 tree = Tree(name=name, weight=weight, rooted=rooted, 1085 tree=opts.rest().strip()) 1086 # if there's an active translation table, translate 1087 if self.translate: 1088 for n in tree.get_terminals(): 1089 try: 1090 tree.node(n).data.taxon = safename(self.translate[int(tree.node(n).data.taxon)]) 1091 except (ValueError, KeyError): 1092 raise NexusError('Unable to substitute %s using \'translate\' data.' 1093 % tree.node(n).data.taxon) 1094 self.trees.append(tree)
1095
1096 - def _apply_block_structure(self, title, lines):
1097 block = Block('') 1098 block.title = title 1099 for line in lines: 1100 block.commandlines.append(Commandline(line, title)) 1101 self.structured.append(block)
1102
1103 - def _taxset(self, options):
1104 name, taxa = self._get_indices(options, set_type=TAXSET) 1105 self.taxsets[name] = _make_unique(taxa)
1106
1107 - def _charset(self, options):
1108 name, sites = self._get_indices(options, set_type=CHARSET) 1109 self.charsets[name] = _make_unique(sites)
1110
1111 - def _taxpartition(self, options):
1112 taxpartition = {} 1113 quotelevel = False 1114 opts = CharBuffer(options) 1115 name = self._name_n_vector(opts) 1116 if not name: 1117 raise NexusError('Formatting error in taxpartition: %s ' % options) 1118 # now collect thesubbpartitions and parse them 1119 # subpartitons separated by commas - which unfortunately could be part of a quoted identifier... 1120 # this is rather unelegant, but we have to avoid double-parsing and potential change of special nexus-words 1121 sub = '' 1122 while True: 1123 w = next(opts) 1124 if w is None or (w == ',' and not quotelevel): 1125 subname, subindices = self._get_indices(sub, set_type=TAXSET, separator=':') 1126 taxpartition[subname] = _make_unique(subindices) 1127 sub = '' 1128 if w is None: 1129 break 1130 else: 1131 if w == "'": 1132 quotelevel = not quotelevel 1133 sub += w 1134 self.taxpartitions[name] = taxpartition
1135
1136 - def _codonposset(self, options):
1137 """Read codon positions from a codons block as written from McClade. 1138 1139 Here codonposset is just a fancy name for a character partition with 1140 the name CodonPositions and the partitions N,1,2,3 1141 """ 1142 1143 prev_partitions = list(self.charpartitions.keys()) 1144 self._charpartition(options) 1145 # mcclade calls it CodonPositions, but you never know... 1146 codonname = [n for n in self.charpartitions if n not in prev_partitions] 1147 if codonname == [] or len(codonname) > 1: 1148 raise NexusError('Formatting Error in codonposset: %s ' % options) 1149 else: 1150 self.codonposset = codonname[0]
1151
1152 - def _codeset(self, options):
1153 pass
1154
1155 - def _charpartition(self, options):
1156 charpartition = {} 1157 quotelevel = False 1158 opts = CharBuffer(options) 1159 name = self._name_n_vector(opts) 1160 if not name: 1161 raise NexusError('Formatting error in charpartition: %s ' % options) 1162 # now collect the subpartitions and parse them 1163 # subpartitions separated by commas - which unfortunately could be part 1164 # of a quoted identifier... 1165 sub = '' 1166 while True: 1167 w = next(opts) 1168 if w is None or (w == ',' and not quotelevel): 1169 subname, subindices = self._get_indices(sub, set_type=CHARSET, separator=':') 1170 charpartition[subname] = _make_unique(subindices) 1171 sub = '' 1172 if w is None: 1173 break 1174 else: 1175 if w == "'": 1176 quotelevel = not quotelevel 1177 sub += w 1178 self.charpartitions[name] = charpartition
1179
1180 - def _get_indices(self, options, set_type=CHARSET, separator='='):
1181 """Parse the taxset/charset specification (PRIVATE). 1182 1183 e.g. '1 2 3 - 5 dog cat 10 - 20 \\ 3' 1184 --> [0,1,2,3,4,'dog','cat',9,12,15,18] 1185 """ 1186 opts = CharBuffer(options) 1187 name = self._name_n_vector(opts, separator=separator) 1188 indices = self._parse_list(opts, set_type=set_type) 1189 if indices is None: 1190 raise NexusError('Formatting error in line: %s ' % options) 1191 return name, indices
1192
1193 - def _name_n_vector(self, opts, separator='='):
1194 """Extract name and check that it's not in vector format.""" 1195 rest = opts.rest() 1196 name = opts.next_word() 1197 # we ignore * before names 1198 if name == '*': 1199 name = opts.next_word() 1200 if not name: 1201 raise NexusError('Formatting error in line: %s ' % rest) 1202 name = quotestrip(name) 1203 if opts.peek_nonwhitespace == '(': 1204 open = opts.next_nonwhitespace() 1205 qualifier = open.next_word() 1206 close = opts.next_nonwhitespace() 1207 if qualifier.lower() == 'vector': 1208 raise NexusError('Unsupported VECTOR format in line %s' 1209 % (opts)) 1210 elif qualifier.lower() != 'standard': 1211 raise NexusError('Unknown qualifier %s in line %s' 1212 % (qualifier, opts)) 1213 if opts.next_nonwhitespace() != separator: 1214 raise NexusError('Formatting error in line: %s ' % rest) 1215 return name
1216
1217 - def _parse_list(self, options_buffer, set_type):
1218 """Parse a NEXUS list (PRIVATE). 1219 1220 e.g. [1, 2, 4-8\\2, dog, cat] --> [1,2,4,6,8,17,21], 1221 (assuming dog is taxon no. 17 and cat is taxon no. 21). 1222 """ 1223 plain_list = [] 1224 if options_buffer.peek_nonwhitespace(): 1225 try: 1226 # capture all possible exceptions and treat them as formatting 1227 # errors, if they are not NexusError 1228 while True: 1229 identifier = options_buffer.next_word() # next list element 1230 if not identifier: # end of list? 1231 break 1232 start = self._resolve(identifier, set_type=set_type) 1233 if options_buffer.peek_nonwhitespace() == '-': # followd by - 1234 end = start 1235 step = 1 1236 # get hyphen and end of range 1237 hyphen = options_buffer.next_nonwhitespace() 1238 end = self._resolve(options_buffer.next_word(), set_type=set_type) 1239 if set_type == CHARSET: 1240 if options_buffer.peek_nonwhitespace() == '\\': # followd by \ 1241 backslash = options_buffer.next_nonwhitespace() 1242 step = int(options_buffer.next_word()) # get backslash and step 1243 plain_list.extend(range(start, end + 1, step)) 1244 else: 1245 if isinstance(start, list) or isinstance(end, list): 1246 raise NexusError('Name if character sets not allowed in range definition: %s' 1247 % identifier) 1248 start = self.taxlabels.index(start) 1249 end = self.taxlabels.index(end) 1250 taxrange = self.taxlabels[start:end + 1] 1251 plain_list.extend(taxrange) 1252 else: 1253 if isinstance(start, list): 1254 # start was the name of charset or taxset 1255 plain_list.extend(start) 1256 else: 1257 # start was an ordinary identifier 1258 plain_list.append(start) 1259 except NexusError: 1260 raise 1261 except: 1262 return None 1263 return plain_list
1264
1265 - def _resolve(self, identifier, set_type=None):
1266 """Translate identifier in list into character/taxon index. 1267 1268 Characters (which are referred to by their index in Nexus.py): 1269 Plain numbers are returned minus 1 (Nexus indices to python indices) 1270 Text identifiers are translated into their indices (if plain character identifiers), 1271 the first hit in charlabels is returned (charlabels don't need to be unique) 1272 or the range of indices is returned (if names of character sets). 1273 Taxa (which are referred to by their unique name in Nexus.py): 1274 Plain numbers are translated in their taxon name, underscores and spaces are considered equal. 1275 Names are returned unchanged (if plain taxon identifiers), or the names in 1276 the corresponding taxon set is returned. 1277 """ 1278 identifier = quotestrip(identifier) 1279 if not set_type: 1280 raise NexusError('INTERNAL ERROR: Need type to resolve identifier.') 1281 if set_type == CHARSET: 1282 try: 1283 n = int(identifier) 1284 except ValueError: 1285 if self.charlabels and identifier in self.charlabels.values(): 1286 for k in self.charlabels: 1287 if self.charlabels[k] == identifier: 1288 return k 1289 elif self.charsets and identifier in self.charsets: 1290 return self.charsets[identifier] 1291 else: 1292 raise NexusError('Unknown character identifier: %s' 1293 % identifier) 1294 else: 1295 if n <= self.nchar: 1296 return n - 1 1297 else: 1298 raise NexusError('Illegal character identifier: %d>nchar (=%d).' 1299 % (identifier, self.nchar)) 1300 elif set_type == TAXSET: 1301 try: 1302 n = int(identifier) 1303 except ValueError: 1304 taxlabels_id = self._check_taxlabels(identifier) 1305 if taxlabels_id: 1306 return taxlabels_id 1307 elif self.taxsets and identifier in self.taxsets: 1308 return self.taxsets[identifier] 1309 else: 1310 raise NexusError('Unknown taxon identifier: %s' 1311 % identifier) 1312 else: 1313 if n > 0 and n <= self.ntax: 1314 return self.taxlabels[n - 1] 1315 else: 1316 raise NexusError('Illegal taxon identifier: %d>ntax (=%d).' 1317 % (identifier, self.ntax)) 1318 else: 1319 raise NexusError('Unknown set specification: %s.' % set_type)
1320
1321 - def _stateset(self, options):
1322 # Not implemented 1323 pass
1324
1325 - def _changeset(self, options):
1326 # Not implemented 1327 pass
1328
1329 - def _treeset(self, options):
1330 # Not implemented 1331 pass
1332
1333 - def _treepartition(self, options):
1334 # Not implemented 1335 pass
1336
1337 - def write_nexus_data_partitions(self, matrix=None, filename=None, blocksize=None, 1338 interleave=False, exclude=(), delete=(), 1339 charpartition=None, comment='', mrbayes=False):
1340 """Writes a nexus file for each partition in charpartition. 1341 1342 Only non-excluded characters and non-deleted taxa are included, 1343 just the data block is written. 1344 """ 1345 1346 if not matrix: 1347 matrix = self.matrix 1348 if not matrix: 1349 return 1350 if not filename: 1351 filename = self.filename 1352 if charpartition: 1353 pfilenames = {} 1354 for p in charpartition: 1355 total_exclude = list(exclude) 1356 total_exclude.extend(c for c in range(self.nchar) if c not in charpartition[p]) 1357 total_exclude = _make_unique(total_exclude) 1358 pcomment = comment + '\nPartition: ' + p + '\n' 1359 dot = filename.rfind('.') 1360 if dot > 0: 1361 pfilename = filename[:dot] + '_' + p + '.data' 1362 else: 1363 pfilename = filename + '_' + p 1364 pfilenames[p] = pfilename 1365 self.write_nexus_data(filename=pfilename, matrix=matrix, blocksize=blocksize, 1366 interleave=interleave, exclude=total_exclude, delete=delete, 1367 comment=pcomment, append_sets=False, mrbayes=mrbayes) 1368 return pfilenames 1369 else: 1370 fn = self.filename + '.data' 1371 self.write_nexus_data(filename=fn, matrix=matrix, blocksize=blocksize, 1372 interleave=interleave, exclude=exclude, delete=delete, 1373 comment=comment, append_sets=False, mrbayes=mrbayes) 1374 return fn
1375
1376 - def write_nexus_data(self, filename=None, matrix=None, exclude=(), delete=(), 1377 blocksize=None, interleave=False, interleave_by_partition=False, 1378 comment=None, omit_NEXUS=False, append_sets=True, mrbayes=False, 1379 codons_block=True):
1380 """Writes a nexus file with data and sets block to a file or handle. 1381 1382 Character sets and partitions are appended by default, and are 1383 adjusted according to excluded characters (i.e. character sets 1384 still point to the same sites (not necessarily same positions), 1385 without including the deleted characters. 1386 1387 - filename - Either a filename as a string (which will be opened, 1388 written to and closed), or a handle object (which will 1389 be written to but NOT closed). 1390 - interleave_by_partition - Optional name of partition (string) 1391 - omit_NEXUS - Boolean. If true, the '#NEXUS' line normally at the 1392 start of the file is omitted. 1393 1394 Returns the filename/handle used to write the data. 1395 """ 1396 if not matrix: 1397 matrix = self.matrix 1398 if not matrix: 1399 return 1400 if not filename: 1401 filename = self.filename 1402 if [t for t in delete if not self._check_taxlabels(t)]: 1403 raise NexusError('Unknown taxa: %s' 1404 % ', '.join(set(delete).difference(set(self.taxlabels)))) 1405 if interleave_by_partition: 1406 if interleave_by_partition not in self.charpartitions: 1407 raise NexusError('Unknown partition: %r' % interleave_by_partition) 1408 else: 1409 partition = self.charpartitions[interleave_by_partition] 1410 # we need to sort the partition names by starting position 1411 # before we exclude characters 1412 names = _sort_keys_by_values(partition) 1413 newpartition = {} 1414 for p in partition: 1415 newpartition[p] = [c for c in partition[p] if c not in exclude] 1416 # how many taxa and how many characters are left? 1417 undelete = [taxon for taxon in self.taxlabels if taxon in matrix and taxon not in delete] 1418 cropped_matrix = _seqmatrix2strmatrix(self.crop_matrix(matrix, exclude=exclude, delete=delete)) 1419 ntax_adjusted = len(undelete) 1420 nchar_adjusted = len(cropped_matrix[undelete[0]]) 1421 if not undelete or (undelete and undelete[0] == ''): 1422 return 1423 1424 with File.as_handle(filename, mode='w') as fh: 1425 if not omit_NEXUS: 1426 fh.write('#NEXUS\n') 1427 if comment: 1428 fh.write('[' + comment + ']\n') 1429 fh.write('begin data;\n') 1430 fh.write('\tdimensions ntax=%d nchar=%d;\n' % (ntax_adjusted, nchar_adjusted)) 1431 fh.write('\tformat datatype=' + self.datatype) 1432 if self.respectcase: 1433 fh.write(' respectcase') 1434 if self.missing: 1435 fh.write(' missing=' + self.missing) 1436 if self.gap: 1437 fh.write(' gap=' + self.gap) 1438 if self.matchchar: 1439 fh.write(' matchchar=' + self.matchchar) 1440 if self.labels: 1441 fh.write(' labels=' + self.labels) 1442 if self.equate: 1443 fh.write(' equate=' + self.equate) 1444 if interleave or interleave_by_partition: 1445 fh.write(' interleave') 1446 fh.write(';\n') 1447 # if self.taxlabels: 1448 # fh.write('taxlabels '+' '.join(self.taxlabels)+';\n') 1449 if self.charlabels: 1450 newcharlabels = self._adjust_charlabels(exclude=exclude) 1451 clkeys = sorted(newcharlabels) 1452 fh.write('charlabels ' + 1453 ', '.join("%s %s" % (k + 1, safename(newcharlabels[k])) for k in clkeys) + 1454 ';\n') 1455 fh.write('matrix\n') 1456 if not blocksize: 1457 if interleave: 1458 blocksize = 70 1459 else: 1460 blocksize = self.nchar 1461 # delete deleted taxa and ecxclude excluded characters... 1462 namelength = max(len(safename(t, mrbayes=mrbayes)) for t in undelete) 1463 if interleave_by_partition: 1464 # interleave by partitions, but adjust partitions with regard 1465 # to excluded characters 1466 seek = 0 1467 for p in names: 1468 fh.write('[%s: %s]\n' % (interleave_by_partition, p)) 1469 if len(newpartition[p]) > 0: 1470 for taxon in undelete: 1471 fh.write(safename(taxon, mrbayes=mrbayes).ljust(namelength + 1)) 1472 fh.write(cropped_matrix[taxon][seek:seek + len(newpartition[p])] + '\n') 1473 fh.write('\n') 1474 else: 1475 fh.write('[empty]\n\n') 1476 seek += len(newpartition[p]) 1477 elif interleave: 1478 for seek in range(0, nchar_adjusted, blocksize): 1479 for taxon in undelete: 1480 fh.write(safename(taxon, mrbayes=mrbayes).ljust(namelength + 1)) 1481 fh.write(cropped_matrix[taxon][seek:seek + blocksize] + '\n') 1482 fh.write('\n') 1483 else: 1484 for taxon in undelete: 1485 if blocksize < nchar_adjusted: 1486 fh.write(safename(taxon, mrbayes=mrbayes) + '\n') 1487 else: 1488 fh.write(safename(taxon, mrbayes=mrbayes).ljust(namelength + 1)) 1489 taxon_seq = cropped_matrix[taxon] 1490 for seek in range(0, nchar_adjusted, blocksize): 1491 fh.write(taxon_seq[seek:seek + blocksize] + '\n') 1492 del taxon_seq 1493 fh.write(';\nend;\n') 1494 if append_sets: 1495 if codons_block: 1496 fh.write(self.append_sets(exclude=exclude, delete=delete, mrbayes=mrbayes, include_codons=False)) 1497 fh.write(self.append_sets(exclude=exclude, delete=delete, mrbayes=mrbayes, codons_only=True)) 1498 else: 1499 fh.write(self.append_sets(exclude=exclude, delete=delete, mrbayes=mrbayes)) 1500 return filename
1501
1502 - def append_sets(self, exclude=(), delete=(), mrbayes=False, include_codons=True, codons_only=False):
1503 """Returns a sets block.""" 1504 if not self.charsets and not self.taxsets and not self.charpartitions: 1505 return '' 1506 if codons_only: 1507 setsb = ['\nbegin codons'] 1508 else: 1509 setsb = ['\nbegin sets'] 1510 # - now if characters have been excluded, the character sets need to be adjusted, 1511 # so that they still point to the right character positions 1512 # calculate a list of offsets: for each deleted character, the following character position 1513 # in the new file will have an additional offset of -1 1514 offset = 0 1515 offlist = [] 1516 for c in range(self.nchar): 1517 if c in exclude: 1518 offset += 1 1519 offlist.append(-1) # dummy value as these character positions are excluded 1520 else: 1521 offlist.append(c - offset) 1522 # now adjust each of the character sets 1523 if not codons_only: 1524 for n, ns in self.charsets.items(): 1525 cset = [offlist[c] for c in ns if c not in exclude] 1526 if cset: 1527 setsb.append('charset %s = %s' % (safename(n), _compact4nexus(cset))) 1528 for n, s in self.taxsets.items(): 1529 tset = [safename(t, mrbayes=mrbayes) for t in s if t not in delete] 1530 if tset: 1531 setsb.append('taxset %s = %s' % (safename(n), ' '.join(tset))) 1532 for n, p in self.charpartitions.items(): 1533 if not include_codons and n == CODONPOSITIONS: 1534 continue 1535 elif codons_only and n != CODONPOSITIONS: 1536 continue 1537 # as characters have been excluded, the partitions must be adjusted 1538 # if a partition is empty, it will be omitted from the charpartition command 1539 # (although paup allows charpartition part=t1:,t2:,t3:1-100) 1540 names = _sort_keys_by_values(p) 1541 newpartition = {} 1542 for sn in names: 1543 nsp = [offlist[c] for c in p[sn] if c not in exclude] 1544 if nsp: 1545 newpartition[sn] = nsp 1546 if newpartition: 1547 if include_codons and n == CODONPOSITIONS: 1548 command = 'codonposset' 1549 else: 1550 command = 'charpartition' 1551 setsb.append('%s %s = %s' % (command, safename(n), 1552 ', '.join('%s: %s' % (sn, _compact4nexus(newpartition[sn])) 1553 for sn in names if sn in newpartition))) 1554 # now write charpartititions, much easier than charpartitions 1555 for n, p in self.taxpartitions.items(): 1556 names = _sort_keys_by_values(p) 1557 newpartition = {} 1558 for sn in names: 1559 nsp = [t for t in p[sn] if t not in delete] 1560 if nsp: 1561 newpartition[sn] = nsp 1562 if newpartition: 1563 setsb.append('taxpartition %s = %s' % (safename(n), 1564 ', '.join('%s: %s' % (safename(sn), 1565 ' '.join(safename(x) for x in newpartition[sn])) 1566 for sn in names if sn in newpartition))) 1567 # add 'end' and return everything 1568 setsb.append('end;\n') 1569 if len(setsb) == 2: # begin and end only 1570 return '' 1571 else: 1572 return ';\n'.join(setsb)
1573
1574 - def export_fasta(self, filename=None, width=70):
1575 """Writes matrix into a fasta file.""" 1576 if not filename: 1577 if '.' in self.filename and self.filename.split('.')[-1].lower() in ['paup', 'nexus', 'nex', 'dat']: 1578 filename = '.'.join(self.filename.split('.')[:-1]) + '.fas' 1579 else: 1580 filename = self.filename + '.fas' 1581 with open(filename, 'w') as fh: 1582 for taxon in self.taxlabels: 1583 fh.write('>' + safename(taxon) + '\n') 1584 for i in range(0, len(str(self.matrix[taxon])), width): 1585 fh.write(str(self.matrix[taxon])[i:i + width] + '\n') 1586 return filename
1587
1588 - def export_phylip(self, filename=None):
1589 """Writes matrix into a PHYLIP file. 1590 1591 Note that this writes a relaxed PHYLIP format file, where the names 1592 are not truncated, nor checked for invalid characters.""" 1593 if not filename: 1594 if '.' in self.filename and self.filename.split('.')[-1].lower() in ['paup', 'nexus', 'nex', 'dat']: 1595 filename = '.'.join(self.filename.split('.')[:-1]) + '.phy' 1596 else: 1597 filename = self.filename + '.phy' 1598 with open(filename, 'w') as fh: 1599 fh.write('%d %d\n' % (self.ntax, self.nchar)) 1600 for taxon in self.taxlabels: 1601 fh.write('%s %s\n' % (safename(taxon), str(self.matrix[taxon]))) 1602 return filename
1603
1604 - def constant(self, matrix=None, delete=(), exclude=()):
1605 """Return a list with all constant characters.""" 1606 if not matrix: 1607 matrix = self.matrix 1608 undelete = [t for t in self.taxlabels if t in matrix and t not in delete] 1609 if not undelete: 1610 return None 1611 elif len(undelete) == 1: 1612 return [x for x in range(len(matrix[undelete[0]])) if x not in exclude] 1613 # get the first sequence and expand all ambiguous values 1614 constant = [(x, self.ambiguous_values.get(n.upper(), n.upper())) for 1615 x, n in enumerate(str(matrix[undelete[0]])) if x not in exclude] 1616 1617 for taxon in undelete[1:]: 1618 newconstant = [] 1619 for site in constant: 1620 # print '%d (paup=%d)' % (site[0],site[0]+1), 1621 seqsite = matrix[taxon][site[0]].upper() 1622 # print seqsite,'checked against',site[1],'\t', 1623 if (seqsite == self.missing or 1624 (seqsite == self.gap and 1625 self.options['gapmode'].lower() == 'missing') or 1626 seqsite == site[1]): 1627 # missing or same as before -> ok 1628 newconstant.append(site) 1629 elif (seqsite in site[1] or 1630 site[1] == self.missing or 1631 (self.options['gapmode'].lower() == 'missing' and 1632 site[1] == self.gap)): 1633 # subset of an ambig or only missing in previous -> take subset 1634 newconstant.append((site[0], self.ambiguous_values.get(seqsite, seqsite))) 1635 elif seqsite in self.ambiguous_values: 1636 # is it an ambig: check the intersection with prev. values 1637 intersect = set(self.ambiguous_values[seqsite]).intersection(set(