Package Bio :: Package Nexus :: Module Nexus
[hide private]
[frames] | no frames]

Source Code for Module Bio.Nexus.Nexus

   1  # Copyright 2005-2008 by Frank Kauff & Cymon J. Cox. All rights reserved. 
   2  #           2014-2015 by Joe Cora (standard data) 
   3  # This code is part of the Biopython distribution and governed by its 
   4  # license. Please see the LICENSE file that should have been included 
   5  # as part of this package. 
   6  # 
   7  # Bug reports welcome: fkauff@biologie.uni-kl.de or on Biopython's bugzilla. 
   8  """Nexus class. Parse the contents of a NEXUS file. 
   9   
  10  Based upon 'NEXUS: An extensible file format for systematic information' 
  11  Maddison, Swofford, Maddison. 1997. Syst. Biol. 46(4):590-621 
  12  """ 
  13  from __future__ import print_function 
  14   
  15  from Bio._py3k import zip 
  16  from Bio._py3k import range 
  17  from Bio._py3k import basestring 
  18   
  19  from functools import reduce 
  20  import copy 
  21  import math 
  22  import random 
  23  import sys 
  24   
  25  from Bio import File 
  26  from Bio.Alphabet import IUPAC 
  27  from Bio.Data import IUPACData 
  28  from Bio.Seq import Seq 
  29   
  30  from Bio.Nexus.StandardData import StandardData 
  31  from Bio.Nexus.Trees import Tree 
  32   
  33   
  34  INTERLEAVE = 70 
  35  SPECIAL_COMMANDS = ['charstatelabels', 'charlabels', 'taxlabels', 'taxset', 
  36                      'charset', 'charpartition', 'taxpartition', 'matrix', 
  37                      'tree', 'utree', 'translate', 'codonposset', 'title'] 
  38  KNOWN_NEXUS_BLOCKS = ['trees', 'data', 'characters', 'taxa', 'sets', 'codons'] 
  39  PUNCTUATION = '()[]{}\,;:=*\'"`+-<>' 
  40  MRBAYESSAFE = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890_' 
  41  WHITESPACE = ' \t\n' 
  42  # SPECIALCOMMENTS = ['!','&','%','/','\\','@'] # original list of special comments 
  43  SPECIALCOMMENTS = ['&']  # supported special comment ('tree' command), all others are ignored 
  44  CHARSET = 'chars' 
  45  TAXSET = 'taxa' 
  46  CODONPOSITIONS = 'codonpositions' 
  47  DEFAULTNEXUS = '#NEXUS\nbegin data; dimensions ntax=0 nchar=0; format datatype=dna; end; ' 
  48   
  49   
50 -class NexusError(Exception):
51 pass
52 53
54 -class CharBuffer(object):
55 """Helps reading NEXUS-words and characters from a buffer (semi-PRIVATE). 56 57 This class is not intended for public use (any more). 58 """
59 - def __init__(self, string):
60 if string: 61 self.buffer = list(string) 62 else: 63 self.buffer = []
64
65 - def peek(self):
66 if self.buffer: 67 return self.buffer[0] 68 else: 69 return None
70
71 - def peek_nonwhitespace(self):
72 b = ''.join(self.buffer).strip() 73 if b: 74 return b[0] 75 else: 76 return None
77
78 - def __next__(self):
79 if self.buffer: 80 return self.buffer.pop(0) 81 else: 82 return None
83 84 if sys.version_info[0] < 3:
85 - def next(self):
86 """Deprecated Python 2 style alias for Python 3 style __next__ method.""" 87 return self.__next__()
88
89 - def next_nonwhitespace(self):
90 while True: 91 p = next(self) 92 if p is None: 93 break 94 if p not in WHITESPACE: 95 return p 96 return None
97
98 - def skip_whitespace(self):
99 while self.buffer[0] in WHITESPACE: 100 self.buffer = self.buffer[1:]
101
102 - def next_until(self, target):
103 for t in target: 104 try: 105 pos = self.buffer.index(t) 106 except ValueError: 107 pass 108 else: 109 found = ''.join(self.buffer[:pos]) 110 self.buffer = self.buffer[pos:] 111 return found 112 else: 113 return None
114
115 - def peek_word(self, word):
116 return ''.join(self.buffer[:len(word)]) == word
117
118 - def next_word(self):
119 """Return the next NEXUS word from a string. 120 121 This deals with single and double quotes, whitespace and punctuation. 122 """ 123 word = [] 124 quoted = False 125 # get first character 126 first = self.next_nonwhitespace() 127 if not first: 128 # return empty if only whitespace left 129 return None 130 word.append(first) 131 if first == "'": 132 quoted = "'" 133 elif first == '"': 134 quoted = '"' 135 elif first in PUNCTUATION: 136 # if it's non-quote punctuation, return immediately 137 return first 138 while True: 139 c = self.peek() 140 if c == quoted: # a quote? 141 word.append(next(self)) # store quote 142 if self.peek() == quoted: # double quote 143 next(self) # skip second quote 144 elif quoted: # second single quote ends word 145 break 146 elif quoted: 147 # if quoted, then add anything 148 word.append(next(self)) 149 elif not c or c in PUNCTUATION or c in WHITESPACE: 150 # if not quoted and special character, stop 151 break 152 else: 153 word.append(next(self)) # standard character 154 return ''.join(word)
155
156 - def rest(self):
157 """Return the rest of the string without parsing.""" 158 return ''.join(self.buffer)
159 160
161 -class StepMatrix(object):
162 """Calculate a stepmatrix for weighted parsimony. 163 164 See Wheeler (1990), Cladistics 6:269-275. 165 """
166 - def __init__(self, symbols, gap):
167 self.data = {} 168 self.symbols = sorted(symbols) 169 if gap: 170 self.symbols.append(gap) 171 for x in self.symbols: 172 for y in [s for s in self.symbols if s != x]: 173 self.set(x, y, 0)
174
175 - def set(self, x, y, value):
176 if x > y: 177 x, y = y, x 178 self.data[x + y] = value
179
180 - def add(self, x, y, value):
181 if x > y: 182 x, y = y, x 183 self.data[x + y] += value
184
185 - def sum(self):
186 return reduce(lambda x, y: x + y, self.data.values())
187
188 - def transformation(self):
189 total = self.sum() 190 if total != 0: 191 for k in self.data: 192 self.data[k] = self.data[k] / float(total) 193 return self
194
195 - def weighting(self):
196 for k in self.data: 197 if self.data[k] != 0: 198 self.data[k] = -math.log(self.data[k]) 199 return self
200
201 - def smprint(self, name='your_name_here'):
202 matrix = 'usertype %s stepmatrix=%d\n' % (name, len(self.symbols)) 203 matrix += ' %s\n' % ' '.join(self.symbols) 204 for x in self.symbols: 205 matrix += '[%s]'.ljust(8) % x 206 for y in self.symbols: 207 if x == y: 208 matrix += ' . ' 209 else: 210 if x > y: 211 x1, y1 = y, x 212 else: 213 x1, y1 = x, y 214 if self.data[x1 + y1] == 0: 215 matrix += 'inf. ' 216 else: 217 matrix += '%2.2f'.ljust(10) % (self.data[x1 + y1]) 218 matrix += '\n' 219 matrix += ';\n' 220 return matrix
221 222
223 -def safename(name, mrbayes=False):
224 """Return a taxon identifier according to NEXUS standard. 225 226 Wrap quotes around names with punctuation or whitespace, and double 227 single quotes. 228 229 mrbayes=True: write names without quotes, whitespace or punctuation 230 for the mrbayes software package. 231 """ 232 if mrbayes: 233 safe = name.replace(' ', '_') 234 safe = ''.join(c for c in safe if c in MRBAYESSAFE) 235 else: 236 safe = name.replace("'", "''") 237 if set(safe).intersection(set(WHITESPACE + PUNCTUATION)): 238 safe = "'" + safe + "'" 239 return safe
240 241
242 -def quotestrip(word):
243 """Remove quotes and/or double quotes around identifiers.""" 244 if not word: 245 return None 246 while (word.startswith("'") and word.endswith("'")) or (word.startswith('"') and word.endswith('"')): 247 word = word[1:-1] 248 return word
249 250
251 -def get_start_end(sequence, skiplist=('-', '?')):
252 """Return position of first and last character which is not in skiplist. 253 254 Skiplist defaults to ['-','?']. 255 """ 256 length = len(sequence) 257 if length == 0: 258 return None, None 259 end = length - 1 260 while end >= 0 and (sequence[end] in skiplist): 261 end -= 1 262 start = 0 263 while start < length and (sequence[start] in skiplist): 264 start += 1 265 if start == length and end == -1: # empty sequence 266 return -1, -1 267 else: 268 return start, end
269 270
271 -def _sort_keys_by_values(p):
272 """Returns a sorted list of keys of p sorted by values of p.""" 273 return sorted((pn for pn in p if p[pn]), key=lambda pn: p[pn])
274 275
276 -def _make_unique(l):
277 """Check that all values in list are unique and return a pruned and sorted list.""" 278 return sorted(set(l))
279 280
281 -def _unique_label(previous_labels, label):
282 """Returns a unique name if label is already in previous_labels.""" 283 while label in previous_labels: 284 label_split = label.split('.') 285 if label_split[-1].startswith('copy'): 286 copy_num = 1 287 if label_split[-1] != "copy": 288 copy_num = int(label_split[-1][4:]) + 1 289 new_label = "%s.copy%s" % ('.'.join(label_split[:-1]), copy_num) 290 label = new_label 291 else: 292 label += '.copy' 293 return label
294 295
296 -def _seqmatrix2strmatrix(matrix):
297 """Converts a Seq-object matrix to a plain sequence-string matrix.""" 298 return dict((t, str(matrix[t])) for t in matrix)
299 300
301 -def _compact4nexus(orig_list):
302 """Transform [1 2 3 5 6 7 8 12 15 18 20] (baseindex 0, used in the Nexus class) 303 into '2-4 6-9 13-19\\3 21' (baseindex 1, used in programs like Paup or MrBayes.). 304 """ 305 if not orig_list: 306 return '' 307 orig_list = sorted(set(orig_list)) 308 shortlist = [] 309 clist = orig_list[:] 310 clist.append(clist[-1] + .5) # dummy value makes it easier 311 while len(clist) > 1: 312 step = 1 313 for i, x in enumerate(clist): 314 if x == clist[0] + i * step: # are we still in the right step? 315 continue 316 elif i == 1 and len(clist) > 3 and clist[i + 1] - x == x - clist[0]: 317 # second element, and possibly at least 3 elements to link, 318 # and the next one is in the right step 319 step = x - clist[0] 320 else: # pattern broke, add all values before current position to new list 321 sub = clist[:i] 322 if len(sub) == 1: 323 shortlist.append(str(sub[0] + 1)) 324 else: 325 if step == 1: 326 shortlist.append('%d-%d' % (sub[0] + 1, sub[-1] + 1)) 327 else: 328 shortlist.append('%d-%d\\%d' % (sub[0] + 1, sub[-1] + 1, step)) 329 clist = clist[i:] 330 break 331 return ' '.join(shortlist)
332 333
334 -def combine(matrices):
335 """Combine matrices in [(name,nexus-instance),...] and return new nexus instance. 336 337 combined_matrix=combine([(name1,nexus_instance1),(name2,nexus_instance2),...] 338 Character sets, character partitions and taxon sets are prefixed, readjusted 339 and present in the combined matrix. 340 """ 341 if not matrices: 342 return None 343 name = matrices[0][0] 344 combined = copy.deepcopy(matrices[0][1]) # initiate with copy of first matrix 345 mixed_datatypes = (len(set(n[1].datatype for n in matrices)) > 1) 346 if mixed_datatypes: 347 # dealing with mixed matrices is application specific. 348 # You take care of that yourself! 349 combined.datatype = 'None' 350 # raise NexusError('Matrices must be of same datatype') 351 combined.charlabels = None 352 combined.statelabels = None 353 combined.interleave = False 354 combined.translate = None 355 356 # rename taxon sets and character sets and name them with prefix 357 for cn, cs in combined.charsets.items(): 358 combined.charsets['%s.%s' % (name, cn)] = cs 359 del combined.charsets[cn] 360 for tn, ts in combined.taxsets.items(): 361 combined.taxsets['%s.%s' % (name, tn)] = ts 362 del combined.taxsets[tn] 363 # previous partitions usually don't make much sense in combined matrix 364 # just initiate one new partition parted by single matrices 365 combined.charpartitions = {'combined': {name: list(range(combined.nchar))}} 366 for n, m in matrices[1:]: # add all other matrices 367 both = [t for t in combined.taxlabels if t in m.taxlabels] 368 combined_only = [t for t in combined.taxlabels if t not in both] 369 m_only = [t for t in m.taxlabels if t not in both] 370 for t in both: 371 # concatenate sequences and unify gap and missing character symbols 372 combined.matrix[t] += Seq(str(m.matrix[t]) 373 .replace(m.gap, combined.gap) 374 .replace(m.missing, combined.missing), 375 combined.alphabet) 376 # replace date of missing taxa with symbol for missing data 377 for t in combined_only: 378 combined.matrix[t] += Seq(combined.missing * m.nchar, 379 combined.alphabet) 380 for t in m_only: 381 combined.matrix[t] = Seq(combined.missing * combined.nchar, 382 combined.alphabet) + \ 383 Seq(str(m.matrix[t]) 384 .replace(m.gap, combined.gap) 385 .replace(m.missing, combined.missing), 386 combined.alphabet) 387 combined.taxlabels.extend(m_only) # new taxon list 388 for cn, cs in m.charsets.items(): # adjust character sets for new matrix 389 combined.charsets['%s.%s' % (n, cn)] = [x + combined.nchar for x in cs] 390 if m.taxsets: 391 if not combined.taxsets: 392 combined.taxsets = {} 393 # update taxon sets 394 combined.taxsets.update(dict(('%s.%s' % (n, tn), ts) 395 for tn, ts in m.taxsets.items())) 396 # update new charpartition 397 combined.charpartitions['combined'][n] = list(range(combined.nchar, combined.nchar + m.nchar)) 398 # update charlabels 399 if m.charlabels: 400 if not combined.charlabels: 401 combined.charlabels = {} 402 combined.charlabels.update(dict((combined.nchar + i, label) 403 for (i, label) in m.charlabels.items())) 404 combined.nchar += m.nchar # update nchar and ntax 405 combined.ntax += len(m_only) 406 407 # some prefer partitions, some charsets: 408 # make separate charset for ecah initial dataset 409 for c in combined.charpartitions['combined']: 410 combined.charsets[c] = combined.charpartitions['combined'][c] 411 412 return combined
413 414
415 -def _kill_comments_and_break_lines(text):
416 """Delete []-delimited comments out of a file and break into lines separated by ';'. 417 418 stripped_text=_kill_comments_and_break_lines(text): 419 Nested and multiline comments are allowed. [ and ] symbols within single 420 or double quotes are ignored, newline ends a quote, all symbols with quotes are 421 treated the same (thus not quoting inside comments like [this character ']' ends a comment]) 422 Special [&...] and [\...] comments remain untouched, if not inside standard comment. 423 Quotes inside special [& and [\ are treated as normal characters, 424 but no nesting inside these special comments allowed (like [& [\ ]]). 425 ';' ist deleted from end of line. 426 427 NOTE: this function is very slow for large files, and obsolete when using C extension cnexus 428 """ 429 contents = iter(text) 430 newtext = [] 431 newline = [] 432 quotelevel = '' 433 speciallevel = False 434 commlevel = 0 435 # Parse with one character look ahead (for special comments) 436 t2 = next(contents) 437 while True: 438 t = t2 439 try: 440 t2 = next(contents) 441 except StopIteration: 442 t2 = None 443 if t is None: 444 break 445 if t == quotelevel and not (commlevel or speciallevel): 446 # matching quote ends quotation 447 quotelevel = '' 448 elif not quotelevel and not (commlevel or speciallevel) and (t == '"' or t == "'"): 449 # single or double quote starts quotation 450 quotelevel = t 451 elif not quotelevel and t == '[': 452 # opening bracket outside a quote 453 if t2 in SPECIALCOMMENTS and commlevel == 0 and not speciallevel: 454 speciallevel = True 455 else: 456 commlevel += 1 457 elif not quotelevel and t == ']': 458 # closing bracket ioutside a quote 459 if speciallevel: 460 speciallevel = False 461 else: 462 commlevel -= 1 463 if commlevel < 0: 464 raise NexusError('Nexus formatting error: unmatched ]') 465 continue 466 if commlevel == 0: 467 # copy if we're not in comment 468 if t == ';' and not quotelevel: 469 newtext.append(''.join(newline)) 470 newline = [] 471 else: 472 newline.append(t) 473 # level of comments should be 0 at the end of the file 474 if newline: 475 newtext.append('\n'.join(newline)) 476 if commlevel > 0: 477 raise NexusError('Nexus formatting error: unmatched [') 478 return newtext
479 480
481 -def _adjust_lines(lines):
482 """Adjust linebreaks to match ';', strip leading/trailing whitespace. 483 484 list_of_commandlines=_adjust_lines(input_text) 485 Lines are adjusted so that no linebreaks occur within a commandline 486 (except matrix command line) 487 """ 488 formatted_lines = [] 489 for l in lines: 490 # Convert line endings 491 l = l.replace('\r\n', '\n').replace('\r', '\n').strip() 492 if l.lower().startswith('matrix'): 493 formatted_lines.append(l) 494 else: 495 l = l.replace('\n', ' ') 496 if l: 497 formatted_lines.append(l) 498 return formatted_lines
499 500
501 -def _replace_parenthesized_ambigs(seq, rev_ambig_values):
502 """Replaces ambigs in xxx(ACG)xxx format by IUPAC ambiguity code.""" 503 opening = seq.find('(') 504 while opening > -1: 505 closing = seq.find(')') 506 if closing < 0: 507 raise NexusError('Missing closing parenthesis in: ' + seq) 508 elif closing < opening: 509 raise NexusError('Missing opening parenthesis in: ' + seq) 510 ambig = ''.join(sorted(seq[opening + 1:closing])) 511 ambig_code = rev_ambig_values[ambig.upper()] 512 if ambig != ambig.upper(): 513 ambig_code = ambig_code.lower() 514 seq = seq[:opening] + ambig_code + seq[closing + 1:] 515 opening = seq.find('(') 516 return seq
517 518
519 -class Commandline(object):
520 """Represent a commandline as command and options.""" 521
522 - def __init__(self, line, title):
523 self.options = {} 524 options = [] 525 self.command = None 526 try: 527 # Assume matrix (all other command lines have been stripped of \n) 528 self.command, options = line.strip().split('\n', 1) 529 except ValueError: # Not matrix 530 # self.command,options=line.split(' ',1) # no: could be tab or spaces (translate...) 531 self.command = line.split()[0] 532 options = ' '.join(line.split()[1:]) 533 self.command = self.command.strip().lower() 534 if self.command in SPECIAL_COMMANDS: 535 # special command that need newlines and order of options preserved 536 self.options = options.strip() 537 else: 538 if len(options) > 0: 539 try: 540 options = options.replace('=', ' = ').split() 541 valued_indices = [(n - 1, n, n + 1) for n in range(len(options)) 542 if options[n] == '=' and n != 0 and n != len((options))] 543 indices = [] 544 for sl in valued_indices: 545 indices.extend(sl) 546 token_indices = [n for n in range(len(options)) if n not in indices] 547 for opt in valued_indices: 548 # self.options[options[opt[0]].lower()] = options[opt[2]].lower() 549 self.options[options[opt[0]].lower()] = options[opt[2]] 550 for token in token_indices: 551 self.options[options[token].lower()] = None 552 except ValueError: 553 raise NexusError('Incorrect formatting in line: %s' % line)
554 555
556 -class Block(object):
557 """Represent a NEXUS block with block name and list of commandlines."""
558 - def __init__(self, title=None):
559 self.title = title 560 self.commandlines = []
561 562
563 -class Nexus(object):
564
565 - def __init__(self, input=None):
566 self.ntax = 0 # number of taxa 567 self.nchar = 0 # number of characters 568 self.unaltered_taxlabels = [] # taxlabels as the appear in the input file (incl. duplicates, etc.) 569 self.taxlabels = [] # labels for taxa, ordered by their id 570 self.charlabels = None # ... and for characters 571 self.statelabels = None # ... and for states 572 self.datatype = 'dna' # (standard), dna, rna, nucleotide, protein 573 self.respectcase = False # case sensitivity 574 self.missing = '?' # symbol for missing characters 575 self.gap = '-' # symbol for gap 576 self.symbols = None # set of symbols 577 self.equate = None # set of symbol synonyms 578 self.matchchar = None # matching char for matrix representation 579 self.labels = None # left, right, no 580 self.transpose = False # whether matrix is transposed 581 self.interleave = False # whether matrix is interleaved 582 self.tokens = False # unsupported 583 self.eliminate = None # unsupported 584 self.matrix = None # ... 585 self.unknown_blocks = [] # blocks we don't care about 586 self.taxsets = {} 587 self.charsets = {} 588 self.charpartitions = {} 589 self.taxpartitions = {} 590 self.trees = [] # list of Trees (instances of Tree class) 591 self.translate = None # Dict to translate taxon <-> taxon numbers 592 self.structured = [] # structured input representation 593 self.set = {} # dict of the set command to set various options 594 self.options = {} # dict of the options command in the data block 595 self.codonposset = None # name of the charpartition that defines codon positions 596 597 # some defaults 598 self.options['gapmode'] = 'missing' 599 600 if input: 601 self.read(input) 602 else: 603 self.read(DEFAULTNEXUS)
604
605 - def get_original_taxon_order(self):
606 """Included for backwards compatibility (DEPRECATED).""" 607 return self.taxlabels
608
609 - def set_original_taxon_order(self, value):
610 """Included for backwards compatibility (DEPRECATED).""" 611 self.taxlabels = value
612 613 original_taxon_order = property(get_original_taxon_order, set_original_taxon_order) 614
615 - def read(self, input):
616 """Read and parse NEXUS input (a filename, file-handle, or string).""" 617 # 1. Assume we have the name of a file in the execution dir or a 618 # file-like object. 619 # Note we need to add parsing of the path to dir/filename 620 try: 621 with File.as_handle(input, 'rU') as fp: 622 file_contents = fp.read() 623 self.filename = getattr(fp, 'name', 'Unknown_nexus_file') 624 except (TypeError, IOError, AttributeError): 625 # 2. Assume we have a string from a fh.read() 626 if isinstance(input, basestring): 627 file_contents = input 628 self.filename = 'input_string' 629 else: 630 print(input.strip()[:50]) 631 raise NexusError('Unrecognized input: %s ...' % input[:100]) 632 file_contents = file_contents.strip() 633 if file_contents.startswith('#NEXUS'): 634 file_contents = file_contents[6:] 635 commandlines = _get_command_lines(file_contents) 636 # get rid of stupid 'NEXUS token - in merged treefiles, this might appear multiple times' 637 for i, cl in enumerate(commandlines): 638 try: 639 if cl[:6].upper() == '#NEXUS': 640 commandlines[i] = cl[6:].strip() 641 except IndexError: 642 pass 643 # now loop through blocks (we parse only data in known blocks, thus ignoring non-block commands 644 nexus_block_gen = self._get_nexus_block(commandlines) 645 while True: 646 try: 647 title, contents = next(nexus_block_gen) 648 except StopIteration: 649 break 650 if title in KNOWN_NEXUS_BLOCKS: 651 self._parse_nexus_block(title, contents) 652 else: 653 self._unknown_nexus_block(title, contents)
654
655 - def _get_nexus_block(self, file_contents):
656 """Generator for looping through Nexus blocks.""" 657 inblock = False 658 blocklines = [] 659 while file_contents: 660 cl = file_contents.pop(0) 661 if cl.lower().startswith('begin'): 662 if not inblock: 663 inblock = True 664 title = cl.split()[1].lower() 665 else: 666 raise NexusError('Illegal block nesting in block %s' % title) 667 elif cl.lower().startswith('end'): 668 if inblock: 669 inblock = False 670 yield title, blocklines 671 blocklines = [] 672 else: 673 raise NexusError('Unmatched \'end\'.') 674 elif inblock: 675 blocklines.append(cl)
676
677 - def _unknown_nexus_block(self, title, contents):
678 block = Block() 679 block.commandlines.append(contents) 680 block.title = title 681 self.unknown_blocks.append(block)
682
683 - def _parse_nexus_block(self, title, contents):
684 """Parse a known Nexus Block (PRIVATE).""" 685 # attached the structured block representation 686 self._apply_block_structure(title, contents) 687 # now check for taxa,characters,data blocks. If this stuff is defined more than once 688 # the later occurrences will override the previous ones. 689 block = self.structured[-1] 690 for line in block.commandlines: 691 try: 692 getattr(self, '_' + line.command)(line.options) 693 except AttributeError: 694 raise NexusError('Unknown command: %s ' % line.command)
695
696 - def _title(self, options):
697 pass
698 701
702 - def _dimensions(self, options):
703 if 'ntax' in options: 704 self.ntax = eval(options['ntax']) 705 if 'nchar' in options: 706 self.nchar = eval(options['nchar'])
707
708 - def _format(self, options):
709 # print options 710 # we first need to test respectcase, then symbols (which depends on respectcase) 711 # then datatype (which, if standard, depends on symbols and respectcase in order to generate 712 # dicts for ambiguous values and alphabet 713 if 'respectcase' in options: 714 self.respectcase = True 715 # adjust symbols to for respectcase 716 if 'symbols' in options: 717 self.symbols = ''.join(options['symbols'].split()) 718 if (self.symbols.startswith('"') and self.symbols.endswith('"')) or \ 719 (self.symbols.startswith("'") and self.symbols.endswith("'")): 720 self.symbols = self.symbols[1:-1] 721 if not self.respectcase: 722 self.symbols = list(self.symbols.upper()) 723 # self.symbols = self.symbols.lower() + self.symbols.upper() 724 # self.symbols = list(set(self.symbols)) 725 if 'datatype' in options: 726 self.datatype = options['datatype'].lower() 727 728 if self.datatype == 'dna' or self.datatype == 'nucleotide': 729 self.alphabet = IUPAC.IUPACAmbiguousDNA() # fresh instance! 730 self.ambiguous_values = IUPACData.ambiguous_dna_values.copy() 731 self.unambiguous_letters = IUPACData.unambiguous_dna_letters 732 elif self.datatype == 'rna': 733 self.alphabet = IUPAC.IUPACAmbiguousDNA() # fresh instance! 734 self.ambiguous_values = IUPACData.ambiguous_rna_values.copy() 735 self.unambiguous_letters = IUPACData.unambiguous_rna_letters 736 elif self.datatype == 'protein': 737 # TODO - Should this not be ExtendedIUPACProtein? 738 self.alphabet = IUPAC.IUPACProtein() # fresh instance 739 self.ambiguous_values = {'B': 'DN', 740 'Z': 'EQ', 741 'X': IUPACData.protein_letters} 742 # that's how PAUP handles it 743 self.unambiguous_letters = IUPACData.protein_letters + '*' # stop-codon 744 elif self.datatype == 'standard': 745 self.alphabet = None 746 self.ambiguous_values = {} 747 if not self.symbols: 748 # PARSER BUG ## 749 # This error arises when symbols are absent or when 750 # whitespace is located within the SYMBOLS command values. 751 # The Nexus parser quits reading the SYMBOLS line upon 752 # finding a whitespace character. 753 raise NexusError( 754 "Symbols must be defined when using standard datatype. " 755 "Please remove any whitespace (spaces, tabs, etc.) " 756 "between values for symbols as this confuses the Nexus " 757 "parser.") 758 759 self.unambiguous_letters = ''.join(self.symbols) 760 if not self.respectcase: 761 self.unambiguous_letters += self.unambiguous_letters.lower() 762 else: 763 raise NexusError('Unsupported datatype: ' + self.datatype) 764 self.valid_characters = ''.join(self.ambiguous_values) + self.unambiguous_letters 765 if not self.respectcase: 766 self.valid_characters = self.valid_characters.lower() + self.valid_characters.upper() 767 # we have to sort the reverse ambig coding dict key characters: 768 # to be sure that it's 'ACGT':'N' and not 'GTCA':'N' 769 rev = dict((i[1], i[0]) for i in self.ambiguous_values.items() if i[0] != 'X') 770 self.rev_ambiguous_values = {} 771 for (k, v) in rev.items(): 772 key = sorted(c for c in k) 773 self.rev_ambiguous_values[''.join(key)] = v 774 # overwrite symbols for datype rna,dna,nucleotide 775 if self.datatype in ['dna', 'rna', 'nucleotide']: 776 self.symbols = self.alphabet.letters 777 if self.missing not in self.ambiguous_values: 778 self.ambiguous_values[self.missing] = self.unambiguous_letters + self.gap 779 self.ambiguous_values[self.gap] = self.gap 780 # elif self.datatype == 'standard': 781 # if not self.symbols: 782 # self.symbols = ['0', '1'] 783 if 'missing' in options: 784 self.missing = options['missing'][0] 785 if 'gap' in options: 786 self.gap = options['gap'][0] 787 if 'equate' in options: 788 self.equate = options['equate'] 789 if 'matchchar' in options: 790 self.matchchar = options['matchchar'][0] 791 if 'labels' in options: 792 self.labels = options['labels'] 793 if 'transpose' in options: 794 raise NexusError('TRANSPOSE is not supported!') 795 self.transpose = True 796 if 'interleave' in options: 797 if options['interleave'] is None or options['interleave'].lower() == 'yes': 798 self.interleave = True 799 if 'tokens' in options: 800 self.tokens = True 801 if 'notokens' in options: 802 self.tokens = False
803
804 - def _set(self, options):
805 self.set = options
806
807 - def _options(self, options):
808 self.options = options
809
810 - def _eliminate(self, options):
811 self.eliminate = options
812
813 - def _taxlabels(self, options):
814 """Get taxon labels (PRIVATE). 815 816 As the taxon names are already in the matrix, this is superfluous 817 except for transpose matrices, which are currently unsupported anyway. 818 Thus, we ignore the taxlabels command to make handling of duplicate 819 taxon names easier. 820 """ 821 pass
822 # self.taxlabels = [] 823 # opts = CharBuffer(options) 824 # while True: 825 # taxon = quotestrip(opts.next_word()) 826 # if not taxon: 827 # break 828 # self.taxlabels.append(taxon) 829
830 - def _check_taxlabels(self, taxon):
831 """Check for presence of taxon in self.taxlabels.""" 832 # According to NEXUS standard, underscores shall be treated as spaces..., 833 # so checking for identity is more difficult 834 nextaxa = dict((t.replace(' ', '_'), t) for t in self.taxlabels) 835 nexid = taxon.replace(' ', '_') 836 return nextaxa.get(nexid)
837
838 - def _charlabels(self, options):
839 self.charlabels = {} 840 opts = CharBuffer(options) 841 while True: 842 # get id and state 843 w = opts.next_word() 844 if w is None: # McClade saves and reads charlabel-lists with terminal comma?! 845 break 846 identifier = self._resolve(w, set_type=CHARSET) 847 state = quotestrip(opts.next_word()) 848 self.charlabels[identifier] = state 849 # check for comma or end of command 850 c = opts.next_nonwhitespace() 851 if c is None: 852 break 853 elif c != ',': 854 raise NexusError('Missing \',\' in line %s.' % options)
855
856 - def _charstatelabels(self, options):
857 self.charlabels = {} 858 self.statelabels = {} 859 opts = CharBuffer(options) 860 861 # Make sure symbols are defined 862 if not self.symbols: 863 raise NexusError( 864 'Symbols must be defined when using character states') 865 866 while True: 867 # get id and character name 868 w = opts.next_word() 869 870 # McClade saves and reads charlabel-lists with terminal comma?! 871 if w is None: 872 break 873 874 identifier = self._resolve(w, set_type=CHARSET) 875 character = quotestrip(opts.next_word()) 876 877 self.charlabels[identifier] = character 878 self.statelabels[identifier] = [] 879 880 # check for comma, slash or end of command 881 c = opts.next_nonwhitespace() 882 883 if c is None: 884 break 885 elif c != ',': 886 # Check if states are defined, otherwise report error 887 if c != '/': 888 raise NexusError('Missing \',\' in line %s.' % options) 889 890 # Get the first state 891 state = quotestrip(opts.next_word()) 892 893 if state is None: 894 raise NexusError( 895 'Missing character state in line %s.' % options) 896 897 while True: 898 # Make sure current state does not exceed number of 899 # available symbols 900 if len(self.statelabels[identifier]) > len(self.symbols): 901 raise NexusError( 902 'Character states exceed number of available symbols in line %s.' % options) 903 904 # Add the character state to the statelabels 905 self.statelabels[identifier].append(state) 906 907 # Check for another state or comma to end states (last 908 # character should not have comma at end of states - but 909 # we'll ignore) 910 state = quotestrip(opts.next_word()) 911 912 if state is None: 913 return 914 elif state is ',': 915 break
916
917 - def _statelabels(self, options):
918 # self.charlabels = options 919 # print 'Command statelabels is not supported and will be ignored.' 920 pass
921
922 - def _matrix(self, options):
923 if not self.ntax or not self.nchar: 924 raise NexusError('Dimensions must be specified before matrix!') 925 self.matrix = {} 926 taxcount = 0 927 first_matrix_block = True 928 929 # eliminate empty lines and leading/trailing whitespace 930 lines = [l.strip() for l in options.split('\n') if l.strip() != ''] 931 lineiter = iter(lines) 932 while True: 933 try: 934 l = next(lineiter) 935 except StopIteration: 936 if taxcount < self.ntax: 937 raise NexusError('Not enough taxa in matrix.') 938 elif taxcount > self.ntax: 939 raise NexusError('Too many taxa in matrix.') 940 else: 941 break 942 # count the taxa and check for interleaved matrix 943 taxcount += 1 944 if taxcount > self.ntax: 945 if not self.interleave: 946 raise NexusError('Too many taxa in matrix - should matrix be interleaved?') 947 else: 948 taxcount = 1 949 first_matrix_block = False 950 # get taxon name and sequence 951 linechars = CharBuffer(l) 952 id = quotestrip(linechars.next_word()) 953 l = linechars.rest().strip() 954 chars = '' 955 if self.interleave: 956 # interleaved matrix 957 if l: 958 chars = ''.join(l.split()) 959 else: 960 chars = ''.join(next(lineiter).split()) 961 else: 962 # non-interleaved matrix 963 chars = ''.join(l.split()) 964 while len(chars) < self.nchar: 965 l = next(lineiter) 966 chars += ''.join(l.split()) 967 968 # Reformat sequence for non-standard datatypes 969 if self.datatype != 'standard': 970 iupac_seq = Seq(_replace_parenthesized_ambigs( 971 chars, self.rev_ambiguous_values), self.alphabet) 972 # first taxon has the reference sequence if matchhar is used 973 if taxcount == 1: 974 refseq = iupac_seq 975 else: 976 if self.matchchar: 977 while True: 978 p = str(iupac_seq).find(self.matchchar) 979 if p == -1: 980 break 981 iupac_seq = Seq(str(iupac_seq)[:p] + refseq[ 982 p] + str(iupac_seq)[p + 1:], self.alphabet) 983 984 # Check for invalid characters 985 for i, c in enumerate(str(iupac_seq)): 986 if c not in self.valid_characters and c != self.gap and c != self.missing: 987 raise NexusError("Taxon %s: Illegal character %s in sequence %s " 988 "(check dimensions/interleaving)" % (id, c, iupac_seq)) 989 else: 990 iupac_seq = StandardData(chars) 991 992 # Check for invalid characters 993 for i, c in enumerate(iupac_seq): 994 # Go through each coding for each character 995 for coding in c['d']: 996 if coding not in self.valid_characters: 997 if coding != self.gap and coding != self.missing: 998 raise NexusError("Taxon %s: Illegal character %s " 999 "in sequence %s " 1000 "(check dimensions/interleaving)" 1001 % (id, coding, iupac_seq)) 1002 1003 # add sequence to matrix 1004 if first_matrix_block: 1005 self.unaltered_taxlabels.append(id) 1006 id = _unique_label(list(self.matrix.keys()), id) 1007 self.matrix[id] = iupac_seq 1008 self.taxlabels.append(id) 1009 else: 1010 # taxon names need to be in the same order in each interleaved block 1011 id = _unique_label(self.taxlabels[:taxcount - 1], id) 1012 taxon_present = self._check_taxlabels(id) 1013 if taxon_present: 1014 self.matrix[taxon_present] += iupac_seq 1015 else: 1016 raise NexusError("Taxon %s not in first block of interleaved " 1017 "matrix. Check matrix dimensions and interleave." % id) 1018 # check all sequences for length according to nchar 1019 for taxon in self.matrix: 1020 if len(self.matrix[taxon]) != self.nchar: 1021 raise NexusError('Matrix Nchar %d does not match data length (%d) for taxon %s' 1022 % (self.nchar, len(self.matrix[taxon]), taxon)) 1023 # check that taxlabels is identical with matrix.keys. If not, it's a problem 1024 matrixkeys = sorted(self.matrix) 1025 taxlabelssort = sorted(self.taxlabels[:]) 1026 assert matrixkeys == taxlabelssort, \ 1027 "ERROR: TAXLABELS must be identical with MATRIX. " + \ 1028 "Please Report this as a bug, and send in data file."
1029
1030 - def _translate(self, options):
1031 self.translate = {} 1032 opts = CharBuffer(options) 1033 while True: 1034 try: 1035 # get id and state 1036 identifier = int(opts.next_word()) 1037 label = quotestrip(opts.next_word()) 1038 self.translate[identifier] = label 1039 # check for comma or end of command 1040 c = opts.next_nonwhitespace() 1041 if c is None: 1042 break 1043 elif c != ',': 1044 raise NexusError('Missing \',\' in line %s.' % options) 1045 except NexusError: 1046 raise 1047 except Exception: # TODO: ValueError? 1048 raise NexusError('Format error in line %s.' % options)
1049
1050 - def _utree(self, options):
1051 """Some software (clustalx) uses 'utree' to denote an unrooted tree.""" 1052 self._tree(options)
1053
1054 - def _tree(self, options):
1055 opts = CharBuffer(options) 1056 if opts.peek_nonwhitespace() == '*': 1057 # a star can be used to make it the default tree in some software packages 1058 dummy = opts.next_nonwhitespace() 1059 name = opts.next_word() 1060 if opts.next_nonwhitespace() != '=': 1061 raise NexusError('Syntax error in tree description: %s' 1062 % options[:50]) 1063 rooted = False 1064 weight = 1.0 1065 while opts.peek_nonwhitespace() == '[': 1066 opts.next_nonwhitespace() # discard opening bracket 1067 symbol = next(opts) 1068 if symbol != '&': 1069 raise NexusError('Illegal special comment [%s...] in tree description: %s' 1070 % (symbol, options[:50])) 1071 special = next(opts) 1072 value = opts.next_until(']') 1073 next(opts) # discard closing bracket 1074 if special == 'R': 1075 rooted = True 1076 elif special == 'U': 1077 rooted = False 1078 elif special == 'W': 1079 weight = float(value) 1080 tree = Tree(name=name, weight=weight, rooted=rooted, 1081 tree=opts.rest().strip()) 1082 # if there's an active translation table, translate 1083 if self.translate: 1084 for n in tree.get_terminals(): 1085 try: 1086 tree.node(n).data.taxon = safename(self.translate[int(tree.node(n).data.taxon)]) 1087 except (ValueError, KeyError): 1088 raise NexusError('Unable to substitute %s using \'translate\' data.' 1089 % tree.node(n).data.taxon) 1090 self.trees.append(tree)
1091
1092 - def _apply_block_structure(self, title, lines):
1093 block = Block('') 1094 block.title = title 1095 for line in lines: 1096 block.commandlines.append(Commandline(line, title)) 1097 self.structured.append(block)
1098
1099 - def _taxset(self, options):
1100 name, taxa = self._get_indices(options, set_type=TAXSET) 1101 self.taxsets[name] = _make_unique(taxa)
1102
1103 - def _charset(self, options):
1104 name, sites = self._get_indices(options, set_type=CHARSET) 1105 self.charsets[name] = _make_unique(sites)
1106
1107 - def _taxpartition(self, options):
1108 taxpartition = {} 1109 quotelevel = False 1110 opts = CharBuffer(options) 1111 name = self._name_n_vector(opts) 1112 if not name: 1113 raise NexusError('Formatting error in taxpartition: %s ' % options) 1114 # now collect thesubbpartitions and parse them 1115 # subpartitons separated by commas - which unfortunately could be part of a quoted identifier... 1116 # this is rather unelegant, but we have to avoid double-parsing and potential change of special nexus-words 1117 sub = '' 1118 while True: 1119 w = next(opts) 1120 if w is None or (w == ',' and not quotelevel): 1121 subname, subindices = self._get_indices(sub, set_type=TAXSET, separator=':') 1122 taxpartition[subname] = _make_unique(subindices) 1123 sub = '' 1124 if w is None: 1125 break 1126 else: 1127 if w == "'": 1128 quotelevel = not quotelevel 1129 sub += w 1130 self.taxpartitions[name] = taxpartition
1131
1132 - def _codonposset(self, options):
1133 """Read codon positions from a codons block as written from McClade. 1134 1135 Here codonposset is just a fancy name for a character partition with 1136 the name CodonPositions and the partitions N,1,2,3 1137 """ 1138 prev_partitions = list(self.charpartitions.keys()) 1139 self._charpartition(options) 1140 # mcclade calls it CodonPositions, but you never know... 1141 codonname = [n for n in self.charpartitions if n not in prev_partitions] 1142 if codonname == [] or len(codonname) > 1: 1143 raise NexusError('Formatting Error in codonposset: %s ' % options) 1144 else: 1145 self.codonposset = codonname[0]
1146
1147 - def _codeset(self, options):
1148 pass
1149
1150 - def _charpartition(self, options):
1151 charpartition = {} 1152 quotelevel = False 1153 opts = CharBuffer(options) 1154 name = self._name_n_vector(opts) 1155 if not name: 1156 raise NexusError('Formatting error in charpartition: %s ' % options) 1157 # now collect the subpartitions and parse them 1158 # subpartitions separated by commas - which unfortunately could be part 1159 # of a quoted identifier... 1160 sub = '' 1161 while True: 1162 w = next(opts) 1163 if w is None or (w == ',' and not quotelevel): 1164 subname, subindices = self._get_indices(sub, set_type=CHARSET, separator=':') 1165 charpartition[subname] = _make_unique(subindices) 1166 sub = '' 1167 if w is None: 1168 break 1169 else: 1170 if w == "'": 1171 quotelevel = not quotelevel 1172 sub += w 1173 self.charpartitions[name] = charpartition
1174
1175 - def _get_indices(self, options, set_type=CHARSET, separator='='):
1176 """Parse the taxset/charset specification (PRIVATE). 1177 1178 e.g. '1 2 3 - 5 dog cat 10 - 20 \\ 3' 1179 --> [0,1,2,3,4,'dog','cat',9,12,15,18] 1180 """ 1181 opts = CharBuffer(options) 1182 name = self._name_n_vector(opts, separator=separator) 1183 indices = self._parse_list(opts, set_type=set_type) 1184 if indices is None: 1185 raise NexusError('Formatting error in line: %s ' % options) 1186 return name, indices
1187
1188 - def _name_n_vector(self, opts, separator='='):
1189 """Extract name and check that it's not in vector format.""" 1190 rest = opts.rest() 1191 name = opts.next_word() 1192 # we ignore * before names 1193 if name == '*': 1194 name = opts.next_word() 1195 if not name: 1196 raise NexusError('Formatting error in line: %s ' % rest) 1197 name = quotestrip(name) 1198 if opts.peek_nonwhitespace == '(': 1199 open = opts.next_nonwhitespace() 1200 qualifier = open.next_word() 1201 close = opts.next_nonwhitespace() 1202 if qualifier.lower() == 'vector': 1203 raise NexusError('Unsupported VECTOR format in line %s' 1204 % (opts)) 1205 elif qualifier.lower() != 'standard': 1206 raise NexusError('Unknown qualifier %s in line %s' 1207 % (qualifier, opts)) 1208 if opts.next_nonwhitespace() != separator: 1209 raise NexusError('Formatting error in line: %s ' % rest) 1210 return name
1211
1212 - def _parse_list(self, options_buffer, set_type):
1213 """Parse a NEXUS list (PRIVATE). 1214 1215 e.g. [1, 2, 4-8\\2, dog, cat] --> [1,2,4,6,8,17,21], 1216 (assuming dog is taxon no. 17 and cat is taxon no. 21). 1217 """ 1218 plain_list = [] 1219 if options_buffer.peek_nonwhitespace(): 1220 try: 1221 # capture all possible exceptions and treat them as formatting 1222 # errors, if they are not NexusError 1223 while True: 1224 identifier = options_buffer.next_word() # next list element 1225 if not identifier: # end of list? 1226 break 1227 start = self._resolve(identifier, set_type=set_type) 1228 if options_buffer.peek_nonwhitespace() == '-': # followd by - 1229 end = start 1230 step = 1 1231 # get hyphen and end of range 1232 hyphen = options_buffer.next_nonwhitespace() 1233 end = self._resolve(options_buffer.next_word(), set_type=set_type) 1234 if set_type == CHARSET: 1235 if options_buffer.peek_nonwhitespace() == '\\': # followd by \ 1236 backslash = options_buffer.next_nonwhitespace() 1237 step = int(options_buffer.next_word()) # get backslash and step 1238 plain_list.extend(range(start, end + 1, step)) 1239 else: 1240 if isinstance(start, list) or isinstance(end, list): 1241 raise NexusError('Name if character sets not allowed in range definition: %s' 1242 % identifier) 1243 start = self.taxlabels.index(start) 1244 end = self.taxlabels.index(end) 1245 taxrange = self.taxlabels[start:end + 1] 1246 plain_list.extend(taxrange) 1247 else: 1248 if isinstance(start, list): 1249 # start was the name of charset or taxset 1250 plain_list.extend(start) 1251 else: 1252 # start was an ordinary identifier 1253 plain_list.append(start) 1254 except NexusError: 1255 raise 1256 except: 1257 return None 1258 return plain_list
1259
1260 - def _resolve(self, identifier, set_type=None):
1261 """Translate identifier in list into character/taxon index. 1262 1263 Characters (which are referred to by their index in Nexus.py): 1264 Plain numbers are returned minus 1 (Nexus indices to python indices) 1265 Text identifiers are translated into their indices (if plain character identifiers), 1266 the first hit in charlabels is returned (charlabels don't need to be unique) 1267 or the range of indices is returned (if names of character sets). 1268 Taxa (which are referred to by their unique name in Nexus.py): 1269 Plain numbers are translated in their taxon name, underscores and spaces are considered equal. 1270 Names are returned unchanged (if plain taxon identifiers), or the names in 1271 the corresponding taxon set is returned. 1272 """ 1273 identifier = quotestrip(identifier) 1274 if not set_type: 1275 raise NexusError('INTERNAL ERROR: Need type to resolve identifier.') 1276 if set_type == CHARSET: 1277 try: 1278 n = int(identifier) 1279 except ValueError: 1280 if self.charlabels and identifier in self.charlabels.values(): 1281 for k in self.charlabels: 1282 if self.charlabels[k] == identifier: 1283 return k 1284 elif self.charsets and identifier in self.charsets: 1285 return self.charsets[identifier] 1286 else: 1287 raise NexusError('Unknown character identifier: %s' 1288 % identifier) 1289 else: 1290 if n <= self.nchar: 1291 return n - 1 1292 else: 1293 raise NexusError('Illegal character identifier: %d>nchar (=%d).' 1294 % (identifier, self.nchar)) 1295 elif set_type == TAXSET: 1296 try: 1297 n = int(identifier) 1298 except ValueError: 1299 taxlabels_id = self._check_taxlabels(identifier) 1300 if taxlabels_id: 1301 return taxlabels_id 1302 elif self.taxsets and identifier in self.taxsets: 1303 return self.taxsets[identifier] 1304 else: 1305 raise NexusError('Unknown taxon identifier: %s' 1306 % identifier) 1307 else: 1308 if n > 0 and n <= self.ntax: 1309 return self.taxlabels[n - 1] 1310 else: 1311 raise NexusError('Illegal taxon identifier: %d>ntax (=%d).' 1312 % (identifier, self.ntax)) 1313 else: 1314 raise NexusError('Unknown set specification: %s.' % set_type)
1315
1316 - def _stateset(self, options):
1317 # Not implemented 1318 pass
1319
1320 - def _changeset(self, options):
1321 # Not implemented 1322 pass
1323
1324 - def _treeset(self, options):
1325 # Not implemented 1326 pass
1327
1328 - def _treepartition(self, options):
1329 # Not implemented 1330 pass
1331
1332 - def write_nexus_data_partitions(self, matrix=None, filename=None, blocksize=None, 1333 interleave=False, exclude=(), delete=(), 1334 charpartition=None, comment='', mrbayes=False):
1335 """Writes a nexus file for each partition in charpartition. 1336 1337 Only non-excluded characters and non-deleted taxa are included, 1338 just the data block is written. 1339 """ 1340 if not matrix: 1341 matrix = self.matrix 1342 if not matrix: 1343 return 1344 if not filename: 1345 filename = self.filename 1346 if charpartition: 1347 pfilenames = {} 1348 for p in charpartition: 1349 total_exclude = list(exclude) 1350 total_exclude.extend(c for c in range(self.nchar) if c not in charpartition[p]) 1351 total_exclude = _make_unique(total_exclude) 1352 pcomment = comment + '\nPartition: ' + p + '\n' 1353 dot = filename.rfind('.') 1354 if dot > 0: 1355 pfilename = filename[:dot] + '_' + p + '.data' 1356 else: 1357 pfilename = filename + '_' + p 1358 pfilenames[p] = pfilename 1359 self.write_nexus_data(filename=pfilename, matrix=matrix, blocksize=blocksize, 1360 interleave=interleave, exclude=total_exclude, delete=delete, 1361 comment=pcomment, append_sets=False, mrbayes=mrbayes) 1362 return pfilenames 1363 else: 1364 fn = self.filename + '.data' 1365 self.write_nexus_data(filename=fn, matrix=matrix, blocksize=blocksize, 1366 interleave=interleave, exclude=exclude, delete=delete, 1367 comment=comment, append_sets=False, mrbayes=mrbayes) 1368 return fn
1369
1370 - def write_nexus_data(self, filename=None, matrix=None, exclude=(), delete=(), 1371 blocksize=None, interleave=False, interleave_by_partition=False, 1372 comment=None, omit_NEXUS=False, append_sets=True, mrbayes=False, 1373 codons_block=True):
1374 """Writes a nexus file with data and sets block to a file or handle. 1375 1376 Character sets and partitions are appended by default, and are 1377 adjusted according to excluded characters (i.e. character sets 1378 still point to the same sites (not necessarily same positions), 1379 without including the deleted characters. 1380 1381 - filename - Either a filename as a string (which will be opened, 1382 written to and closed), or a handle object (which will 1383 be written to but NOT closed). 1384 - interleave_by_partition - Optional name of partition (string) 1385 - omit_NEXUS - Boolean. If true, the '#NEXUS' line normally at the 1386 start of the file is omitted. 1387 1388 Returns the filename/handle used to write the data. 1389 """ 1390 if not matrix: 1391 matrix = self.matrix 1392 if not matrix: 1393 return 1394 if not filename: 1395 filename = self.filename 1396 if [t for t in delete if not self._check_taxlabels(t)]: 1397 raise NexusError('Unknown taxa: %s' 1398 % ', '.join(set(delete).difference(set(self.taxlabels)))) 1399 if interleave_by_partition: 1400 if interleave_by_partition not in self.charpartitions: 1401 raise NexusError('Unknown partition: %r' % interleave_by_partition) 1402 else: 1403 partition = self.charpartitions[interleave_by_partition] 1404 # we need to sort the partition names by starting position 1405 # before we exclude characters 1406 names = _sort_keys_by_values(partition) 1407 newpartition = {} 1408 for p in partition: 1409 newpartition[p] = [c for c in partition[p] if c not in exclude] 1410 # how many taxa and how many characters are left? 1411 undelete = [taxon for taxon in self.taxlabels if taxon in matrix and taxon not in delete] 1412 cropped_matrix = _seqmatrix2strmatrix(self.crop_matrix(matrix, exclude=exclude, delete=delete)) 1413 ntax_adjusted = len(undelete) 1414 nchar_adjusted = len(cropped_matrix[undelete[0]]) 1415 if not undelete or (undelete and undelete[0] == ''): 1416 return 1417 1418 with File.as_handle(filename, mode='w') as fh: 1419 if not omit_NEXUS: 1420 fh.write('#NEXUS\n') 1421 if comment: 1422 fh.write('[' + comment + ']\n') 1423 fh.write('begin data;\n') 1424 fh.write('\tdimensions ntax=%d nchar=%d;\n' % (ntax_adjusted, nchar_adjusted)) 1425 fh.write('\tformat datatype=' + self.datatype) 1426 if self.respectcase: 1427 fh.write(' respectcase') 1428 if self.missing: 1429 fh.write(' missing=' + self.missing) 1430 if self.gap: 1431 fh.write(' gap=' + self.gap) 1432 if self.matchchar: 1433 fh.write(' matchchar=' + self.matchchar) 1434 if self.labels: 1435 fh.write(' labels=' + self.labels) 1436 if self.equate: 1437 fh.write(' equate=' + self.equate) 1438 if interleave or interleave_by_partition: 1439 fh.write(' interleave') 1440 fh.write(';\n') 1441 # if self.taxlabels: 1442 # fh.write('taxlabels '+' '.join(self.taxlabels)+';\n') 1443 if self.charlabels: 1444 newcharlabels = self._adjust_charlabels(exclude=exclude) 1445 clkeys = sorted(newcharlabels) 1446 fh.write('charlabels ' + 1447 ', '.join("%s %s" % (k + 1, safename(newcharlabels[k])) for k in clkeys) + 1448 ';\n') 1449 fh.write('matrix\n') 1450 if not blocksize: 1451 if interleave: 1452 blocksize = 70 1453 else: 1454 blocksize = self.nchar 1455 # delete deleted taxa and ecxclude excluded characters... 1456 namelength = max(len(safename(t, mrbayes=mrbayes)) for t in undelete) 1457 if interleave_by_partition: 1458 # interleave by partitions, but adjust partitions with regard 1459 # to excluded characters 1460 seek = 0 1461 for p in names: 1462 fh.write('[%s: %s]\n' % (interleave_by_partition, p)) 1463 if len(newpartition[p]) > 0: 1464 for taxon in undelete: 1465 fh.write(safename(taxon, mrbayes=mrbayes).ljust(namelength + 1)) 1466 fh.write(cropped_matrix[taxon][seek:seek + len(newpartition[p])] + '\n') 1467 fh.write('\n') 1468 else: 1469 fh.write('[empty]\n\n') 1470 seek += len(newpartition[p]) 1471 elif interleave: 1472 for seek in range(0, nchar_adjusted, blocksize): 1473 for taxon in undelete: 1474 fh.write(safename(taxon, mrbayes=mrbayes).ljust(namelength + 1)) 1475 fh.write(cropped_matrix[taxon][seek:seek + blocksize] + '\n') 1476 fh.write('\n') 1477 else: 1478 for taxon in undelete: 1479 if blocksize < nchar_adjusted: 1480 fh.write(safename(taxon, mrbayes=mrbayes) + '\n') 1481 else: 1482 fh.write(safename(taxon, mrbayes=mrbayes).ljust(namelength + 1)) 1483 taxon_seq = cropped_matrix[taxon] 1484 for seek in range(0, nchar_adjusted, blocksize): 1485 fh.write(taxon_seq[seek:seek + blocksize] + '\n') 1486 del taxon_seq 1487 fh.write(';\nend;\n') 1488 if append_sets: 1489 if codons_block: 1490 fh.write(self.append_sets(exclude=exclude, delete=delete, mrbayes=mrbayes, include_codons=False)) 1491 fh.write(self.append_sets(exclude=exclude, delete=delete, mrbayes=mrbayes, codons_only=True)) 1492 else: 1493 fh.write(self.append_sets(exclude=exclude, delete=delete, mrbayes=mrbayes)) 1494 return filename
1495
1496 - def append_sets(self, exclude=(), delete=(), mrbayes=False, include_codons=True, codons_only=False):
1497 """Returns a sets block.""" 1498 if not self.charsets and not self.taxsets and not self.charpartitions: 1499 return '' 1500 if codons_only: 1501 setsb = ['\nbegin codons'] 1502 else: 1503 setsb = ['\nbegin sets'] 1504 # - now if characters have been excluded, the character sets need to be adjusted, 1505 # so that they still point to the right character positions 1506 # calculate a list of offsets: for each deleted character, the following character position 1507 # in the new file will have an additional offset of -1 1508 offset = 0 1509 offlist = [] 1510 for c in range(self.nchar): 1511 if c in exclude: 1512 offset += 1 1513 offlist.append(-1) # dummy value as these character positions are excluded 1514 else: 1515 offlist.append(c - offset) 1516 # now adjust each of the character sets 1517 if not codons_only: 1518 for n, ns in self.charsets.items(): 1519 cset = [offlist[c] for c in ns if c not in exclude] 1520 if cset: 1521 setsb.append('charset %s = %s' % (safename(n), _compact4nexus(cset))) 1522 for n, s in self.taxsets.items(): 1523 tset = [safename(t, mrbayes=mrbayes) for t in s if t not in delete] 1524 if tset: 1525 setsb.append('taxset %s = %s' % (safename(n), ' '.join(tset))) 1526 for n, p in self.charpartitions.items(): 1527 if not include_codons and n == CODONPOSITIONS: 1528 continue 1529 elif codons_only and n != CODONPOSITIONS: 1530 continue 1531 # as characters have been excluded, the partitions must be adjusted 1532 # if a partition is empty, it will be omitted from the charpartition command 1533 # (although paup allows charpartition part=t1:,t2:,t3:1-100) 1534 names = _sort_keys_by_values(p) 1535 newpartition = {} 1536 for sn in names: 1537 nsp = [offlist[c] for c in p[sn] if c not in exclude] 1538 if nsp: 1539 newpartition[sn] = nsp 1540 if newpartition: 1541 if include_codons and n == CODONPOSITIONS: 1542 command = 'codonposset' 1543 else: 1544 command = 'charpartition' 1545 setsb.append('%s %s = %s' % (command, safename(n), 1546 ', '.join('%s: %s' % (sn, _compact4nexus(newpartition[sn])) 1547 for sn in names if sn in newpartition))) 1548 # now write charpartititions, much easier than charpartitions 1549 for n, p in self.taxpartitions.items(): 1550 names = _sort_keys_by_values(p) 1551 newpartition = {} 1552 for sn in names: 1553 nsp = [t for t in p[sn] if t not in delete] 1554 if nsp: 1555 newpartition[sn] = nsp 1556 if newpartition: 1557 setsb.append('taxpartition %s = %s' % (safename(n), 1558 ', '.join('%s: %s' % (safename(sn), 1559 ' '.join(safename(x) for x in newpartition[sn])) 1560 for sn in names if sn in newpartition))) 1561 # add 'end' and return everything 1562 setsb.append('end;\n') 1563 if len(setsb) == 2: # begin and end only 1564 return '' 1565 else: 1566 return ';\n'.join(setsb)
1567
1568 - def export_fasta(self, filename=None, width=70):
1569 """Writes matrix into a fasta file.""" 1570 if not filename: 1571 if '.' in self.filename and self.filename.split('.')[-1].lower() in ['paup', 'nexus', 'nex', 'dat']: 1572 filename = '.'.join(self.filename.split('.')[:-1]) + '.fas' 1573 else: 1574 filename = self.filename + '.fas' 1575 with open(filename, 'w') as fh: 1576 for taxon in self.taxlabels: 1577 fh.write('>' + safename(taxon) + '\n') 1578 for i in range(0, len(str(self.matrix[taxon])), width): 1579 fh.write(str(self.matrix[taxon])[i:i + width] + '\n') 1580 return filename
1581
1582 - def export_phylip(self, filename=None):
1583 """Writes matrix into a PHYLIP file. 1584 1585 Note that this writes a relaxed PHYLIP format file, where the names 1586 are not truncated, nor checked for invalid characters.""" 1587 if not filename: 1588 if '.' in self.filename and self.filename.split('.')[-1].lower() in ['paup', 'nexus', 'nex', 'dat']: 1589 filename = '.'.join(self.filename.split('.')[:-1]) + '.phy' 1590 else: 1591 filename = self.filename + '.phy' 1592 with open(filename, 'w') as fh: 1593 fh.write('%d %d\n' % (self.ntax, self.nchar)) 1594 for taxon in self.taxlabels: 1595 fh.write('%s %s\n' % (safename(taxon), str(self.matrix[taxon]))) 1596 return filename
1597
1598 - def constant(self, matrix=None, delete=(), exclude=()):
1599 """Return a list with all constant characters.""" 1600 if not matrix: 1601 matrix = self.matrix 1602 undelete = [t for t in self.taxlabels if t in matrix and t not in delete] 1603 if not undelete: 1604 return None 1605 elif len(undelete) == 1: 1606 return [x for x in range(len(matrix[undelete[0]])) if x not in exclude] 1607 # get the first sequence and expand all ambiguous values 1608 constant = [(x, self.ambiguous_values.get(n.upper(), n.upper())) for 1609 x, n in enumerate(str(matrix[undelete[0]])) if x not in exclude] 1610 1611 for taxon in undelete[1:]: 1612 newconstant = [] 1613 for site in constant: 1614 # print '%d (paup=%d)' % (site[0],site[0]+1), 1615 seqsite = matrix[taxon][site[0]].upper() 1616 # print seqsite,'checked against',site[1],'\t', 1617 if seqsite == self.missing or \ 1618 (seqsite == self.gap and self.options['gapmode'].lower() == 'missing') or \ 1619 seqsite == site[1]: 1620 # missing or same as before -> ok 1621 newconstant.append(site) 1622 elif (seqsite in site[1] or 1623 site[1] == self.missing or 1624 (self.options['gapmode'].lower() == 'missing' and 1625 site[1] == self.gap)): 1626 # subset of an ambig or only missing in previous -> take subset 1627 newconstant.append((site[0], self.ambiguous_values.get(seqsite, seqsite))) 1628 elif seqsite in self.ambiguous_values: 1629 # is it an ambig: check the intersection with prev. values 1630 intersect = set(self.ambiguous_values[seqsite]).intersection(set(