Package Bio :: Package Nexus :: Module Nexus
[hide private]
[frames] | no frames]

Source Code for Module Bio.Nexus.Nexus

   1  # Copyright 2005-2008 by Frank Kauff & Cymon J. Cox. All rights reserved. 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license. Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5  # 
   6  # Bug reports welcome: fkauff@biologie.uni-kl.de or on Biopython's bugzilla. 
   7  """Nexus class. Parse the contents of a NEXUS file. 
   8   
   9  Based upon 'NEXUS: An extensible file format for systematic information' 
  10  Maddison, Swofford, Maddison. 1997. Syst. Biol. 46(4):590-621 
  11  """ 
  12  from __future__ import print_function 
  13   
  14  from Bio._py3k import zip 
  15  from Bio._py3k import range 
  16  from Bio._py3k import basestring 
  17   
  18  from functools import reduce 
  19  import copy 
  20  import math 
  21  import random 
  22  import sys 
  23   
  24  from Bio import File 
  25  from Bio.Alphabet import IUPAC 
  26  from Bio.Data import IUPACData 
  27  from Bio.Seq import Seq 
  28   
  29  from .Trees import Tree 
  30   
  31  __docformat__ = "restructuredtext en" 
  32   
  33  INTERLEAVE = 70 
  34  SPECIAL_COMMANDS = ['charstatelabels', 'charlabels', 'taxlabels', 'taxset', 
  35                      'charset', 'charpartition', 'taxpartition', 'matrix', 
  36                      'tree', 'utree', 'translate', 'codonposset', 'title'] 
  37  KNOWN_NEXUS_BLOCKS = ['trees', 'data', 'characters', 'taxa', 'sets', 'codons'] 
  38  PUNCTUATION = '()[]{}/\,;:=*\'"`+-<>' 
  39  MRBAYESSAFE = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890_' 
  40  WHITESPACE = ' \t\n' 
  41  # SPECIALCOMMENTS = ['!','&','%','/','\\','@'] # original list of special comments 
  42  SPECIALCOMMENTS = ['&'] # supported special comment ('tree' command), all others are ignored 
  43  CHARSET = 'chars' 
  44  TAXSET = 'taxa' 
  45  CODONPOSITIONS = 'codonpositions' 
  46  DEFAULTNEXUS = '#NEXUS\nbegin data; dimensions ntax=0 nchar=0; format datatype=dna; end; ' 
  47   
  48   
49 -class NexusError(Exception):
50 pass
51 52
53 -class CharBuffer(object):
54 """Helps reading NEXUS-words and characters from a buffer (semi-PRIVATE). 55 56 This class is not intended for public use (any more). 57 """
58 - def __init__(self, string):
59 if string: 60 self.buffer = list(string) 61 else: 62 self.buffer = []
63
64 - def peek(self):
65 if self.buffer: 66 return self.buffer[0] 67 else: 68 return None
69
70 - def peek_nonwhitespace(self):
71 b = ''.join(self.buffer).strip() 72 if b: 73 return b[0] 74 else: 75 return None
76
77 - def __next__(self):
78 if self.buffer: 79 return self.buffer.pop(0) 80 else: 81 return None
82 83 if sys.version_info[0] < 3:
84 - def next(self):
85 """Deprecated Python 2 style alias for Python 3 style __next__ method.""" 86 return self.__next__()
87
88 - def next_nonwhitespace(self):
89 while True: 90 p = next(self) 91 if p is None: 92 break 93 if p not in WHITESPACE: 94 return p 95 return None
96
97 - def skip_whitespace(self):
98 while self.buffer[0] in WHITESPACE: 99 self.buffer = self.buffer[1:]
100
101 - def next_until(self, target):
102 for t in target: 103 try: 104 pos = self.buffer.index(t) 105 except ValueError: 106 pass 107 else: 108 found = ''.join(self.buffer[:pos]) 109 self.buffer = self.buffer[pos:] 110 return found 111 else: 112 return None
113
114 - def peek_word(self, word):
115 return ''.join(self.buffer[:len(word)]) == word
116
117 - def next_word(self):
118 """Return the next NEXUS word from a string. 119 120 This deals with single and double quotes, whitespace and punctuation. 121 """ 122 123 word = [] 124 quoted = False 125 first = self.next_nonwhitespace() # get first character 126 if not first: # return empty if only whitespace left 127 return None 128 word.append(first) 129 if first == "'": # word starts with a quote 130 quoted = "'" 131 elif first == '"': 132 quoted = '"' 133 elif first in PUNCTUATION: # if it's punctuation, return immediately 134 return first 135 while True: 136 c = self.peek() 137 if c == quoted: # a quote? 138 word.append(next(self)) # store quote 139 if self.peek() == quoted: # double quote 140 skip = next(self) # skip second quote 141 elif quoted: # second single quote ends word 142 break 143 elif quoted: 144 word.append(next(self)) # if quoted, then add anything 145 elif not c or c in PUNCTUATION or c in WHITESPACE: 146 # if not quoted and special character, stop 147 break 148 else: 149 word.append(next(self)) # standard character 150 return ''.join(word)
151
152 - def rest(self):
153 """Return the rest of the string without parsing.""" 154 return ''.join(self.buffer)
155 156
157 -class StepMatrix(object):
158 """Calculate a stepmatrix for weighted parsimony. 159 160 See Wheeler (1990), Cladistics 6:269-275. 161 """ 162
163 - def __init__(self, symbols, gap):
164 self.data = {} 165 self.symbols = sorted(symbols) 166 if gap: 167 self.symbols.append(gap) 168 for x in self.symbols: 169 for y in [s for s in self.symbols if s != x]: 170 self.set(x, y, 0)
171
172 - def set(self, x, y, value):
173 if x > y: 174 x, y = y, x 175 self.data[x + y] = value
176
177 - def add(self, x, y, value):
178 if x > y: 179 x, y = y, x 180 self.data[x + y] += value
181
182 - def sum(self):
183 return reduce(lambda x, y: x+y, self.data.values())
184
185 - def transformation(self):
186 total = self.sum() 187 if total != 0: 188 for k in self.data: 189 self.data[k] = self.data[k] / float(total) 190 return self
191
192 - def weighting(self):
193 for k in self.data: 194 if self.data[k] != 0: 195 self.data[k] = -math.log(self.data[k]) 196 return self
197
198 - def smprint(self, name='your_name_here'):
199 matrix = 'usertype %s stepmatrix=%d\n' % (name, len(self.symbols)) 200 matrix += ' %s\n' % ' '.join(self.symbols) 201 for x in self.symbols: 202 matrix += '[%s]'.ljust(8) % x 203 for y in self.symbols: 204 if x == y: 205 matrix += ' . ' 206 else: 207 if x > y: 208 x1, y1 = y, x 209 else: 210 x1, y1 = x, y 211 if self.data[x1 + y1] == 0: 212 matrix += 'inf. ' 213 else: 214 matrix += '%2.2f'.ljust(10) % (self.data[x1 + y1]) 215 matrix += '\n' 216 matrix += ';\n' 217 return matrix
218 219
220 -def safename(name, mrbayes=False):
221 """Return a taxon identifier according to NEXUS standard. 222 223 Wrap quotes around names with punctuation or whitespace, and double 224 single quotes. 225 226 mrbayes=True: write names without quotes, whitespace or punctuation 227 for the mrbayes software package. 228 """ 229 if mrbayes: 230 safe = name.replace(' ', '_') 231 safe = ''.join(c for c in safe if c in MRBAYESSAFE) 232 else: 233 safe = name.replace("'", "''") 234 if set(safe).intersection(set(WHITESPACE+PUNCTUATION)): 235 safe = "'" + safe + "'" 236 return safe
237 238
239 -def quotestrip(word):
240 """Remove quotes and/or double quotes around identifiers.""" 241 if not word: 242 return None 243 while (word.startswith("'") and word.endswith("'")) or (word.startswith('"') and word.endswith('"')): 244 word = word[1:-1] 245 return word
246 247
248 -def get_start_end(sequence, skiplist=['-', '?']):
249 """Return position of first and last character which is not in skiplist. 250 251 Skiplist defaults to ['-','?']).""" 252 253 length = len(sequence) 254 if length == 0: 255 return None, None 256 end = length - 1 257 while end >= 0 and (sequence[end] in skiplist): 258 end -= 1 259 start = 0 260 while start < length and (sequence[start] in skiplist): 261 start += 1 262 if start == length and end == -1: # empty sequence 263 return -1, -1 264 else: 265 return start, end
266 267
268 -def _sort_keys_by_values(p):
269 """Returns a sorted list of keys of p sorted by values of p.""" 270 return sorted((pn for pn in p if p[pn]), key=lambda pn: p[pn])
271 272
273 -def _make_unique(l):
274 """Check that all values in list are unique and return a pruned and sorted list.""" 275 return sorted(set(l))
276 277
278 -def _unique_label(previous_labels, label):
279 """Returns a unique name if label is already in previous_labels.""" 280 while label in previous_labels: 281 if label.split('.')[-1].startswith('copy'): 282 label = '.'.join(label.split('.')[:-1]) \ 283 + '.copy' + str(eval('0'+label.split('.')[-1][4:])+1) 284 else: 285 label += '.copy' 286 return label
287 288
289 -def _seqmatrix2strmatrix(matrix):
290 """Converts a Seq-object matrix to a plain sequence-string matrix.""" 291 return dict((t, str(matrix[t])) for t in matrix)
292 293
294 -def _compact4nexus(orig_list):
295 """Transform [1 2 3 5 6 7 8 12 15 18 20] (baseindex 0, used in the Nexus class) 296 into '2-4 6-9 13-19\\3 21' (baseindex 1, used in programs like Paup or MrBayes.). 297 """ 298 299 if not orig_list: 300 return '' 301 orig_list = sorted(set(orig_list)) 302 shortlist = [] 303 clist = orig_list[:] 304 clist.append(clist[-1] + .5) # dummy value makes it easier 305 while len(clist) > 1: 306 step = 1 307 for i, x in enumerate(clist): 308 if x == clist[0] + i*step: # are we still in the right step? 309 continue 310 elif i == 1 and len(clist) > 3 and clist[i+1] - x == x - clist[0]: 311 # second element, and possibly at least 3 elements to link, 312 # and the next one is in the right step 313 step = x - clist[0] 314 else: # pattern broke, add all values before current position to new list 315 sub = clist[:i] 316 if len(sub) == 1: 317 shortlist.append(str(sub[0]+1)) 318 else: 319 if step == 1: 320 shortlist.append('%d-%d' % (sub[0]+1, sub[-1]+1)) 321 else: 322 shortlist.append('%d-%d\\%d' % (sub[0]+1, sub[-1]+1, step)) 323 clist = clist[i:] 324 break 325 return ' '.join(shortlist)
326 327
328 -def combine(matrices):
329 """Combine matrices in [(name,nexus-instance),...] and return new nexus instance. 330 331 combined_matrix=combine([(name1,nexus_instance1),(name2,nexus_instance2),...] 332 Character sets, character partitions and taxon sets are prefixed, readjusted 333 and present in the combined matrix. 334 """ 335 336 if not matrices: 337 return None 338 name = matrices[0][0] 339 combined = copy.deepcopy(matrices[0][1]) # initiate with copy of first matrix 340 mixed_datatypes = (len(set(n[1].datatype for n in matrices)) > 1) 341 if mixed_datatypes: 342 # dealing with mixed matrices is application specific. 343 # You take care of that yourself! 344 combined.datatype = 'None' 345 # raise NexusError('Matrices must be of same datatype') 346 combined.charlabels = None 347 combined.statelabels = None 348 combined.interleave = False 349 combined.translate = None 350 351 # rename taxon sets and character sets and name them with prefix 352 for cn, cs in combined.charsets.items(): 353 combined.charsets['%s.%s' % (name, cn)]=cs 354 del combined.charsets[cn] 355 for tn, ts in combined.taxsets.items(): 356 combined.taxsets['%s.%s' % (name, tn)]=ts 357 del combined.taxsets[tn] 358 # previous partitions usually don't make much sense in combined matrix 359 # just initiate one new partition parted by single matrices 360 combined.charpartitions = {'combined': {name: list(range(combined.nchar))}} 361 for n, m in matrices[1:]: # add all other matrices 362 both = [t for t in combined.taxlabels if t in m.taxlabels] 363 combined_only = [t for t in combined.taxlabels if t not in both] 364 m_only = [t for t in m.taxlabels if t not in both] 365 for t in both: 366 # concatenate sequences and unify gap and missing character symbols 367 combined.matrix[t] += Seq(str(m.matrix[t]).replace(m.gap, combined.gap).replace(m.missing, combined.missing), combined.alphabet) 368 # replace date of missing taxa with symbol for missing data 369 for t in combined_only: 370 combined.matrix[t] += Seq(combined.missing*m.nchar, combined.alphabet) 371 for t in m_only: 372 combined.matrix[t] = Seq(combined.missing*combined.nchar, combined.alphabet) + \ 373 Seq(str(m.matrix[t]).replace(m.gap, combined.gap).replace(m.missing, combined.missing), combined.alphabet) 374 combined.taxlabels.extend(m_only) # new taxon list 375 for cn, cs in m.charsets.items(): # adjust character sets for new matrix 376 combined.charsets['%s.%s' % (n, cn)] = [x+combined.nchar for x in cs] 377 if m.taxsets: 378 if not combined.taxsets: 379 combined.taxsets = {} 380 # update taxon sets 381 combined.taxsets.update(dict(('%s.%s' % (n, tn), ts) 382 for tn, ts in m.taxsets.items())) 383 # update new charpartition 384 combined.charpartitions['combined'][n] = list(range(combined.nchar, combined.nchar+m.nchar)) 385 # update charlabels 386 if m.charlabels: 387 if not combined.charlabels: 388 combined.charlabels = {} 389 combined.charlabels.update(dict((combined.nchar + i, label) 390 for (i, label) in m.charlabels.items())) 391 combined.nchar += m.nchar # update nchar and ntax 392 combined.ntax += len(m_only) 393 394 # some prefer partitions, some charsets: 395 # make separate charset for ecah initial dataset 396 for c in combined.charpartitions['combined']: 397 combined.charsets[c] = combined.charpartitions['combined'][c] 398 399 return combined
400 401
402 -def _kill_comments_and_break_lines(text):
403 """Delete []-delimited comments out of a file and break into lines separated by ';'. 404 405 stripped_text=_kill_comments_and_break_lines(text): 406 Nested and multiline comments are allowed. [ and ] symbols within single 407 or double quotes are ignored, newline ends a quote, all symbols with quotes are 408 treated the same (thus not quoting inside comments like [this character ']' ends a comment]) 409 Special [&...] and [\...] comments remain untouched, if not inside standard comment. 410 Quotes inside special [& and [\ are treated as normal characters, 411 but no nesting inside these special comments allowed (like [& [\ ]]). 412 ';' ist deleted from end of line. 413 414 NOTE: this function is very slow for large files, and obsolete when using C extension cnexus 415 """ 416 contents = iter(text) 417 newtext = [] 418 newline = [] 419 quotelevel = '' 420 speciallevel = False 421 commlevel = 0 422 # Parse with one character look ahead (for special comments) 423 t2 = next(contents) 424 while True: 425 t = t2 426 try: 427 t2 = next(contents) 428 except StopIteration: 429 t2 = None 430 if t is None: 431 break 432 if t == quotelevel and not (commlevel or speciallevel): 433 # matching quote ends quotation 434 quotelevel = '' 435 elif not quotelevel and not (commlevel or speciallevel) and (t == '"' or t == "'"): 436 # single or double quote starts quotation 437 quotelevel=t 438 elif not quotelevel and t == '[': 439 # opening bracket outside a quote 440 if t2 in SPECIALCOMMENTS and commlevel == 0 and not speciallevel: 441 speciallevel = True 442 else: 443 commlevel += 1 444 elif not quotelevel and t == ']': 445 # closing bracket ioutside a quote 446 if speciallevel: 447 speciallevel = False 448 else: 449 commlevel -= 1 450 if commlevel < 0: 451 raise NexusError('Nexus formatting error: unmatched ]') 452 continue 453 if commlevel == 0: 454 # copy if we're not in comment 455 if t == ';' and not quotelevel: 456 newtext.append(''.join(newline)) 457 newline=[] 458 else: 459 newline.append(t) 460 # level of comments should be 0 at the end of the file 461 if newline: 462 newtext.append('\n'.join(newline)) 463 if commlevel > 0: 464 raise NexusError('Nexus formatting error: unmatched [') 465 return newtext
466 467
468 -def _adjust_lines(lines):
469 """Adjust linebreaks to match ';', strip leading/trailing whitespace. 470 471 list_of_commandlines=_adjust_lines(input_text) 472 Lines are adjusted so that no linebreaks occur within a commandline 473 (except matrix command line) 474 """ 475 formatted_lines = [] 476 for l in lines: 477 # Convert line endings 478 l = l.replace('\r\n', '\n').replace('\r', '\n').strip() 479 if l.lower().startswith('matrix'): 480 formatted_lines.append(l) 481 else: 482 l = l.replace('\n', ' ') 483 if l: 484 formatted_lines.append(l) 485 return formatted_lines
486 487
488 -def _replace_parenthesized_ambigs(seq, rev_ambig_values):
489 """Replaces ambigs in xxx(ACG)xxx format by IUPAC ambiguity code.""" 490 491 opening = seq.find('(') 492 while opening > -1: 493 closing = seq.find(')') 494 if closing < 0: 495 raise NexusError('Missing closing parenthesis in: '+seq) 496 elif closing < opening: 497 raise NexusError('Missing opening parenthesis in: '+seq) 498 ambig = ''.join(sorted(seq[opening+1:closing])) 499 ambig_code = rev_ambig_values[ambig.upper()] 500 if ambig != ambig.upper(): 501 ambig_code = ambig_code.lower() 502 seq = seq[:opening] + ambig_code + seq[closing+1:] 503 opening = seq.find('(') 504 return seq
505 506
507 -class Commandline(object):
508 """Represent a commandline as command and options.""" 509
510 - def __init__(self, line, title):
511 self.options = {} 512 options = [] 513 self.command = None 514 try: 515 # Assume matrix (all other command lines have been stripped of \n) 516 self.command, options = line.strip().split('\n', 1) 517 except ValueError: # Not matrix 518 # self.command,options=line.split(' ',1) # no: could be tab or spaces (translate...) 519 self.command = line.split()[0] 520 options=' '.join(line.split()[1:]) 521 self.command = self.command.strip().lower() 522 if self.command in SPECIAL_COMMANDS: 523 # special command that need newlines and order of options preserved 524 self.options = options.strip() 525 else: 526 if len(options) > 0: 527 try: 528 options = options.replace('=', ' = ').split() 529 valued_indices = [(n-1, n, n+1) for n in range(len(options)) 530 if options[n] == '=' and n != 0 and n != len((options))] 531 indices = [] 532 for sl in valued_indices: 533 indices.extend(sl) 534 token_indices = [n for n in range(len(options)) if n not in indices] 535 for opt in valued_indices: 536 # self.options[options[opt[0]].lower()] = options[opt[2]].lower() 537 self.options[options[opt[0]].lower()] = options[opt[2]] 538 for token in token_indices: 539 self.options[options[token].lower()] = None 540 except ValueError: 541 raise NexusError('Incorrect formatting in line: %s' % line)
542 543
544 -class Block(object):
545 """Represent a NEXUS block with block name and list of commandlines."""
546 - def __init__(self, title=None):
547 self.title = title 548 self.commandlines = []
549 550
551 -class Nexus(object):
552
553 - def __init__(self, input=None):
554 self.ntax = 0 # number of taxa 555 self.nchar = 0 # number of characters 556 self.unaltered_taxlabels = [] # taxlabels as the appear in the input file (incl. duplicates, etc.) 557 self.taxlabels = [] # labels for taxa, ordered by their id 558 self.charlabels = None # ... and for characters 559 self.statelabels = None # ... and for states 560 self.datatype = 'dna' # (standard), dna, rna, nucleotide, protein 561 self.respectcase = False # case sensitivity 562 self.missing = '?' # symbol for missing characters 563 self.gap = '-' # symbol for gap 564 self.symbols = None # set of symbols 565 self.equate = None # set of symbol synonyms 566 self.matchchar = None # matching char for matrix representation 567 self.labels = None # left, right, no 568 self.transpose = False # whether matrix is transposed 569 self.interleave = False # whether matrix is interleaved 570 self.tokens = False # unsupported 571 self.eliminate = None # unsupported 572 self.matrix = None # ... 573 self.unknown_blocks = [] # blocks we don't care about 574 self.taxsets = {} 575 self.charsets = {} 576 self.charpartitions = {} 577 self.taxpartitions = {} 578 self.trees = [] # list of Trees (instances of Tree class) 579 self.translate = None # Dict to translate taxon <-> taxon numbers 580 self.structured = [] # structured input representation 581 self.set = {} # dict of the set command to set various options 582 self.options = {} # dict of the options command in the data block 583 self.codonposset = None # name of the charpartition that defines codon positions 584 585 # some defaults 586 self.options['gapmode'] = 'missing' 587 588 if input: 589 self.read(input) 590 else: 591 self.read(DEFAULTNEXUS)
592
593 - def get_original_taxon_order(self):
594 """Included for backwards compatibility (DEPRECATED).""" 595 return self.taxlabels
596
597 - def set_original_taxon_order(self, value):
598 """Included for backwards compatibility (DEPRECATED).""" 599 self.taxlabels = value
600 601 original_taxon_order = property(get_original_taxon_order, set_original_taxon_order) 602
603 - def read(self, input):
604 """Read and parse NEXUS input (a filename, file-handle, or string).""" 605 606 # 1. Assume we have the name of a file in the execution dir or a 607 # file-like object. 608 # Note we need to add parsing of the path to dir/filename 609 try: 610 with File.as_handle(input, 'rU') as fp: 611 file_contents = fp.read() 612 self.filename = getattr(fp, 'name', 'Unknown_nexus_file') 613 except (TypeError, IOError, AttributeError): 614 # 2 Assume we have a string from a fh.read() 615 if isinstance(input, basestring): 616 file_contents = input 617 self.filename = 'input_string' 618 else: 619 print(input.strip()[:50]) 620 raise NexusError('Unrecognized input: %s ...' % input[:100]) 621 file_contents = file_contents.strip() 622 if file_contents.startswith('#NEXUS'): 623 file_contents = file_contents[6:] 624 commandlines = _get_command_lines(file_contents) 625 # get rid of stupid 'NEXUS token - in merged treefiles, this might appear multiple times' 626 for i, cl in enumerate(commandlines): 627 try: 628 if cl[:6].upper() == '#NEXUS': 629 commandlines[i] = cl[6:].strip() 630 except: 631 pass 632 # now loop through blocks (we parse only data in known blocks, thus ignoring non-block commands 633 nexus_block_gen = self._get_nexus_block(commandlines) 634 while True: 635 try: 636 title, contents = next(nexus_block_gen) 637 except StopIteration: 638 break 639 if title in KNOWN_NEXUS_BLOCKS: 640 self._parse_nexus_block(title, contents) 641 else: 642 self._unknown_nexus_block(title, contents)
643
644 - def _get_nexus_block(self, file_contents):
645 """Generator for looping through Nexus blocks.""" 646 inblock = False 647 blocklines = [] 648 while file_contents: 649 cl = file_contents.pop(0) 650 if cl.lower().startswith('begin'): 651 if not inblock: 652 inblock = True 653 title = cl.split()[1].lower() 654 else: 655 raise NexusError('Illegal block nesting in block %s' % title) 656 elif cl.lower().startswith('end'): 657 if inblock: 658 inblock = False 659 yield title, blocklines 660 blocklines = [] 661 else: 662 raise NexusError('Unmatched \'end\'.') 663 elif inblock: 664 blocklines.append(cl)
665
666 - def _unknown_nexus_block(self, title, contents):
667 block = Block() 668 block.commandlines.append(contents) 669 block.title = title 670 self.unknown_blocks.append(block)
671
672 - def _parse_nexus_block(self, title, contents):
673 """Parse a known Nexus Block (PRIVATE).""" 674 # attached the structered block representation 675 self._apply_block_structure(title, contents) 676 # now check for taxa,characters,data blocks. If this stuff is defined more than once 677 # the later occurences will override the previous ones. 678 block = self.structured[-1] 679 for line in block.commandlines: 680 try: 681 getattr(self, '_' + line.command)(line.options) 682 except AttributeError: 683 raise NexusError('Unknown command: %s ' % line.command)
684
685 - def _title(self, options):
686 pass
687 690
691 - def _dimensions(self, options):
692 if 'ntax' in options: 693 self.ntax = eval(options['ntax']) 694 if 'nchar' in options: 695 self.nchar = eval(options['nchar'])
696
697 - def _format(self, options):
698 # print options 699 # we first need to test respectcase, then symbols (which depends on respectcase) 700 # then datatype (which, if standard, depends on symbols and respectcase in order to generate 701 # dicts for ambiguous values and alphabet 702 if 'respectcase' in options: 703 self.respectcase = True 704 # adjust symbols to for respectcase 705 if 'symbols' in options: 706 self.symbols = options['symbols'] 707 if (self.symbols.startswith('"') and self.symbols.endswith('"')) or\ 708 (self.symbold.startswith("'") and self.symbols.endswith("'")): 709 self.symbols = self.symbols[1:-1].replace(' ', '') 710 if not self.respectcase: 711 self.symbols = self.symbols.lower() + self.symbols.upper() 712 self.symbols = list(set(self.symbols)) 713 if 'datatype' in options: 714 self.datatype = options['datatype'].lower() 715 if self.datatype == 'dna' or self.datatype == 'nucleotide': 716 self.alphabet = IUPAC.IUPACAmbiguousDNA() # fresh instance! 717 self.ambiguous_values = IUPACData.ambiguous_dna_values.copy() 718 self.unambiguous_letters = IUPACData.unambiguous_dna_letters 719 elif self.datatype == 'rna': 720 self.alphabet = IUPAC.IUPACAmbiguousDNA() # fresh instance! 721 self.ambiguous_values = IUPACData.ambiguous_rna_values.copy() 722 self.unambiguous_letters = IUPACData.unambiguous_rna_letters 723 elif self.datatype == 'protein': 724 # TODO - Should this not be ExtendedIUPACProtein? 725 self.alphabet = IUPAC.IUPACProtein() # fresh instance 726 self.ambiguous_values = {'B': 'DN', 'Z': 'EQ', 'X': IUPACData.protein_letters} 727 # that's how PAUP handles it 728 self.unambiguous_letters = IUPACData.protein_letters + '*' # stop-codon 729 elif self.datatype == 'standard': 730 raise NexusError('Datatype standard is not yet supported.') 731 # self.alphabet = None 732 # self.ambiguous_values = {} 733 # if not self.symbols: 734 # self.symbols = '01' # if nothing else defined, then 0 and 1 are the default states 735 # self.unambiguous_letters = self.symbols 736 else: 737 raise NexusError('Unsupported datatype: ' + self.datatype) 738 self.valid_characters = ''.join(self.ambiguous_values) + self.unambiguous_letters 739 if not self.respectcase: 740 self.valid_characters = self.valid_characters.lower() + self.valid_characters.upper() 741 # we have to sort the reverse ambig coding dict key characters: 742 # to be sure that it's 'ACGT':'N' and not 'GTCA':'N' 743 rev=dict((i[1], i[0]) for i in self.ambiguous_values.items() if i[0]!='X') 744 self.rev_ambiguous_values = {} 745 for (k, v) in rev.items(): 746 key = sorted(c for c in k) 747 self.rev_ambiguous_values[''.join(key)] = v 748 # overwrite symbols for datype rna,dna,nucleotide 749 if self.datatype in ['dna', 'rna', 'nucleotide']: 750 self.symbols = self.alphabet.letters 751 if self.missing not in self.ambiguous_values: 752 self.ambiguous_values[self.missing] = self.unambiguous_letters+self.gap 753 self.ambiguous_values[self.gap] = self.gap 754 elif self.datatype == 'standard': 755 if not self.symbols: 756 self.symbols = ['1', '0'] 757 if 'missing' in options: 758 self.missing = options['missing'][0] 759 if 'gap' in options: 760 self.gap = options['gap'][0] 761 if 'equate' in options: 762 self.equate = options['equate'] 763 if 'matchchar' in options: 764 self.matchchar = options['matchchar'][0] 765 if 'labels' in options: 766 self.labels = options['labels'] 767 if 'transpose' in options: 768 raise NexusError('TRANSPOSE is not supported!') 769 self.transpose = True 770 if 'interleave' in options: 771 if options['interleave'] is None or options['interleave'].lower() == 'yes': 772 self.interleave = True 773 if 'tokens' in options: 774 self.tokens = True 775 if 'notokens' in options: 776 self.tokens = False
777
778 - def _set(self, options):
779 self.set = options
780
781 - def _options(self, options):
782 self.options = options
783
784 - def _eliminate(self, options):
785 self.eliminate = options
786
787 - def _taxlabels(self, options):
788 """Get taxon labels (PRIVATE). 789 790 As the taxon names are already in the matrix, this is superfluous 791 except for transpose matrices, which are currently unsupported anyway. 792 Thus, we ignore the taxlabels command to make handling of duplicate 793 taxon names easier. 794 """ 795 pass
796 # self.taxlabels = [] 797 # opts = CharBuffer(options) 798 # while True: 799 # taxon = quotestrip(opts.next_word()) 800 # if not taxon: 801 # break 802 # self.taxlabels.append(taxon) 803
804 - def _check_taxlabels(self, taxon):
805 """Check for presence of taxon in self.taxlabels.""" 806 # According to NEXUS standard, underscores shall be treated as spaces..., 807 # so checking for identity is more difficult 808 nextaxa = dict((t.replace(' ', '_'), t) for t in self.taxlabels) 809 nexid = taxon.replace(' ', '_') 810 return nextaxa.get(nexid)
811
812 - def _charlabels(self, options):
813 self.charlabels = {} 814 opts = CharBuffer(options) 815 while True: 816 # get id and state 817 w = opts.next_word() 818 if w is None: # McClade saves and reads charlabel-lists with terminal comma?! 819 break 820 identifier = self._resolve(w, set_type=CHARSET) 821 state = quotestrip(opts.next_word()) 822 self.charlabels[identifier] = state 823 # check for comma or end of command 824 c = opts.next_nonwhitespace() 825 if c is None: 826 break 827 elif c != ',': 828 raise NexusError('Missing \',\' in line %s.' % options)
829
830 - def _charstatelabels(self, options):
831 # warning: charstatelabels supports only charlabels-syntax! 832 self._charlabels(options)
833
834 - def _statelabels(self, options):
835 # self.charlabels = options 836 # print 'Command statelabels is not supported and will be ignored.' 837 pass
838
839 - def _matrix(self, options):
840 if not self.ntax or not self.nchar: 841 raise NexusError('Dimensions must be specified before matrix!') 842 self.matrix = {} 843 taxcount = 0 844 first_matrix_block = True 845 846 # eliminate empty lines and leading/trailing whitespace 847 lines = [l.strip() for l in options.split('\n') if l.strip() != ''] 848 lineiter = iter(lines) 849 while True: 850 try: 851 l = next(lineiter) 852 except StopIteration: 853 if taxcount < self.ntax: 854 raise NexusError('Not enough taxa in matrix.') 855 elif taxcount > self.ntax: 856 raise NexusError('Too many taxa in matrix.') 857 else: 858 break 859 # count the taxa and check for interleaved matrix 860 taxcount += 1 861 # print taxcount 862 if taxcount > self.ntax: 863 if not self.interleave: 864 raise NexusError('Too many taxa in matrix - should matrix be interleaved?') 865 else: 866 taxcount = 1 867 first_matrix_block = False 868 # get taxon name and sequence 869 linechars = CharBuffer(l) 870 id = quotestrip(linechars.next_word()) 871 l = linechars.rest().strip() 872 chars = '' 873 if self.interleave: 874 # interleaved matrix 875 # print 'In interleave' 876 if l: 877 chars = ''.join(l.split()) 878 else: 879 chars = ''.join(next(lineiter).split()) 880 else: 881 # non-interleaved matrix 882 chars = ''.join(l.split()) 883 while len(chars)<self.nchar: 884 l = next(lineiter) 885 chars += ''.join(l.split()) 886 iupac_seq = Seq(_replace_parenthesized_ambigs(chars, self.rev_ambiguous_values), self.alphabet) 887 # first taxon has the reference sequence if matchhar is used 888 if taxcount == 1: 889 refseq = iupac_seq 890 else: 891 if self.matchchar: 892 while True: 893 p = str(iupac_seq).find(self.matchchar) 894 if p == -1: 895 break 896 iupac_seq = Seq(str(iupac_seq)[:p]+refseq[p]+str(iupac_seq)[p+1:], self.alphabet) 897 # check for invalid characters 898 for i, c in enumerate(str(iupac_seq)): 899 if c not in self.valid_characters and c != self.gap and c != self.missing: 900 raise NexusError("Taxon %s: Illegal character %s in sequence %s " 901 "(check dimensions/interleaving)" % (id, c, iupac_seq)) 902 # add sequence to matrix 903 if first_matrix_block: 904 self.unaltered_taxlabels.append(id) 905 id = _unique_label(list(self.matrix.keys()), id) 906 self.matrix[id] = iupac_seq 907 self.taxlabels.append(id) 908 else: 909 # taxon names need to be in the same order in each interleaved block 910 id = _unique_label(self.taxlabels[:taxcount-1], id) 911 taxon_present = self._check_taxlabels(id) 912 if taxon_present: 913 self.matrix[taxon_present] += iupac_seq 914 else: 915 raise NexusError("Taxon %s not in first block of interleaved " 916 "matrix. Check matrix dimensions and interleave." % id) 917 # check all sequences for length according to nchar 918 for taxon in self.matrix: 919 if len(self.matrix[taxon]) != self.nchar: 920 raise NexusError('Matrix Nchar %d does not match data length (%d) for taxon %s' 921 % (self.nchar, len(self.matrix[taxon]), taxon)) 922 # check that taxlabels is identical with matrix.keys. If not, it's a problem 923 matrixkeys = sorted(self.matrix) 924 taxlabelssort = sorted(self.taxlabels[:]) 925 assert matrixkeys == taxlabelssort, \ 926 "ERROR: TAXLABELS must be identical with MATRIX. " + \ 927 "Please Report this as a bug, and send in data file."
928
929 - def _translate(self, options):
930 self.translate = {} 931 opts = CharBuffer(options) 932 while True: 933 try: 934 # get id and state 935 identifier = int(opts.next_word()) 936 label = quotestrip(opts.next_word()) 937 self.translate[identifier] = label 938 # check for comma or end of command 939 c = opts.next_nonwhitespace() 940 if c is None: 941 break 942 elif c != ',': 943 raise NexusError('Missing \',\' in line %s.' % options) 944 except NexusError: 945 raise 946 except: 947 raise NexusError('Format error in line %s.' % options)
948
949 - def _utree(self, options):
950 """Some software (clustalx) uses 'utree' to denote an unrooted tree.""" 951 self._tree(options)
952
953 - def _tree(self, options):
954 opts = CharBuffer(options) 955 if opts.peek_nonwhitespace() == '*': 956 # a star can be used to make it the default tree in some software packages 957 dummy = opts.next_nonwhitespace() 958 name = opts.next_word() 959 if opts.next_nonwhitespace() != '=': 960 raise NexusError('Syntax error in tree description: %s' 961 % options[:50]) 962 rooted = False 963 weight = 1.0 964 while opts.peek_nonwhitespace() == '[': 965 open = opts.next_nonwhitespace() 966 symbol = next(opts) 967 if symbol != '&': 968 raise NexusError('Illegal special comment [%s...] in tree description: %s' 969 % (symbol, options[:50])) 970 special = next(opts) 971 value = opts.next_until(']') 972 closing = next(opts) 973 if special == 'R': 974 rooted = True 975 elif special == 'U': 976 rooted = False 977 elif special == 'W': 978 weight = float(value) 979 tree = Tree(name=name, weight=weight, rooted=rooted, tree=opts.rest().strip()) 980 # if there's an active translation table, translate 981 if self.translate: 982 for n in tree.get_terminals(): 983 try: 984 tree.node(n).data.taxon=safename(self.translate[int(tree.node(n).data.taxon)]) 985 except (ValueError, KeyError): 986 raise NexusError('Unable to substitute %s using \'translate\' data.' 987 % tree.node(n).data.taxon) 988 self.trees.append(tree)
989
990 - def _apply_block_structure(self, title, lines):
991 block = Block('') 992 block.title = title 993 for line in lines: 994 block.commandlines.append(Commandline(line, title)) 995 self.structured.append(block)
996
997 - def _taxset(self, options):
998 name, taxa = self._get_indices(options, set_type=TAXSET) 999 self.taxsets[name] = _make_unique(taxa)
1000
1001 - def _charset(self, options):
1002 name, sites = self._get_indices(options, set_type=CHARSET) 1003 self.charsets[name] = _make_unique(sites)
1004
1005 - def _taxpartition(self, options):
1006 taxpartition = {} 1007 quotelevel = False 1008 opts = CharBuffer(options) 1009 name=self._name_n_vector(opts) 1010 if not name: 1011 raise NexusError('Formatting error in taxpartition: %s ' % options) 1012 # now collect thesubbpartitions and parse them 1013 # subpartitons separated by commas - which unfortunately could be part of a quoted identifier... 1014 # this is rather unelegant, but we have to avoid double-parsing and potential change of special nexus-words 1015 sub = '' 1016 while True: 1017 w = next(opts) 1018 if w is None or (w == ',' and not quotelevel): 1019 subname, subindices = self._get_indices(sub, set_type=TAXSET, separator=':') 1020 taxpartition[subname] = _make_unique(subindices) 1021 sub = '' 1022 if w is None: 1023 break 1024 else: 1025 if w == "'": 1026 quotelevel = not quotelevel 1027 sub += w 1028 self.taxpartitions[name] = taxpartition
1029
1030 - def _codonposset(self, options):
1031 """Read codon positions from a codons block as written from McClade. 1032 1033 Here codonposset is just a fancy name for a character partition with 1034 the name CodonPositions and the partitions N,1,2,3 1035 """ 1036 1037 prev_partitions = list(self.charpartitions.keys()) 1038 self._charpartition(options) 1039 # mcclade calls it CodonPositions, but you never know... 1040 codonname = [n for n in self.charpartitions if n not in prev_partitions] 1041 if codonname == [] or len(codonname) > 1: 1042 raise NexusError('Formatting Error in codonposset: %s ' % options) 1043 else: 1044 self.codonposset = codonname[0]
1045
1046 - def _codeset(self, options):
1047 pass
1048
1049 - def _charpartition(self, options):
1050 charpartition = {} 1051 quotelevel = False 1052 opts = CharBuffer(options) 1053 name = self._name_n_vector(opts) 1054 if not name: 1055 raise NexusError('Formatting error in charpartition: %s ' % options) 1056 # now collect thesubbpartitions and parse them 1057 # subpartitons separated by commas - which unfortunately could be part of a quoted identifier... 1058 sub = '' 1059 while True: 1060 w = next(opts) 1061 if w is None or (w == ',' and not quotelevel): 1062 subname, subindices = self._get_indices(sub, set_type=CHARSET, separator=':') 1063 charpartition[subname] = _make_unique(subindices) 1064 sub = '' 1065 if w is None: 1066 break 1067 else: 1068 if w == "'": 1069 quotelevel = not quotelevel 1070 sub += w 1071 self.charpartitions[name]=charpartition
1072
1073 - def _get_indices(self, options, set_type=CHARSET, separator='='):
1074 """Parse the taxset/charset specification (PRIVATE). 1075 1076 e.g. '1 2 3 - 5 dog cat 10 - 20 \\ 3' 1077 --> [0,1,2,3,4,'dog','cat',9,12,15,18] 1078 """ 1079 opts = CharBuffer(options) 1080 name = self._name_n_vector(opts, separator=separator) 1081 indices = self._parse_list(opts, set_type=set_type) 1082 if indices is None: 1083 raise NexusError('Formatting error in line: %s ' % options) 1084 return name, indices
1085
1086 - def _name_n_vector(self, opts, separator='='):
1087 """Extract name and check that it's not in vector format.""" 1088 rest = opts.rest() 1089 name = opts.next_word() 1090 # we ignore * before names 1091 if name == '*': 1092 name = opts.next_word() 1093 if not name: 1094 raise NexusError('Formatting error in line: %s ' % rest) 1095 name = quotestrip(name) 1096 if opts.peek_nonwhitespace == '(': 1097 open = opts.next_nonwhitespace() 1098 qualifier = open.next_word() 1099 close = opts.next_nonwhitespace() 1100 if qualifier.lower() == 'vector': 1101 raise NexusError('Unsupported VECTOR format in line %s' 1102 % (opts)) 1103 elif qualifier.lower() != 'standard': 1104 raise NexusError('Unknown qualifier %s in line %s' 1105 % (qualifier, opts)) 1106 if opts.next_nonwhitespace() != separator: 1107 raise NexusError('Formatting error in line: %s ' % rest) 1108 return name
1109
1110 - def _parse_list(self, options_buffer, set_type):
1111 """Parse a NEXUS list (PRIVATE). 1112 1113 e.g. [1, 2, 4-8\\2, dog, cat] --> [1,2,4,6,8,17,21], 1114 (assuming dog is taxon no. 17 and cat is taxon no. 21). 1115 """ 1116 plain_list = [] 1117 if options_buffer.peek_nonwhitespace(): 1118 try: 1119 # capture all possible exceptions and treat them as formatting 1120 # errors, if they are not NexusError 1121 while True: 1122 identifier = options_buffer.next_word() # next list element 1123 if not identifier: # end of list? 1124 break 1125 start = self._resolve(identifier, set_type=set_type) 1126 if options_buffer.peek_nonwhitespace() == '-': # followd by - 1127 end = start 1128 step = 1 1129 # get hyphen and end of range 1130 hyphen = options_buffer.next_nonwhitespace() 1131 end = self._resolve(options_buffer.next_word(), set_type=set_type) 1132 if set_type == CHARSET: 1133 if options_buffer.peek_nonwhitespace() == '\\': # followd by \ 1134 backslash = options_buffer.next_nonwhitespace() 1135 step = int(options_buffer.next_word()) # get backslash and step 1136 plain_list.extend(range(start, end+1, step)) 1137 else: 1138 if isinstance(start, list) or isinstance(end, list): 1139 raise NexusError('Name if character sets not allowed in range definition: %s' 1140 % identifier) 1141 start = self.taxlabels.index(start) 1142 end = self.taxlabels.index(end) 1143 taxrange = self.taxlabels[start:end+1] 1144 plain_list.extend(taxrange) 1145 else: 1146 if isinstance(start, list): # start was the name of charset or taxset 1147 plain_list.extend(start) 1148 else: # start was an ordinary identifier 1149 plain_list.append(start) 1150 except NexusError: 1151 raise 1152 except: 1153 return None 1154 return plain_list
1155
1156 - def _resolve(self, identifier, set_type=None):
1157 """Translate identifier in list into character/taxon index. 1158 1159 Characters (which are referred to by their index in Nexus.py): 1160 Plain numbers are returned minus 1 (Nexus indices to python indices) 1161 Text identifiers are translated into their indices (if plain character identifiers), 1162 the first hit in charlabels is returned (charlabels don't need to be unique) 1163 or the range of indices is returned (if names of character sets). 1164 Taxa (which are referred to by their unique name in Nexus.py): 1165 Plain numbers are translated in their taxon name, underscores and spaces are considered equal. 1166 Names are returned unchanged (if plain taxon identifiers), or the names in 1167 the corresponding taxon set is returned. 1168 """ 1169 identifier = quotestrip(identifier) 1170 if not set_type: 1171 raise NexusError('INTERNAL ERROR: Need type to resolve identifier.') 1172 if set_type == CHARSET: 1173 try: 1174 n = int(identifier) 1175 except ValueError: 1176 if self.charlabels and identifier in self.charlabels.values(): 1177 for k in self.charlabels: 1178 if self.charlabels[k] == identifier: 1179 return k 1180 elif self.charsets and identifier in self.charsets: 1181 return self.charsets[identifier] 1182 else: 1183 raise NexusError('Unknown character identifier: %s' 1184 % identifier) 1185 else: 1186 if n <= self.nchar: 1187 return n-1 1188 else: 1189 raise NexusError('Illegal character identifier: %d>nchar (=%d).' 1190 % (identifier, self.nchar)) 1191 elif set_type == TAXSET: 1192 try: 1193 n = int(identifier) 1194 except ValueError: 1195 taxlabels_id = self._check_taxlabels(identifier) 1196 if taxlabels_id: 1197 return taxlabels_id 1198 elif self.taxsets and identifier in self.taxsets: 1199 return self.taxsets[identifier] 1200 else: 1201 raise NexusError('Unknown taxon identifier: %s' 1202 % identifier) 1203 else: 1204 if n > 0 and n <= self.ntax: 1205 return self.taxlabels[n-1] 1206 else: 1207 raise NexusError('Illegal taxon identifier: %d>ntax (=%d).' 1208 % (identifier, self.ntax)) 1209 else: 1210 raise NexusError('Unknown set specification: %s.'% set_type)
1211
1212 - def _stateset(self, options):
1213 # Not implemented 1214 pass
1215
1216 - def _changeset(self, options):
1217 # Not implemented 1218 pass
1219
1220 - def _treeset(self, options):
1221 # Not implemented 1222 pass
1223
1224 - def _treepartition(self, options):
1225 # Not implemented 1226 pass
1227
1228 - def write_nexus_data_partitions(self, matrix=None, filename=None, blocksize=None, 1229 interleave=False, exclude=[], delete=[], 1230 charpartition=None, comment='', mrbayes=False):
1231 """Writes a nexus file for each partition in charpartition. 1232 1233 Only non-excluded characters and non-deleted taxa are included, 1234 just the data block is written. 1235 """ 1236 1237 if not matrix: 1238 matrix = self.matrix 1239 if not matrix: 1240 return 1241 if not filename: 1242 filename = self.filename 1243 if charpartition: 1244 pfilenames = {} 1245 for p in charpartition: 1246 total_exclude = [] + exclude 1247 total_exclude.extend(c for c in range(self.nchar) if c not in charpartition[p]) 1248 total_exclude = _make_unique(total_exclude) 1249 pcomment = comment + '\nPartition: ' + p + '\n' 1250 dot = filename.rfind('.') 1251 if dot > 0: 1252 pfilename = filename[:dot] + '_' + p + '.data' 1253 else: 1254 pfilename = filename+'_'+p 1255 pfilenames[p] = pfilename 1256 self.write_nexus_data(filename=pfilename, matrix=matrix, blocksize=blocksize, 1257 interleave=interleave, exclude=total_exclude, delete=delete, 1258 comment=pcomment, append_sets=False, mrbayes=mrbayes) 1259 return pfilenames 1260 else: 1261 fn=self.filename+'.data' 1262 self.write_nexus_data(filename=fn, matrix=matrix, blocksize=blocksize, 1263 interleave=interleave, exclude=exclude, delete=delete, 1264 comment=comment, append_sets=False, mrbayes=mrbayes) 1265 return fn
1266
1267 - def write_nexus_data(self, filename=None, matrix=None, exclude=[], delete=[], 1268 blocksize=None, interleave=False, interleave_by_partition=False, 1269 comment=None, omit_NEXUS=False, append_sets=True, mrbayes=False, 1270 codons_block=True):
1271 """Writes a nexus file with data and sets block to a file or handle. 1272 1273 Character sets and partitions are appended by default, and are 1274 adjusted according to excluded characters (i.e. character sets 1275 still point to the same sites (not necessarily same positions), 1276 without including the deleted characters. 1277 1278 - filename - Either a filename as a string (which will be opened, 1279 written to and closed), or a handle object (which will 1280 be written to but NOT closed). 1281 - interleave_by_partition - Optional name of partition (string) 1282 - omit_NEXUS - Boolean. If true, the '#NEXUS' line normally at the 1283 start of the file is omitted. 1284 1285 Returns the filename/handle used to write the data. 1286 """ 1287 if not matrix: 1288 matrix = self.matrix 1289 if not matrix: 1290 return 1291 if not filename: 1292 filename = self.filename 1293 if [t for t in delete if not self._check_taxlabels(t)]: 1294 raise NexusError('Unknown taxa: %s' 1295 % ', '.join(set(delete).difference(set(self.taxlabels)))) 1296 if interleave_by_partition: 1297 if interleave_by_partition not in self.charpartitions: 1298 raise NexusError('Unknown partition: %r' % interleave_by_partition) 1299 else: 1300 partition = self.charpartitions[interleave_by_partition] 1301 # we need to sort the partition names by starting position before we exclude characters 1302 names = _sort_keys_by_values(partition) 1303 newpartition = {} 1304 for p in partition: 1305 newpartition[p] = [c for c in partition[p] if c not in exclude] 1306 # how many taxa and how many characters are left? 1307 undelete = [taxon for taxon in self.taxlabels if taxon in matrix and taxon not in delete] 1308 cropped_matrix = _seqmatrix2strmatrix(self.crop_matrix(matrix, exclude=exclude, delete=delete)) 1309 ntax_adjusted = len(undelete) 1310 nchar_adjusted = len(cropped_matrix[undelete[0]]) 1311 if not undelete or (undelete and undelete[0] == ''): 1312 return 1313 1314 with File.as_handle(filename, mode='w') as fh: 1315 if not omit_NEXUS: 1316 fh.write('#NEXUS\n') 1317 if comment: 1318 fh.write('[' + comment + ']\n') 1319 fh.write('begin data;\n') 1320 fh.write('\tdimensions ntax=%d nchar=%d;\n' % (ntax_adjusted, nchar_adjusted)) 1321 fh.write('\tformat datatype=' + self.datatype) 1322 if self.respectcase: 1323 fh.write(' respectcase') 1324 if self.missing: 1325 fh.write(' missing=' + self.missing) 1326 if self.gap: 1327 fh.write(' gap=' + self.gap) 1328 if self.matchchar: 1329 fh.write(' matchchar=' + self.matchchar) 1330 if self.labels: 1331 fh.write(' labels=' + self.labels) 1332 if self.equate: 1333 fh.write(' equate=' + self.equate) 1334 if interleave or interleave_by_partition: 1335 fh.write(' interleave') 1336 fh.write(';\n') 1337 # if self.taxlabels: 1338 # fh.write('taxlabels '+' '.join(self.taxlabels)+';\n') 1339 if self.charlabels: 1340 newcharlabels = self._adjust_charlabels(exclude=exclude) 1341 clkeys = sorted(newcharlabels) 1342 fh.write('charlabels ' 1343 + ', '.join("%s %s" % (k+1, safename(newcharlabels[k])) for k in clkeys) 1344 + ';\n') 1345 fh.write('matrix\n') 1346 if not blocksize: 1347 if interleave: 1348 blocksize = 70 1349 else: 1350 blocksize = self.nchar 1351 # delete deleted taxa and ecxclude excluded characters... 1352 namelength = max(len(safename(t, mrbayes=mrbayes)) for t in undelete) 1353 if interleave_by_partition: 1354 # interleave by partitions, but adjust partitions with regard to excluded characters 1355 seek = 0 1356 for p in names: 1357 fh.write('[%s: %s]\n' % (interleave_by_partition, p)) 1358 if len(newpartition[p]) > 0: 1359 for taxon in undelete: 1360 fh.write(safename(taxon, mrbayes=mrbayes).ljust(namelength+1)) 1361 fh.write(cropped_matrix[taxon][seek:seek+len(newpartition[p])]+'\n') 1362 fh.write('\n') 1363 else: 1364 fh.write('[empty]\n\n') 1365 seek += len(newpartition[p]) 1366 elif interleave: 1367 for seek in range(0, nchar_adjusted, blocksize): 1368 for taxon in undelete: 1369 fh.write(safename(taxon, mrbayes=mrbayes).ljust(namelength+1)) 1370 fh.write(cropped_matrix[taxon][seek:seek+blocksize]+'\n') 1371 fh.write('\n') 1372 else: 1373 for taxon in undelete: 1374 if blocksize<nchar_adjusted: 1375 fh.write(safename(taxon, mrbayes=mrbayes)+'\n') 1376 else: 1377 fh.write(safename(taxon, mrbayes=mrbayes).ljust(namelength+1)) 1378 taxon_seq = cropped_matrix[taxon] 1379 for seek in range(0, nchar_adjusted, blocksize): 1380 fh.write(taxon_seq[seek:seek+blocksize]+'\n') 1381 del taxon_seq 1382 fh.write(';\nend;\n') 1383 if append_sets: 1384 if codons_block: 1385 fh.write(self.append_sets(exclude=exclude, delete=delete, mrbayes=mrbayes, include_codons=False)) 1386 fh.write(self.append_sets(exclude=exclude, delete=delete, mrbayes=mrbayes, codons_only=True)) 1387 else: 1388 fh.write(self.append_sets(exclude=exclude, delete=delete, mrbayes=mrbayes)) 1389 return filename
1390
1391 - def append_sets(self, exclude=[], delete=[], mrbayes=False, include_codons=True, codons_only=False):
1392 """Returns a sets block.""" 1393 if not self.charsets and not self.taxsets and not self.charpartitions: 1394 return '' 1395 if codons_only: 1396 setsb = ['\nbegin codons'] 1397 else: 1398 setsb = ['\nbegin sets'] 1399 # - now if characters have been excluded, the character sets need to be adjusted, 1400 # so that they still point to the right character positions 1401 # calculate a list of offsets: for each deleted character, the following character position 1402 # in the new file will have an additional offset of -1 1403 offset = 0 1404 offlist = [] 1405 for c in range(self.nchar): 1406 if c in exclude: 1407 offset += 1 1408 offlist.append(-1) # dummy value as these character positions are excluded 1409 else: 1410 offlist.append(c-offset) 1411 # now adjust each of the character sets 1412 if not codons_only: 1413 for n, ns in self.charsets.items(): 1414 cset = [offlist[c] for c in ns if c not in exclude] 1415 if cset: 1416 setsb.append('charset %s = %s' % (safename(n), _compact4nexus(cset))) 1417 for n, s in self.taxsets.items(): 1418 tset = [safename(t, mrbayes=mrbayes) for t in s if t not in delete] 1419 if tset: 1420 setsb.append('taxset %s = %s' % (safename(n), ' '.join(tset))) 1421 for n, p in self.charpartitions.items(): 1422 if not include_codons and n == CODONPOSITIONS: 1423 continue 1424 elif codons_only and n != CODONPOSITIONS: 1425 continue 1426 # as characters have been excluded, the partitions must be adjusted 1427 # if a partition is empty, it will be omitted from the charpartition command 1428 # (although paup allows charpartition part=t1:,t2:,t3:1-100) 1429 names = _sort_keys_by_values(p) 1430 newpartition = {} 1431 for sn in names: 1432 nsp = [offlist[c] for c in p[sn] if c not in exclude] 1433 if nsp: 1434 newpartition[sn] = nsp 1435 if newpartition: 1436 if include_codons and n == CODONPOSITIONS: 1437 command = 'codonposset' 1438 else: 1439 command = 'charpartition' 1440 setsb.append('%s %s = %s' % (command, safename(n), 1441 ', '.join('%s: %s' % (sn, _compact4nexus(newpartition[sn])) 1442 for sn in names if sn in newpartition))) 1443 # now write charpartititions, much easier than charpartitions 1444 for n, p in self.taxpartitions.items(): 1445 names = _sort_keys_by_values(p) 1446 newpartition = {} 1447 for sn in names: 1448 nsp = [t for t in p[sn] if t not in delete] 1449 if nsp: 1450 newpartition[sn]=nsp 1451 if newpartition: 1452 setsb.append('taxpartition %s = %s' % (safename(n), 1453 ', '.join('%s: %s' % (safename(sn), 1454 ' '.join(safename(x) for x in newpartition[sn])) 1455 for sn in names if sn in newpartition))) 1456 # add 'end' and return everything 1457 setsb.append('end;\n') 1458 if len(setsb) == 2: # begin and end only 1459 return '' 1460 else: 1461 return ';\n'.join(setsb)
1462
1463 - def export_fasta(self, filename=None, width=70):
1464 """Writes matrix into a fasta file.""" 1465 if not filename: 1466 if '.' in self.filename and self.filename.split('.')[-1].lower() in ['paup', 'nexus', 'nex', 'dat']: 1467 filename = '.'.join(self.filename.split('.')[:-1])+'.fas' 1468 else: 1469 filename = self.filename+'.fas' 1470 with open(filename, 'w') as fh: 1471 for taxon in self.taxlabels: 1472 fh.write('>' + safename(taxon) + '\n') 1473 for i in range(0, len(str(self.matrix[taxon])), width): 1474 fh.write(str(self.matrix[taxon])[i:i+width] + '\n') 1475 return filename
1476
1477 - def export_phylip(self, filename=None):
1478 """Writes matrix into a PHYLIP file. 1479 1480 Note that this writes a relaxed PHYLIP format file, where the names 1481 are not truncated, nor checked for invalid characters.""" 1482 if not filename: 1483 if '.' in self.filename and self.filename.split('.')[-1].lower() in ['paup', 'nexus', 'nex', 'dat']: 1484 filename = '.'.join(self.filename.split('.')[:-1])+'.phy' 1485 else: 1486 filename = self.filename+'.phy' 1487 with open(filename, 'w') as fh: 1488 fh.write('%d %d\n' % (self.ntax, self.nchar)) 1489 for taxon in self.taxlabels: 1490 fh.write('%s %s\n' % (safename(taxon), str(self.matrix[taxon]))) 1491 return filename
1492
1493 - def constant(self, matrix=None, delete=[], exclude=[]):
1494 """Return a list with all constant characters.""" 1495 if not matrix: 1496 matrix = self.matrix 1497 undelete = [t for t in self.taxlabels if t in matrix and t not in delete] 1498 if not undelete: 1499 return None 1500 elif len(undelete) == 1: 1501 return [x for x in range(len(matrix[undelete[0]])) if x not in exclude] 1502 # get the first sequence and expand all ambiguous values 1503 constant = [(x, self.ambiguous_values.get(n.upper(), n.upper())) for 1504 x, n in enumerate(str(matrix[undelete[0]])) if x not in exclude] 1505 1506 for taxon in undelete[1:]: 1507 newconstant = [] 1508 for site in constant: 1509 # print '%d (paup=%d)' % (site[0],site[0]+1), 1510 seqsite = matrix[taxon][site[0]].upper() 1511 # print seqsite,'checked against',site[1],'\t', 1512 if seqsite == self.missing \ 1513 or (seqsite == self.gap and self.options['gapmode'].lower() == 'missing') \ 1514 or seqsite == site[1]: 1515 # missing or same as before -> ok 1516 newconstant.append(site) 1517 elif seqsite in site[1] \ 1518 or site[1] == self.missing \ 1519 or (self.options['gapmode'].lower() == 'missing' and site[1] == self.gap): 1520 # subset of an ambig or only missing in previous -> take subset 1521 newconstant.append((site[0], self.ambiguous_values.get(seqsite, seqsite))) 1522 elif seqsite in self.ambiguous_values: 1523 # is it an ambig: check the intersection with prev. values 1524 intersect = set(self.ambiguous_values[seqsite]).intersection(set(site[1])) 1525 if intersect: 1526 newconstant.append((site[0], ''.join(intersect))) 1527 # print 'ok' 1528 # else: 1529 # print 'failed' 1530 # else: 1531 # print 'failed' 1532 constant = newconstant 1533 cpos = [s[0] for s in constant] 1534 return cpos
1535
1536 - def cstatus(self, site, delete=[], narrow=True):
1537 """Summarize character. 1538 1539 narrow=True: paup-mode (a c ? --> ac; ? ? ? --> ?) 1540 narrow=false: (a c ? --> a c g t -; ? ? ? --> a c g t -) 1541 """ 1542 undelete = [t for t in self.taxlabels if t not in delete] 1543 if not undelete: 1544 return None 1545 cstatus = [] 1546 for t in undelete: 1547 c = self.matrix[t][