Package Bio :: Package AlignIO :: Module NexusIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.NexusIO

  1  # Copyright 2008-2010, 2012-2014, 2016-2017 by Peter Cock.  All rights reserved. 
  2  # 
  3  # This file is part of the Biopython distribution and governed by your 
  4  # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". 
  5  # Please see the LICENSE file that should have been included as part of this 
  6  # package. 
  7  """Bio.AlignIO support for the "nexus" file format. 
  8   
  9  You are expected to use this module via the Bio.AlignIO functions (or the 
 10  Bio.SeqIO functions if you want to work directly with the gapped sequences). 
 11   
 12  See also the Bio.Nexus module (which this code calls internally), 
 13  as this offers more than just accessing the alignment or its 
 14  sequences as SeqRecord objects. 
 15  """ 
 16   
 17  from __future__ import print_function 
 18   
 19  from Bio.SeqRecord import SeqRecord 
 20  from Bio.Nexus import Nexus 
 21  from Bio.Align import MultipleSeqAlignment 
 22  from .Interfaces import AlignmentWriter 
 23  from Bio import Alphabet 
 24   
 25   
 26  # You can get a couple of example files here: 
 27  # http://www.molecularevolution.org/resources/fileformats/ 
 28   
 29   
 30  # This is a generator function! 
31 -def NexusIterator(handle, seq_count=None):
32 """Return SeqRecord objects from a Nexus file. 33 34 Thus uses the Bio.Nexus module to do the hard work. 35 36 You are expected to call this function via Bio.SeqIO or Bio.AlignIO 37 (and not use it directly). 38 39 NOTE - We only expect ONE alignment matrix per Nexus file, 40 meaning this iterator will only yield one MultipleSeqAlignment. 41 """ 42 n = Nexus.Nexus(handle) 43 if not n.matrix: 44 # No alignment found 45 return 46 47 # Bio.Nexus deals with duplicated names by adding a '.copy' suffix. 48 # The original names and the modified names are kept in these two lists: 49 assert len(n.unaltered_taxlabels) == len(n.taxlabels) 50 51 if seq_count and seq_count != len(n.unaltered_taxlabels): 52 raise ValueError("Found %i sequences, but seq_count=%i" 53 % (len(n.unaltered_taxlabels), seq_count)) 54 55 # TODO - Can we extract any annotation too? 56 records = (SeqRecord(n.matrix[new_name], id=new_name, 57 name=old_name, description="") 58 for old_name, new_name 59 in zip(n.unaltered_taxlabels, n.taxlabels)) 60 # All done 61 yield MultipleSeqAlignment(records, n.alphabet)
62 63
64 -class NexusWriter(AlignmentWriter):
65 """Nexus alignment writer. 66 67 Note that Nexus files are only expected to hold ONE alignment 68 matrix. 69 70 You are expected to call this class via the Bio.AlignIO.write() or 71 Bio.SeqIO.write() functions. 72 """ 73
74 - def write_file(self, alignments):
75 """Use this to write an entire file containing the given alignments. 76 77 Arguments: 78 - alignments - A list or iterator returning MultipleSeqAlignment objects. 79 This should hold ONE and only one alignment. 80 81 """ 82 align_iter = iter(alignments) # Could have been a list 83 try: 84 first_alignment = next(align_iter) 85 except StopIteration: 86 first_alignment = None 87 if first_alignment is None: 88 # Nothing to write! 89 return 0 90 91 # Check there is only one alignment... 92 try: 93 second_alignment = next(align_iter) 94 except StopIteration: 95 second_alignment = None 96 if second_alignment is not None: 97 raise ValueError("We can only write one Alignment to a Nexus file.") 98 99 # Good. Actually write the single alignment, 100 self.write_alignment(first_alignment) 101 return 1 # we only support writing one alignment!
102
103 - def write_alignment(self, alignment, interleave=None):
104 """Write an alignment to file. 105 106 Creates an empty Nexus object, adds the sequences 107 and then gets Nexus to prepare the output. 108 Default interleave behaviour: Interleave if columns > 1000 109 --> Override with interleave=[True/False] 110 """ 111 if len(alignment) == 0: 112 raise ValueError("Must have at least one sequence") 113 columns = alignment.get_alignment_length() 114 if columns == 0: 115 raise ValueError("Non-empty sequences are required") 116 minimal_record = "#NEXUS\nbegin data; dimensions ntax=0 nchar=0; " \ 117 + "format datatype=%s; end;" \ 118 % self._classify_alphabet_for_nexus(alignment._alphabet) 119 n = Nexus.Nexus(minimal_record) 120 n.alphabet = alignment._alphabet 121 for record in alignment: 122 n.add_sequence(record.id, str(record.seq)) 123 124 # Note: MrBayes may choke on large alignments if not interleaved 125 if interleave is None: 126 interleave = (columns > 1000) 127 n.write_nexus_data(self.handle, interleave=interleave)
128
129 - def _classify_alphabet_for_nexus(self, alphabet):
130 """Return 'protein', 'dna', or 'rna' based on the alphabet (PRIVATE). 131 132 Raises an exception if this is not possible. 133 """ 134 # Get the base alphabet (underneath any Gapped or StopCodon encoding) 135 a = Alphabet._get_base_alphabet(alphabet) 136 137 if not isinstance(a, Alphabet.Alphabet): 138 raise TypeError("Invalid alphabet") 139 elif isinstance(a, Alphabet.ProteinAlphabet): 140 return "protein" 141 elif isinstance(a, Alphabet.DNAAlphabet): 142 return "dna" 143 elif isinstance(a, Alphabet.RNAAlphabet): 144 return "rna" 145 else: 146 # Must be something like NucleotideAlphabet or 147 # just the generic Alphabet (default for fasta files) 148 raise ValueError("Need a DNA, RNA or Protein alphabet")
149 150 151 if __name__ == "__main__": 152 from Bio._utils import run_doctest 153 run_doctest(verbose=0) 154