Package Bio :: Package AlignIO :: Module NexusIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.NexusIO

  1  # Copyright 2008-2010 by Peter Cock.  All rights reserved. 
  2  # 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """Bio.AlignIO support for the "nexus" file format. 
  7   
  8  You are expected to use this module via the Bio.AlignIO functions (or the 
  9  Bio.SeqIO functions if you want to work directly with the gapped sequences). 
 10   
 11  See also the Bio.Nexus module (which this code calls internally), 
 12  as this offers more than just accessing the alignment or its 
 13  sequences as SeqRecord objects. 
 14  """ 
 15   
 16  from __future__ import print_function 
 17   
 18  from Bio.SeqRecord import SeqRecord 
 19  from Bio.Nexus import Nexus 
 20  from Bio.Align import MultipleSeqAlignment 
 21  from .Interfaces import AlignmentWriter 
 22  from Bio import Alphabet 
 23   
 24   
 25  # You can get a couple of example files here: 
 26  # http://www.molecularevolution.org/resources/fileformats/ 
 27   
 28   
 29  # This is a generator function! 
30 -def NexusIterator(handle, seq_count=None):
31 """Returns SeqRecord objects from a Nexus file. 32 33 Thus uses the Bio.Nexus module to do the hard work. 34 35 You are expected to call this function via Bio.SeqIO or Bio.AlignIO 36 (and not use it directly). 37 38 NOTE - We only expect ONE alignment matrix per Nexus file, 39 meaning this iterator will only yield one MultipleSeqAlignment. 40 """ 41 n = Nexus.Nexus(handle) 42 if not n.matrix: 43 # No alignment found 44 raise StopIteration 45 46 # Bio.Nexus deals with duplicated names by adding a '.copy' suffix. 47 # The original names and the modified names are kept in these two lists: 48 assert len(n.unaltered_taxlabels) == len(n.taxlabels) 49 50 if seq_count and seq_count != len(n.unaltered_taxlabels): 51 raise ValueError("Found %i sequences, but seq_count=%i" 52 % (len(n.unaltered_taxlabels), seq_count)) 53 54 # TODO - Can we extract any annotation too? 55 records = (SeqRecord(n.matrix[new_name], id=new_name, 56 name=old_name, description="") 57 for old_name, new_name 58 in zip(n.unaltered_taxlabels, n.taxlabels)) 59 # All done 60 yield MultipleSeqAlignment(records, n.alphabet)
61 62
63 -class NexusWriter(AlignmentWriter):
64 """Nexus alignment writer. 65 66 Note that Nexus files are only expected to hold ONE alignment 67 matrix. 68 69 You are expected to call this class via the Bio.AlignIO.write() or 70 Bio.SeqIO.write() functions. 71 """
72 - def write_file(self, alignments):
73 """Use this to write an entire file containing the given alignments. 74 75 Arguments: 76 77 - alignments - A list or iterator returning MultipleSeqAlignment objects. 78 This should hold ONE and only one alignment. 79 """ 80 align_iter = iter(alignments) # Could have been a list 81 try: 82 first_alignment = next(align_iter) 83 except StopIteration: 84 first_alignment = None 85 if first_alignment is None: 86 # Nothing to write! 87 return 0 88 89 # Check there is only one alignment... 90 try: 91 second_alignment = next(align_iter) 92 except StopIteration: 93 second_alignment = None 94 if second_alignment is not None: 95 raise ValueError("We can only write one Alignment to a Nexus file.") 96 97 # Good. Actually write the single alignment, 98 self.write_alignment(first_alignment) 99 return 1 # we only support writing one alignment!
100
101 - def write_alignment(self, alignment):
102 # Creates an empty Nexus object, adds the sequences, 103 # and then gets Nexus to prepare the output. 104 if len(alignment) == 0: 105 raise ValueError("Must have at least one sequence") 106 columns = alignment.get_alignment_length() 107 if columns == 0: 108 raise ValueError("Non-empty sequences are required") 109 minimal_record = "#NEXUS\nbegin data; dimensions ntax=0 nchar=0; " \ 110 + "format datatype=%s; end;" \ 111 % self._classify_alphabet_for_nexus(alignment._alphabet) 112 n = Nexus.Nexus(minimal_record) 113 n.alphabet = alignment._alphabet 114 for record in alignment: 115 n.add_sequence(record.id, str(record.seq)) 116 # For smaller alignments, don't bother to interleave. 117 # For larger alginments, interleave to avoid very long lines 118 # in the output - something MrBayes can't handle. 119 # TODO - Default to always interleaving? 120 n.write_nexus_data(self.handle, interleave=(columns > 1000))
121
122 - def _classify_alphabet_for_nexus(self, alphabet):
123 """Returns 'protein', 'dna', 'rna' based on the alphabet (PRIVATE). 124 125 Raises an exception if this is not possible.""" 126 # Get the base alphabet (underneath any Gapped or StopCodon encoding) 127 a = Alphabet._get_base_alphabet(alphabet) 128 129 if not isinstance(a, Alphabet.Alphabet): 130 raise TypeError("Invalid alphabet") 131 elif isinstance(a, Alphabet.ProteinAlphabet): 132 return "protein" 133 elif isinstance(a, Alphabet.DNAAlphabet): 134 return "dna" 135 elif isinstance(a, Alphabet.RNAAlphabet): 136 return "rna" 137 else: 138 # Must be something like NucleotideAlphabet or 139 # just the generic Alphabet (default for fasta files) 140 raise ValueError("Need a DNA, RNA or Protein alphabet")
141 142 if __name__ == "__main__": 143 from Bio._py3k import StringIO 144 print("Quick self test") 145 print("") 146 print("Repeated names without a TAXA block") 147 handle = StringIO("""#NEXUS 148 [TITLE: NoName] 149 150 begin data; 151 dimensions ntax=4 nchar=50; 152 format interleave datatype=protein gap=- symbols="FSTNKEYVQMCLAWPHDRIG"; 153 154 matrix 155 CYS1_DICDI -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---- 156 ALEU_HORVU MAHARVLLLA LAVLATAAVA VASSSSFADS NPIRPVTDRA ASTLESAVLG 157 CATH_HUMAN ------MWAT LPLLCAGAWL LGV------- -PVCGAAELS VNSLEK---- 158 CYS1_DICDI -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---X 159 ; 160 end; 161 """) 162 for a in NexusIterator(handle): 163 print(a) 164 for r in a: 165 print("%r %s %s" % (r.seq, r.name, r.id)) 166 print("Done") 167 168 print("") 169 print("Repeated names with a TAXA block") 170 handle = StringIO("""#NEXUS 171 [TITLE: NoName] 172 173 begin taxa 174 CYS1_DICDI 175 ALEU_HORVU 176 CATH_HUMAN 177 CYS1_DICDI; 178 end; 179 180 begin data; 181 dimensions ntax=4 nchar=50; 182 format interleave datatype=protein gap=- symbols="FSTNKEYVQMCLAWPHDRIG"; 183 184 matrix 185 CYS1_DICDI -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---- 186 ALEU_HORVU MAHARVLLLA LAVLATAAVA VASSSSFADS NPIRPVTDRA ASTLESAVLG 187 CATH_HUMAN ------MWAT LPLLCAGAWL LGV------- -PVCGAAELS VNSLEK---- 188 CYS1_DICDI -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---X 189 ; 190 end; 191 """) 192 for a in NexusIterator(handle): 193 print(a) 194 for r in a: 195 print("%r %s %s" % (r.seq, r.name, r.id)) 196 print("Done") 197 print("") 198 print("Reading an empty file") 199 assert 0 == len(list(NexusIterator(StringIO()))) 200 print("Done") 201 print("") 202 print("Writing...") 203 204 handle = StringIO() 205 NexusWriter(handle).write_file([a]) 206 handle.seek(0) 207 print(handle.read()) 208 209 handle = StringIO() 210 try: 211 NexusWriter(handle).write_file([a, a]) 212 assert False, "Should have rejected more than one alignment!" 213 except ValueError: 214 pass 215