Package Bio :: Package AlignIO :: Module NexusIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.NexusIO

  1  # Copyright 2008-2010 by Peter Cock.  All rights reserved. 
  2  # 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """Bio.AlignIO support for the "nexus" file format. 
  7   
  8  You are expected to use this module via the Bio.AlignIO functions (or the 
  9  Bio.SeqIO functions if you want to work directly with the gapped sequences). 
 10   
 11  See also the Bio.Nexus module (which this code calls internally), 
 12  as this offers more than just accessing the alignment or its 
 13  sequences as SeqRecord objects. 
 14  """ 
 15   
 16  from __future__ import print_function 
 17   
 18  from Bio.SeqRecord import SeqRecord 
 19  from Bio.Nexus import Nexus 
 20  from Bio.Align import MultipleSeqAlignment 
 21  from .Interfaces import AlignmentWriter 
 22  from Bio import Alphabet 
 23   
 24  #You can get a couple of example files here: 
 25  #http://www.molecularevolution.org/resources/fileformats/ 
 26   
 27   
 28  #This is a generator function! 
29 -def NexusIterator(handle, seq_count=None):
30 """Returns SeqRecord objects from a Nexus file. 31 32 Thus uses the Bio.Nexus module to do the hard work. 33 34 You are expected to call this function via Bio.SeqIO or Bio.AlignIO 35 (and not use it directly). 36 37 NOTE - We only expect ONE alignment matrix per Nexus file, 38 meaning this iterator will only yield one MultipleSeqAlignment. 39 """ 40 n = Nexus.Nexus(handle) 41 if not n.matrix: 42 #No alignment found 43 raise StopIteration 44 45 #Bio.Nexus deals with duplicated names by adding a '.copy' suffix. 46 #The original names and the modified names are kept in these two lists: 47 assert len(n.unaltered_taxlabels) == len(n.taxlabels) 48 49 if seq_count and seq_count != len(n.unaltered_taxlabels): 50 raise ValueError("Found %i sequences, but seq_count=%i" 51 % (len(n.unaltered_taxlabels), seq_count)) 52 53 #ToDo - Can we extract any annotation too? 54 records = (SeqRecord(n.matrix[new_name], id=new_name, 55 name=old_name, description="") 56 for old_name, new_name 57 in zip(n.unaltered_taxlabels, n.taxlabels)) 58 #All done 59 yield MultipleSeqAlignment(records, n.alphabet)
60 61
62 -class NexusWriter(AlignmentWriter):
63 """Nexus alignment writer. 64 65 Note that Nexus files are only expected to hold ONE alignment 66 matrix. 67 68 You are expected to call this class via the Bio.AlignIO.write() or 69 Bio.SeqIO.write() functions. 70 """
71 - def write_file(self, alignments):
72 """Use this to write an entire file containing the given alignments. 73 74 alignments - A list or iterator returning MultipleSeqAlignment objects. 75 This should hold ONE and only one alignment. 76 """ 77 align_iter = iter(alignments) # Could have been a list 78 try: 79 first_alignment = next(align_iter) 80 except StopIteration: 81 first_alignment = None 82 if first_alignment is None: 83 #Nothing to write! 84 return 0 85 86 #Check there is only one alignment... 87 try: 88 second_alignment = next(align_iter) 89 except StopIteration: 90 second_alignment = None 91 if second_alignment is not None: 92 raise ValueError("We can only write one Alignment to a Nexus file.") 93 94 #Good. Actually write the single alignment, 95 self.write_alignment(first_alignment) 96 return 1 # we only support writing one alignment!
97
98 - def write_alignment(self, alignment):
99 #Creates an empty Nexus object, adds the sequences, 100 #and then gets Nexus to prepare the output. 101 if len(alignment) == 0: 102 raise ValueError("Must have at least one sequence") 103 columns = alignment.get_alignment_length() 104 if columns == 0: 105 raise ValueError("Non-empty sequences are required") 106 minimal_record = "#NEXUS\nbegin data; dimensions ntax=0 nchar=0; " \ 107 + "format datatype=%s; end;" \ 108 % self._classify_alphabet_for_nexus(alignment._alphabet) 109 n = Nexus.Nexus(minimal_record) 110 n.alphabet = alignment._alphabet 111 for record in alignment: 112 n.add_sequence(record.id, str(record.seq)) 113 #For smaller alignments, don't bother to interleave. 114 #For larger alginments, interleave to avoid very long lines 115 #in the output - something MrBayes can't handle. 116 #TODO - Default to always interleaving? 117 n.write_nexus_data(self.handle, interleave=(columns > 1000))
118
119 - def _classify_alphabet_for_nexus(self, alphabet):
120 """Returns 'protein', 'dna', 'rna' based on the alphabet (PRIVATE). 121 122 Raises an exception if this is not possible.""" 123 #Get the base alphabet (underneath any Gapped or StopCodon encoding) 124 a = Alphabet._get_base_alphabet(alphabet) 125 126 if not isinstance(a, Alphabet.Alphabet): 127 raise TypeError("Invalid alphabet") 128 elif isinstance(a, Alphabet.ProteinAlphabet): 129 return "protein" 130 elif isinstance(a, Alphabet.DNAAlphabet): 131 return "dna" 132 elif isinstance(a, Alphabet.RNAAlphabet): 133 return "rna" 134 else: 135 #Must be something like NucleotideAlphabet or 136 #just the generic Alphabet (default for fasta files) 137 raise ValueError("Need a DNA, RNA or Protein alphabet")
138 139 if __name__ == "__main__": 140 from Bio._py3k import StringIO 141 print("Quick self test") 142 print("") 143 print("Repeated names without a TAXA block") 144 handle = StringIO("""#NEXUS 145 [TITLE: NoName] 146 147 begin data; 148 dimensions ntax=4 nchar=50; 149 format interleave datatype=protein gap=- symbols="FSTNKEYVQMCLAWPHDRIG"; 150 151 matrix 152 CYS1_DICDI -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---- 153 ALEU_HORVU MAHARVLLLA LAVLATAAVA VASSSSFADS NPIRPVTDRA ASTLESAVLG 154 CATH_HUMAN ------MWAT LPLLCAGAWL LGV------- -PVCGAAELS VNSLEK---- 155 CYS1_DICDI -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---X 156 ; 157 end; 158 """) 159 for a in NexusIterator(handle): 160 print(a) 161 for r in a: 162 print("%r %s %s" % (r.seq, r.name, r.id)) 163 print("Done") 164 165 print("") 166 print("Repeated names with a TAXA block") 167 handle = StringIO("""#NEXUS 168 [TITLE: NoName] 169 170 begin taxa 171 CYS1_DICDI 172 ALEU_HORVU 173 CATH_HUMAN 174 CYS1_DICDI; 175 end; 176 177 begin data; 178 dimensions ntax=4 nchar=50; 179 format interleave datatype=protein gap=- symbols="FSTNKEYVQMCLAWPHDRIG"; 180 181 matrix 182 CYS1_DICDI -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---- 183 ALEU_HORVU MAHARVLLLA LAVLATAAVA VASSSSFADS NPIRPVTDRA ASTLESAVLG 184 CATH_HUMAN ------MWAT LPLLCAGAWL LGV------- -PVCGAAELS VNSLEK---- 185 CYS1_DICDI -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---X 186 ; 187 end; 188 """) 189 for a in NexusIterator(handle): 190 print(a) 191 for r in a: 192 print("%r %s %s" % (r.seq, r.name, r.id)) 193 print("Done") 194 print("") 195 print("Reading an empty file") 196 assert 0 == len(list(NexusIterator(StringIO()))) 197 print("Done") 198 print("") 199 print("Writing...") 200 201 handle = StringIO() 202 NexusWriter(handle).write_file([a]) 203 handle.seek(0) 204 print(handle.read()) 205 206 handle = StringIO() 207 try: 208 NexusWriter(handle).write_file([a, a]) 209 assert False, "Should have rejected more than one alignment!" 210 except ValueError: 211 pass 212