Package Bio :: Package AlignIO :: Module EmbossIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.EmbossIO

  1  # Copyright 2008-2013 by Peter Cock.  All rights reserved. 
  2  # 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """Bio.AlignIO support for "emboss" alignment output from EMBOSS tools. 
  7   
  8  You are expected to use this module via the Bio.AlignIO functions (or the 
  9  Bio.SeqIO functions if you want to work directly with the gapped sequences). 
 10   
 11  This module contains a parser for the EMBOSS pairs/simple file format, for 
 12  example from the alignret, water and needle tools. 
 13  """ 
 14   
 15  from __future__ import print_function 
 16   
 17  from Bio.Seq import Seq 
 18  from Bio.SeqRecord import SeqRecord 
 19  from Bio.Align import MultipleSeqAlignment 
 20  from .Interfaces import AlignmentIterator, SequentialAlignmentWriter 
 21   
 22   
23 -class EmbossWriter(SequentialAlignmentWriter):
24 """Emboss alignment writer (WORK IN PROGRESS). 25 26 Writes a simplfied version of the EMBOSS pairs/simple file format. 27 A lot of the information their tools record in their headers is not 28 available and is omitted. 29 """ 30
31 - def write_header(self):
32 handle = self.handle 33 handle.write("########################################\n") 34 handle.write("# Program: Biopython\n") 35 try: 36 handle.write("# Report_file: %s\n" % handle.name) 37 except AttributeError: 38 pass 39 handle.write("########################################\n")
40 45
46 - def write_alignment(self, alignment):
47 """Use this to write (another) single alignment to an open file.""" 48 handle = self.handle 49 handle.write("#=======================================\n") 50 handle.write("#\n") 51 handle.write("# Aligned_sequences: %i\n" % len(alignment)) 52 for i, record in enumerate(alignment): 53 handle.write("# %i: %s\n" % (i + 1, record.id)) 54 handle.write("#\n") 55 handle.write("# Length: %i\n" % alignment.get_alignment_length()) 56 handle.write("#\n") 57 handle.write("#=======================================\n") 58 handle.write("\n") 59 # ... 60 assert False
61 62
63 -class EmbossIterator(AlignmentIterator):
64 """Emboss alignment iterator. 65 66 For reading the (pairwise) alignments from EMBOSS tools in what they 67 call the "pairs" and "simple" formats. 68 """ 69 70 _header = None # for caching lines between __next__ calls 71
72 - def __next__(self):
73 74 handle = self.handle 75 76 if self._header is None: 77 line = handle.readline() 78 else: 79 # Header we saved from when we were parsing 80 # the previous alignment. 81 line = self._header 82 self._header = None 83 84 if not line: 85 raise StopIteration 86 87 while line.rstrip() != "#=======================================": 88 line = handle.readline() 89 if not line: 90 raise StopIteration 91 92 length_of_seqs = None 93 number_of_seqs = None 94 ids = [] 95 seqs = [] 96 97 while line[0] == "#": 98 # Read in the rest of this alignment header, 99 # try and discover the number of records expected 100 # and their length 101 parts = line[1:].split(":", 1) 102 key = parts[0].lower().strip() 103 if key == "aligned_sequences": 104 number_of_seqs = int(parts[1].strip()) 105 assert len(ids) == 0 106 # Should now expect the record identifiers... 107 for i in range(number_of_seqs): 108 line = handle.readline() 109 parts = line[1:].strip().split(":", 1) 110 assert i + 1 == int(parts[0].strip()) 111 ids.append(parts[1].strip()) 112 assert len(ids) == number_of_seqs 113 if key == "length": 114 length_of_seqs = int(parts[1].strip()) 115 116 # And read in another line... 117 line = handle.readline() 118 119 if number_of_seqs is None: 120 raise ValueError("Number of sequences missing!") 121 if length_of_seqs is None: 122 raise ValueError("Length of sequences missing!") 123 124 if self.records_per_alignment is not None \ 125 and self.records_per_alignment != number_of_seqs: 126 raise ValueError("Found %i records in this alignment, told to expect %i" 127 % (number_of_seqs, self.records_per_alignment)) 128 129 seqs = ["" for id in ids] 130 seq_starts = [] 131 index = 0 132 133 # Parse the seqs 134 while line: 135 if len(line) > 21: 136 id_start = line[:21].strip().split(None, 1) 137 seq_end = line[21:].strip().split(None, 1) 138 if len(id_start) == 2 and len(seq_end) == 2: 139 # identifier, seq start position, seq, seq end position 140 # (an aligned seq is broken up into multiple lines) 141 id, start = id_start 142 seq, end = seq_end 143 if start == end: 144 # Special case, either a single letter is present, 145 # or no letters at all. 146 if seq.replace("-", "") == "": 147 start = int(start) 148 end = int(end) 149 else: 150 start = int(start) - 1 151 end = int(end) 152 else: 153 assert seq.replace("-", "") != "", repr(line) 154 start = int(start) - 1 # python counting 155 end = int(end) 156 157 # The identifier is truncated... 158 assert 0 <= index and index < number_of_seqs, \ 159 "Expected index %i in range [0,%i)" \ 160 % (index, number_of_seqs) 161 assert id == ids[index] or id == ids[index][:len(id)] 162 163 if len(seq_starts) == index: 164 # Record the start 165 seq_starts.append(start) 166 167 # Check the start... 168 if start == end: 169 assert seq.replace("-", "") == "", line 170 else: 171 assert start - seq_starts[index] == len(seqs[index].replace("-", "")), \ 172 "Found %i chars so far for sequence %i (%s, %s), line says start %i:\n%s" \ 173 % (len(seqs[index].replace("-", "")), index, id, repr(seqs[index]), 174 start, line) 175 176 seqs[index] += seq 177 178 # Check the end ... 179 assert end == seq_starts[index] + len(seqs[index].replace("-", "")), \ 180 "Found %i chars so far for sequence %i (%s, %s, start=%i), file says end %i:\n%s" \ 181 % (len(seqs[index].replace("-", "")), index, id, repr(seqs[index]), 182 seq_starts[index], end, line) 183 184 index += 1 185 if index >= number_of_seqs: 186 index = 0 187 else: 188 # just a start value, this is just alignment annotation (?) 189 # print "Skipping: " + line.rstrip() 190 pass 191 elif line.strip() == "": 192 # Just a spacer? 193 pass 194 else: 195 print(line) 196 assert False 197 198 line = handle.readline() 199 if line.rstrip() == "#---------------------------------------" \ 200 or line.rstrip() == "#=======================================": 201 # End of alignment 202 self._header = line 203 break 204 205 assert index == 0 206 207 if self.records_per_alignment is not None \ 208 and self.records_per_alignment != len(ids): 209 raise ValueError("Found %i records in this alignment, told to expect %i" 210 % (len(ids), self.records_per_alignment)) 211 212 records = [] 213 for id, seq in zip(ids, seqs): 214 if len(seq) != length_of_seqs: 215 # EMBOSS 2.9.0 is known to use spaces instead of minus signs 216 # for leading gaps, and thus fails to parse. This old version 217 # is still used as of Dec 2008 behind the EBI SOAP webservice: 218 # http://www.ebi.ac.uk/Tools/webservices/wsdl/WSEmboss.wsdl 219 raise ValueError("Error parsing alignment - sequences of " 220 "different length? You could be using an " 221 "old version of EMBOSS.") 222 records.append(SeqRecord(Seq(seq, self.alphabet), 223 id=id, description=id)) 224 return MultipleSeqAlignment(records, self.alphabet)
225