Package Bio :: Package AlignIO :: Module EmbossIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.EmbossIO

  1  # Copyright 2008-2016 by Peter Cock.  All rights reserved. 
  2  # 
  3  # This file is part of the Biopython distribution and governed by your 
  4  # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". 
  5  # Please see the LICENSE file that should have been included as part of this 
  6  # package. 
  7  """Bio.AlignIO support for "emboss" alignment output from EMBOSS tools. 
  8   
  9  You are expected to use this module via the Bio.AlignIO functions (or the 
 10  Bio.SeqIO functions if you want to work directly with the gapped sequences). 
 11   
 12  This module contains a parser for the EMBOSS pairs/simple file format, for 
 13  example from the alignret, water and needle tools. 
 14  """ 
 15   
 16  from __future__ import print_function 
 17   
 18  from Bio.Seq import Seq 
 19  from Bio.SeqRecord import SeqRecord 
 20  from Bio.Align import MultipleSeqAlignment 
 21  from .Interfaces import AlignmentIterator, SequentialAlignmentWriter 
 22   
 23   
24 -class EmbossWriter(SequentialAlignmentWriter):
25 """Emboss alignment writer (WORK IN PROGRESS). 26 27 Writes a simplfied version of the EMBOSS pairs/simple file format. 28 A lot of the information their tools record in their headers is not 29 available and is omitted. 30 """ 31
32 - def write_header(self):
33 handle = self.handle 34 handle.write("########################################\n") 35 handle.write("# Program: Biopython\n") 36 try: 37 handle.write("# Report_file: %s\n" % handle.name) 38 except AttributeError: 39 pass 40 handle.write("########################################\n")
41 46
47 - def write_alignment(self, alignment):
48 """Use this to write (another) single alignment to an open file.""" 49 handle = self.handle 50 handle.write("#=======================================\n") 51 handle.write("#\n") 52 handle.write("# Aligned_sequences: %i\n" % len(alignment)) 53 for i, record in enumerate(alignment): 54 handle.write("# %i: %s\n" % (i + 1, record.id)) 55 handle.write("#\n") 56 handle.write("# Length: %i\n" % alignment.get_alignment_length()) 57 handle.write("#\n") 58 handle.write("#=======================================\n") 59 handle.write("\n") 60 # ... 61 assert False
62 63
64 -class EmbossIterator(AlignmentIterator):
65 """Emboss alignment iterator. 66 67 For reading the (pairwise) alignments from EMBOSS tools in what they 68 call the "pairs" and "simple" formats. 69 """ 70 71 _header = None # for caching lines between __next__ calls 72
73 - def __next__(self):
74 75 handle = self.handle 76 77 if self._header is None: 78 line = handle.readline() 79 else: 80 # Header we saved from when we were parsing 81 # the previous alignment. 82 line = self._header 83 self._header = None 84 85 if not line: 86 raise StopIteration 87 88 while line.rstrip() != "#=======================================": 89 line = handle.readline() 90 if not line: 91 raise StopIteration 92 93 length_of_seqs = None 94 number_of_seqs = None 95 ids = [] 96 seqs = [] 97 98 while line[0] == "#": 99 # Read in the rest of this alignment header, 100 # try and discover the number of records expected 101 # and their length 102 parts = line[1:].split(":", 1) 103 key = parts[0].lower().strip() 104 if key == "aligned_sequences": 105 number_of_seqs = int(parts[1].strip()) 106 assert len(ids) == 0 107 # Should now expect the record identifiers... 108 for i in range(number_of_seqs): 109 line = handle.readline() 110 parts = line[1:].strip().split(":", 1) 111 assert i + 1 == int(parts[0].strip()) 112 ids.append(parts[1].strip()) 113 assert len(ids) == number_of_seqs 114 if key == "length": 115 length_of_seqs = int(parts[1].strip()) 116 117 # And read in another line... 118 line = handle.readline() 119 120 if number_of_seqs is None: 121 raise ValueError("Number of sequences missing!") 122 if length_of_seqs is None: 123 raise ValueError("Length of sequences missing!") 124 125 if self.records_per_alignment is not None \ 126 and self.records_per_alignment != number_of_seqs: 127 raise ValueError("Found %i records in this alignment, told to expect %i" 128 % (number_of_seqs, self.records_per_alignment)) 129 130 seqs = ["" for id in ids] 131 seq_starts = [] 132 index = 0 133 134 # Parse the seqs 135 while line: 136 if len(line) > 21: 137 id_start = line[:21].strip().split(None, 1) 138 seq_end = line[21:].strip().split(None, 1) 139 if len(id_start) == 2 and len(seq_end) == 2: 140 # identifier, seq start position, seq, seq end position 141 # (an aligned seq is broken up into multiple lines) 142 id, start = id_start 143 seq, end = seq_end 144 if start == end: 145 # Special case, either a single letter is present, 146 # or no letters at all. 147 if seq.replace("-", "") == "": 148 start = int(start) 149 end = int(end) 150 else: 151 start = int(start) - 1 152 end = int(end) 153 else: 154 assert seq.replace("-", "") != "", repr(line) 155 start = int(start) - 1 # python counting 156 end = int(end) 157 158 # The identifier is truncated... 159 assert 0 <= index and index < number_of_seqs, \ 160 "Expected index %i in range [0,%i)" \ 161 % (index, number_of_seqs) 162 assert id == ids[index] or id == ids[index][:len(id)] 163 164 if len(seq_starts) == index: 165 # Record the start 166 seq_starts.append(start) 167 168 # Check the start... 169 if start == end: 170 assert seq.replace("-", "") == "", line 171 else: 172 assert start - seq_starts[index] == len(seqs[index].replace("-", "")), \ 173 "Found %i chars so far for sequence %i (%s, %s), line says start %i:\n%s" \ 174 % (len(seqs[index].replace("-", "")), index, id, repr(seqs[index]), 175 start, line) 176 177 seqs[index] += seq 178 179 # Check the end ... 180 assert end == seq_starts[index] + len(seqs[index].replace("-", "")), \ 181 "Found %i chars so far for sequence %i (%s, %s, start=%i), file says end %i:\n%s" \ 182 % (len(seqs[index].replace("-", "")), index, id, repr(seqs[index]), 183 seq_starts[index], end, line) 184 185 index += 1 186 if index >= number_of_seqs: 187 index = 0 188 else: 189 # just a start value, this is just alignment annotation (?) 190 # print "Skipping: " + line.rstrip() 191 pass 192 elif line.strip() == "": 193 # Just a spacer? 194 pass 195 else: 196 print(line) 197 assert False 198 199 line = handle.readline() 200 if line.rstrip() == "#---------------------------------------" \ 201 or line.rstrip() == "#=======================================": 202 # End of alignment 203 self._header = line 204 break 205 206 assert index == 0 207 208 if self.records_per_alignment is not None \ 209 and self.records_per_alignment != len(ids): 210 raise ValueError("Found %i records in this alignment, told to expect %i" 211 % (len(ids), self.records_per_alignment)) 212 213 records = [] 214 for id, seq in zip(ids, seqs): 215 if len(seq) != length_of_seqs: 216 # EMBOSS 2.9.0 is known to use spaces instead of minus signs 217 # for leading gaps, and thus fails to parse. This old version 218 # is still used as of Dec 2008 behind the EBI SOAP webservice: 219 # http://www.ebi.ac.uk/Tools/webservices/wsdl/WSEmboss.wsdl 220 raise ValueError("Error parsing alignment - sequences of " 221 "different length? You could be using an " 222 "old version of EMBOSS.") 223 records.append(SeqRecord(Seq(seq, self.alphabet), 224 id=id, description=id)) 225 return MultipleSeqAlignment(records, self.alphabet)
226