Package Bio :: Package AlignIO :: Module ClustalIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.ClustalIO

  1  # Copyright 2006-2013 by Peter Cock.  All rights reserved. 
  2  # 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """Bio.AlignIO support for "clustal" output from CLUSTAL W and other tools. 
  7   
  8  You are expected to use this module via the Bio.AlignIO functions (or the 
  9  Bio.SeqIO functions if you want to work directly with the gapped sequences). 
 10  """ 
 11   
 12  from __future__ import print_function 
 13   
 14  from Bio.Seq import Seq 
 15  from Bio.SeqRecord import SeqRecord 
 16  from Bio.Align import MultipleSeqAlignment 
 17  from .Interfaces import AlignmentIterator, SequentialAlignmentWriter 
 18   
 19  __docformat__ = "restructuredtext en" 
 20   
 21   
22 -class ClustalWriter(SequentialAlignmentWriter):
23 """Clustalw alignment writer.""" 24
25 - def write_alignment(self, alignment):
26 """Use this to write (another) single alignment to an open file.""" 27 28 if len(alignment) == 0: 29 raise ValueError("Must have at least one sequence") 30 if alignment.get_alignment_length() == 0: 31 # This doubles as a check for an alignment object 32 raise ValueError("Non-empty sequences are required") 33 34 # Old versions of the parser in Bio.Clustalw used a ._version property, 35 try: 36 version = str(alignment._version) 37 except AttributeError: 38 version = "" 39 if not version: 40 version = '1.81' 41 if version.startswith("2."): 42 # e.g. 2.0.x 43 output = "CLUSTAL %s multiple sequence alignment\n\n\n" % version 44 else: 45 # e.g. 1.81 or 1.83 46 output = "CLUSTAL X (%s) multiple sequence alignment\n\n\n" % version 47 48 cur_char = 0 49 max_length = len(alignment[0]) 50 51 if max_length <= 0: 52 raise ValueError("Non-empty sequences are required") 53 54 # keep displaying sequences until we reach the end 55 while cur_char != max_length: 56 # calculate the number of sequences to show, which will 57 # be less if we are at the end of the sequence 58 if (cur_char + 50) > max_length: 59 show_num = max_length - cur_char 60 else: 61 show_num = 50 62 63 # go through all of the records and print out the sequences 64 # when we output, we do a nice 80 column output, although this 65 # may result in truncation of the ids. 66 for record in alignment: 67 # Make sure we don't get any spaces in the record 68 # identifier when output in the file by replacing 69 # them with underscores: 70 line = record.id[0:30].replace(" ", "_").ljust(36) 71 line += str(record.seq[cur_char:(cur_char + show_num)]) 72 output += line + "\n" 73 74 # now we need to print out the star info, if we've got it 75 # This was stored by Bio.Clustalw using a ._star_info property. 76 if hasattr(alignment, "_star_info") and alignment._star_info != '': 77 output += (" " * 36) + \ 78 alignment._star_info[cur_char:(cur_char + show_num)] + "\n" 79 80 output += "\n" 81 cur_char += show_num 82 83 # Want a trailing blank new line in case the output is concatenated 84 self.handle.write(output + "\n")
85 86
87 -class ClustalIterator(AlignmentIterator):
88 """Clustalw alignment iterator.""" 89 90 _header = None # for caching lines between __next__ calls 91
92 - def __next__(self):
93 handle = self.handle 94 95 if self._header is None: 96 line = handle.readline() 97 else: 98 # Header we saved from when we were parsing 99 # the previous alignment. 100 line = self._header 101 self._header = None 102 103 if not line: 104 raise StopIteration 105 106 # Whitelisted headers we know about 107 known_headers = ['CLUSTAL', 'PROBCONS', 'MUSCLE', 'MSAPROBS','Kalign'] 108 if line.strip().split()[0] not in known_headers: 109 raise ValueError("%s is not a known CLUSTAL header: %s" % 110 (line.strip().split()[0], 111 ", ".join(known_headers))) 112 113 # find the clustal version in the header line 114 version = None 115 for word in line.split(): 116 if word[0] == '(' and word[-1] == ')': 117 word = word[1:-1] 118 if word[0] in '0123456789': 119 version = word 120 break 121 122 # There should be two blank lines after the header line 123 line = handle.readline() 124 while line.strip() == "": 125 line = handle.readline() 126 127 # If the alignment contains entries with the same sequence 128 # identifier (not a good idea - but seems possible), then this 129 # dictionary based parser will merge their sequences. Fix this? 130 ids = [] 131 seqs = [] 132 consensus = "" 133 seq_cols = None # Used to extract the consensus 134 135 # Use the first block to get the sequence identifiers 136 while True: 137 if line[0] != " " and line.strip() != "": 138 # Sequences identifier... 139 fields = line.rstrip().split() 140 141 # We expect there to be two fields, there can be an optional 142 # "sequence number" field containing the letter count. 143 if len(fields) < 2 or len(fields) > 3: 144 raise ValueError("Could not parse line:\n%s" % line) 145 146 ids.append(fields[0]) 147 seqs.append(fields[1]) 148 149 # Record the sequence position to get the consensus 150 if seq_cols is None: 151 start = len(fields[0]) + line[len(fields[0]):].find(fields[1]) 152 end = start + len(fields[1]) 153 seq_cols = slice(start, end) 154 del start, end 155 assert fields[1] == line[seq_cols] 156 157 if len(fields) == 3: 158 # This MAY be an old style file with a letter count... 159 try: 160 letters = int(fields[2]) 161 except ValueError: 162 raise ValueError("Could not parse line, bad sequence number:\n%s" % line) 163 if len(fields[1].replace("-", "")) != letters: 164 raise ValueError("Could not parse line, invalid sequence number:\n%s" % line) 165 elif line[0] == " ": 166 # Sequence consensus line... 167 assert len(ids) == len(seqs) 168 assert len(ids) > 0 169 assert seq_cols is not None 170 consensus = line[seq_cols] 171 assert not line[:seq_cols.start].strip() 172 assert not line[seq_cols.stop:].strip() 173 # Check for blank line (or end of file) 174 line = handle.readline() 175 assert line.strip() == "" 176 break 177 else: 178 # No consensus 179 break 180 line = handle.readline() 181 if not line: 182 break # end of file 183 184 assert line.strip() == "" 185 assert seq_cols is not None 186 187 # Confirm all same length 188 for s in seqs: 189 assert len(s) == len(seqs[0]) 190 if consensus: 191 assert len(consensus) == len(seqs[0]) 192 193 # Loop over any remaining blocks... 194 done = False 195 while not done: 196 # There should be a blank line between each block. 197 # Also want to ignore any consensus line from the 198 # previous block. 199 while (not line) or line.strip() == "": 200 line = handle.readline() 201 if not line: 202 break # end of file 203 if not line: 204 break # end of file 205 206 if line.split(None, 1)[0] in known_headers: 207 # Found concatenated alignment. 208 done = True 209 self._header = line 210 break 211 212 for i in range(len(ids)): 213 assert line[0] != " ", "Unexpected line:\n%s" % repr(line) 214 fields = line.rstrip().split() 215 216 # We expect there to be two fields, there can be an optional 217 # "sequence number" field containing the letter count. 218 if len(fields) < 2 or len(fields) > 3: 219 raise ValueError("Could not parse line:\n%s" % repr(line)) 220 221 if fields[0] != ids[i]: 222 raise ValueError("Identifiers out of order? Got '%s' but expected '%s'" 223 % (fields[0], ids[i])) 224 225 if fields[1] != line[seq_cols]: 226 start = len(fields[0]) + line[len(fields[0]):].find(fields[1]) 227 assert start == seq_cols.start, 'Old location %s -> %i:XX' % (seq_cols, start) 228 end = start + len(fields[1]) 229 seq_cols = slice(start, end) 230 del start, end 231 232 # Append the sequence 233 seqs[i] += fields[1] 234 assert len(seqs[i]) == len(seqs[0]) 235 236 if len(fields) == 3: 237 # This MAY be an old style file with a letter count... 238 try: 239 letters = int(fields[2]) 240 except ValueError: 241 raise ValueError("Could not parse line, bad sequence number:\n%s" % line) 242 if len(seqs[i].replace("-", "")) != letters: 243 raise ValueError("Could not parse line, invalid sequence number:\n%s" % line) 244 245 # Read in the next line 246 line = handle.readline() 247 # There should now be a consensus line 248 if consensus: 249 assert line[0] == " " 250 assert seq_cols is not None 251 consensus += line[seq_cols] 252 assert len(consensus) == len(seqs[0]) 253 assert not line[:seq_cols.start].strip() 254 assert not line[seq_cols.stop:].strip() 255 # Read in the next line 256 line = handle.readline() 257 258 assert len(ids) == len(seqs) 259 if len(seqs) == 0 or len(seqs[0]) == 0: 260 raise StopIteration 261 262 if self.records_per_alignment is not None \ 263 and self.records_per_alignment != len(ids): 264 raise ValueError("Found %i records in this alignment, told to expect %i" 265 % (len(ids), self.records_per_alignment)) 266 267 records = (SeqRecord(Seq(s, self.alphabet), id=i, description=i) 268 for (i, s) in zip(ids, seqs)) 269 alignment = MultipleSeqAlignment(records, self.alphabet) 270 # TODO - Handle alignment annotation better, for now 271 # mimic the old parser in Bio.Clustalw 272 if version: 273 alignment._version = version 274 if consensus: 275 alignment_length = len(seqs[0]) 276 assert len(consensus) == alignment_length, \ 277 "Alignment length is %i, consensus length is %i, '%s'" \ 278 % (alignment_length, len(consensus), consensus) 279 alignment._star_info = consensus 280 return alignment
281