Package Bio :: Package AlignIO :: Module ClustalIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.ClustalIO

  1  # Copyright 2006-2013 by Peter Cock.  All rights reserved. 
  2  # 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """Bio.AlignIO support for "clustal" output from CLUSTAL W and other tools. 
  7   
  8  You are expected to use this module via the Bio.AlignIO functions (or the 
  9  Bio.SeqIO functions if you want to work directly with the gapped sequences). 
 10  """ 
 11   
 12  from __future__ import print_function 
 13   
 14  from Bio.Seq import Seq 
 15  from Bio.SeqRecord import SeqRecord 
 16  from Bio.Align import MultipleSeqAlignment 
 17  from .Interfaces import AlignmentIterator, SequentialAlignmentWriter 
 18   
 19   
20 -class ClustalWriter(SequentialAlignmentWriter):
21 """Clustalw alignment writer.""" 22
23 - def write_alignment(self, alignment):
24 """Use this to write (another) single alignment to an open file.""" 25 26 if len(alignment) == 0: 27 raise ValueError("Must have at least one sequence") 28 if alignment.get_alignment_length() == 0: 29 # This doubles as a check for an alignment object 30 raise ValueError("Non-empty sequences are required") 31 32 # Old versions of the parser in Bio.Clustalw used a ._version property, 33 try: 34 version = str(alignment._version) 35 except AttributeError: 36 version = "" 37 if not version: 38 version = '1.81' 39 if version.startswith("2."): 40 # e.g. 2.0.x 41 output = "CLUSTAL %s multiple sequence alignment\n\n\n" % version 42 else: 43 # e.g. 1.81 or 1.83 44 output = "CLUSTAL X (%s) multiple sequence alignment\n\n\n" % version 45 46 cur_char = 0 47 max_length = len(alignment[0]) 48 49 if max_length <= 0: 50 raise ValueError("Non-empty sequences are required") 51 52 # keep displaying sequences until we reach the end 53 while cur_char != max_length: 54 # calculate the number of sequences to show, which will 55 # be less if we are at the end of the sequence 56 if (cur_char + 50) > max_length: 57 show_num = max_length - cur_char 58 else: 59 show_num = 50 60 61 # go through all of the records and print out the sequences 62 # when we output, we do a nice 80 column output, although this 63 # may result in truncation of the ids. 64 for record in alignment: 65 # Make sure we don't get any spaces in the record 66 # identifier when output in the file by replacing 67 # them with underscores: 68 line = record.id[0:30].replace(" ", "_").ljust(36) 69 line += str(record.seq[cur_char:(cur_char + show_num)]) 70 output += line + "\n" 71 72 # now we need to print out the star info, if we've got it 73 # This was stored by Bio.Clustalw using a ._star_info property. 74 if hasattr(alignment, "_star_info") and alignment._star_info != '': 75 output += (" " * 36) + \ 76 alignment._star_info[cur_char:(cur_char + show_num)] + "\n" 77 78 output += "\n" 79 cur_char += show_num 80 81 # Want a trailing blank new line in case the output is concatenated 82 self.handle.write(output + "\n")
83 84
85 -class ClustalIterator(AlignmentIterator):
86 """Clustalw alignment iterator.""" 87 88 _header = None # for caching lines between __next__ calls 89
90 - def __next__(self):
91 handle = self.handle 92 93 if self._header is None: 94 line = handle.readline() 95 else: 96 # Header we saved from when we were parsing 97 # the previous alignment. 98 line = self._header 99 self._header = None 100 101 if not line: 102 raise StopIteration 103 104 # Whitelisted headers we know about 105 known_headers = ['CLUSTAL', 'PROBCONS', 'MUSCLE', 'MSAPROBS', 'Kalign'] 106 if line.strip().split()[0] not in known_headers: 107 raise ValueError("%s is not a known CLUSTAL header: %s" % 108 (line.strip().split()[0], 109 ", ".join(known_headers))) 110 111 # find the clustal version in the header line 112 version = None 113 for word in line.split(): 114 if word[0] == '(' and word[-1] == ')': 115 word = word[1:-1] 116 if word[0] in '0123456789': 117 version = word 118 break 119 120 # There should be two blank lines after the header line 121 line = handle.readline() 122 while line.strip() == "": 123 line = handle.readline() 124 125 # If the alignment contains entries with the same sequence 126 # identifier (not a good idea - but seems possible), then this 127 # dictionary based parser will merge their sequences. Fix this? 128 ids = [] 129 seqs = [] 130 consensus = "" 131 seq_cols = None # Used to extract the consensus 132 133 # Use the first block to get the sequence identifiers 134 while True: 135 if line[0] != " " and line.strip() != "": 136 # Sequences identifier... 137 fields = line.rstrip().split() 138 139 # We expect there to be two fields, there can be an optional 140 # "sequence number" field containing the letter count. 141 if len(fields) < 2 or len(fields) > 3: 142 raise ValueError("Could not parse line:\n%s" % line) 143 144 ids.append(fields[0]) 145 seqs.append(fields[1]) 146 147 # Record the sequence position to get the consensus 148 if seq_cols is None: 149 start = len(fields[0]) + line[len(fields[0]):].find(fields[1]) 150 end = start + len(fields[1]) 151 seq_cols = slice(start, end) 152 del start, end 153 assert fields[1] == line[seq_cols] 154 155 if len(fields) == 3: 156 # This MAY be an old style file with a letter count... 157 try: 158 letters = int(fields[2]) 159 except ValueError: 160 raise ValueError("Could not parse line, bad sequence number:\n%s" % line) 161 if len(fields[1].replace("-", "")) != letters: 162 raise ValueError("Could not parse line, invalid sequence number:\n%s" % line) 163 elif line[0] == " ": 164 # Sequence consensus line... 165 assert len(ids) == len(seqs) 166 assert len(ids) > 0 167 assert seq_cols is not None 168 consensus = line[seq_cols] 169 assert not line[:seq_cols.start].strip() 170 assert not line[seq_cols.stop:].strip() 171 # Check for blank line (or end of file) 172 line = handle.readline() 173 assert line.strip() == "" 174 break 175 else: 176 # No consensus 177 break 178 line = handle.readline() 179 if not line: 180 break # end of file 181 182 assert line.strip() == "" 183 assert seq_cols is not None 184 185 # Confirm all same length 186 for s in seqs: 187 assert len(s) == len(seqs[0]) 188 if consensus: 189 assert len(consensus) == len(seqs[0]) 190 191 # Loop over any remaining blocks... 192 done = False 193 while not done: 194 # There should be a blank line between each block. 195 # Also want to ignore any consensus line from the 196 # previous block. 197 while (not line) or line.strip() == "": 198 line = handle.readline() 199 if not line: 200 break # end of file 201 if not line: 202 break # end of file 203 204 if line.split(None, 1)[0] in known_headers: 205 # Found concatenated alignment. 206 done = True 207 self._header = line 208 break 209 210 for i in range(len(ids)): 211 assert line[0] != " ", "Unexpected line:\n%s" % repr(line) 212 fields = line.rstrip().split() 213 214 # We expect there to be two fields, there can be an optional 215 # "sequence number" field containing the letter count. 216 if len(fields) < 2 or len(fields) > 3: 217 raise ValueError("Could not parse line:\n%s" % repr(line)) 218 219 if fields[0] != ids[i]: 220 raise ValueError("Identifiers out of order? Got '%s' but expected '%s'" 221 % (fields[0], ids[i])) 222 223 if fields[1] != line[seq_cols]: 224 start = len(fields[0]) + line[len(fields[0]):].find(fields[1]) 225 assert start == seq_cols.start, 'Old location %s -> %i:XX' % (seq_cols, start) 226 end = start + len(fields[1]) 227 seq_cols = slice(start, end) 228 del start, end 229 230 # Append the sequence 231 seqs[i] += fields[1] 232 assert len(seqs[i]) == len(seqs[0]) 233 234 if len(fields) == 3: 235 # This MAY be an old style file with a letter count... 236 try: 237 letters = int(fields[2]) 238 except ValueError: 239 raise ValueError("Could not parse line, bad sequence number:\n%s" % line) 240 if len(seqs[i].replace("-", "")) != letters: 241 raise ValueError("Could not parse line, invalid sequence number:\n%s" % line) 242 243 # Read in the next line 244 line = handle.readline() 245 # There should now be a consensus line 246 if consensus: 247 assert line[0] == " " 248 assert seq_cols is not None 249 consensus += line[seq_cols] 250 assert len(consensus) == len(seqs[0]) 251 assert not line[:seq_cols.start].strip() 252 assert not line[seq_cols.stop:].strip() 253 # Read in the next line 254 line = handle.readline() 255 256 assert len(ids) == len(seqs) 257 if len(seqs) == 0 or len(seqs[0]) == 0: 258 raise StopIteration 259 260 if self.records_per_alignment is not None \ 261 and self.records_per_alignment != len(ids): 262 raise ValueError("Found %i records in this alignment, told to expect %i" 263 % (len(ids), self.records_per_alignment)) 264 265 records = (SeqRecord(Seq(s, self.alphabet), id=i, description=i) 266 for (i, s) in zip(ids, seqs)) 267 alignment = MultipleSeqAlignment(records, self.alphabet) 268 # TODO - Handle alignment annotation better, for now 269 # mimic the old parser in Bio.Clustalw 270 if version: 271 alignment._version = version 272 if consensus: 273 alignment_length = len(seqs[0]) 274 assert len(consensus) == alignment_length, \ 275 "Alignment length is %i, consensus length is %i, '%s'" \ 276 % (alignment_length, len(consensus), consensus) 277 alignment._star_info = consensus 278 return alignment
279