Package Bio :: Package AlignIO :: Module ClustalIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.ClustalIO

  1  # Copyright 2006-2013 by Peter Cock.  All rights reserved. 
  2  # 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """Bio.AlignIO support for "clustal" output from CLUSTAL W and other tools. 
  7   
  8  You are expected to use this module via the Bio.AlignIO functions (or the 
  9  Bio.SeqIO functions if you want to work directly with the gapped sequences). 
 10  """ 
 11   
 12  from __future__ import print_function 
 13   
 14  from Bio.Seq import Seq 
 15  from Bio.SeqRecord import SeqRecord 
 16  from Bio.Align import MultipleSeqAlignment 
 17  from .Interfaces import AlignmentIterator, SequentialAlignmentWriter 
 18   
 19  __docformat__ = "restructuredtext en" 
 20   
 21   
22 -class ClustalWriter(SequentialAlignmentWriter):
23 """Clustalw alignment writer.""" 24
25 - def write_alignment(self, alignment):
26 """Use this to write (another) single alignment to an open file.""" 27 28 if len(alignment) == 0: 29 raise ValueError("Must have at least one sequence") 30 if alignment.get_alignment_length() == 0: 31 # This doubles as a check for an alignment object 32 raise ValueError("Non-empty sequences are required") 33 34 # Old versions of the parser in Bio.Clustalw used a ._version property, 35 try: 36 version = str(alignment._version) 37 except AttributeError: 38 version = "" 39 if not version: 40 version = '1.81' 41 if version.startswith("2."): 42 # e.g. 2.0.x 43 output = "CLUSTAL %s multiple sequence alignment\n\n\n" % version 44 else: 45 # e.g. 1.81 or 1.83 46 output = "CLUSTAL X (%s) multiple sequence alignment\n\n\n" % version 47 48 cur_char = 0 49 max_length = len(alignment[0]) 50 51 if max_length <= 0: 52 raise ValueError("Non-empty sequences are required") 53 54 # keep displaying sequences until we reach the end 55 while cur_char != max_length: 56 # calculate the number of sequences to show, which will 57 # be less if we are at the end of the sequence 58 if (cur_char + 50) > max_length: 59 show_num = max_length - cur_char 60 else: 61 show_num = 50 62 63 # go through all of the records and print out the sequences 64 # when we output, we do a nice 80 column output, although this 65 # may result in truncation of the ids. 66 for record in alignment: 67 # Make sure we don't get any spaces in the record 68 # identifier when output in the file by replacing 69 # them with underscores: 70 line = record.id[0:30].replace(" ", "_").ljust(36) 71 line += str(record.seq[cur_char:(cur_char + show_num)]) 72 output += line + "\n" 73 74 # now we need to print out the star info, if we've got it 75 # This was stored by Bio.Clustalw using a ._star_info property. 76 if hasattr(alignment, "_star_info") and alignment._star_info != '': 77 output += (" " * 36) + \ 78 alignment._star_info[cur_char:(cur_char + show_num)] + "\n" 79 80 output += "\n" 81 cur_char += show_num 82 83 # Want a trailing blank new line in case the output is concatenated 84 self.handle.write(output + "\n")
85 86
87 -class ClustalIterator(AlignmentIterator):
88 """Clustalw alignment iterator.""" 89
90 - def __next__(self):
91 handle = self.handle 92 try: 93 # Header we saved from when we were parsing 94 # the previous alignment. 95 line = self._header 96 del self._header 97 except AttributeError: 98 line = handle.readline() 99 if not line: 100 raise StopIteration 101 102 # Whitelisted headers we know about 103 known_headers = ['CLUSTAL', 'PROBCONS', 'MUSCLE', 'MSAPROBS'] 104 if line.strip().split()[0] not in known_headers: 105 raise ValueError("%s is not a known CLUSTAL header: %s" % 106 (line.strip().split()[0], 107 ", ".join(known_headers))) 108 109 # find the clustal version in the header line 110 version = None 111 for word in line.split(): 112 if word[0] == '(' and word[-1] == ')': 113 word = word[1:-1] 114 if word[0] in '0123456789': 115 version = word 116 break 117 118 # There should be two blank lines after the header line 119 line = handle.readline() 120 while line.strip() == "": 121 line = handle.readline() 122 123 # If the alignment contains entries with the same sequence 124 # identifier (not a good idea - but seems possible), then this 125 # dictionary based parser will merge their sequences. Fix this? 126 ids = [] 127 seqs = [] 128 consensus = "" 129 seq_cols = None # Used to extract the consensus 130 131 # Use the first block to get the sequence identifiers 132 while True: 133 if line[0] != " " and line.strip() != "": 134 # Sequences identifier... 135 fields = line.rstrip().split() 136 137 # We expect there to be two fields, there can be an optional 138 # "sequence number" field containing the letter count. 139 if len(fields) < 2 or len(fields) > 3: 140 raise ValueError("Could not parse line:\n%s" % line) 141 142 ids.append(fields[0]) 143 seqs.append(fields[1]) 144 145 # Record the sequence position to get the consensus 146 if seq_cols is None: 147 start = len(fields[0]) + line[len(fields[0]):].find(fields[1]) 148 end = start + len(fields[1]) 149 seq_cols = slice(start, end) 150 del start, end 151 assert fields[1] == line[seq_cols] 152 153 if len(fields) == 3: 154 # This MAY be an old style file with a letter count... 155 try: 156 letters = int(fields[2]) 157 except ValueError: 158 raise ValueError("Could not parse line, bad sequence number:\n%s" % line) 159 if len(fields[1].replace("-", "")) != letters: 160 raise ValueError("Could not parse line, invalid sequence number:\n%s" % line) 161 elif line[0] == " ": 162 # Sequence consensus line... 163 assert len(ids) == len(seqs) 164 assert len(ids) > 0 165 assert seq_cols is not None 166 consensus = line[seq_cols] 167 assert not line[:seq_cols.start].strip() 168 assert not line[seq_cols.stop:].strip() 169 # Check for blank line (or end of file) 170 line = handle.readline() 171 assert line.strip() == "" 172 break 173 else: 174 # No consensus 175 break 176 line = handle.readline() 177 if not line: 178 break # end of file 179 180 assert line.strip() == "" 181 assert seq_cols is not None 182 183 # Confirm all same length 184 for s in seqs: 185 assert len(s) == len(seqs[0]) 186 if consensus: 187 assert len(consensus) == len(seqs[0]) 188 189 # Loop over any remaining blocks... 190 done = False 191 while not done: 192 # There should be a blank line between each block. 193 # Also want to ignore any consensus line from the 194 # previous block. 195 while (not line) or line.strip() == "": 196 line = handle.readline() 197 if not line: 198 break # end of file 199 if not line: 200 break # end of file 201 202 if line.split(None, 1)[0] in known_headers: 203 # Found concatenated alignment. 204 done = True 205 self._header = line 206 break 207 208 for i in range(len(ids)): 209 assert line[0] != " ", "Unexpected line:\n%s" % repr(line) 210 fields = line.rstrip().split() 211 212 # We expect there to be two fields, there can be an optional 213 # "sequence number" field containing the letter count. 214 if len(fields) < 2 or len(fields) > 3: 215 raise ValueError("Could not parse line:\n%s" % repr(line)) 216 217 if fields[0] != ids[i]: 218 raise ValueError("Identifiers out of order? Got '%s' but expected '%s'" 219 % (fields[0], ids[i])) 220 221 if fields[1] != line[seq_cols]: 222 start = len(fields[0]) + line[len(fields[0]):].find(fields[1]) 223 assert start == seq_cols.start, 'Old location %s -> %i:XX' % (seq_cols, start) 224 end = start + len(fields[1]) 225 seq_cols = slice(start, end) 226 del start, end 227 228 # Append the sequence 229 seqs[i] += fields[1] 230 assert len(seqs[i]) == len(seqs[0]) 231 232 if len(fields) == 3: 233 # This MAY be an old style file with a letter count... 234 try: 235 letters = int(fields[2]) 236 except ValueError: 237 raise ValueError("Could not parse line, bad sequence number:\n%s" % line) 238 if len(seqs[i].replace("-", "")) != letters: 239 raise ValueError("Could not parse line, invalid sequence number:\n%s" % line) 240 241 # Read in the next line 242 line = handle.readline() 243 # There should now be a consensus line 244 if consensus: 245 assert line[0] == " " 246 assert seq_cols is not None 247 consensus += line[seq_cols] 248 assert len(consensus) == len(seqs[0]) 249 assert not line[:seq_cols.start].strip() 250 assert not line[seq_cols.stop:].strip() 251 # Read in the next line 252 line = handle.readline() 253 254 assert len(ids) == len(seqs) 255 if len(seqs) == 0 or len(seqs[0]) == 0: 256 raise StopIteration 257 258 if self.records_per_alignment is not None \ 259 and self.records_per_alignment != len(ids): 260 raise ValueError("Found %i records in this alignment, told to expect %i" 261 % (len(ids), self.records_per_alignment)) 262 263 records = (SeqRecord(Seq(s, self.alphabet), id=i, description=i) 264 for (i, s) in zip(ids, seqs)) 265 alignment = MultipleSeqAlignment(records, self.alphabet) 266 # TODO - Handle alignment annotation better, for now 267 # mimic the old parser in Bio.Clustalw 268 if version: 269 alignment._version = version 270 if consensus: 271 alignment_length = len(seqs[0]) 272 assert len(consensus) == alignment_length, \ 273 "Alignment length is %i, consensus length is %i, '%s'" \ 274 % (alignment_length, len(consensus), consensus) 275 alignment._star_info = consensus 276 return alignment
277