Package Bio :: Package AlignIO :: Module ClustalIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.ClustalIO

  1  # Copyright 2006-2016 by Peter Cock.  All rights reserved. 
  2  # 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """Bio.AlignIO support for "clustal" output from CLUSTAL W and other tools. 
  7   
  8  You are expected to use this module via the Bio.AlignIO functions (or the 
  9  Bio.SeqIO functions if you want to work directly with the gapped sequences). 
 10  """ 
 11   
 12  from __future__ import print_function 
 13   
 14  from Bio.Seq import Seq 
 15  from Bio.SeqRecord import SeqRecord 
 16  from Bio.Align import MultipleSeqAlignment 
 17  from .Interfaces import AlignmentIterator, SequentialAlignmentWriter 
 18   
 19   
20 -class ClustalWriter(SequentialAlignmentWriter):
21 """Clustalw alignment writer.""" 22
23 - def write_alignment(self, alignment):
24 """Use this to write (another) single alignment to an open file.""" 25 26 if len(alignment) == 0: 27 raise ValueError("Must have at least one sequence") 28 if alignment.get_alignment_length() == 0: 29 # This doubles as a check for an alignment object 30 raise ValueError("Non-empty sequences are required") 31 32 # Old versions of the parser in Bio.Clustalw used a ._version property, 33 try: 34 version = str(alignment._version) 35 except AttributeError: 36 version = "" 37 if not version: 38 version = '1.81' 39 if version.startswith("2."): 40 # e.g. 2.0.x 41 output = "CLUSTAL %s multiple sequence alignment\n\n\n" % version 42 else: 43 # e.g. 1.81 or 1.83 44 output = "CLUSTAL X (%s) multiple sequence alignment\n\n\n" % version 45 46 cur_char = 0 47 max_length = len(alignment[0]) 48 49 if max_length <= 0: 50 raise ValueError("Non-empty sequences are required") 51 52 # keep displaying sequences until we reach the end 53 while cur_char != max_length: 54 # calculate the number of sequences to show, which will 55 # be less if we are at the end of the sequence 56 if (cur_char + 50) > max_length: 57 show_num = max_length - cur_char 58 else: 59 show_num = 50 60 61 # go through all of the records and print out the sequences 62 # when we output, we do a nice 80 column output, although this 63 # may result in truncation of the ids. 64 for record in alignment: 65 # Make sure we don't get any spaces in the record 66 # identifier when output in the file by replacing 67 # them with underscores: 68 line = record.id[0:30].replace(" ", "_").ljust(36) 69 line += str(record.seq[cur_char:(cur_char + show_num)]) 70 output += line + "\n" 71 72 # now we need to print out the star info, if we've got it 73 # This was stored by Bio.Clustalw using a ._star_info property. 74 if hasattr(alignment, "_star_info") and alignment._star_info != '': 75 output += (" " * 36) + \ 76 alignment._star_info[cur_char:(cur_char + show_num)] + "\n" 77 78 output += "\n" 79 cur_char += show_num 80 81 # Want a trailing blank new line in case the output is concatenated 82 self.handle.write(output + "\n")
83 84
85 -class ClustalIterator(AlignmentIterator):
86 """Clustalw alignment iterator.""" 87 88 _header = None # for caching lines between __next__ calls 89
90 - def __next__(self):
91 handle = self.handle 92 93 if self._header is None: 94 line = handle.readline() 95 else: 96 # Header we saved from when we were parsing 97 # the previous alignment. 98 line = self._header 99 self._header = None 100 101 if not line: 102 raise StopIteration 103 104 # Whitelisted headers we know about 105 known_headers = ['CLUSTAL', 'PROBCONS', 'MUSCLE', 'MSAPROBS', 'Kalign'] 106 if line.strip().split()[0] not in known_headers: 107 raise ValueError("%s is not a known CLUSTAL header: %s" % 108 (line.strip().split()[0], 109 ", ".join(known_headers))) 110 111 # find the clustal version in the header line 112 version = None 113 for word in line.split(): 114 if word[0] == '(' and word[-1] == ')': 115 word = word[1:-1] 116 if word[0] in '0123456789': 117 version = word 118 break 119 120 # There should be two blank lines after the header line 121 line = handle.readline() 122 while line.strip() == "": 123 line = handle.readline() 124 125 # If the alignment contains entries with the same sequence 126 # identifier (not a good idea - but seems possible), then this 127 # dictionary based parser will merge their sequences. Fix this? 128 ids = [] 129 seqs = [] 130 consensus = "" 131 seq_cols = None # Used to extract the consensus 132 133 # Use the first block to get the sequence identifiers 134 while True: 135 if line[0] != " " and line.strip() != "": 136 # Sequences identifier... 137 fields = line.rstrip().split() 138 139 # We expect there to be two fields, there can be an optional 140 # "sequence number" field containing the letter count. 141 if len(fields) < 2 or len(fields) > 3: 142 raise ValueError("Could not parse line:\n%s" % line) 143 144 ids.append(fields[0]) 145 seqs.append(fields[1]) 146 147 # Record the sequence position to get the consensus 148 if seq_cols is None: 149 start = len(fields[0]) + \ 150 line[len(fields[0]):].find(fields[1]) 151 end = start + len(fields[1]) 152 seq_cols = slice(start, end) 153 del start, end 154 assert fields[1] == line[seq_cols] 155 156 if len(fields) == 3: 157 # This MAY be an old style file with a letter count... 158 try: 159 letters = int(fields[2]) 160 except ValueError: 161 raise ValueError("Could not parse line, " 162 "bad sequence number:\n%s" % line) 163 if len(fields[1].replace("-", "")) != letters: 164 raise ValueError("Could not parse line, " 165 "invalid sequence number:\n%s" % line) 166 elif line[0] == " ": 167 # Sequence consensus line... 168 assert len(ids) == len(seqs) 169 assert len(ids) > 0 170 assert seq_cols is not None 171 consensus = line[seq_cols] 172 assert not line[:seq_cols.start].strip() 173 assert not line[seq_cols.stop:].strip() 174 # Check for blank line (or end of file) 175 line = handle.readline() 176 assert line.strip() == "" 177 break 178 else: 179 # No consensus 180 break 181 line = handle.readline() 182 if not line: 183 break # end of file 184 185 assert line.strip() == "" 186 assert seq_cols is not None 187 188 # Confirm all same length 189 for s in seqs: 190 assert len(s) == len(seqs[0]) 191 if consensus: 192 assert len(consensus) == len(seqs[0]) 193 194 # Loop over any remaining blocks... 195 done = False 196 while not done: 197 # There should be a blank line between each block. 198 # Also want to ignore any consensus line from the 199 # previous block. 200 while (not line) or line.strip() == "": 201 line = handle.readline() 202 if not line: 203 break # end of file 204 if not line: 205 break # end of file 206 207 if line.split(None, 1)[0] in known_headers: 208 # Found concatenated alignment. 209 done = True 210 self._header = line 211 break 212 213 for i in range(len(ids)): 214 assert line[0] != " ", "Unexpected line:\n%s" % repr(line) 215 fields = line.rstrip().split() 216 217 # We expect there to be two fields, there can be an optional 218 # "sequence number" field containing the letter count. 219 if len(fields) < 2 or len(fields) > 3: 220 raise ValueError("Could not parse line:\n%s" % repr(line)) 221 222 if fields[0] != ids[i]: 223 raise ValueError("Identifiers out of order? " 224 "Got '%s' but expected '%s'" 225 % (fields[0], ids[i])) 226 227 if fields[1] != line[seq_cols]: 228 start = len(fields[0]) + \ 229 line[len(fields[0]):].find(fields[1]) 230 assert start == seq_cols.start, \ 231 'Old location %s -> %i:XX' % (seq_cols, start) 232 end = start + len(fields[1]) 233 seq_cols = slice(start, end) 234 del start, end 235 236 # Append the sequence 237 seqs[i] += fields[1] 238 assert len(seqs[i]) == len(seqs[0]) 239 240 if len(fields) == 3: 241 # This MAY be an old style file with a letter count... 242 try: 243 letters = int(fields[2]) 244 except ValueError: 245 raise ValueError("Could not parse line, " 246 "bad sequence number:\n%s" % 247 line) 248 if len(seqs[i].replace("-", "")) != letters: 249 raise ValueError("Could not parse line, " 250 "invalid sequence number:\n%s" % line) 251 252 # Read in the next line 253 line = handle.readline() 254 # There should now be a consensus line 255 if consensus: 256 assert line[0] == " " 257 assert seq_cols is not None 258 consensus += line[seq_cols] 259 assert len(consensus) == len(seqs[0]) 260 assert not line[:seq_cols.start].strip() 261 assert not line[seq_cols.stop:].strip() 262 # Read in the next line 263 line = handle.readline() 264 265 assert len(ids) == len(seqs) 266 if len(seqs) == 0 or len(seqs[0]) == 0: 267 raise StopIteration 268 269 if self.records_per_alignment is not None and \ 270 self.records_per_alignment != len(ids): 271 raise ValueError("Found %i records in this alignment, " 272 "told to expect %i" 273 % (len(ids), self.records_per_alignment)) 274 275 records = (SeqRecord(Seq(s, self.alphabet), id=i, description=i) 276 for (i, s) in zip(ids, seqs)) 277 alignment = MultipleSeqAlignment(records, self.alphabet) 278 # TODO - Handle alignment annotation better, for now 279 # mimic the old parser in Bio.Clustalw 280 if version: 281 alignment._version = version 282 if consensus: 283 alignment_length = len(seqs[0]) 284 assert len(consensus) == alignment_length, \ 285 "Alignment length is %i, consensus length is %i, '%s'" \ 286 % (alignment_length, len(consensus), consensus) 287 alignment._star_info = consensus 288 return alignment
289