Package Bio :: Package AlignIO :: Module ClustalIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.ClustalIO

  1  # Copyright 2006-2016 by Peter Cock.  All rights reserved. 
  2  # 
  3  # This file is part of the Biopython distribution and governed by your 
  4  # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". 
  5  # Please see the LICENSE file that should have been included as part of this 
  6  # package. 
  7  """Bio.AlignIO support for "clustal" output from CLUSTAL W and other tools. 
  8   
  9  You are expected to use this module via the Bio.AlignIO functions (or the 
 10  Bio.SeqIO functions if you want to work directly with the gapped sequences). 
 11  """ 
 12   
 13  from __future__ import print_function 
 14   
 15  from Bio.Seq import Seq 
 16  from Bio.SeqRecord import SeqRecord 
 17  from Bio.Align import MultipleSeqAlignment 
 18  from .Interfaces import AlignmentIterator, SequentialAlignmentWriter 
 19   
 20   
21 -class ClustalWriter(SequentialAlignmentWriter):
22 """Clustalw alignment writer.""" 23
24 - def write_alignment(self, alignment):
25 """Use this to write (another) single alignment to an open file.""" 26 if len(alignment) == 0: 27 raise ValueError("Must have at least one sequence") 28 if alignment.get_alignment_length() == 0: 29 # This doubles as a check for an alignment object 30 raise ValueError("Non-empty sequences are required") 31 32 # Old versions of the parser in Bio.Clustalw used a ._version property, 33 try: 34 version = str(alignment._version) 35 except AttributeError: 36 version = "" 37 if not version: 38 version = '1.81' 39 if version.startswith("2."): 40 # e.g. 2.0.x 41 output = "CLUSTAL %s multiple sequence alignment\n\n\n" % version 42 else: 43 # e.g. 1.81 or 1.83 44 output = "CLUSTAL X (%s) multiple sequence alignment\n\n\n" % version 45 46 cur_char = 0 47 max_length = len(alignment[0]) 48 49 if max_length <= 0: 50 raise ValueError("Non-empty sequences are required") 51 52 # keep displaying sequences until we reach the end 53 while cur_char != max_length: 54 # calculate the number of sequences to show, which will 55 # be less if we are at the end of the sequence 56 if (cur_char + 50) > max_length: 57 show_num = max_length - cur_char 58 else: 59 show_num = 50 60 61 # go through all of the records and print out the sequences 62 # when we output, we do a nice 80 column output, although this 63 # may result in truncation of the ids. 64 for record in alignment: 65 # Make sure we don't get any spaces in the record 66 # identifier when output in the file by replacing 67 # them with underscores: 68 line = record.id[0:30].replace(" ", "_").ljust(36) 69 line += str(record.seq[cur_char:(cur_char + show_num)]) 70 output += line + "\n" 71 72 # now we need to print out the star info, if we've got it 73 # This was stored by Bio.Clustalw using a ._star_info property. 74 if hasattr(alignment, "_star_info") and alignment._star_info != '': 75 output += (" " * 36) + \ 76 alignment._star_info[cur_char:(cur_char + show_num)] + "\n" 77 78 output += "\n" 79 cur_char += show_num 80 81 # Want a trailing blank new line in case the output is concatenated 82 self.handle.write(output + "\n")
83 84
85 -class ClustalIterator(AlignmentIterator):
86 """Clustalw alignment iterator.""" 87 88 _header = None # for caching lines between __next__ calls 89
90 - def __next__(self):
91 handle = self.handle 92 93 if self._header is None: 94 line = handle.readline() 95 else: 96 # Header we saved from when we were parsing 97 # the previous alignment. 98 line = self._header 99 self._header = None 100 101 if not line: 102 raise StopIteration 103 104 # Whitelisted headers we know about 105 known_headers = ['CLUSTAL', 'PROBCONS', 'MUSCLE', 'MSAPROBS', 'Kalign'] 106 if line.strip().split()[0] not in known_headers: 107 raise ValueError("%s is not a known CLUSTAL header: %s" % 108 (line.strip().split()[0], 109 ", ".join(known_headers))) 110 111 # find the clustal version in the header line 112 version = None 113 for word in line.split(): 114 if word[0] == '(' and word[-1] == ')': 115 word = word[1:-1] 116 if word[0] in '0123456789': 117 version = word 118 break 119 120 # There should be two blank lines after the header line 121 line = handle.readline() 122 while line.strip() == "": 123 line = handle.readline() 124 125 # If the alignment contains entries with the same sequence 126 # identifier (not a good idea - but seems possible), then this 127 # dictionary based parser will merge their sequences. Fix this? 128 ids = [] 129 seqs = [] 130 consensus = "" 131 seq_cols = None # Used to extract the consensus 132 133 # Use the first block to get the sequence identifiers 134 while True: 135 if line[0] != " " and line.strip() != "": 136 # Sequences identifier... 137 fields = line.rstrip().split() 138 139 # We expect there to be two fields, there can be an optional 140 # "sequence number" field containing the letter count. 141 if len(fields) < 2 or len(fields) > 3: 142 raise ValueError("Could not parse line:\n%s" % line) 143 144 ids.append(fields[0]) 145 seqs.append(fields[1]) 146 147 # Record the sequence position to get the consensus 148 if seq_cols is None: 149 start = len(fields[0]) + \ 150 line[len(fields[0]):].find(fields[1]) 151 end = start + len(fields[1]) 152 seq_cols = slice(start, end) 153 del start, end 154 assert fields[1] == line[seq_cols] 155 156 if len(fields) == 3: 157 # This MAY be an old style file with a letter count... 158 try: 159 letters = int(fields[2]) 160 except ValueError: 161 raise ValueError("Could not parse line, " 162 "bad sequence number:\n%s" % line) 163 if len(fields[1].replace("-", "")) != letters: 164 raise ValueError("Could not parse line, " 165 "invalid sequence number:\n%s" % line) 166 elif line[0] == " ": 167 # Sequence consensus line... 168 assert len(ids) == len(seqs) 169 assert len(ids) > 0 170 assert seq_cols is not None 171 consensus = line[seq_cols] 172 assert not line[:seq_cols.start].strip() 173 assert not line[seq_cols.stop:].strip() 174 # Check for blank line (or end of file) 175 line = handle.readline() 176 assert line.strip() == "" 177 break 178 else: 179 # No consensus 180 break 181 line = handle.readline() 182 if not line: 183 break # end of file 184 185 assert line.strip() == "" 186 assert seq_cols is not None 187 188 # Confirm all same length 189 for s in seqs: 190 assert len(s) == len(seqs[0]) 191 if consensus: 192 assert len(consensus) == len(seqs[0]) 193 194 # Loop over any remaining blocks... 195 done = False 196 while not done: 197 # There should be a blank line between each block. 198 # Also want to ignore any consensus line from the 199 # previous block. 200 while (not line) or line.strip() == "": 201 line = handle.readline() 202 if not line: 203 break # end of file 204 if not line: 205 break # end of file 206 207 if line.split(None, 1)[0] in known_headers: 208 # Found concatenated alignment. 209 done = True 210 self._header = line 211 break 212 213 for i in range(len(ids)): 214 assert line[0] != " ", "Unexpected line:\n%s" % repr(line) 215 fields = line.rstrip().split() 216 217 # We expect there to be two fields, there can be an optional 218 # "sequence number" field containing the letter count. 219 if len(fields) < 2 or len(fields) > 3: 220 raise ValueError("Could not parse line:\n%s" % repr(line)) 221 222 if fields[0] != ids[i]: 223 raise ValueError("Identifiers out of order? " 224 "Got '%s' but expected '%s'" 225 % (fields[0], ids[i])) 226 227 if fields[1] != line[seq_cols]: 228 start = len(fields[0]) + \ 229 line[len(fields[0]):].find(fields[1]) 230 assert start == seq_cols.start, \ 231 'Old location %s -> %i:XX' % (seq_cols, start) 232 end = start + len(fields[1]) 233 seq_cols = slice(start, end) 234 del start, end 235 236 # Append the sequence 237 seqs[i] += fields[1] 238 assert len(seqs[i]) == len(seqs[0]) 239 240 if len(fields) == 3: 241 # This MAY be an old style file with a letter count... 242 try: 243 letters = int(fields[2]) 244 except ValueError: 245 raise ValueError("Could not parse line, " 246 "bad sequence number:\n%s" % 247 line) 248 if len(seqs[i].replace("-", "")) != letters: 249 raise ValueError("Could not parse line, " 250 "invalid sequence number:\n%s" % line) 251 252 # Read in the next line 253 line = handle.readline() 254 # There should now be a consensus line 255 if consensus: 256 assert line[0] == " " 257 assert seq_cols is not None 258 consensus += line[seq_cols] 259 assert len(consensus) == len(seqs[0]) 260 assert not line[:seq_cols.start].strip() 261 assert not line[seq_cols.stop:].strip() 262 # Read in the next line 263 line = handle.readline() 264 265 assert len(ids) == len(seqs) 266 if len(seqs) == 0 or len(seqs[0]) == 0: 267 raise StopIteration 268 269 if self.records_per_alignment is not None and \ 270 self.records_per_alignment != len(ids): 271 raise ValueError("Found %i records in this alignment, " 272 "told to expect %i" 273 % (len(ids), self.records_per_alignment)) 274 275 records = (SeqRecord(Seq(s, self.alphabet), id=i, description=i) 276 for (i, s) in zip(ids, seqs)) 277 alignment = MultipleSeqAlignment(records, self.alphabet) 278 # TODO - Handle alignment annotation better, for now 279 # mimic the old parser in Bio.Clustalw 280 if version: 281 alignment._version = version 282 if consensus: 283 alignment_length = len(seqs[0]) 284 assert len(consensus) == alignment_length, \ 285 "Alignment length is %i, consensus length is %i, '%s'" \ 286 % (alignment_length, len(consensus), consensus) 287 alignment._star_info = consensus 288 return alignment
289