Package Bio :: Package AlignIO :: Module ClustalIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.ClustalIO

  1  # Copyright 2006-2013 by Peter Cock.  All rights reserved. 
  2  # 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """Bio.AlignIO support for "clustal" output from CLUSTAL W and other tools. 
  7   
  8  You are expected to use this module via the Bio.AlignIO functions (or the 
  9  Bio.SeqIO functions if you want to work directly with the gapped sequences). 
 10  """ 
 11   
 12  from __future__ import print_function 
 13   
 14  from Bio.Seq import Seq 
 15  from Bio.SeqRecord import SeqRecord 
 16  from Bio.Align import MultipleSeqAlignment 
 17  from .Interfaces import AlignmentIterator, SequentialAlignmentWriter 
 18   
 19   
20 -class ClustalWriter(SequentialAlignmentWriter):
21 """Clustalw alignment writer."""
22 - def write_alignment(self, alignment):
23 """Use this to write (another) single alignment to an open file.""" 24 25 if len(alignment) == 0: 26 raise ValueError("Must have at least one sequence") 27 if alignment.get_alignment_length() == 0: 28 #This doubles as a check for an alignment object 29 raise ValueError("Non-empty sequences are required") 30 31 #Old versions of the parser in Bio.Clustalw used a ._version property, 32 try: 33 version = str(alignment._version) 34 except AttributeError: 35 version = "" 36 if not version: 37 version = '1.81' 38 if version.startswith("2."): 39 #e.g. 2.0.x 40 output = "CLUSTAL %s multiple sequence alignment\n\n\n" % version 41 else: 42 #e.g. 1.81 or 1.83 43 output = "CLUSTAL X (%s) multiple sequence alignment\n\n\n" % version 44 45 cur_char = 0 46 max_length = len(alignment[0]) 47 48 if max_length <= 0: 49 raise ValueError("Non-empty sequences are required") 50 51 # keep displaying sequences until we reach the end 52 while cur_char != max_length: 53 # calculate the number of sequences to show, which will 54 # be less if we are at the end of the sequence 55 if (cur_char + 50) > max_length: 56 show_num = max_length - cur_char 57 else: 58 show_num = 50 59 60 # go through all of the records and print out the sequences 61 # when we output, we do a nice 80 column output, although this 62 # may result in truncation of the ids. 63 for record in alignment: 64 #Make sure we don't get any spaces in the record 65 #identifier when output in the file by replacing 66 #them with underscores: 67 line = record.id[0:30].replace(" ", "_").ljust(36) 68 line += str(record.seq[cur_char:(cur_char + show_num)]) 69 output += line + "\n" 70 71 # now we need to print out the star info, if we've got it 72 # This was stored by Bio.Clustalw using a ._star_info property. 73 if hasattr(alignment, "_star_info") and alignment._star_info != '': 74 output += (" " * 36) + \ 75 alignment._star_info[cur_char:(cur_char + show_num)] + "\n" 76 77 output += "\n" 78 cur_char += show_num 79 80 # Want a trailing blank new line in case the output is concatenated 81 self.handle.write(output + "\n")
82 83
84 -class ClustalIterator(AlignmentIterator):
85 """Clustalw alignment iterator.""" 86
87 - def __next__(self):
88 handle = self.handle 89 try: 90 #Header we saved from when we were parsing 91 #the previous alignment. 92 line = self._header 93 del self._header 94 except AttributeError: 95 line = handle.readline() 96 if not line: 97 raise StopIteration 98 99 #Whitelisted headers we know about 100 known_headers = ['CLUSTAL', 'PROBCONS', 'MUSCLE', 'MSAPROBS'] 101 if line.strip().split()[0] not in known_headers: 102 raise ValueError("%s is not a known CLUSTAL header: %s" % 103 (line.strip().split()[0], 104 ", ".join(known_headers))) 105 106 # find the clustal version in the header line 107 version = None 108 for word in line.split(): 109 if word[0] == '(' and word[-1] == ')': 110 word = word[1:-1] 111 if word[0] in '0123456789': 112 version = word 113 break 114 115 #There should be two blank lines after the header line 116 line = handle.readline() 117 while line.strip() == "": 118 line = handle.readline() 119 120 #If the alignment contains entries with the same sequence 121 #identifier (not a good idea - but seems possible), then this 122 #dictionary based parser will merge their sequences. Fix this? 123 ids = [] 124 seqs = [] 125 consensus = "" 126 seq_cols = None # Used to extract the consensus 127 128 #Use the first block to get the sequence identifiers 129 while True: 130 if line[0] != " " and line.strip() != "": 131 #Sequences identifier... 132 fields = line.rstrip().split() 133 134 #We expect there to be two fields, there can be an optional 135 #"sequence number" field containing the letter count. 136 if len(fields) < 2 or len(fields) > 3: 137 raise ValueError("Could not parse line:\n%s" % line) 138 139 ids.append(fields[0]) 140 seqs.append(fields[1]) 141 142 #Record the sequence position to get the consensus 143 if seq_cols is None: 144 start = len(fields[0]) + line[len(fields[0]):].find(fields[1]) 145 end = start + len(fields[1]) 146 seq_cols = slice(start, end) 147 del start, end 148 assert fields[1] == line[seq_cols] 149 150 if len(fields) == 3: 151 #This MAY be an old style file with a letter count... 152 try: 153 letters = int(fields[2]) 154 except ValueError: 155 raise ValueError("Could not parse line, bad sequence number:\n%s" % line) 156 if len(fields[1].replace("-", "")) != letters: 157 raise ValueError("Could not parse line, invalid sequence number:\n%s" % line) 158 elif line[0] == " ": 159 #Sequence consensus line... 160 assert len(ids) == len(seqs) 161 assert len(ids) > 0 162 assert seq_cols is not None 163 consensus = line[seq_cols] 164 assert not line[:seq_cols.start].strip() 165 assert not line[seq_cols.stop:].strip() 166 #Check for blank line (or end of file) 167 line = handle.readline() 168 assert line.strip() == "" 169 break 170 else: 171 #No consensus 172 break 173 line = handle.readline() 174 if not line: 175 break # end of file 176 177 assert line.strip() == "" 178 assert seq_cols is not None 179 180 #Confirm all same length 181 for s in seqs: 182 assert len(s) == len(seqs[0]) 183 if consensus: 184 assert len(consensus) == len(seqs[0]) 185 186 #Loop over any remaining blocks... 187 done = False 188 while not done: 189 #There should be a blank line between each block. 190 #Also want to ignore any consensus line from the 191 #previous block. 192 while (not line) or line.strip() == "": 193 line = handle.readline() 194 if not line: 195 break # end of file 196 if not line: 197 break # end of file 198 199 if line.split(None, 1)[0] in known_headers: 200 #Found concatenated alignment. 201 done = True 202 self._header = line 203 break 204 205 for i in range(len(ids)): 206 assert line[0] != " ", "Unexpected line:\n%s" % repr(line) 207 fields = line.rstrip().split() 208 209 #We expect there to be two fields, there can be an optional 210 #"sequence number" field containing the letter count. 211 if len(fields) < 2 or len(fields) > 3: 212 raise ValueError("Could not parse line:\n%s" % repr(line)) 213 214 if fields[0] != ids[i]: 215 raise ValueError("Identifiers out of order? Got '%s' but expected '%s'" 216 % (fields[0], ids[i])) 217 218 if fields[1] != line[seq_cols]: 219 start = len(fields[0]) + line[len(fields[0]):].find(fields[1]) 220 assert start == seq_cols.start, 'Old location %s -> %i:XX' % (seq_cols, start) 221 end = start + len(fields[1]) 222 seq_cols = slice(start, end) 223 del start, end 224 225 #Append the sequence 226 seqs[i] += fields[1] 227 assert len(seqs[i]) == len(seqs[0]) 228 229 if len(fields) == 3: 230 #This MAY be an old style file with a letter count... 231 try: 232 letters = int(fields[2]) 233 except ValueError: 234 raise ValueError("Could not parse line, bad sequence number:\n%s" % line) 235 if len(seqs[i].replace("-", "")) != letters: 236 raise ValueError("Could not parse line, invalid sequence number:\n%s" % line) 237 238 #Read in the next line 239 line = handle.readline() 240 #There should now be a consensus line 241 if consensus: 242 assert line[0] == " " 243 assert seq_cols is not None 244 consensus += line[seq_cols] 245 assert len(consensus) == len(seqs[0]) 246 assert not line[:seq_cols.start].strip() 247 assert not line[seq_cols.stop:].strip() 248 #Read in the next line 249 line = handle.readline() 250 251 assert len(ids) == len(seqs) 252 if len(seqs) == 0 or len(seqs[0]) == 0: 253 raise StopIteration 254 255 if self.records_per_alignment is not None \ 256 and self.records_per_alignment != len(ids): 257 raise ValueError("Found %i records in this alignment, told to expect %i" 258 % (len(ids), self.records_per_alignment)) 259 260 records = (SeqRecord(Seq(s, self.alphabet), id=i, description=i) 261 for (i, s) in zip(ids, seqs)) 262 alignment = MultipleSeqAlignment(records, self.alphabet) 263 #TODO - Handle alignment annotation better, for now 264 #mimic the old parser in Bio.Clustalw 265 if version: 266 alignment._version = version 267 if consensus: 268 alignment_length = len(seqs[0]) 269 assert len(consensus) == alignment_length, \ 270 "Alignment length is %i, consensus length is %i, '%s'" \ 271 % (alignment_length, len(consensus), consensus) 272 alignment._star_info = consensus 273 return alignment
274 275 if __name__ == "__main__": 276 print("Running a quick self-test") 277 278 #This is a truncated version of the example in Tests/cw02.aln 279 #Notice the inclusion of sequence numbers (right hand side) 280 aln_example1 = \ 281 """CLUSTAL W (1.81) multiple sequence alignment 282 283 284 gi|4959044|gb|AAD34209.1|AF069 MENSDSNDKGSDQSAAQRRSQMDRLDREEAFYQFVNNLSEEDYRLMRDNN 50 285 gi|671626|emb|CAA85685.1| ---------MSPQTETKASVGFKAGVKEYKLTYYTPEYETKDTDILAAFR 41 286 * *: :: :. :* : :. : . :* :: . 287 288 gi|4959044|gb|AAD34209.1|AF069 LLGTPGESTEEELLRRLQQIKEGPPPQSPDENRAGESSDDVTNSDSIIDW 100 289 gi|671626|emb|CAA85685.1| VTPQPG-----------------VPPEEAGAAVAAESSTGT--------- 65 290 : ** **:... *.*** .. 291 292 gi|4959044|gb|AAD34209.1|AF069 LNSVRQTGNTTRSRQRGNQSWRAVSRTNPNSGDFRFSLEINVNRNNGSQT 150 293 gi|671626|emb|CAA85685.1| WTTVWTDGLTSLDRYKG-----RCYHIEPVPG------------------ 92 294 .:* * *: .* :* : :* .* 295 296 gi|4959044|gb|AAD34209.1|AF069 SENESEPSTRRLSVENMESSSQRQMENSASESASARPSRAERNSTEAVTE 200 297 gi|671626|emb|CAA85685.1| -EKDQCICYVAYPLDLFEEGSVTNMFTSIVGNVFGFKALRALRLEDLRIP 141 298 *::. . .:: :*..* :* .* .. . : . : 299 300 gi|4959044|gb|AAD34209.1|AF069 VPTTRAQRRA 210 301 gi|671626|emb|CAA85685.1| VAYVKTFQGP 151 302 *. .:: : . 303 """ 304 305 #This example is a truncated version of the dataset used here: 306 #http://virgil.ruc.dk/kurser/Sekvens/Treedraw.htm 307 #with the last record repeated twice (deliberate toture test) 308 aln_example2 = \ 309 """CLUSTAL X (1.83) multiple sequence alignment 310 311 312 V_Harveyi_PATH --MKNWIKVAVAAIA--LSAA------------------TVQAATEVKVG 313 B_subtilis_YXEM MKMKKWTVLVVAALLAVLSACG------------NGNSSSKEDDNVLHVG 314 B_subtilis_GlnH_homo_YCKK MKKALLALFMVVSIAALAACGAGNDNQSKDNAKDGDLWASIKKKGVLTVG 315 YA80_HAEIN MKKLLFTTALLTGAIAFSTF-----------SHAGEIADRVEKTKTLLVG 316 FLIY_ECOLI MKLAHLGRQALMGVMAVALVAG---MSVKSFADEG-LLNKVKERGTLLVG 317 E_coli_GlnH --MKSVLKVSLAALTLAFAVS------------------SHAADKKLVVA 318 Deinococcus_radiodurans -MKKSLLSLKLSGLLVPSVLALS--------LSACSSPSSTLNQGTLKIA 319 HISJ_E_COLI MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG 320 HISJ_E_COLI MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG 321 : . : :. 322 323 V_Harveyi_PATH MSGRYFPFTFVKQ--DKLQGFEVDMWDEIGKRNDYKIEYVTANFSGLFGL 324 B_subtilis_YXEM ATGQSYPFAYKEN--GKLTGFDVEVMEAVAKKIDMKLDWKLLEFSGLMGE 325 B_subtilis_GlnH_homo_YCKK TEGTYEPFTYHDKDTDKLTGYDVEVITEVAKRLGLKVDFKETQWGSMFAG 326 YA80_HAEIN TEGTYAPFTFHDK-SGKLTGFDVEVIRKVAEKLGLKVEFKETQWDAMYAG 327 FLIY_ECOLI LEGTYPPFSFQGD-DGKLTGFEVEFAQQLAKHLGVEASLKPTKWDGMLAS 328 E_coli_GlnH TDTAFVPFEFKQG--DKYVGFDVDLWAAIAKELKLDYELKPMDFSGIIPA 329 Deinococcus_radiodurans MEGTYPPFTSKNE-QGELVGFDVDIAKAVAQKLNLKPEFVLTEWSGILAG 330 HISJ_E_COLI TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS 331 HISJ_E_COLI TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS 332 ** .: *::::. : :. . ..: 333 334 V_Harveyi_PATH LETGRIDTISNQITMTDARKAKYLFADPYVVDG-AQI 335 B_subtilis_YXEM LQTGKLDTISNQVAVTDERKETYNFTKPYAYAG-TQI 336 B_subtilis_GlnH_homo_YCKK LNSKRFDVVANQVG-KTDREDKYDFSDKYTTSR-AVV 337 YA80_HAEIN LNAKRFDVIANQTNPSPERLKKYSFTTPYNYSG-GVI 338 FLIY_ECOLI LDSKRIDVVINQVTISDERKKKYDFSTPYTISGIQAL 339 E_coli_GlnH LQTKNVDLALAGITITDERKKAIDFSDGYYKSG-LLV 340 Deinococcus_radiodurans LQANKYDVIVNQVGITPERQNSIGFSQPYAYSRPEII 341 HISJ_E_COLI LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV 342 HISJ_E_COLI LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV 343 *.: . * . * *: : 344 345 """ 346 347 from Bio._py3k import StringIO 348 349 alignments = list(ClustalIterator(StringIO(aln_example1))) 350 assert 1 == len(alignments) 351 assert alignments[0]._version == "1.81" 352 alignment = alignments[0] 353 assert 2 == len(alignment) 354 assert alignment[0].id == "gi|4959044|gb|AAD34209.1|AF069" 355 assert alignment[1].id == "gi|671626|emb|CAA85685.1|" 356 assert str(alignment[0].seq) == \ 357 "MENSDSNDKGSDQSAAQRRSQMDRLDREEAFYQFVNNLSEEDYRLMRDNN" + \ 358 "LLGTPGESTEEELLRRLQQIKEGPPPQSPDENRAGESSDDVTNSDSIIDW" + \ 359 "LNSVRQTGNTTRSRQRGNQSWRAVSRTNPNSGDFRFSLEINVNRNNGSQT" + \ 360 "SENESEPSTRRLSVENMESSSQRQMENSASESASARPSRAERNSTEAVTE" + \ 361 "VPTTRAQRRA" 362 363 alignments = list(ClustalIterator(StringIO(aln_example2))) 364 assert 1 == len(alignments) 365 assert alignments[0]._version == "1.83" 366 alignment = alignments[0] 367 assert 9 == len(alignment) 368 assert alignment[-1].id == "HISJ_E_COLI" 369 assert str(alignment[-1].seq) == \ 370 "MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG" + \ 371 "TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS" + \ 372 "LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV" 373 374 for alignment in ClustalIterator(StringIO(aln_example2 + aln_example1)): 375 print("Alignment with %i records of length %i" \ 376 % (len(alignment), 377 alignment.get_alignment_length())) 378 379 print("Checking empty file...") 380 assert 0 == len(list(ClustalIterator(StringIO("")))) 381 382 print("Checking write/read...") 383 alignments = list(ClustalIterator(StringIO(aln_example1))) \ 384 + list(ClustalIterator(StringIO(aln_example2)))*2 385 handle = StringIO() 386 ClustalWriter(handle).write_file(alignments) 387 handle.seek(0) 388 for i, a in enumerate(ClustalIterator(handle)): 389 assert a.get_alignment_length() == alignments[i].get_alignment_length() 390 handle.seek(0) 391 392 print("Testing write/read when there is only one sequence...") 393 alignment = alignment[0:1] 394 handle = StringIO() 395 ClustalWriter(handle).write_file([alignment]) 396 handle.seek(0) 397 for i, a in enumerate(ClustalIterator(handle)): 398 assert a.get_alignment_length() == alignment.get_alignment_length() 399 assert len(a) == 1 400 401 aln_example3 = \ 402 """CLUSTAL 2.0.9 multiple sequence alignment 403 404 405 Test1seq ------------------------------------------------------------ 406 AT3G20900.1-SEQ ATGAACAAAGTAGCGAGGAAGAACAAAACATCAGGTGAACAAAAAAAAAACTCAATCCAC 407 AT3G20900.1-CDS ------------------------------------------------------------ 408 409 410 Test1seq -----AGTTACAATAACTGACGAAGCTAAGTAGGCTACTAATTAACGTCATCAACCTAAT 411 AT3G20900.1-SEQ ATCAAAGTTACAATAACTGACGAAGCTAAGTAGGCTAGAAATTAAAGTCATCAACCTAAT 412 AT3G20900.1-CDS ------------------------------------------------------------ 413 414 415 Test1seq ACATAGCACTTAGAAAAAAGTGAAGTAAGAAAATATAAAATAATAAAAGGGTGGGTTATC 416 AT3G20900.1-SEQ ACATAGCACTTAGAAAAAAGTGAAGCAAGAAAATATAAAATAATAAAAGGGTGGGTTATC 417 AT3G20900.1-CDS ------------------------------------------------------------ 418 419 420 Test1seq AATTGATAGTGTAAATCATCGTATTCCGGTGATATACCCTACCACAAAAACTCAAACCGA 421 AT3G20900.1-SEQ AATTGATAGTGTAAATCATAGTTGATTTTTGATATACCCTACCACAAAAACTCAAACCGA 422 AT3G20900.1-CDS ------------------------------------------------------------ 423 424 425 Test1seq CTTGATTCAAATCATCTCAATAAATTAGCGCCAAAATAATGAAAAAAATAATAACAAACA 426 AT3G20900.1-SEQ CTTGATTCAAATCATCTCAAAAAACAAGCGCCAAAATAATGAAAAAAATAATAACAAAAA 427 AT3G20900.1-CDS ------------------------------------------------------------ 428 429 430 Test1seq AAAACAAACCAAAATAAGAAAAAACATTACGCAAAACATAATAATTTACTCTTCGTTATT 431 AT3G20900.1-SEQ CAAACAAACCAAAATAAGAAAAAACATTACGCAAAACATAATAATTTACTCTTCGTTATT 432 AT3G20900.1-CDS ------------------------------------------------------------ 433 434 435 Test1seq GTATTAACAAATCAAAGAGCTGAATTTTGATCACCTGCTAATACTACTTTCTGTATTGAT 436 AT3G20900.1-SEQ GTATTAACAAATCAAAGAGATGAATTTTGATCACCTGCTAATACTACTTTCTGTATTGAT 437 AT3G20900.1-CDS ------------------------------------------------------------ 438 439 440 Test1seq CCTATATCAACGTAAACAAAGATACTAATAATTAACTAAAAGTACGTTCATCGATCGTGT 441 AT3G20900.1-SEQ CCTATATCAAAAAAAAAAAAGATACTAATAATTAACTAAAAGTACGTTCATCGATCGTGT 442 AT3G20900.1-CDS ------------------------------------------------------ATGAAC 443 * 444 445 Test1seq TCGTTGACGAAGAAGAGCTCTATCTCCGGCGGAGCAAAGAAAACGATCTGTCTCCGTCGT 446 AT3G20900.1-SEQ GCGTTGACGAAGAAGAGCTCTATCTCCGGCGGAGCAAAGAAAACGATCTGTCTCCGTCGT 447 AT3G20900.1-CDS AAAGTAGCGAGGAAGAACAAAACATC------AGCAAAGAAAACGATCTGTCTCCGTCGT 448 * *** ***** * * ** **************************** 449 450 Test1seq AACACACGGTCGCTAGAGAAACTTTGCTTCTTCGGCGCCGGTGGACACGTCAGCATCTCC 451 AT3G20900.1-SEQ AACACACAGTTTTTCGAGACCCTTTGCTTCTTCGGCGCCGGTGGACACGTCAGCATCTCC 452 AT3G20900.1-CDS AACACACAGTTTTTCGAGACCCTTTGCTTCTTCGGCGCCGGTGGACACGTCAGCATCTCC 453 ******* ** * **** *************************************** 454 455 Test1seq GGTATCCTAGACTTCTTGGCTTTCGGGGTACAACAACCGCGTGGTGACGTCAGCACCGCT 456 AT3G20900.1-SEQ GGTATCCTAGACTTCTTGGCTTTCGGGGTACAACAACCGCCTGGTGACGTCAGCACCGCT 457 AT3G20900.1-CDS GGTATCCTAGACTTCTTGGCTTTCGGGGTACAACAACCGCCTGGTGACGTCAGCACCGCT 458 **************************************** ******************* 459 460 Test1seq GCTGGGGATGGAGAGGGAACAGAGTT- 461 AT3G20900.1-SEQ GCTGGGGATGGAGAGGGAACAGAGTAG 462 AT3G20900.1-CDS GCTGGGGATGGAGAGGGAACAGAGTAG 463 ************************* 464 """ 465 alignments = list(ClustalIterator(StringIO(aln_example3))) 466 assert 1 == len(alignments) 467 assert alignments[0]._version == "2.0.9" 468 469 print("The End") 470