Package Bio :: Package AlignIO :: Module PhylipIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.PhylipIO

  1  # Copyright 2006-2011 by Peter Cock.  All rights reserved. 
  2  # Revisions copyright 2011 Brandon Invergo. All rights reserved. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """ 
  7  AlignIO support for the "phylip" format used in Joe Felsenstein's PHYLIP tools. 
  8   
  9  You are expected to use this module via the Bio.AlignIO functions (or the 
 10  Bio.SeqIO functions if you want to work directly with the gapped sequences). 
 11   
 12  Support for "relaxed phylip" format is also provided. Relaxed phylip differs 
 13  from standard phylip format in the following ways: 
 14   
 15   * No whitespace is allowed in the sequence ID. 
 16   * No truncation is performed. Instead, sequence IDs are padded to the longest 
 17     ID length, rather than 10 characters. A space separates the sequence 
 18     identifier from the sequence. 
 19   
 20  Relaxed phylip is supported by RAxML and PHYML. 
 21   
 22  Note 
 23  ==== 
 24  In TREE_PUZZLE (Schmidt et al. 2003) and PHYML (Guindon and Gascuel 2003) 
 25  a dot/period (".") in a sequence is interpreted as meaning the same 
 26  character as in the first sequence.  The PHYLIP documentation from 3.3 to 3.69 
 27  http://evolution.genetics.washington.edu/phylip/doc/sequence.html says: 
 28   
 29     "a period was also previously allowed but it is no longer allowed, 
 30     because it sometimes is used in different senses in other programs" 
 31   
 32  Biopython 1.58 or later treats dots/periods in the sequence as invalid, both 
 33  for reading and writing. Older versions did nothing special with a dot/period. 
 34  """ 
 35  import string 
 36   
 37  from Bio.Seq import Seq 
 38  from Bio.SeqRecord import SeqRecord 
 39  from Bio.Align import MultipleSeqAlignment 
 40  from Interfaces import AlignmentIterator, SequentialAlignmentWriter 
 41   
 42  _PHYLIP_ID_WIDTH = 10 
 43   
 44   
45 -class PhylipWriter(SequentialAlignmentWriter):
46 """Phylip alignment writer.""" 47
48 - def write_alignment(self, alignment, id_width=_PHYLIP_ID_WIDTH):
49 """Use this to write (another) single alignment to an open file. 50 51 This code will write interlaced alignments (when the sequences are 52 longer than 50 characters). 53 54 Note that record identifiers are strictly truncated to id_width, 55 defaulting to the value required to comply with the PHYLIP standard. 56 57 For more information on the file format, please see: 58 http://evolution.genetics.washington.edu/phylip/doc/sequence.html 59 http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles 60 """ 61 handle = self.handle 62 63 if len(alignment) == 0: 64 raise ValueError("Must have at least one sequence") 65 length_of_seqs = alignment.get_alignment_length() 66 for record in alignment: 67 if length_of_seqs != len(record.seq): 68 raise ValueError("Sequences must all be the same length") 69 if length_of_seqs <= 0: 70 raise ValueError("Non-empty sequences are required") 71 72 # Check for repeated identifiers... 73 # Apply this test *after* cleaning the identifiers 74 names = [] 75 seqs = [] 76 for record in alignment: 77 """ 78 Quoting the PHYLIP version 3.6 documentation: 79 80 The name should be ten characters in length, filled out to 81 the full ten characters by blanks if shorter. Any printable 82 ASCII/ISO character is allowed in the name, except for 83 parentheses ("(" and ")"), square brackets ("[" and "]"), 84 colon (":"), semicolon (";") and comma (","). If you forget 85 to extend the names to ten characters in length by blanks, 86 the program [i.e. PHYLIP] will get out of synchronization 87 with the contents of the data file, and an error message will 88 result. 89 90 Note that Tab characters count as only one character in the 91 species names. Their inclusion can cause trouble. 92 """ 93 name = record.id.strip() 94 #Either remove the banned characters, or map them to something 95 #else like an underscore "_" or pipe "|" character... 96 for char in "[](),": 97 name = name.replace(char, "") 98 for char in ":;": 99 name = name.replace(char, "|") 100 name = name[:id_width] 101 if name in names: 102 raise ValueError("Repeated name %r (originally %r), " 103 "possibly due to truncation" 104 % (name, record.id)) 105 names.append(name) 106 sequence = str(record.seq) 107 if "." in sequence: 108 # Do this check here (once per record, not once per block) 109 raise ValueError("PHYLIP format no longer allows dots in " 110 "sequence") 111 seqs.append(sequence) 112 113 # From experimentation, the use of tabs is not understood by the 114 # EMBOSS suite. The nature of the expected white space is not 115 # defined in the PHYLIP documentation, simply "These are in free 116 # format, separated by blanks". We'll use spaces to keep EMBOSS 117 # happy. 118 handle.write(" %i %s\n" % (len(alignment), length_of_seqs)) 119 block = 0 120 while True: 121 for name, sequence in zip(names, seqs): 122 if block == 0: 123 #Write name (truncated/padded to id_width characters) 124 #Now truncate and right pad to expected length. 125 handle.write(name[:id_width].ljust(id_width)) 126 else: 127 #write indent 128 handle.write(" " * id_width) 129 #Write five chunks of ten letters per line... 130 for chunk in range(0, 5): 131 i = block*50 + chunk*10 132 seq_segment = sequence[i:i+10] 133 #TODO - Force any gaps to be '-' character? Look at the 134 #alphabet... 135 #TODO - How to cope with '?' or '.' in the sequence? 136 handle.write(" %s" % seq_segment) 137 if i+10 > length_of_seqs: 138 break 139 handle.write("\n") 140 block = block+1 141 if block*50 > length_of_seqs: 142 break 143 handle.write("\n")
144 145
146 -class PhylipIterator(AlignmentIterator):
147 """Reads a Phylip alignment file returning a MultipleSeqAlignment iterator. 148 149 Record identifiers are limited to at most 10 characters. 150 151 It only copes with interlaced phylip files! Sequential files won't work 152 where the sequences are split over multiple lines. 153 154 For more information on the file format, please see: 155 http://evolution.genetics.washington.edu/phylip/doc/sequence.html 156 http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles 157 """ 158 159 # Default truncation length 160 id_width = _PHYLIP_ID_WIDTH 161
162 - def _is_header(self, line):
163 line = line.strip() 164 parts = filter(None, line.split()) 165 if len(parts) != 2: 166 return False # First line should have two integers 167 try: 168 number_of_seqs = int(parts[0]) 169 length_of_seqs = int(parts[1]) 170 return True 171 except ValueError: 172 return False # First line should have two integers
173
174 - def _split_id(self, line):
175 """ 176 Extracts the sequence ID from a Phylip line, returning a tuple 177 containing: 178 179 (sequence_id, sequence_residues) 180 181 The first 10 characters in the line are are the sequence id, the 182 remainder are sequence data. 183 """ 184 seq_id = line[:self.id_width].strip() 185 seq = line[self.id_width:].strip().replace(' ', '') 186 return seq_id, seq
187
188 - def next(self):
189 handle = self.handle 190 191 try: 192 #Header we saved from when we were parsing 193 #the previous alignment. 194 line = self._header 195 del self._header 196 except AttributeError: 197 line = handle.readline() 198 199 if not line: 200 raise StopIteration 201 line = line.strip() 202 parts = filter(None, line.split()) 203 if len(parts) != 2: 204 raise ValueError("First line should have two integers") 205 try: 206 number_of_seqs = int(parts[0]) 207 length_of_seqs = int(parts[1]) 208 except ValueError: 209 raise ValueError("First line should have two integers") 210 211 assert self._is_header(line) 212 213 if self.records_per_alignment is not None \ 214 and self.records_per_alignment != number_of_seqs: 215 raise ValueError("Found %i records in this alignment, told to expect %i" 216 % (number_of_seqs, self.records_per_alignment)) 217 218 ids = [] 219 seqs = [] 220 221 # By default, expects STRICT truncation / padding to 10 characters. 222 # Does not require any whitespace between name and seq. 223 for i in xrange(number_of_seqs): 224 line = handle.readline().rstrip() 225 sequence_id, s = self._split_id(line) 226 ids.append(sequence_id) 227 if "." in s: 228 raise ValueError("PHYLIP format no longer allows dots in sequence") 229 seqs.append([s]) 230 231 #Look for further blocks 232 line = "" 233 while True: 234 #Skip any blank lines between blocks... 235 while "" == line.strip(): 236 line = handle.readline() 237 if not line: 238 break # end of file 239 if not line: 240 break # end of file 241 242 if self._is_header(line): 243 #Looks like the start of a concatenated alignment 244 self._header = line 245 break 246 247 #print "New block..." 248 for i in xrange(number_of_seqs): 249 s = line.strip().replace(" ", "") 250 if "." in s: 251 raise ValueError("PHYLIP format no longer allows dots in sequence") 252 seqs[i].append(s) 253 line = handle.readline() 254 if (not line) and i+1 < number_of_seqs: 255 raise ValueError("End of file mid-block") 256 if not line: 257 break # end of file 258 259 records = (SeqRecord(Seq("".join(s), self.alphabet), 260 id=i, name=i, description=i) 261 for (i, s) in zip(ids, seqs)) 262 return MultipleSeqAlignment(records, self.alphabet)
263 264 265 # Relaxed Phylip
266 -class RelaxedPhylipWriter(PhylipWriter):
267 """ 268 Relaxed Phylip format writer 269 """ 270
271 - def write_alignment(self, alignment):
272 """ 273 Write a relaxed phylip alignment 274 """ 275 # Check inputs 276 for name in (s.id.strip() for s in alignment): 277 if any(c in name for c in string.whitespace): 278 raise ValueError("Whitespace not allowed in identifier: %s" 279 % name) 280 281 # Calculate a truncation length - maximum length of sequence ID plus a 282 # single character for padding 283 # If no sequences, set id_width to 1. super(...) call will raise a 284 # ValueError 285 if len(alignment) == 0: 286 id_width = 1 287 else: 288 id_width = max((len(s.id.strip()) for s in alignment)) + 1 289 super(RelaxedPhylipWriter, self).write_alignment(alignment, id_width)
290 291
292 -class RelaxedPhylipIterator(PhylipIterator):
293 """ 294 Relaxed Phylip format Iterator 295 """ 296
297 - def _split_id(self, line):
298 """Returns the ID, sequence data from a line 299 Extracts the sequence ID from a Phylip line, returning a tuple 300 containing: 301 302 (sequence_id, sequence_residues) 303 304 For relaxed format - split at the first whitespace character 305 """ 306 seq_id, sequence = line.split(None, 1) 307 sequence = sequence.strip().replace(" ", "") 308 return seq_id, sequence
309 310
311 -class SequentialPhylipWriter(SequentialAlignmentWriter):
312 """ 313 Sequential Phylip format Writer 314 """
315 - def write_alignment(self, alignment, id_width=_PHYLIP_ID_WIDTH):
316 handle = self.handle 317 318 if len(alignment) == 0: 319 raise ValueError("Must have at least one sequence") 320 length_of_seqs = alignment.get_alignment_length() 321 for record in alignment: 322 if length_of_seqs != len(record.seq): 323 raise ValueError("Sequences must all be the same length") 324 if length_of_seqs <= 0: 325 raise ValueError("Non-empty sequences are required") 326 327 # Check for repeated identifiers... 328 # Apply this test *after* cleaning the identifiers 329 names = [] 330 for record in alignment: 331 name = record.id.strip() 332 #Either remove the banned characters, or map them to something 333 #else like an underscore "_" or pipe "|" character... 334 for char in "[](),": 335 name = name.replace(char, "") 336 for char in ":;": 337 name = name.replace(char, "|") 338 name = name[:id_width] 339 if name in names: 340 raise ValueError("Repeated name %r (originally %r), " 341 "possibly due to truncation" 342 % (name, record.id)) 343 names.append(name) 344 345 # From experimentation, the use of tabs is not understood by the 346 # EMBOSS suite. The nature of the expected white space is not 347 # defined in the PHYLIP documentation, simply "These are in free 348 # format, separated by blanks". We'll use spaces to keep EMBOSS 349 # happy. 350 handle.write(" %i %s\n" % (len(alignment), length_of_seqs)) 351 for name, record in zip(names, alignment): 352 sequence = str(record.seq) 353 if "." in sequence: 354 raise ValueError("PHYLIP format no longer allows dots in " 355 "sequence") 356 handle.write(name[:id_width].ljust(id_width)) 357 # Write the entire sequence to one line (see sequential format 358 # notes in the SequentialPhylipIterator docstring 359 handle.write(sequence) 360 handle.write("\n")
361 362
363 -class SequentialPhylipIterator(PhylipIterator):
364 """ 365 Sequential Phylip format Iterator 366 367 The sequential format carries the same restrictions as the normal 368 interleaved one, with the difference being that the sequences are listed 369 sequentially, each sequence written in its entirety before the start of 370 the next. According to the PHYLIP documentation for input file formatting, 371 newlines and spaces may optionally be entered at any point in the sequences. 372 """
373 - def next(self):
374 handle = self.handle 375 376 try: 377 #Header we saved from when we were parsing 378 #the previous alignment. 379 line = self._header 380 del self._header 381 except AttributeError: 382 line = handle.readline() 383 384 if not line: 385 raise StopIteration 386 line = line.strip() 387 parts = filter(None, line.split()) 388 if len(parts) != 2: 389 raise ValueError("First line should have two integers") 390 try: 391 number_of_seqs = int(parts[0]) 392 length_of_seqs = int(parts[1]) 393 except ValueError: 394 raise ValueError("First line should have two integers") 395 396 assert self._is_header(line) 397 398 if self.records_per_alignment is not None \ 399 and self.records_per_alignment != number_of_seqs: 400 raise ValueError("Found %i records in this alignment, told to expect %i" 401 % (number_of_seqs, self.records_per_alignment)) 402 403 ids = [] 404 seqs = [] 405 406 # By default, expects STRICT truncation / padding to 10 characters. 407 # Does not require any whitespace between name and seq. 408 for i in xrange(number_of_seqs): 409 line = handle.readline().rstrip() 410 sequence_id, s = self._split_id(line) 411 ids.append(sequence_id) 412 while len(s) < length_of_seqs: 413 # The sequence may be split into multiple lines 414 line = handle.readline().strip() 415 if not line: 416 break 417 if line == "": 418 continue 419 s = "".join([s, line.strip().replace(" ", "")]) 420 if len(s) > length_of_seqs: 421 raise ValueError("Found a record of length %i, should be %i" 422 % (len(s), length_of_seqs)) 423 if "." in s: 424 raise ValueError("PHYLIP format no longer allows dots in sequence") 425 seqs.append(s) 426 while True: 427 # Find other alignments in the file 428 line = handle.readline() 429 if not line: 430 break 431 if self._is_header(line): 432 self._header = line 433 break 434 435 records = (SeqRecord(Seq(s, self.alphabet), 436 id=i, name=i, description=i) 437 for (i, s) in zip(ids, seqs)) 438 return MultipleSeqAlignment(records, self.alphabet)
439 440 441 if __name__ == "__main__": 442 print "Running short mini-test" 443 444 phylip_text = """ 8 286 445 V_Harveyi_ --MKNWIKVA VAAIA--LSA A--------- ---------T VQAATEVKVG 446 B_subtilis MKMKKWTVLV VAALLAVLSA CG-------- ----NGNSSS KEDDNVLHVG 447 B_subtilis MKKALLALFM VVSIAALAAC GAGNDNQSKD NAKDGDLWAS IKKKGVLTVG 448 YA80_HAEIN MKKLLFTTAL LTGAIAFSTF ---------- -SHAGEIADR VEKTKTLLVG 449 FLIY_ECOLI MKLAHLGRQA LMGVMAVALV AG---MSVKS FADEG-LLNK VKERGTLLVG 450 E_coli_Gln --MKSVLKVS LAALTLAFAV S--------- ---------S HAADKKLVVA 451 Deinococcu -MKKSLLSLK LSGLLVPSVL ALS------- -LSACSSPSS TLNQGTLKIA 452 HISJ_E_COL MKKLVLSLSL VLAFSSATAA F--------- ---------- AAIPQNIRIG 453 454 MSGRYFPFTF VKQ--DKLQG FEVDMWDEIG KRNDYKIEYV TANFSGLFGL 455 ATGQSYPFAY KEN--GKLTG FDVEVMEAVA KKIDMKLDWK LLEFSGLMGE 456 TEGTYEPFTY HDKDTDKLTG YDVEVITEVA KRLGLKVDFK ETQWGSMFAG 457 TEGTYAPFTF HDK-SGKLTG FDVEVIRKVA EKLGLKVEFK ETQWDAMYAG 458 LEGTYPPFSF QGD-DGKLTG FEVEFAQQLA KHLGVEASLK PTKWDGMLAS 459 TDTAFVPFEF KQG--DKYVG FDVDLWAAIA KELKLDYELK PMDFSGIIPA 460 MEGTYPPFTS KNE-QGELVG FDVDIAKAVA QKLNLKPEFV LTEWSGILAG 461 TDPTYAPFES KNS-QGELVG FDIDLAKELC KRINTQCTFV ENPLDALIPS 462 463 LETGRIDTIS NQITMTDARK AKYLFADPYV VDG-AQITVR KGNDSIQGVE 464 LQTGKLDTIS NQVAVTDERK ETYNFTKPYA YAG-TQIVVK KDNTDIKSVD 465 LNSKRFDVVA NQVG-KTDRE DKYDFSDKYT TSR-AVVVTK KDNNDIKSEA 466 LNAKRFDVIA NQTNPSPERL KKYSFTTPYN YSG-GVIVTK SSDNSIKSFE 467 LDSKRIDVVI NQVTISDERK KKYDFSTPYT ISGIQALVKK GNEGTIKTAD 468 LQTKNVDLAL AGITITDERK KAIDFSDGYY KSG-LLVMVK ANNNDVKSVK 469 LQANKYDVIV NQVGITPERQ NSIGFSQPYA YSRPEIIVAK NNTFNPQSLA 470 LKAKKIDAIM SSLSITEKRQ QEIAFTDKLY AADSRLVVAK NSDIQP-TVE 471 472 DLAGKTVAVN LGSNFEQLLR DYDKDGKINI KTYDT--GIE HDVALGRADA 473 DLKGKTVAAV LGSNHAKNLE SKDPDKKINI KTYETQEGTL KDVAYGRVDA 474 DVKGKTSAQS LTSNYNKLAT N----AGAKV EGVEGMAQAL QMIQQARVDM 475 DLKGRKSAQS ATSNWGKDAK A----AGAQI LVVDGLAQSL ELIKQGRAEA 476 DLKGKKVGVG LGTNYEEWLR QNV--QGVDV RTYDDDPTKY QDLRVGRIDA 477 DLDGKVVAVK SGTGSVDYAK AN--IKTKDL RQFPNIDNAY MELGTNRADA 478 DLKGKRVGST LGSNYEKQLI DTG---DIKI VTYPGAPEIL ADLVAGRIDA 479 SLKGKRVGVL QGTTQETFGN EHWAPKGIEI VSYQGQDNIY SDLTAGRIDA 480 481 FIMDRLSALE -LIKKT-GLP LQLAGEPFET I-----QNAW PFVDNEKGRK 482 YVNSRTVLIA -QIKKT-GLP LKLAGDPIVY E-----QVAF PFAKDDAHDK 483 TYNDKLAVLN -YLKTSGNKN VKIAFETGEP Q-----STYF TFRKGS--GE 484 TINDKLAVLD -YFKQHPNSG LKIAYDRGDK T-----PTAF AFLQGE--DA 485 ILVDRLAALD -LVKKT-NDT LAVTGEAFSR Q-----ESGV ALRKGN--ED 486 VLHDTPNILY -FIKTAGNGQ FKAVGDSLEA Q-----QYGI AFPKGS--DE 487 AYNDRLVVNY -IINDQ-KLP VRGAGQIGDA A-----PVGI ALKKGN--SA 488 AFQDEVAASE GFLKQPVGKD YKFGGPSVKD EKLFGVGTGM GLRKED--NE 489 490 LQAEVNKALA EMRADGTVEK ISVKWFGADI TK---- 491 LRKKVNKALD ELRKDGTLKK LSEKYFNEDI TVEQKH 492 VVDQVNKALK EMKEDGTLSK ISKKWFGEDV SK---- 493 LITKFNQVLE ALRQDGTLKQ ISIEWFGYDI TQ---- 494 LLKAVNDAIA EMQKDGTLQA LSEKWFGADV TK---- 495 LRDKVNGALK TLRENGTYNE IYKKWFGTEP K----- 496 LKDQIDKALT EMRSDGTFEK ISQKWFGQDV GQP--- 497 LREALNKAFA EMRADGTYEK LAKKYFDFDV YGG--- 498 """ 499 500 from cStringIO import StringIO 501 handle = StringIO(phylip_text) 502 count = 0 503 for alignment in PhylipIterator(handle): 504 for record in alignment: 505 count = count+1 506 print record.id 507 #print str(record.seq) 508 assert count == 8 509 510 expected = """mkklvlslsl vlafssataa faaipqniri gtdptyapfe sknsqgelvg 511 fdidlakelc krintqctfv enpldalips lkakkidaim sslsitekrq qeiaftdkly 512 aadsrlvvak nsdiqptves lkgkrvgvlq gttqetfgne hwapkgieiv syqgqdniys 513 dltagridaafqdevaaseg flkqpvgkdy kfggpsvkde klfgvgtgmg lrkednelre 514 alnkafaemradgtyeklak kyfdfdvygg""".replace(" ", "").replace("\n", "").upper() 515 assert str(record.seq).replace("-", "") == expected 516 517 #From here: 518 #http://atgc.lirmm.fr/phyml/usersguide.html 519 phylip_text2 = """5 60 520 Tax1 CCATCTCACGGTCGGTACGATACACCTGCTTTTGGCAG 521 Tax2 CCATCTCACGGTCAGTAAGATACACCTGCTTTTGGCGG 522 Tax3 CCATCTCCCGCTCAGTAAGATACCCCTGCTGTTGGCGG 523 Tax4 TCATCTCATGGTCAATAAGATACTCCTGCTTTTGGCGG 524 Tax5 CCATCTCACGGTCGGTAAGATACACCTGCTTTTGGCGG 525 526 GAAATGGTCAATATTACAAGGT 527 GAAATGGTCAACATTAAAAGAT 528 GAAATCGTCAATATTAAAAGGT 529 GAAATGGTCAATCTTAAAAGGT 530 GAAATGGTCAATATTAAAAGGT""" 531 532 phylip_text3 = """5 60 533 Tax1 CCATCTCACGGTCGGTACGATACACCTGCTTTTGGCAGGAAATGGTCAATATTACAAGGT 534 Tax2 CCATCTCACGGTCAGTAAGATACACCTGCTTTTGGCGGGAAATGGTCAACATTAAAAGAT 535 Tax3 CCATCTCCCGCTCAGTAAGATACCCCTGCTGTTGGCGGGAAATCGTCAATATTAAAAGGT 536 Tax4 TCATCTCATGGTCAATAAGATACTCCTGCTTTTGGCGGGAAATGGTCAATCTTAAAAGGT 537 Tax5 CCATCTCACGGTCGGTAAGATACACCTGCTTTTGGCGGGAAATGGTCAATATTAAAAGGT""" 538 539 handle = StringIO(phylip_text2) 540 list2 = list(PhylipIterator(handle)) 541 handle.close() 542 assert len(list2) == 1 543 assert len(list2[0]) == 5 544 545 handle = StringIO(phylip_text3) 546 list3 = list(PhylipIterator(handle)) 547 handle.close() 548 assert len(list3) == 1 549 assert len(list3[0]) == 5 550 551 for i in range(0, 5): 552 list2[0][i].id == list3[0][i].id 553 str(list2[0][i].seq) == str(list3[0][i].seq) 554 555 #From here: 556 #http://evolution.genetics.washington.edu/phylip/doc/sequence.html 557 #Note the lack of any white space between names 2 and 3 and their seqs. 558 phylip_text4 = """ 5 42 559 Turkey AAGCTNGGGC ATTTCAGGGT 560 Salmo gairAAGCCTTGGC AGTGCAGGGT 561 H. SapiensACCGGTTGGC CGTTCAGGGT 562 Chimp AAACCCTTGC CGTTACGCTT 563 Gorilla AAACCCTTGC CGGTACGCTT 564 565 GAGCCCGGGC AATACAGGGT AT 566 GAGCCGTGGC CGGGCACGGT AT 567 ACAGGTTGGC CGTTCAGGGT AA 568 AAACCGAGGC CGGGACACTC AT 569 AAACCATTGC CGGTACGCTT AA""" 570 571 #From here: 572 #http://evolution.genetics.washington.edu/phylip/doc/sequence.html 573 phylip_text5 = """ 5 42 574 Turkey AAGCTNGGGC ATTTCAGGGT 575 GAGCCCGGGC AATACAGGGT AT 576 Salmo gairAAGCCTTGGC AGTGCAGGGT 577 GAGCCGTGGC CGGGCACGGT AT 578 H. SapiensACCGGTTGGC CGTTCAGGGT 579 ACAGGTTGGC CGTTCAGGGT AA 580 Chimp AAACCCTTGC CGTTACGCTT 581 AAACCGAGGC CGGGACACTC AT 582 Gorilla AAACCCTTGC CGGTACGCTT 583 AAACCATTGC CGGTACGCTT AA""" 584 585 phylip_text5a = """ 5 42 586 Turkey AAGCTNGGGC ATTTCAGGGT GAGCCCGGGC AATACAGGGT AT 587 Salmo gairAAGCCTTGGC AGTGCAGGGT GAGCCGTGGC CGGGCACGGT AT 588 H. SapiensACCGGTTGGC CGTTCAGGGT ACAGGTTGGC CGTTCAGGGT AA 589 Chimp AAACCCTTGC CGTTACGCTT AAACCGAGGC CGGGACACTC AT 590 Gorilla AAACCCTTGC CGGTACGCTT AAACCATTGC CGGTACGCTT AA""" 591 592 handle = StringIO(phylip_text4) 593 list4 = list(PhylipIterator(handle)) 594 handle.close() 595 assert len(list4) == 1 596 assert len(list4[0]) == 5 597 598 handle = StringIO(phylip_text5) 599 try: 600 list5 = list(PhylipIterator(handle)) 601 assert len(list5) == 1 602 assert len(list5[0]) == 5 603 print "That should have failed..." 604 except ValueError: 605 print "Evil multiline non-interlaced example failed as expected" 606 handle.close() 607 608 handle = StringIO(phylip_text5a) 609 list5 = list(PhylipIterator(handle)) 610 handle.close() 611 assert len(list5) == 1 612 assert len(list4[0]) == 5 613 614 print "Concatenation" 615 handle = StringIO(phylip_text4 + "\n" + phylip_text4) 616 assert len(list(PhylipIterator(handle))) == 2 617 618 handle = StringIO(phylip_text3 + "\n" + phylip_text4 + "\n\n\n" + phylip_text) 619 assert len(list(PhylipIterator(handle))) == 3 620 621 print "OK" 622 623 print "Checking write/read" 624 handle = StringIO() 625 PhylipWriter(handle).write_file(list5) 626 handle.seek(0) 627 list6 = list(PhylipIterator(handle)) 628 assert len(list5) == len(list6) 629 for a1, a2 in zip(list5, list6): 630 assert len(a1) == len(a2) 631 for r1, r2 in zip(a1, a2): 632 assert r1.id == r2.id 633 assert str(r1.seq) == str(r2.seq) 634 print "Done" 635