Package Bio :: Package AlignIO :: Module PhylipIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.PhylipIO

  1  # Copyright 2006-2013 by Peter Cock.  All rights reserved. 
  2  # Revisions copyright 2011 Brandon Invergo. All rights reserved. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """AlignIO support for "phylip" format from Joe Felsenstein's PHYLIP tools. 
  7   
  8  You are expected to use this module via the Bio.AlignIO functions (or the 
  9  Bio.SeqIO functions if you want to work directly with the gapped sequences). 
 10   
 11  Support for "relaxed phylip" format is also provided. Relaxed phylip differs 
 12  from standard phylip format in the following ways: 
 13   
 14   - No whitespace is allowed in the sequence ID. 
 15   - No truncation is performed. Instead, sequence IDs are padded to the longest 
 16     ID length, rather than 10 characters. A space separates the sequence 
 17     identifier from the sequence. 
 18   
 19  Relaxed phylip is supported by RAxML and PHYML. 
 20   
 21  Note 
 22  ==== 
 23   
 24  In TREE_PUZZLE (Schmidt et al. 2003) and PHYML (Guindon and Gascuel 2003) 
 25  a dot/period (".") in a sequence is interpreted as meaning the same 
 26  character as in the first sequence.  The PHYLIP documentation from 3.3 to 3.69 
 27  http://evolution.genetics.washington.edu/phylip/doc/sequence.html says: 
 28   
 29  "a period was also previously allowed but it is no longer allowed, 
 30  because it sometimes is used in different senses in other programs" 
 31   
 32  Biopython 1.58 or later treats dots/periods in the sequence as invalid, both 
 33  for reading and writing. Older versions did nothing special with a dot/period. 
 34  """ 
 35  from __future__ import print_function 
 36   
 37  import string 
 38   
 39  from Bio._py3k import range 
 40   
 41  from Bio.Seq import Seq 
 42  from Bio.SeqRecord import SeqRecord 
 43  from Bio.Align import MultipleSeqAlignment 
 44  from .Interfaces import AlignmentIterator, SequentialAlignmentWriter 
 45   
 46  __docformat__ = "restructuredtext en" 
 47   
 48  _PHYLIP_ID_WIDTH = 10 
 49   
 50   
51 -class PhylipWriter(SequentialAlignmentWriter):
52 """Phylip alignment writer.""" 53
54 - def write_alignment(self, alignment, id_width=_PHYLIP_ID_WIDTH):
55 """Use this to write (another) single alignment to an open file. 56 57 This code will write interlaced alignments (when the sequences are 58 longer than 50 characters). 59 60 Note that record identifiers are strictly truncated to id_width, 61 defaulting to the value required to comply with the PHYLIP standard. 62 63 For more information on the file format, please see: 64 http://evolution.genetics.washington.edu/phylip/doc/sequence.html 65 http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles 66 """ 67 handle = self.handle 68 69 if len(alignment) == 0: 70 raise ValueError("Must have at least one sequence") 71 length_of_seqs = alignment.get_alignment_length() 72 for record in alignment: 73 if length_of_seqs != len(record.seq): 74 raise ValueError("Sequences must all be the same length") 75 if length_of_seqs <= 0: 76 raise ValueError("Non-empty sequences are required") 77 78 # Check for repeated identifiers... 79 # Apply this test *after* cleaning the identifiers 80 names = [] 81 seqs = [] 82 for record in alignment: 83 """ 84 Quoting the PHYLIP version 3.6 documentation: 85 86 The name should be ten characters in length, filled out to 87 the full ten characters by blanks if shorter. Any printable 88 ASCII/ISO character is allowed in the name, except for 89 parentheses ("(" and ")"), square brackets ("[" and "]"), 90 colon (":"), semicolon (";") and comma (","). If you forget 91 to extend the names to ten characters in length by blanks, 92 the program [i.e. PHYLIP] will get out of synchronization 93 with the contents of the data file, and an error message will 94 result. 95 96 Note that Tab characters count as only one character in the 97 species names. Their inclusion can cause trouble. 98 """ 99 name = record.id.strip() 100 # Either remove the banned characters, or map them to something 101 # else like an underscore "_" or pipe "|" character... 102 for char in "[](),": 103 name = name.replace(char, "") 104 for char in ":;": 105 name = name.replace(char, "|") 106 name = name[:id_width] 107 if name in names: 108 raise ValueError("Repeated name %r (originally %r), " 109 "possibly due to truncation" 110 % (name, record.id)) 111 names.append(name) 112 sequence = str(record.seq) 113 if "." in sequence: 114 # Do this check here (once per record, not once per block) 115 raise ValueError("PHYLIP format no longer allows dots in " 116 "sequence") 117 seqs.append(sequence) 118 119 # From experimentation, the use of tabs is not understood by the 120 # EMBOSS suite. The nature of the expected white space is not 121 # defined in the PHYLIP documentation, simply "These are in free 122 # format, separated by blanks". We'll use spaces to keep EMBOSS 123 # happy. 124 handle.write(" %i %s\n" % (len(alignment), length_of_seqs)) 125 block = 0 126 while True: 127 for name, sequence in zip(names, seqs): 128 if block == 0: 129 # Write name (truncated/padded to id_width characters) 130 # Now truncate and right pad to expected length. 131 handle.write(name[:id_width].ljust(id_width)) 132 else: 133 # write indent 134 handle.write(" " * id_width) 135 # Write five chunks of ten letters per line... 136 for chunk in range(0, 5): 137 i = block * 50 + chunk * 10 138 seq_segment = sequence[i:i + 10] 139 # TODO - Force any gaps to be '-' character? Look at the 140 # alphabet... 141 # TODO - How to cope with '?' or '.' in the sequence? 142 handle.write(" %s" % seq_segment) 143 if i + 10 > length_of_seqs: 144 break 145 handle.write("\n") 146 block += 1 147 if block * 50 > length_of_seqs: 148 break 149 handle.write("\n")
150 151
152 -class PhylipIterator(AlignmentIterator):
153 """Reads a Phylip alignment file returning a MultipleSeqAlignment iterator. 154 155 Record identifiers are limited to at most 10 characters. 156 157 It only copes with interlaced phylip files! Sequential files won't work 158 where the sequences are split over multiple lines. 159 160 For more information on the file format, please see: 161 http://evolution.genetics.washington.edu/phylip/doc/sequence.html 162 http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles 163 """ 164 165 # Default truncation length 166 id_width = _PHYLIP_ID_WIDTH 167
168 - def _is_header(self, line):
169 line = line.strip() 170 parts = [x for x in line.split() if x] 171 if len(parts) != 2: 172 return False # First line should have two integers 173 try: 174 number_of_seqs = int(parts[0]) 175 length_of_seqs = int(parts[1]) 176 return True 177 except ValueError: 178 return False # First line should have two integers
179
180 - def _split_id(self, line):
181 """Extracts the sequence ID from a Phylip line (PRIVATE). 182 183 Returning a tuple containing: (sequence_id, sequence_residues) 184 185 The first 10 characters in the line are are the sequence id, the 186 remainder are sequence data. 187 """ 188 seq_id = line[:self.id_width].strip() 189 seq = line[self.id_width:].strip().replace(' ', '') 190 return seq_id, seq
191
192 - def __next__(self):
193 handle = self.handle 194 195 try: 196 # Header we saved from when we were parsing 197 # the previous alignment. 198 line = self._header 199 del self._header 200 except AttributeError: 201 line = handle.readline() 202 203 if not line: 204 raise StopIteration 205 line = line.strip() 206 parts = [x for x in line.split() if x] 207 if len(parts) != 2: 208 raise ValueError("First line should have two integers") 209 try: 210 number_of_seqs = int(parts[0]) 211 length_of_seqs = int(parts[1]) 212 except ValueError: 213 raise ValueError("First line should have two integers") 214 215 assert self._is_header(line) 216 217 if self.records_per_alignment is not None \ 218 and self.records_per_alignment != number_of_seqs: 219 raise ValueError("Found %i records in this alignment, told to expect %i" 220 % (number_of_seqs, self.records_per_alignment)) 221 222 ids = [] 223 seqs = [] 224 225 # By default, expects STRICT truncation / padding to 10 characters. 226 # Does not require any whitespace between name and seq. 227 for i in range(number_of_seqs): 228 line = handle.readline().rstrip() 229 sequence_id, s = self._split_id(line) 230 ids.append(sequence_id) 231 if "." in s: 232 raise ValueError("PHYLIP format no longer allows dots in sequence") 233 seqs.append([s]) 234 235 # Look for further blocks 236 line = "" 237 while True: 238 # Skip any blank lines between blocks... 239 while "" == line.strip(): 240 line = handle.readline() 241 if not line: 242 break # end of file 243 if not line: 244 break # end of file 245 246 if self._is_header(line): 247 # Looks like the start of a concatenated alignment 248 self._header = line 249 break 250 251 # print "New block..." 252 for i in range(number_of_seqs): 253 s = line.strip().replace(" ", "") 254 if "." in s: 255 raise ValueError("PHYLIP format no longer allows dots in sequence") 256 seqs[i].append(s) 257 line = handle.readline() 258 if (not line) and i + 1 < number_of_seqs: 259 raise ValueError("End of file mid-block") 260 if not line: 261 break # end of file 262 263 records = (SeqRecord(Seq("".join(s), self.alphabet), 264 id=i, name=i, description=i) 265 for (i, s) in zip(ids, seqs)) 266 return MultipleSeqAlignment(records, self.alphabet)
267 268 269 # Relaxed Phylip
270 -class RelaxedPhylipWriter(PhylipWriter):
271 """ 272 Relaxed Phylip format writer 273 """ 274
275 - def write_alignment(self, alignment):
276 """ 277 Write a relaxed phylip alignment 278 """ 279 # Check inputs 280 for name in (s.id.strip() for s in alignment): 281 if any(c in name for c in string.whitespace): 282 raise ValueError("Whitespace not allowed in identifier: %s" 283 % name) 284 285 # Calculate a truncation length - maximum length of sequence ID plus a 286 # single character for padding 287 # If no sequences, set id_width to 1. super(...) call will raise a 288 # ValueError 289 if len(alignment) == 0: 290 id_width = 1 291 else: 292 id_width = max((len(s.id.strip()) for s in alignment)) + 1 293 super(RelaxedPhylipWriter, self).write_alignment(alignment, id_width)
294 295
296 -class RelaxedPhylipIterator(PhylipIterator):
297 """Relaxed Phylip format Iterator.""" 298
299 - def _split_id(self, line):
300 """Extracts the sequence ID from a Phylip line (PRIVATE). 301 302 Returns a tuple containing: (sequence_id, sequence_residues) 303 304 For relaxed format split at the first whitespace character. 305 """ 306 seq_id, sequence = line.split(None, 1) 307 sequence = sequence.strip().replace(" ", "") 308 return seq_id, sequence
309 310
311 -class SequentialPhylipWriter(SequentialAlignmentWriter):
312 """Sequential Phylip format Writer.""" 313
314 - def write_alignment(self, alignment, id_width=_PHYLIP_ID_WIDTH):
315 handle = self.handle 316 317 if len(alignment) == 0: 318 raise ValueError("Must have at least one sequence") 319 length_of_seqs = alignment.get_alignment_length() 320 for record in alignment: 321 if length_of_seqs != len(record.seq): 322 raise ValueError("Sequences must all be the same length") 323 if length_of_seqs <= 0: 324 raise ValueError("Non-empty sequences are required") 325 326 # Check for repeated identifiers... 327 # Apply this test *after* cleaning the identifiers 328 names = [] 329 for record in alignment: 330 name = record.id.strip() 331 # Either remove the banned characters, or map them to something 332 # else like an underscore "_" or pipe "|" character... 333 for char in "[](),": 334 name = name.replace(char, "") 335 for char in ":;": 336 name = name.replace(char, "|") 337 name = name[:id_width] 338 if name in names: 339 raise ValueError("Repeated name %r (originally %r), " 340 "possibly due to truncation" 341 % (name, record.id)) 342 names.append(name) 343 344 # From experimentation, the use of tabs is not understood by the 345 # EMBOSS suite. The nature of the expected white space is not 346 # defined in the PHYLIP documentation, simply "These are in free 347 # format, separated by blanks". We'll use spaces to keep EMBOSS 348 # happy. 349 handle.write(" %i %s\n" % (len(alignment), length_of_seqs)) 350 for name, record in zip(names, alignment): 351 sequence = str(record.seq) 352 if "." in sequence: 353 raise ValueError("PHYLIP format no longer allows dots in " 354 "sequence") 355 handle.write(name[:id_width].ljust(id_width)) 356 # Write the entire sequence to one line (see sequential format 357 # notes in the SequentialPhylipIterator docstring 358 handle.write(sequence) 359 handle.write("\n")
360 361
362 -class SequentialPhylipIterator(PhylipIterator):
363 """ 364 Sequential Phylip format Iterator 365 366 The sequential format carries the same restrictions as the normal 367 interleaved one, with the difference being that the sequences are listed 368 sequentially, each sequence written in its entirety before the start of 369 the next. According to the PHYLIP documentation for input file formatting, 370 newlines and spaces may optionally be entered at any point in the sequences. 371 """
372 - def __next__(self):
373 handle = self.handle 374 375 try: 376 # Header we saved from when we were parsing 377 # the previous alignment. 378 line = self._header 379 del self._header 380 except AttributeError: 381 line = handle.readline() 382 383 if not line: 384 raise StopIteration 385 line = line.strip() 386 parts = [x for x in line.split() if x] 387 if len(parts) != 2: 388 raise ValueError("First line should have two integers") 389 try: 390 number_of_seqs = int(parts[0]) 391 length_of_seqs = int(parts[1]) 392 except ValueError: 393 raise ValueError("First line should have two integers") 394 395 assert self._is_header(line) 396 397 if self.records_per_alignment is not None \ 398 and self.records_per_alignment != number_of_seqs: 399 raise ValueError("Found %i records in this alignment, told to expect %i" 400 % (number_of_seqs, self.records_per_alignment)) 401 402 ids = [] 403 seqs = [] 404 405 # By default, expects STRICT truncation / padding to 10 characters. 406 # Does not require any whitespace between name and seq. 407 for i in range(number_of_seqs): 408 line = handle.readline().rstrip() 409 sequence_id, s = self._split_id(line) 410 ids.append(sequence_id) 411 while len(s) < length_of_seqs: 412 # The sequence may be split into multiple lines 413 line = handle.readline().strip() 414 if not line: 415 break 416 if line == "": 417 continue 418 s = "".join([s, line.strip().replace(" ", "")]) 419 if len(s) > length_of_seqs: 420 raise ValueError("Found a record of length %i, should be %i" 421 % (len(s), length_of_seqs)) 422 if "." in s: 423 raise ValueError("PHYLIP format no longer allows dots in sequence") 424 seqs.append(s) 425 while True: 426 # Find other alignments in the file 427 line = handle.readline() 428 if not line: 429 break 430 if self._is_header(line): 431 self._header = line 432 break 433 434 records = (SeqRecord(Seq(s, self.alphabet), 435 id=i, name=i, description=i) 436 for (i, s) in zip(ids, seqs)) 437 return MultipleSeqAlignment(records, self.alphabet)
438