Package Bio :: Package AlignIO :: Module PhylipIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.PhylipIO

  1  # Copyright 2006-2013 by Peter Cock.  All rights reserved. 
  2  # Revisions copyright 2011 Brandon Invergo. All rights reserved. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """AlignIO support for "phylip" format from Joe Felsenstein's PHYLIP tools. 
  7   
  8  You are expected to use this module via the Bio.AlignIO functions (or the 
  9  Bio.SeqIO functions if you want to work directly with the gapped sequences). 
 10   
 11  Support for "relaxed phylip" format is also provided. Relaxed phylip differs 
 12  from standard phylip format in the following ways: 
 13   
 14   - No whitespace is allowed in the sequence ID. 
 15   - No truncation is performed. Instead, sequence IDs are padded to the longest 
 16     ID length, rather than 10 characters. A space separates the sequence 
 17     identifier from the sequence. 
 18   
 19  Relaxed phylip is supported by RAxML and PHYML. 
 20   
 21  Note 
 22  ==== 
 23   
 24  In TREE_PUZZLE (Schmidt et al. 2003) and PHYML (Guindon and Gascuel 2003) 
 25  a dot/period (".") in a sequence is interpreted as meaning the same 
 26  character as in the first sequence.  The PHYLIP documentation from 3.3 to 3.69 
 27  http://evolution.genetics.washington.edu/phylip/doc/sequence.html says: 
 28   
 29  "a period was also previously allowed but it is no longer allowed, 
 30  because it sometimes is used in different senses in other programs" 
 31   
 32  Biopython 1.58 or later treats dots/periods in the sequence as invalid, both 
 33  for reading and writing. Older versions did nothing special with a dot/period. 
 34  """ 
 35  from __future__ import print_function 
 36   
 37  import string 
 38   
 39  from Bio._py3k import range 
 40   
 41  from Bio.Seq import Seq 
 42  from Bio.SeqRecord import SeqRecord 
 43  from Bio.Align import MultipleSeqAlignment 
 44  from .Interfaces import AlignmentIterator, SequentialAlignmentWriter 
 45   
 46   
 47  _PHYLIP_ID_WIDTH = 10 
 48   
 49   
50 -class PhylipWriter(SequentialAlignmentWriter):
51 """Phylip alignment writer.""" 52
53 - def write_alignment(self, alignment, id_width=_PHYLIP_ID_WIDTH):
54 """Use this to write (another) single alignment to an open file. 55 56 This code will write interlaced alignments (when the sequences are 57 longer than 50 characters). 58 59 Note that record identifiers are strictly truncated to id_width, 60 defaulting to the value required to comply with the PHYLIP standard. 61 62 For more information on the file format, please see: 63 http://evolution.genetics.washington.edu/phylip/doc/sequence.html 64 http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles 65 """ 66 handle = self.handle 67 68 if len(alignment) == 0: 69 raise ValueError("Must have at least one sequence") 70 length_of_seqs = alignment.get_alignment_length() 71 for record in alignment: 72 if length_of_seqs != len(record.seq): 73 raise ValueError("Sequences must all be the same length") 74 if length_of_seqs <= 0: 75 raise ValueError("Non-empty sequences are required") 76 77 # Check for repeated identifiers... 78 # Apply this test *after* cleaning the identifiers 79 names = [] 80 seqs = [] 81 for record in alignment: 82 """ 83 Quoting the PHYLIP version 3.6 documentation: 84 85 The name should be ten characters in length, filled out to 86 the full ten characters by blanks if shorter. Any printable 87 ASCII/ISO character is allowed in the name, except for 88 parentheses ("(" and ")"), square brackets ("[" and "]"), 89 colon (":"), semicolon (";") and comma (","). If you forget 90 to extend the names to ten characters in length by blanks, 91 the program [i.e. PHYLIP] will get out of synchronization 92 with the contents of the data file, and an error message will 93 result. 94 95 Note that Tab characters count as only one character in the 96 species names. Their inclusion can cause trouble. 97 """ 98 name = record.id.strip() 99 # Either remove the banned characters, or map them to something 100 # else like an underscore "_" or pipe "|" character... 101 for char in "[](),": 102 name = name.replace(char, "") 103 for char in ":;": 104 name = name.replace(char, "|") 105 name = name[:id_width] 106 if name in names: 107 raise ValueError("Repeated name %r (originally %r), " 108 "possibly due to truncation" 109 % (name, record.id)) 110 names.append(name) 111 sequence = str(record.seq) 112 if "." in sequence: 113 # Do this check here (once per record, not once per block) 114 raise ValueError("PHYLIP format no longer allows dots in " 115 "sequence") 116 seqs.append(sequence) 117 118 # From experimentation, the use of tabs is not understood by the 119 # EMBOSS suite. The nature of the expected white space is not 120 # defined in the PHYLIP documentation, simply "These are in free 121 # format, separated by blanks". We'll use spaces to keep EMBOSS 122 # happy. 123 handle.write(" %i %s\n" % (len(alignment), length_of_seqs)) 124 block = 0 125 while True: 126 for name, sequence in zip(names, seqs): 127 if block == 0: 128 # Write name (truncated/padded to id_width characters) 129 # Now truncate and right pad to expected length. 130 handle.write(name[:id_width].ljust(id_width)) 131 else: 132 # write indent 133 handle.write(" " * id_width) 134 # Write five chunks of ten letters per line... 135 for chunk in range(0, 5): 136 i = block * 50 + chunk * 10 137 seq_segment = sequence[i:i + 10] 138 # TODO - Force any gaps to be '-' character? Look at the 139 # alphabet... 140 # TODO - How to cope with '?' or '.' in the sequence? 141 handle.write(" %s" % seq_segment) 142 if i + 10 > length_of_seqs: 143 break 144 handle.write("\n") 145 block += 1 146 if block * 50 > length_of_seqs: 147 break 148 handle.write("\n")
149 150
151 -class PhylipIterator(AlignmentIterator):
152 """Reads a Phylip alignment file returning a MultipleSeqAlignment iterator. 153 154 Record identifiers are limited to at most 10 characters. 155 156 It only copes with interlaced phylip files! Sequential files won't work 157 where the sequences are split over multiple lines. 158 159 For more information on the file format, please see: 160 http://evolution.genetics.washington.edu/phylip/doc/sequence.html 161 http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles 162 """ 163 164 # Default truncation length 165 id_width = _PHYLIP_ID_WIDTH 166 167 _header = None # for caching lines between __next__ calls 168
169 - def _is_header(self, line):
170 line = line.strip() 171 parts = [x for x in line.split() if x] 172 if len(parts) != 2: 173 return False # First line should have two integers 174 try: 175 number_of_seqs = int(parts[0]) 176 length_of_seqs = int(parts[1]) 177 return True 178 except ValueError: 179 return False # First line should have two integers
180
181 - def _split_id(self, line):
182 """Extracts the sequence ID from a Phylip line (PRIVATE). 183 184 Returning a tuple containing: (sequence_id, sequence_residues) 185 186 The first 10 characters in the line are are the sequence id, the 187 remainder are sequence data. 188 """ 189 seq_id = line[:self.id_width].strip() 190 seq = line[self.id_width:].strip().replace(' ', '') 191 return seq_id, seq
192
193 - def __next__(self):
194 handle = self.handle 195 196 if self._header is None: 197 line = handle.readline() 198 else: 199 # Header we saved from when we were parsing 200 # the previous alignment. 201 line = self._header 202 self._header = None 203 204 if not line: 205 raise StopIteration 206 line = line.strip() 207 parts = [x for x in line.split() if x] 208 if len(parts) != 2: 209 raise ValueError("First line should have two integers") 210 try: 211 number_of_seqs = int(parts[0]) 212 length_of_seqs = int(parts[1]) 213 except ValueError: 214 raise ValueError("First line should have two integers") 215 216 assert self._is_header(line) 217 218 if self.records_per_alignment is not None \ 219 and self.records_per_alignment != number_of_seqs: 220 raise ValueError("Found %i records in this alignment, told to expect %i" 221 % (number_of_seqs, self.records_per_alignment)) 222 223 ids = [] 224 seqs = [] 225 226 # By default, expects STRICT truncation / padding to 10 characters. 227 # Does not require any whitespace between name and seq. 228 for i in range(number_of_seqs): 229 line = handle.readline().rstrip() 230 sequence_id, s = self._split_id(line) 231 ids.append(sequence_id) 232 if "." in s: 233 raise ValueError("PHYLIP format no longer allows dots in sequence") 234 seqs.append([s]) 235 236 # Look for further blocks 237 line = "" 238 while True: 239 # Skip any blank lines between blocks... 240 while "" == line.strip(): 241 line = handle.readline() 242 if not line: 243 break # end of file 244 if not line: 245 break # end of file 246 247 if self._is_header(line): 248 # Looks like the start of a concatenated alignment 249 self._header = line 250 break 251 252 # print "New block..." 253 for i in range(number_of_seqs): 254 s = line.strip().replace(" ", "") 255 if "." in s: 256 raise ValueError("PHYLIP format no longer allows dots in sequence") 257 seqs[i].append(s) 258 line = handle.readline() 259 if (not line) and i + 1 < number_of_seqs: 260 raise ValueError("End of file mid-block") 261 if not line: 262 break # end of file 263 264 records = (SeqRecord(Seq("".join(s), self.alphabet), 265 id=i, name=i, description=i) 266 for (i, s) in zip(ids, seqs)) 267 return MultipleSeqAlignment(records, self.alphabet)
268 269 270 # Relaxed Phylip
271 -class RelaxedPhylipWriter(PhylipWriter):
272 """ 273 Relaxed Phylip format writer 274 """ 275
276 - def write_alignment(self, alignment):
277 """ 278 Write a relaxed phylip alignment 279 """ 280 # Check inputs 281 for name in (s.id.strip() for s in alignment): 282 if any(c in name for c in string.whitespace): 283 raise ValueError("Whitespace not allowed in identifier: %s" 284 % name) 285 286 # Calculate a truncation length - maximum length of sequence ID plus a 287 # single character for padding 288 # If no sequences, set id_width to 1. super(...) call will raise a 289 # ValueError 290 if len(alignment) == 0: 291 id_width = 1 292 else: 293 id_width = max((len(s.id.strip()) for s in alignment)) + 1 294 super(RelaxedPhylipWriter, self).write_alignment(alignment, id_width)
295 296
297 -class RelaxedPhylipIterator(PhylipIterator):
298 """Relaxed Phylip format Iterator.""" 299
300 - def _split_id(self, line):
301 """Extracts the sequence ID from a Phylip line (PRIVATE). 302 303 Returns a tuple containing: (sequence_id, sequence_residues) 304 305 For relaxed format split at the first whitespace character. 306 """ 307 seq_id, sequence = line.split(None, 1) 308 sequence = sequence.strip().replace(" ", "") 309 return seq_id, sequence
310 311
312 -class SequentialPhylipWriter(SequentialAlignmentWriter):
313 """Sequential Phylip format Writer.""" 314
315 - def write_alignment(self, alignment, id_width=_PHYLIP_ID_WIDTH):
316 handle = self.handle 317 318 if len(alignment) == 0: 319 raise ValueError("Must have at least one sequence") 320 length_of_seqs = alignment.get_alignment_length() 321 for record in alignment: 322 if length_of_seqs != len(record.seq): 323 raise ValueError("Sequences must all be the same length") 324 if length_of_seqs <= 0: 325 raise ValueError("Non-empty sequences are required") 326 327 # Check for repeated identifiers... 328 # Apply this test *after* cleaning the identifiers 329 names = [] 330 for record in alignment: 331 name = record.id.strip() 332 # Either remove the banned characters, or map them to something 333 # else like an underscore "_" or pipe "|" character... 334 for char in "[](),": 335 name = name.replace(char, "") 336 for char in ":;": 337 name = name.replace(char, "|") 338 name = name[:id_width] 339 if name in names: 340 raise ValueError("Repeated name %r (originally %r), " 341 "possibly due to truncation" 342 % (name, record.id)) 343 names.append(name) 344 345 # From experimentation, the use of tabs is not understood by the 346 # EMBOSS suite. The nature of the expected white space is not 347 # defined in the PHYLIP documentation, simply "These are in free 348 # format, separated by blanks". We'll use spaces to keep EMBOSS 349 # happy. 350 handle.write(" %i %s\n" % (len(alignment), length_of_seqs)) 351 for name, record in zip(names, alignment): 352 sequence = str(record.seq) 353 if "." in sequence: 354 raise ValueError("PHYLIP format no longer allows dots in " 355 "sequence") 356 handle.write(name[:id_width].ljust(id_width)) 357 # Write the entire sequence to one line (see sequential format 358 # notes in the SequentialPhylipIterator docstring 359 handle.write(sequence) 360 handle.write("\n")
361 362
363 -class SequentialPhylipIterator(PhylipIterator):
364 """ 365 Sequential Phylip format Iterator 366 367 The sequential format carries the same restrictions as the normal 368 interleaved one, with the difference being that the sequences are listed 369 sequentially, each sequence written in its entirety before the start of 370 the next. According to the PHYLIP documentation for input file formatting, 371 newlines and spaces may optionally be entered at any point in the sequences. 372 """ 373 374 _header = None # for caching lines between __next__ calls 375
376 - def __next__(self):
377 handle = self.handle 378 379 if self._header is None: 380 line = handle.readline() 381 else: 382 # Header we saved from when we were parsing 383 # the previous alignment. 384 line = self._header 385 self._header = None 386 387 if not line: 388 raise StopIteration 389 line = line.strip() 390 parts = [x for x in line.split() if x] 391 if len(parts) != 2: 392 raise ValueError("First line should have two integers") 393 try: 394 number_of_seqs = int(parts[0]) 395 length_of_seqs = int(parts[1]) 396 except ValueError: 397 raise ValueError("First line should have two integers") 398 399 assert self._is_header(line) 400 401 if self.records_per_alignment is not None \ 402 and self.records_per_alignment != number_of_seqs: 403 raise ValueError("Found %i records in this alignment, told to expect %i" 404 % (number_of_seqs, self.records_per_alignment)) 405 406 ids = [] 407 seqs = [] 408 409 # By default, expects STRICT truncation / padding to 10 characters. 410 # Does not require any whitespace between name and seq. 411 for i in range(number_of_seqs): 412 line = handle.readline().rstrip() 413 sequence_id, s = self._split_id(line) 414 ids.append(sequence_id) 415 while len(s) < length_of_seqs: 416 # The sequence may be split into multiple lines 417 line = handle.readline().strip() 418 if not line: 419 break 420 if line == "": 421 continue 422 s = "".join([s, line.strip().replace(" ", "")]) 423 if len(s) > length_of_seqs: 424 raise ValueError("Found a record of length %i, should be %i" 425 % (len(s), length_of_seqs)) 426 if "." in s: 427 raise ValueError("PHYLIP format no longer allows dots in sequence") 428 seqs.append(s) 429 while True: 430 # Find other alignments in the file 431 line = handle.readline() 432 if not line: 433 break 434 if self._is_header(line): 435 self._header = line 436 break 437 438 records = (SeqRecord(Seq(s, self.alphabet), 439 id=i, name=i, description=i) 440 for (i, s) in zip(ids, seqs)) 441 return MultipleSeqAlignment(records, self.alphabet)
442