Package Bio :: Package AlignIO :: Module PhylipIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.PhylipIO

  1  # Copyright 2006-2016 by Peter Cock.  All rights reserved. 
  2  # Revisions copyright 2011 Brandon Invergo. All rights reserved. 
  3  # 
  4  # This file is part of the Biopython distribution and governed by your 
  5  # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". 
  6  # Please see the LICENSE file that should have been included as part of this 
  7  # package. 
  8  """AlignIO support for "phylip" format from Joe Felsenstein's PHYLIP tools. 
  9   
 10  You are expected to use this module via the Bio.AlignIO functions (or the 
 11  Bio.SeqIO functions if you want to work directly with the gapped sequences). 
 12   
 13  Support for "relaxed phylip" format is also provided. Relaxed phylip differs 
 14  from standard phylip format in the following ways: 
 15   
 16   - No whitespace is allowed in the sequence ID. 
 17   - No truncation is performed. Instead, sequence IDs are padded to the longest 
 18     ID length, rather than 10 characters. A space separates the sequence 
 19     identifier from the sequence. 
 20   
 21  Relaxed phylip is supported by RAxML and PHYML. 
 22   
 23  Note 
 24  ==== 
 25   
 26  In TREE_PUZZLE (Schmidt et al. 2003) and PHYML (Guindon and Gascuel 2003) 
 27  a dot/period (".") in a sequence is interpreted as meaning the same 
 28  character as in the first sequence.  The PHYLIP documentation from 3.3 to 3.69 
 29  http://evolution.genetics.washington.edu/phylip/doc/sequence.html says: 
 30   
 31  "a period was also previously allowed but it is no longer allowed, 
 32  because it sometimes is used in different senses in other programs" 
 33   
 34  Biopython 1.58 or later treats dots/periods in the sequence as invalid, both 
 35  for reading and writing. Older versions did nothing special with a dot/period. 
 36  """ 
 37  from __future__ import print_function 
 38   
 39  import string 
 40   
 41  from Bio._py3k import range 
 42   
 43  from Bio.Seq import Seq 
 44  from Bio.SeqRecord import SeqRecord 
 45  from Bio.Align import MultipleSeqAlignment 
 46  from .Interfaces import AlignmentIterator, SequentialAlignmentWriter 
 47   
 48   
 49  _PHYLIP_ID_WIDTH = 10 
 50  _NO_DOTS = "PHYLIP format no longer allows dots in sequence" 
 51   
 52   
53 -class PhylipWriter(SequentialAlignmentWriter):
54 """Phylip alignment writer.""" 55
56 - def write_alignment(self, alignment, id_width=_PHYLIP_ID_WIDTH):
57 """Use this to write (another) single alignment to an open file. 58 59 This code will write interlaced alignments (when the sequences are 60 longer than 50 characters). 61 62 Note that record identifiers are strictly truncated to id_width, 63 defaulting to the value required to comply with the PHYLIP standard. 64 65 For more information on the file format, please see: 66 http://evolution.genetics.washington.edu/phylip/doc/sequence.html 67 http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles 68 """ 69 handle = self.handle 70 71 if len(alignment) == 0: 72 raise ValueError("Must have at least one sequence") 73 length_of_seqs = alignment.get_alignment_length() 74 for record in alignment: 75 if length_of_seqs != len(record.seq): 76 raise ValueError("Sequences must all be the same length") 77 if length_of_seqs <= 0: 78 raise ValueError("Non-empty sequences are required") 79 80 # Check for repeated identifiers... 81 # Apply this test *after* cleaning the identifiers 82 names = [] 83 seqs = [] 84 for record in alignment: 85 """ 86 Quoting the PHYLIP version 3.6 documentation: 87 88 The name should be ten characters in length, filled out to 89 the full ten characters by blanks if shorter. Any printable 90 ASCII/ISO character is allowed in the name, except for 91 parentheses ("(" and ")"), square brackets ("[" and "]"), 92 colon (":"), semicolon (";") and comma (","). If you forget 93 to extend the names to ten characters in length by blanks, 94 the program [i.e. PHYLIP] will get out of synchronization 95 with the contents of the data file, and an error message will 96 result. 97 98 Note that Tab characters count as only one character in the 99 species names. Their inclusion can cause trouble. 100 """ 101 name = record.id.strip() 102 # Either remove the banned characters, or map them to something 103 # else like an underscore "_" or pipe "|" character... 104 for char in "[](),": 105 name = name.replace(char, "") 106 for char in ":;": 107 name = name.replace(char, "|") 108 name = name[:id_width] 109 if name in names: 110 raise ValueError("Repeated name %r (originally %r), " 111 "possibly due to truncation" 112 % (name, record.id)) 113 names.append(name) 114 sequence = str(record.seq) 115 if "." in sequence: 116 # Do this check here (once per record, not once per block) 117 raise ValueError(_NO_DOTS) 118 seqs.append(sequence) 119 120 # From experimentation, the use of tabs is not understood by the 121 # EMBOSS suite. The nature of the expected white space is not 122 # defined in the PHYLIP documentation, simply "These are in free 123 # format, separated by blanks". We'll use spaces to keep EMBOSS 124 # happy. 125 handle.write(" %i %s\n" % (len(alignment), length_of_seqs)) 126 block = 0 127 while True: 128 for name, sequence in zip(names, seqs): 129 if block == 0: 130 # Write name (truncated/padded to id_width characters) 131 # Now truncate and right pad to expected length. 132 handle.write(name[:id_width].ljust(id_width)) 133 else: 134 # write indent 135 handle.write(" " * id_width) 136 # Write five chunks of ten letters per line... 137 for chunk in range(0, 5): 138 i = block * 50 + chunk * 10 139 seq_segment = sequence[i:i + 10] 140 # TODO - Force any gaps to be '-' character? Look at the 141 # alphabet... 142 # TODO - How to cope with '?' or '.' in the sequence? 143 handle.write(" %s" % seq_segment) 144 if i + 10 > length_of_seqs: 145 break 146 handle.write("\n") 147 block += 1 148 if block * 50 > length_of_seqs: 149 break 150 handle.write("\n")
151 152
153 -class PhylipIterator(AlignmentIterator):
154 """Reads a Phylip alignment file returning a MultipleSeqAlignment iterator. 155 156 Record identifiers are limited to at most 10 characters. 157 158 It only copes with interlaced phylip files! Sequential files won't work 159 where the sequences are split over multiple lines. 160 161 For more information on the file format, please see: 162 http://evolution.genetics.washington.edu/phylip/doc/sequence.html 163 http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles 164 """ 165 166 # Default truncation length 167 id_width = _PHYLIP_ID_WIDTH 168 169 _header = None # for caching lines between __next__ calls 170
171 - def _is_header(self, line):
172 line = line.strip() 173 parts = [x for x in line.split() if x] 174 if len(parts) != 2: 175 return False # First line should have two integers 176 try: 177 number_of_seqs = int(parts[0]) 178 length_of_seqs = int(parts[1]) 179 return True 180 except ValueError: 181 return False # First line should have two integers
182
183 - def _split_id(self, line):
184 """Extract the sequence ID from a Phylip line (PRIVATE). 185 186 Returning a tuple containing: (sequence_id, sequence_residues) 187 188 The first 10 characters in the line are are the sequence id, the 189 remainder are sequence data. 190 """ 191 seq_id = line[:self.id_width].strip() 192 seq = line[self.id_width:].strip().replace(' ', '') 193 return seq_id, seq
194
195 - def __next__(self):
196 """Parse the next alignment from the handle.""" 197 handle = self.handle 198 199 if self._header is None: 200 line = handle.readline() 201 else: 202 # Header we saved from when we were parsing 203 # the previous alignment. 204 line = self._header 205 self._header = None 206 207 if not line: 208 raise StopIteration 209 line = line.strip() 210 parts = [x for x in line.split() if x] 211 if len(parts) != 2: 212 raise ValueError("First line should have two integers") 213 try: 214 number_of_seqs = int(parts[0]) 215 length_of_seqs = int(parts[1]) 216 except ValueError: 217 raise ValueError("First line should have two integers") 218 219 assert self._is_header(line) 220 221 if self.records_per_alignment is not None and \ 222 self.records_per_alignment != number_of_seqs: 223 raise ValueError("Found %i records in this alignment, " 224 "told to expect %i" 225 % (number_of_seqs, self.records_per_alignment)) 226 227 ids = [] 228 seqs = [] 229 230 # By default, expects STRICT truncation / padding to 10 characters. 231 # Does not require any whitespace between name and seq. 232 for i in range(number_of_seqs): 233 line = handle.readline().rstrip() 234 sequence_id, s = self._split_id(line) 235 ids.append(sequence_id) 236 if "." in s: 237 raise ValueError(_NO_DOTS) 238 seqs.append([s]) 239 240 # Look for further blocks 241 line = "" 242 while True: 243 # Skip any blank lines between blocks... 244 while "" == line.strip(): 245 line = handle.readline() 246 if not line: 247 break # end of file 248 if not line: 249 break # end of file 250 251 if self._is_header(line): 252 # Looks like the start of a concatenated alignment 253 self._header = line 254 break 255 256 # print "New block..." 257 for i in range(number_of_seqs): 258 s = line.strip().replace(" ", "") 259 if "." in s: 260 raise ValueError(_NO_DOTS) 261 seqs[i].append(s) 262 line = handle.readline() 263 if (not line) and i + 1 < number_of_seqs: 264 raise ValueError("End of file mid-block") 265 if not line: 266 break # end of file 267 268 records = (SeqRecord(Seq("".join(s), self.alphabet), 269 id=i, name=i, description=i) 270 for (i, s) in zip(ids, seqs)) 271 return MultipleSeqAlignment(records, self.alphabet)
272 273 274 # Relaxed Phylip
275 -class RelaxedPhylipWriter(PhylipWriter):
276 """Relaxed Phylip format writer.""" 277
278 - def write_alignment(self, alignment):
279 """Write a relaxed phylip alignment.""" 280 # Check inputs 281 for name in (s.id.strip() for s in alignment): 282 if any(c in name for c in string.whitespace): 283 raise ValueError("Whitespace not allowed in identifier: %s" 284 % name) 285 286 # Calculate a truncation length - maximum length of sequence ID plus a 287 # single character for padding 288 # If no sequences, set id_width to 1. super(...) call will raise a 289 # ValueError 290 if len(alignment) == 0: 291 id_width = 1 292 else: 293 id_width = max((len(s.id.strip()) for s in alignment)) + 1 294 super(RelaxedPhylipWriter, self).write_alignment(alignment, id_width)
295 296
297 -class RelaxedPhylipIterator(PhylipIterator):
298 """Relaxed Phylip format Iterator.""" 299
300 - def _split_id(self, line):
301 """Extract the sequence ID from a Phylip line (PRIVATE). 302 303 Returns a tuple containing: (sequence_id, sequence_residues) 304 305 For relaxed format split at the first whitespace character. 306 """ 307 seq_id, sequence = line.split(None, 1) 308 sequence = sequence.strip().replace(" ", "") 309 return seq_id, sequence
310 311
312 -class SequentialPhylipWriter(SequentialAlignmentWriter):
313 """Sequential Phylip format Writer.""" 314
315 - def write_alignment(self, alignment, id_width=_PHYLIP_ID_WIDTH):
316 """Write a Phylip alignment to the handle.""" 317 handle = self.handle 318 319 if len(alignment) == 0: 320 raise ValueError("Must have at least one sequence") 321 length_of_seqs = alignment.get_alignment_length() 322 for record in alignment: 323 if length_of_seqs != len(record.seq): 324 raise ValueError("Sequences must all be the same length") 325 if length_of_seqs <= 0: 326 raise ValueError("Non-empty sequences are required") 327 328 # Check for repeated identifiers... 329 # Apply this test *after* cleaning the identifiers 330 names = [] 331 for record in alignment: 332 name = record.id.strip() 333 # Either remove the banned characters, or map them to something 334 # else like an underscore "_" or pipe "|" character... 335 for char in "[](),": 336 name = name.replace(char, "") 337 for char in ":;": 338 name = name.replace(char, "|") 339 name = name[:id_width] 340 if name in names: 341 raise ValueError("Repeated name %r (originally %r), " 342 "possibly due to truncation" 343 % (name, record.id)) 344 names.append(name) 345 346 # From experimentation, the use of tabs is not understood by the 347 # EMBOSS suite. The nature of the expected white space is not 348 # defined in the PHYLIP documentation, simply "These are in free 349 # format, separated by blanks". We'll use spaces to keep EMBOSS 350 # happy. 351 handle.write(" %i %s\n" % (len(alignment), length_of_seqs)) 352 for name, record in zip(names, alignment): 353 sequence = str(record.seq) 354 if "." in sequence: 355 raise ValueError(_NO_DOTS) 356 handle.write(name[:id_width].ljust(id_width)) 357 # Write the entire sequence to one line (see sequential format 358 # notes in the SequentialPhylipIterator docstring 359 handle.write(sequence) 360 handle.write("\n")
361 362
363 -class SequentialPhylipIterator(PhylipIterator):
364 """Sequential Phylip format Iterator. 365 366 The sequential format carries the same restrictions as the normal 367 interleaved one, with the difference being that the sequences are listed 368 sequentially, each sequence written in its entirety before the start of 369 the next. According to the PHYLIP documentation for input file 370 formatting, newlines and spaces may optionally be entered at any point 371 in the sequences. 372 """ 373 374 _header = None # for caching lines between __next__ calls 375
376 - def __next__(self):
377 handle = self.handle 378 379 if self._header is None: 380 line = handle.readline() 381 else: 382 # Header we saved from when we were parsing 383 # the previous alignment. 384 line = self._header 385 self._header = None 386 387 if not line: 388 raise StopIteration 389 line = line.strip() 390 parts = [x for x in line.split() if x] 391 if len(parts) != 2: 392 raise ValueError("First line should have two integers") 393 try: 394 number_of_seqs = int(parts[0]) 395 length_of_seqs = int(parts[1]) 396 except ValueError: 397 raise ValueError("First line should have two integers") 398 399 assert self._is_header(line) 400 401 if self.records_per_alignment is not None and \ 402 self.records_per_alignment != number_of_seqs: 403 raise ValueError("Found %i records in this alignment, " 404 "told to expect %i" 405 % (number_of_seqs, self.records_per_alignment)) 406 407 ids = [] 408 seqs = [] 409 410 # By default, expects STRICT truncation / padding to 10 characters. 411 # Does not require any whitespace between name and seq. 412 for i in range(number_of_seqs): 413 line = handle.readline().rstrip() 414 sequence_id, s = self._split_id(line) 415 ids.append(sequence_id) 416 while len(s) < length_of_seqs: 417 # The sequence may be split into multiple lines 418 line = handle.readline().strip() 419 if not line: 420 break 421 if line == "": 422 continue 423 s = "".join([s, line.strip().replace(" ", "")]) 424 if len(s) > length_of_seqs: 425 raise ValueError("Found a record of length %i, " 426 "should be %i" 427 % (len(s), length_of_seqs)) 428 if "." in s: 429 raise ValueError(_NO_DOTS) 430 seqs.append(s) 431 while True: 432 # Find other alignments in the file 433 line = handle.readline() 434 if not line: 435 break 436 if self._is_header(line): 437 self._header = line 438 break 439 440 records = (SeqRecord(Seq(s, self.alphabet), 441 id=i, name=i, description=i) 442 for (i, s) in zip(ids, seqs)) 443 return MultipleSeqAlignment(records, self.alphabet)
444