Package Bio :: Package AlignIO :: Module PhylipIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.PhylipIO

  1  # Copyright 2006-2016 by Peter Cock.  All rights reserved. 
  2  # Revisions copyright 2011 Brandon Invergo. All rights reserved. 
  3  # 
  4  # This file is part of the Biopython distribution and governed by your 
  5  # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". 
  6  # Please see the LICENSE file that should have been included as part of this 
  7  # package. 
  8  """AlignIO support for "phylip" format from Joe Felsenstein's PHYLIP tools. 
  9   
 10  You are expected to use this module via the Bio.AlignIO functions (or the 
 11  Bio.SeqIO functions if you want to work directly with the gapped sequences). 
 12   
 13  Support for "relaxed phylip" format is also provided. Relaxed phylip differs 
 14  from standard phylip format in the following ways: 
 15   
 16   - No whitespace is allowed in the sequence ID. 
 17   - No truncation is performed. Instead, sequence IDs are padded to the longest 
 18     ID length, rather than 10 characters. A space separates the sequence 
 19     identifier from the sequence. 
 20   
 21  Relaxed phylip is supported by RAxML and PHYML. 
 22   
 23  Note 
 24  ==== 
 25   
 26  In TREE_PUZZLE (Schmidt et al. 2003) and PHYML (Guindon and Gascuel 2003) 
 27  a dot/period (".") in a sequence is interpreted as meaning the same 
 28  character as in the first sequence.  The PHYLIP documentation from 3.3 to 3.69 
 29  http://evolution.genetics.washington.edu/phylip/doc/sequence.html says: 
 30   
 31  "a period was also previously allowed but it is no longer allowed, 
 32  because it sometimes is used in different senses in other programs" 
 33   
 34  Biopython 1.58 or later treats dots/periods in the sequence as invalid, both 
 35  for reading and writing. Older versions did nothing special with a dot/period. 
 36  """ 
 37  from __future__ import print_function 
 38   
 39  import string 
 40   
 41  from Bio._py3k import range 
 42   
 43  from Bio.Seq import Seq 
 44  from Bio.SeqRecord import SeqRecord 
 45  from Bio.Align import MultipleSeqAlignment 
 46  from .Interfaces import AlignmentIterator, SequentialAlignmentWriter 
 47   
 48   
 49  _PHYLIP_ID_WIDTH = 10 
 50  _NO_DOTS = "PHYLIP format no longer allows dots in sequence" 
 51   
 52   
53 -class PhylipWriter(SequentialAlignmentWriter):
54 """Phylip alignment writer.""" 55
56 - def write_alignment(self, alignment, id_width=_PHYLIP_ID_WIDTH):
57 """Use this to write (another) single alignment to an open file. 58 59 This code will write interlaced alignments (when the sequences are 60 longer than 50 characters). 61 62 Note that record identifiers are strictly truncated to id_width, 63 defaulting to the value required to comply with the PHYLIP standard. 64 65 For more information on the file format, please see: 66 http://evolution.genetics.washington.edu/phylip/doc/sequence.html 67 http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles 68 """ 69 handle = self.handle 70 71 if len(alignment) == 0: 72 raise ValueError("Must have at least one sequence") 73 length_of_seqs = alignment.get_alignment_length() 74 for record in alignment: 75 if length_of_seqs != len(record.seq): 76 raise ValueError("Sequences must all be the same length") 77 if length_of_seqs <= 0: 78 raise ValueError("Non-empty sequences are required") 79 80 # Check for repeated identifiers... 81 # Apply this test *after* cleaning the identifiers 82 names = [] 83 seqs = [] 84 for record in alignment: 85 """ 86 Quoting the PHYLIP version 3.6 documentation: 87 88 The name should be ten characters in length, filled out to 89 the full ten characters by blanks if shorter. Any printable 90 ASCII/ISO character is allowed in the name, except for 91 parentheses ("(" and ")"), square brackets ("[" and "]"), 92 colon (":"), semicolon (";") and comma (","). If you forget 93 to extend the names to ten characters in length by blanks, 94 the program [i.e. PHYLIP] will get out of synchronization 95 with the contents of the data file, and an error message will 96 result. 97 98 Note that Tab characters count as only one character in the 99 species names. Their inclusion can cause trouble. 100 """ 101 name = record.id.strip() 102 # Either remove the banned characters, or map them to something 103 # else like an underscore "_" or pipe "|" character... 104 for char in "[](),": 105 name = name.replace(char, "") 106 for char in ":;": 107 name = name.replace(char, "|") 108 name = name[:id_width] 109 if name in names: 110 raise ValueError("Repeated name %r (originally %r), " 111 "possibly due to truncation" 112 % (name, record.id)) 113 names.append(name) 114 sequence = str(record.seq) 115 if "." in sequence: 116 # Do this check here (once per record, not once per block) 117 raise ValueError(_NO_DOTS) 118 seqs.append(sequence) 119 120 # From experimentation, the use of tabs is not understood by the 121 # EMBOSS suite. The nature of the expected white space is not 122 # defined in the PHYLIP documentation, simply "These are in free 123 # format, separated by blanks". We'll use spaces to keep EMBOSS 124 # happy. 125 handle.write(" %i %s\n" % (len(alignment), length_of_seqs)) 126 block = 0 127 while True: 128 for name, sequence in zip(names, seqs): 129 if block == 0: 130 # Write name (truncated/padded to id_width characters) 131 # Now truncate and right pad to expected length. 132 handle.write(name[:id_width].ljust(id_width)) 133 else: 134 # write indent 135 handle.write(" " * id_width) 136 # Write five chunks of ten letters per line... 137 for chunk in range(0, 5): 138 i = block * 50 + chunk * 10 139 seq_segment = sequence[i:i + 10] 140 # TODO - Force any gaps to be '-' character? Look at the 141 # alphabet... 142 # TODO - How to cope with '?' or '.' in the sequence? 143 handle.write(" %s" % seq_segment) 144 if i + 10 > length_of_seqs: 145 break 146 handle.write("\n") 147 block += 1 148 if block * 50 > length_of_seqs: 149 break 150 handle.write("\n")
151 152
153 -class PhylipIterator(AlignmentIterator):
154 """Reads a Phylip alignment file returning a MultipleSeqAlignment iterator. 155 156 Record identifiers are limited to at most 10 characters. 157 158 It only copes with interlaced phylip files! Sequential files won't work 159 where the sequences are split over multiple lines. 160 161 For more information on the file format, please see: 162 http://evolution.genetics.washington.edu/phylip/doc/sequence.html 163 http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles 164 """ 165 166 # Default truncation length 167 id_width = _PHYLIP_ID_WIDTH 168 169 _header = None # for caching lines between __next__ calls 170
171 - def _is_header(self, line):
172 line = line.strip() 173 parts = [x for x in line.split() if x] 174 if len(parts) != 2: 175 return False # First line should have two integers 176 try: 177 number_of_seqs = int(parts[0]) 178 length_of_seqs = int(parts[1]) 179 return True 180 except ValueError: 181 return False # First line should have two integers
182
183 - def _split_id(self, line):
184 """Extracts the sequence ID from a Phylip line (PRIVATE). 185 186 Returning a tuple containing: (sequence_id, sequence_residues) 187 188 The first 10 characters in the line are are the sequence id, the 189 remainder are sequence data. 190 """ 191 seq_id = line[:self.id_width].strip() 192 seq = line[self.id_width:].strip().replace(' ', '') 193 return seq_id, seq
194
195 - def __next__(self):
196 handle = self.handle 197 198 if self._header is None: 199 line = handle.readline() 200 else: 201 # Header we saved from when we were parsing 202 # the previous alignment. 203 line = self._header 204 self._header = None 205 206 if not line: 207 raise StopIteration 208 line = line.strip() 209 parts = [x for x in line.split() if x] 210 if len(parts) != 2: 211 raise ValueError("First line should have two integers") 212 try: 213 number_of_seqs = int(parts[0]) 214 length_of_seqs = int(parts[1]) 215 except ValueError: 216 raise ValueError("First line should have two integers") 217 218 assert self._is_header(line) 219 220 if self.records_per_alignment is not None and \ 221 self.records_per_alignment != number_of_seqs: 222 raise ValueError("Found %i records in this alignment, " 223 "told to expect %i" 224 % (number_of_seqs, self.records_per_alignment)) 225 226 ids = [] 227 seqs = [] 228 229 # By default, expects STRICT truncation / padding to 10 characters. 230 # Does not require any whitespace between name and seq. 231 for i in range(number_of_seqs): 232 line = handle.readline().rstrip() 233 sequence_id, s = self._split_id(line) 234 ids.append(sequence_id) 235 if "." in s: 236 raise ValueError(_NO_DOTS) 237 seqs.append([s]) 238 239 # Look for further blocks 240 line = "" 241 while True: 242 # Skip any blank lines between blocks... 243 while "" == line.strip(): 244 line = handle.readline() 245 if not line: 246 break # end of file 247 if not line: 248 break # end of file 249 250 if self._is_header(line): 251 # Looks like the start of a concatenated alignment 252 self._header = line 253 break 254 255 # print "New block..." 256 for i in range(number_of_seqs): 257 s = line.strip().replace(" ", "") 258 if "." in s: 259 raise ValueError(_NO_DOTS) 260 seqs[i].append(s) 261 line = handle.readline() 262 if (not line) and i + 1 < number_of_seqs: 263 raise ValueError("End of file mid-block") 264 if not line: 265 break # end of file 266 267 records = (SeqRecord(Seq("".join(s), self.alphabet), 268 id=i, name=i, description=i) 269 for (i, s) in zip(ids, seqs)) 270 return MultipleSeqAlignment(records, self.alphabet)
271 272 273 # Relaxed Phylip
274 -class RelaxedPhylipWriter(PhylipWriter):
275 """ 276 Relaxed Phylip format writer 277 """ 278
279 - def write_alignment(self, alignment):
280 """ 281 Write a relaxed phylip alignment 282 """ 283 # Check inputs 284 for name in (s.id.strip() for s in alignment): 285 if any(c in name for c in string.whitespace): 286 raise ValueError("Whitespace not allowed in identifier: %s" 287 % name) 288 289 # Calculate a truncation length - maximum length of sequence ID plus a 290 # single character for padding 291 # If no sequences, set id_width to 1. super(...) call will raise a 292 # ValueError 293 if len(alignment) == 0: 294 id_width = 1 295 else: 296 id_width = max((len(s.id.strip()) for s in alignment)) + 1 297 super(RelaxedPhylipWriter, self).write_alignment(alignment, id_width)
298 299
300 -class RelaxedPhylipIterator(PhylipIterator):
301 """Relaxed Phylip format Iterator.""" 302
303 - def _split_id(self, line):
304 """Extracts the sequence ID from a Phylip line (PRIVATE). 305 306 Returns a tuple containing: (sequence_id, sequence_residues) 307 308 For relaxed format split at the first whitespace character. 309 """ 310 seq_id, sequence = line.split(None, 1) 311 sequence = sequence.strip().replace(" ", "") 312 return seq_id, sequence
313 314
315 -class SequentialPhylipWriter(SequentialAlignmentWriter):
316 """Sequential Phylip format Writer.""" 317
318 - def write_alignment(self, alignment, id_width=_PHYLIP_ID_WIDTH):
319 handle = self.handle 320 321 if len(alignment) == 0: 322 raise ValueError("Must have at least one sequence") 323 length_of_seqs = alignment.get_alignment_length() 324 for record in alignment: 325 if length_of_seqs != len(record.seq): 326 raise ValueError("Sequences must all be the same length") 327 if length_of_seqs <= 0: 328 raise ValueError("Non-empty sequences are required") 329 330 # Check for repeated identifiers... 331 # Apply this test *after* cleaning the identifiers 332 names = [] 333 for record in alignment: 334 name = record.id.strip() 335 # Either remove the banned characters, or map them to something 336 # else like an underscore "_" or pipe "|" character... 337 for char in "[](),": 338 name = name.replace(char, "") 339 for char in ":;": 340 name = name.replace(char, "|") 341 name = name[:id_width] 342 if name in names: 343 raise ValueError("Repeated name %r (originally %r), " 344 "possibly due to truncation" 345 % (name, record.id)) 346 names.append(name) 347 348 # From experimentation, the use of tabs is not understood by the 349 # EMBOSS suite. The nature of the expected white space is not 350 # defined in the PHYLIP documentation, simply "These are in free 351 # format, separated by blanks". We'll use spaces to keep EMBOSS 352 # happy. 353 handle.write(" %i %s\n" % (len(alignment), length_of_seqs)) 354 for name, record in zip(names, alignment): 355 sequence = str(record.seq) 356 if "." in sequence: 357 raise ValueError(_NO_DOTS) 358 handle.write(name[:id_width].ljust(id_width)) 359 # Write the entire sequence to one line (see sequential format 360 # notes in the SequentialPhylipIterator docstring 361 handle.write(sequence) 362 handle.write("\n")
363 364
365 -class SequentialPhylipIterator(PhylipIterator):
366 """Sequential Phylip format Iterator. 367 368 The sequential format carries the same restrictions as the normal 369 interleaved one, with the difference being that the sequences are listed 370 sequentially, each sequence written in its entirety before the start of 371 the next. According to the PHYLIP documentation for input file 372 formatting, newlines and spaces may optionally be entered at any point 373 in the sequences. 374 """ 375 376 _header = None # for caching lines between __next__ calls 377
378 - def __next__(self):
379 handle = self.handle 380 381 if self._header is None: 382 line = handle.readline() 383 else: 384 # Header we saved from when we were parsing 385 # the previous alignment. 386 line = self._header 387 self._header = None 388 389 if not line: 390 raise StopIteration 391 line = line.strip() 392 parts = [x for x in line.split() if x] 393 if len(parts) != 2: 394 raise ValueError("First line should have two integers") 395 try: 396 number_of_seqs = int(parts[0]) 397 length_of_seqs = int(parts[1]) 398 except ValueError: 399 raise ValueError("First line should have two integers") 400 401 assert self._is_header(line) 402 403 if self.records_per_alignment is not None and \ 404 self.records_per_alignment != number_of_seqs: 405 raise ValueError("Found %i records in this alignment, " 406 "told to expect %i" 407 % (number_of_seqs, self.records_per_alignment)) 408 409 ids = [] 410 seqs = [] 411 412 # By default, expects STRICT truncation / padding to 10 characters. 413 # Does not require any whitespace between name and seq. 414 for i in range(number_of_seqs): 415 line = handle.readline().rstrip() 416 sequence_id, s = self._split_id(line) 417 ids.append(sequence_id) 418 while len(s) < length_of_seqs: 419 # The sequence may be split into multiple lines 420 line = handle.readline().strip() 421 if not line: 422 break 423 if line == "": 424 continue 425 s = "".join([s, line.strip().replace(" ", "")]) 426 if len(s) > length_of_seqs: 427 raise ValueError("Found a record of length %i, " 428 "should be %i" 429 % (len(s), length_of_seqs)) 430 if "." in s: 431 raise ValueError(_NO_DOTS) 432 seqs.append(s) 433 while True: 434 # Find other alignments in the file 435 line = handle.readline() 436 if not line: 437 break 438 if self._is_header(line): 439 self._header = line 440 break 441 442 records = (SeqRecord(Seq(s, self.alphabet), 443 id=i, name=i, description=i) 444 for (i, s) in zip(ids, seqs)) 445 return MultipleSeqAlignment(records, self.alphabet)
446