Package Bio :: Package AlignIO :: Module PhylipIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.PhylipIO

  1  # Copyright 2006-2016 by Peter Cock.  All rights reserved. 
  2  # Revisions copyright 2011 Brandon Invergo. All rights reserved. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """AlignIO support for "phylip" format from Joe Felsenstein's PHYLIP tools. 
  7   
  8  You are expected to use this module via the Bio.AlignIO functions (or the 
  9  Bio.SeqIO functions if you want to work directly with the gapped sequences). 
 10   
 11  Support for "relaxed phylip" format is also provided. Relaxed phylip differs 
 12  from standard phylip format in the following ways: 
 13   
 14   - No whitespace is allowed in the sequence ID. 
 15   - No truncation is performed. Instead, sequence IDs are padded to the longest 
 16     ID length, rather than 10 characters. A space separates the sequence 
 17     identifier from the sequence. 
 18   
 19  Relaxed phylip is supported by RAxML and PHYML. 
 20   
 21  Note 
 22  ==== 
 23   
 24  In TREE_PUZZLE (Schmidt et al. 2003) and PHYML (Guindon and Gascuel 2003) 
 25  a dot/period (".") in a sequence is interpreted as meaning the same 
 26  character as in the first sequence.  The PHYLIP documentation from 3.3 to 3.69 
 27  http://evolution.genetics.washington.edu/phylip/doc/sequence.html says: 
 28   
 29  "a period was also previously allowed but it is no longer allowed, 
 30  because it sometimes is used in different senses in other programs" 
 31   
 32  Biopython 1.58 or later treats dots/periods in the sequence as invalid, both 
 33  for reading and writing. Older versions did nothing special with a dot/period. 
 34  """ 
 35  from __future__ import print_function 
 36   
 37  import string 
 38   
 39  from Bio._py3k import range 
 40   
 41  from Bio.Seq import Seq 
 42  from Bio.SeqRecord import SeqRecord 
 43  from Bio.Align import MultipleSeqAlignment 
 44  from .Interfaces import AlignmentIterator, SequentialAlignmentWriter 
 45   
 46   
 47  _PHYLIP_ID_WIDTH = 10 
 48  _NO_DOTS = "PHYLIP format no longer allows dots in sequence" 
 49   
 50   
51 -class PhylipWriter(SequentialAlignmentWriter):
52 """Phylip alignment writer.""" 53
54 - def write_alignment(self, alignment, id_width=_PHYLIP_ID_WIDTH):
55 """Use this to write (another) single alignment to an open file. 56 57 This code will write interlaced alignments (when the sequences are 58 longer than 50 characters). 59 60 Note that record identifiers are strictly truncated to id_width, 61 defaulting to the value required to comply with the PHYLIP standard. 62 63 For more information on the file format, please see: 64 http://evolution.genetics.washington.edu/phylip/doc/sequence.html 65 http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles 66 """ 67 handle = self.handle 68 69 if len(alignment) == 0: 70 raise ValueError("Must have at least one sequence") 71 length_of_seqs = alignment.get_alignment_length() 72 for record in alignment: 73 if length_of_seqs != len(record.seq): 74 raise ValueError("Sequences must all be the same length") 75 if length_of_seqs <= 0: 76 raise ValueError("Non-empty sequences are required") 77 78 # Check for repeated identifiers... 79 # Apply this test *after* cleaning the identifiers 80 names = [] 81 seqs = [] 82 for record in alignment: 83 """ 84 Quoting the PHYLIP version 3.6 documentation: 85 86 The name should be ten characters in length, filled out to 87 the full ten characters by blanks if shorter. Any printable 88 ASCII/ISO character is allowed in the name, except for 89 parentheses ("(" and ")"), square brackets ("[" and "]"), 90 colon (":"), semicolon (";") and comma (","). If you forget 91 to extend the names to ten characters in length by blanks, 92 the program [i.e. PHYLIP] will get out of synchronization 93 with the contents of the data file, and an error message will 94 result. 95 96 Note that Tab characters count as only one character in the 97 species names. Their inclusion can cause trouble. 98 """ 99 name = record.id.strip() 100 # Either remove the banned characters, or map them to something 101 # else like an underscore "_" or pipe "|" character... 102 for char in "[](),": 103 name = name.replace(char, "") 104 for char in ":;": 105 name = name.replace(char, "|") 106 name = name[:id_width] 107 if name in names: 108 raise ValueError("Repeated name %r (originally %r), " 109 "possibly due to truncation" 110 % (name, record.id)) 111 names.append(name) 112 sequence = str(record.seq) 113 if "." in sequence: 114 # Do this check here (once per record, not once per block) 115 raise ValueError(_NO_DOTS) 116 seqs.append(sequence) 117 118 # From experimentation, the use of tabs is not understood by the 119 # EMBOSS suite. The nature of the expected white space is not 120 # defined in the PHYLIP documentation, simply "These are in free 121 # format, separated by blanks". We'll use spaces to keep EMBOSS 122 # happy. 123 handle.write(" %i %s\n" % (len(alignment), length_of_seqs)) 124 block = 0 125 while True: 126 for name, sequence in zip(names, seqs): 127 if block == 0: 128 # Write name (truncated/padded to id_width characters) 129 # Now truncate and right pad to expected length. 130 handle.write(name[:id_width].ljust(id_width)) 131 else: 132 # write indent 133 handle.write(" " * id_width) 134 # Write five chunks of ten letters per line... 135 for chunk in range(0, 5): 136 i = block * 50 + chunk * 10 137 seq_segment = sequence[i:i + 10] 138 # TODO - Force any gaps to be '-' character? Look at the 139 # alphabet... 140 # TODO - How to cope with '?' or '.' in the sequence? 141 handle.write(" %s" % seq_segment) 142 if i + 10 > length_of_seqs: 143 break 144 handle.write("\n") 145 block += 1 146 if block * 50 > length_of_seqs: 147 break 148 handle.write("\n")
149 150
151 -class PhylipIterator(AlignmentIterator):
152 """Reads a Phylip alignment file returning a MultipleSeqAlignment iterator. 153 154 Record identifiers are limited to at most 10 characters. 155 156 It only copes with interlaced phylip files! Sequential files won't work 157 where the sequences are split over multiple lines. 158 159 For more information on the file format, please see: 160 http://evolution.genetics.washington.edu/phylip/doc/sequence.html 161 http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles 162 """ 163 164 # Default truncation length 165 id_width = _PHYLIP_ID_WIDTH 166 167 _header = None # for caching lines between __next__ calls 168
169 - def _is_header(self, line):
170 line = line.strip() 171 parts = [x for x in line.split() if x] 172 if len(parts) != 2: 173 return False # First line should have two integers 174 try: 175 number_of_seqs = int(parts[0]) 176 length_of_seqs = int(parts[1]) 177 return True 178 except ValueError: 179 return False # First line should have two integers
180
181 - def _split_id(self, line):
182 """Extracts the sequence ID from a Phylip line (PRIVATE). 183 184 Returning a tuple containing: (sequence_id, sequence_residues) 185 186 The first 10 characters in the line are are the sequence id, the 187 remainder are sequence data. 188 """ 189 seq_id = line[:self.id_width].strip() 190 seq = line[self.id_width:].strip().replace(' ', '') 191 return seq_id, seq
192
193 - def __next__(self):
194 handle = self.handle 195 196 if self._header is None: 197 line = handle.readline() 198 else: 199 # Header we saved from when we were parsing 200 # the previous alignment. 201 line = self._header 202 self._header = None 203 204 if not line: 205 raise StopIteration 206 line = line.strip() 207 parts = [x for x in line.split() if x] 208 if len(parts) != 2: 209 raise ValueError("First line should have two integers") 210 try: 211 number_of_seqs = int(parts[0]) 212 length_of_seqs = int(parts[1]) 213 except ValueError: 214 raise ValueError("First line should have two integers") 215 216 assert self._is_header(line) 217 218 if self.records_per_alignment is not None and \ 219 self.records_per_alignment != number_of_seqs: 220 raise ValueError("Found %i records in this alignment, " 221 "told to expect %i" 222 % (number_of_seqs, self.records_per_alignment)) 223 224 ids = [] 225 seqs = [] 226 227 # By default, expects STRICT truncation / padding to 10 characters. 228 # Does not require any whitespace between name and seq. 229 for i in range(number_of_seqs): 230 line = handle.readline().rstrip() 231 sequence_id, s = self._split_id(line) 232 ids.append(sequence_id) 233 if "." in s: 234 raise ValueError(_NO_DOTS) 235 seqs.append([s]) 236 237 # Look for further blocks 238 line = "" 239 while True: 240 # Skip any blank lines between blocks... 241 while "" == line.strip(): 242 line = handle.readline() 243 if not line: 244 break # end of file 245 if not line: 246 break # end of file 247 248 if self._is_header(line): 249 # Looks like the start of a concatenated alignment 250 self._header = line 251 break 252 253 # print "New block..." 254 for i in range(number_of_seqs): 255 s = line.strip().replace(" ", "") 256 if "." in s: 257 raise ValueError(_NO_DOTS) 258 seqs[i].append(s) 259 line = handle.readline() 260 if (not line) and i + 1 < number_of_seqs: 261 raise ValueError("End of file mid-block") 262 if not line: 263 break # end of file 264 265 records = (SeqRecord(Seq("".join(s), self.alphabet), 266 id=i, name=i, description=i) 267 for (i, s) in zip(ids, seqs)) 268 return MultipleSeqAlignment(records, self.alphabet)
269 270 271 # Relaxed Phylip
272 -class RelaxedPhylipWriter(PhylipWriter):
273 """ 274 Relaxed Phylip format writer 275 """ 276
277 - def write_alignment(self, alignment):
278 """ 279 Write a relaxed phylip alignment 280 """ 281 # Check inputs 282 for name in (s.id.strip() for s in alignment): 283 if any(c in name for c in string.whitespace): 284 raise ValueError("Whitespace not allowed in identifier: %s" 285 % name) 286 287 # Calculate a truncation length - maximum length of sequence ID plus a 288 # single character for padding 289 # If no sequences, set id_width to 1. super(...) call will raise a 290 # ValueError 291 if len(alignment) == 0: 292 id_width = 1 293 else: 294 id_width = max((len(s.id.strip()) for s in alignment)) + 1 295 super(RelaxedPhylipWriter, self).write_alignment(alignment, id_width)
296 297
298 -class RelaxedPhylipIterator(PhylipIterator):
299 """Relaxed Phylip format Iterator.""" 300
301 - def _split_id(self, line):
302 """Extracts the sequence ID from a Phylip line (PRIVATE). 303 304 Returns a tuple containing: (sequence_id, sequence_residues) 305 306 For relaxed format split at the first whitespace character. 307 """ 308 seq_id, sequence = line.split(None, 1) 309 sequence = sequence.strip().replace(" ", "") 310 return seq_id, sequence
311 312
313 -class SequentialPhylipWriter(SequentialAlignmentWriter):
314 """Sequential Phylip format Writer.""" 315
316 - def write_alignment(self, alignment, id_width=_PHYLIP_ID_WIDTH):
317 handle = self.handle 318 319 if len(alignment) == 0: 320 raise ValueError("Must have at least one sequence") 321 length_of_seqs = alignment.get_alignment_length() 322 for record in alignment: 323 if length_of_seqs != len(record.seq): 324 raise ValueError("Sequences must all be the same length") 325 if length_of_seqs <= 0: 326 raise ValueError("Non-empty sequences are required") 327 328 # Check for repeated identifiers... 329 # Apply this test *after* cleaning the identifiers 330 names = [] 331 for record in alignment: 332 name = record.id.strip() 333 # Either remove the banned characters, or map them to something 334 # else like an underscore "_" or pipe "|" character... 335 for char in "[](),": 336 name = name.replace(char, "") 337 for char in ":;": 338 name = name.replace(char, "|") 339 name = name[:id_width] 340 if name in names: 341 raise ValueError("Repeated name %r (originally %r), " 342 "possibly due to truncation" 343 % (name, record.id)) 344 names.append(name) 345 346 # From experimentation, the use of tabs is not understood by the 347 # EMBOSS suite. The nature of the expected white space is not 348 # defined in the PHYLIP documentation, simply "These are in free 349 # format, separated by blanks". We'll use spaces to keep EMBOSS 350 # happy. 351 handle.write(" %i %s\n" % (len(alignment), length_of_seqs)) 352 for name, record in zip(names, alignment): 353 sequence = str(record.seq) 354 if "." in sequence: 355 raise ValueError(_NO_DOTS) 356 handle.write(name[:id_width].ljust(id_width)) 357 # Write the entire sequence to one line (see sequential format 358 # notes in the SequentialPhylipIterator docstring 359 handle.write(sequence) 360 handle.write("\n")
361 362
363 -class SequentialPhylipIterator(PhylipIterator):
364 """Sequential Phylip format Iterator. 365 366 The sequential format carries the same restrictions as the normal 367 interleaved one, with the difference being that the sequences are listed 368 sequentially, each sequence written in its entirety before the start of 369 the next. According to the PHYLIP documentation for input file 370 formatting, newlines and spaces may optionally be entered at any point 371 in the sequences. 372 """ 373 374 _header = None # for caching lines between __next__ calls 375
376 - def __next__(self):
377 handle = self.handle 378 379 if self._header is None: 380 line = handle.readline() 381 else: 382 # Header we saved from when we were parsing 383 # the previous alignment. 384 line = self._header 385 self._header = None 386 387 if not line: 388 raise StopIteration 389 line = line.strip() 390 parts = [x for x in line.split() if x] 391 if len(parts) != 2: 392 raise ValueError("First line should have two integers") 393 try: 394 number_of_seqs = int(parts[0]) 395 length_of_seqs = int(parts[1]) 396 except ValueError: 397 raise ValueError("First line should have two integers") 398 399 assert self._is_header(line) 400 401 if self.records_per_alignment is not None and \ 402 self.records_per_alignment != number_of_seqs: 403 raise ValueError("Found %i records in this alignment, " 404 "told to expect %i" 405 % (number_of_seqs, self.records_per_alignment)) 406 407 ids = [] 408 seqs = [] 409 410 # By default, expects STRICT truncation / padding to 10 characters. 411 # Does not require any whitespace between name and seq. 412 for i in range(number_of_seqs): 413 line = handle.readline().rstrip() 414 sequence_id, s = self._split_id(line) 415 ids.append(sequence_id) 416 while len(s) < length_of_seqs: 417 # The sequence may be split into multiple lines 418 line = handle.readline().strip() 419 if not line: 420 break 421 if line == "": 422 continue 423 s = "".join([s, line.strip().replace(" ", "")]) 424 if len(s) > length_of_seqs: 425 raise ValueError("Found a record of length %i, " 426 "should be %i" 427 % (len(s), length_of_seqs)) 428 if "." in s: 429 raise ValueError(_NO_DOTS) 430 seqs.append(s) 431 while True: 432 # Find other alignments in the file 433 line = handle.readline() 434 if not line: 435 break 436 if self._is_header(line): 437 self._header = line 438 break 439 440 records = (SeqRecord(Seq(s, self.alphabet), 441 id=i, name=i, description=i) 442 for (i, s) in zip(ids, seqs)) 443 return MultipleSeqAlignment(records, self.alphabet)
444