Package Bio :: Package SeqIO :: Module IgIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.IgIO

  1  # Copyright 2008-2010 by Peter Cock.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  # 
  6  # This module is for reading and writing IntelliGenetics format files as 
  7  # SeqRecord objects.  This file format appears to be the same as the MASE 
  8  # multiple sequence alignment format. 
  9   
 10  """Bio.SeqIO support for the "ig" (IntelliGenetics or MASE) file format. 
 11   
 12  You are expected to use this module via the Bio.SeqIO functions. 
 13  """ 
 14   
 15  from __future__ import print_function 
 16   
 17  from Bio.Alphabet import single_letter_alphabet 
 18  from Bio.Seq import Seq 
 19  from Bio.SeqRecord import SeqRecord 
 20   
 21  __docformat__ = "restructuredtext en" 
 22   
 23   
24 -def IgIterator(handle, alphabet=single_letter_alphabet):
25 """Iterate over IntelliGenetics records (as SeqRecord objects). 26 27 handle - input file 28 alphabet - optional alphabet 29 30 The optional free format file header lines (which start with two 31 semi-colons) are ignored. 32 33 The free format commentary lines at the start of each record (which 34 start with a semi-colon) are recorded as a single string with embedded 35 new line characters in the SeqRecord's annotations dictionary under the 36 key 'comment'. 37 """ 38 # Skip any file header text before the first record (;; lines) 39 while True: 40 line = handle.readline() 41 if not line: 42 break # Premature end of file, or just empty? 43 if not line.startswith(";;"): 44 break 45 46 while line: 47 # Now iterate over the records 48 if line[0] != ";": 49 raise ValueError( 50 "Records should start with ';' and not:\n%s" % repr(line)) 51 52 # Try and agree with SeqRecord convention from the GenBank parser, 53 # (and followed in the SwissProt parser) which stores the comments 54 # as a long string with newlines under annotations key 'comment'. 55 56 # Note some examples use "; ..." and others ";..." 57 comment_lines = [] 58 while line.startswith(";"): 59 # TODO - Extract identifier from lines like "LOCUS\tB_SF2"? 60 comment_lines.append(line[1:].strip()) 61 line = handle.readline() 62 title = line.rstrip() 63 64 seq_lines = [] 65 while True: 66 line = handle.readline() 67 if not line: 68 break 69 if line[0] == ";": 70 break 71 # Remove trailing whitespace, and any internal spaces 72 seq_lines.append(line.rstrip().replace(" ", "")) 73 seq_str = "".join(seq_lines) 74 if seq_str.endswith("1"): 75 # Remove the optional terminator (digit one) 76 seq_str = seq_str[:-1] 77 if "1" in seq_str: 78 raise ValueError( 79 "Potential terminator digit one found within sequence.") 80 81 # Return the record and then continue... 82 record = SeqRecord(Seq(seq_str, alphabet), 83 id=title, name=title) 84 record.annotations['comment'] = "\n".join(comment_lines) 85 yield record 86 87 # We should be at the end of the file now 88 assert not line
89 90 if __name__ == "__main__": 91 print("Running quick self test") 92 93 import os 94 path = "../../Tests/IntelliGenetics/" 95 if os.path.isdir(path): 96 for filename in os.listdir(path): 97 if os.path.splitext(filename)[-1] == ".txt": 98 print("") 99 print(filename) 100 print("-" * len(filename)) 101 with open(os.path.join(path, filename)) as handle: 102 for record in IgIterator(handle): 103 print("%s %i" % (record.id, len(record))) 104 print("Done") 105 else: 106 print("Could not find input files") 107