Package Bio :: Package SeqIO :: Module IgIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.IgIO

  1  # Copyright 2008-2015 by Peter Cock.  All rights reserved. 
  2  # 
  3  # This file is part of the Biopython distribution and governed by your 
  4  # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". 
  5  # Please see the LICENSE file that should have been included as part of this 
  6  # package. 
  7  """Bio.SeqIO support for the "ig" (IntelliGenetics or MASE) file format. 
  8   
  9  This module is for reading and writing IntelliGenetics format files as 
 10  SeqRecord objects.  This file format appears to be the same as the MASE 
 11  multiple sequence alignment format. 
 12   
 13  You are expected to use this module via the Bio.SeqIO functions. 
 14  """ 
 15   
 16  from __future__ import print_function 
 17   
 18  from Bio.Alphabet import single_letter_alphabet 
 19  from Bio.Seq import Seq 
 20  from Bio.SeqRecord import SeqRecord 
 21   
 22   
23 -def IgIterator(handle, alphabet=single_letter_alphabet):
24 """Iterate over IntelliGenetics records (as SeqRecord objects). 25 26 handle - input file 27 alphabet - optional alphabet 28 29 The optional free format file header lines (which start with two 30 semi-colons) are ignored. 31 32 The free format commentary lines at the start of each record (which 33 start with a semi-colon) are recorded as a single string with embedded 34 new line characters in the SeqRecord's annotations dictionary under the 35 key 'comment'. 36 37 Examples 38 -------- 39 >>> with open("IntelliGenetics/TAT_mase_nuc.txt") as handle: 40 ... for record in IgIterator(handle): 41 ... print("%s length %i" % (record.id, len(record))) 42 ... 43 A_U455 length 303 44 B_HXB2R length 306 45 C_UG268A length 267 46 D_ELI length 309 47 F_BZ163A length 309 48 O_ANT70 length 342 49 O_MVP5180 length 348 50 CPZGAB length 309 51 CPZANT length 309 52 A_ROD length 390 53 B_EHOA length 420 54 D_MM251 length 390 55 STM_STM length 387 56 VER_AGM3 length 354 57 GRI_AGM677 length 264 58 SAB_SAB1C length 219 59 SYK_SYK length 330 60 61 """ 62 # Skip any file header text before the first record (;; lines) 63 while True: 64 line = handle.readline() 65 if not line: 66 break # Premature end of file, or just empty? 67 if not line.startswith(";;"): 68 break 69 70 while line: 71 # Now iterate over the records 72 if line[0] != ";": 73 raise ValueError( 74 "Records should start with ';' and not:\n%r" % line) 75 76 # Try and agree with SeqRecord convention from the GenBank parser, 77 # (and followed in the SwissProt parser) which stores the comments 78 # as a long string with newlines under annotations key 'comment'. 79 80 # Note some examples use "; ..." and others ";..." 81 comment_lines = [] 82 while line.startswith(";"): 83 # TODO - Extract identifier from lines like "LOCUS\tB_SF2"? 84 comment_lines.append(line[1:].strip()) 85 line = handle.readline() 86 title = line.rstrip() 87 88 seq_lines = [] 89 while True: 90 line = handle.readline() 91 if not line: 92 break 93 if line[0] == ";": 94 break 95 # Remove trailing whitespace, and any internal spaces 96 seq_lines.append(line.rstrip().replace(" ", "")) 97 seq_str = "".join(seq_lines) 98 if seq_str.endswith("1"): 99 # Remove the optional terminator (digit one) 100 seq_str = seq_str[:-1] 101 if "1" in seq_str: 102 raise ValueError( 103 "Potential terminator digit one found within sequence.") 104 105 # Return the record and then continue... 106 record = SeqRecord(Seq(seq_str, alphabet), 107 id=title, name=title) 108 record.annotations['comment'] = "\n".join(comment_lines) 109 yield record 110 111 # We should be at the end of the file now 112 assert not line
113 114 115 if __name__ == "__main__": 116 from Bio._utils import run_doctest 117 run_doctest(verbose=0) 118