Package Bio :: Package SeqIO :: Module IgIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.IgIO

  1  # Copyright 2008-2015 by Peter Cock.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  # 
  6  # This module is for reading and writing IntelliGenetics format files as 
  7  # SeqRecord objects.  This file format appears to be the same as the MASE 
  8  # multiple sequence alignment format. 
  9   
 10  """Bio.SeqIO support for the "ig" (IntelliGenetics or MASE) file format. 
 11   
 12  You are expected to use this module via the Bio.SeqIO functions. 
 13  """ 
 14   
 15  from __future__ import print_function 
 16   
 17  from Bio.Alphabet import single_letter_alphabet 
 18  from Bio.Seq import Seq 
 19  from Bio.SeqRecord import SeqRecord 
 20   
 21   
22 -def IgIterator(handle, alphabet=single_letter_alphabet):
23 """Iterate over IntelliGenetics records (as SeqRecord objects). 24 25 handle - input file 26 alphabet - optional alphabet 27 28 The optional free format file header lines (which start with two 29 semi-colons) are ignored. 30 31 The free format commentary lines at the start of each record (which 32 start with a semi-colon) are recorded as a single string with embedded 33 new line characters in the SeqRecord's annotations dictionary under the 34 key 'comment'. 35 36 Example: 37 38 >>> with open("IntelliGenetics/TAT_mase_nuc.txt") as handle: 39 ... for record in IgIterator(handle): 40 ... print("%s length %i" % (record.id, len(record))) 41 ... 42 A_U455 length 303 43 B_HXB2R length 306 44 C_UG268A length 267 45 D_ELI length 309 46 F_BZ163A length 309 47 O_ANT70 length 342 48 O_MVP5180 length 348 49 CPZGAB length 309 50 CPZANT length 309 51 A_ROD length 390 52 B_EHOA length 420 53 D_MM251 length 390 54 STM_STM length 387 55 VER_AGM3 length 354 56 GRI_AGM677 length 264 57 SAB_SAB1C length 219 58 SYK_SYK length 330 59 60 """ 61 # Skip any file header text before the first record (;; lines) 62 while True: 63 line = handle.readline() 64 if not line: 65 break # Premature end of file, or just empty? 66 if not line.startswith(";;"): 67 break 68 69 while line: 70 # Now iterate over the records 71 if line[0] != ";": 72 raise ValueError( 73 "Records should start with ';' and not:\n%r" % line) 74 75 # Try and agree with SeqRecord convention from the GenBank parser, 76 # (and followed in the SwissProt parser) which stores the comments 77 # as a long string with newlines under annotations key 'comment'. 78 79 # Note some examples use "; ..." and others ";..." 80 comment_lines = [] 81 while line.startswith(";"): 82 # TODO - Extract identifier from lines like "LOCUS\tB_SF2"? 83 comment_lines.append(line[1:].strip()) 84 line = handle.readline() 85 title = line.rstrip() 86 87 seq_lines = [] 88 while True: 89 line = handle.readline() 90 if not line: 91 break 92 if line[0] == ";": 93 break 94 # Remove trailing whitespace, and any internal spaces 95 seq_lines.append(line.rstrip().replace(" ", "")) 96 seq_str = "".join(seq_lines) 97 if seq_str.endswith("1"): 98 # Remove the optional terminator (digit one) 99 seq_str = seq_str[:-1] 100 if "1" in seq_str: 101 raise ValueError( 102 "Potential terminator digit one found within sequence.") 103 104 # Return the record and then continue... 105 record = SeqRecord(Seq(seq_str, alphabet), 106 id=title, name=title) 107 record.annotations['comment'] = "\n".join(comment_lines) 108 yield record 109 110 # We should be at the end of the file now 111 assert not line
112 113 if __name__ == "__main__": 114 from Bio._utils import run_doctest 115 run_doctest(verbose=0) 116