Package Bio :: Package SeqIO :: Module TabIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.TabIO

  1  # Copyright 2008-2015 by Peter Cock.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Bio.SeqIO support for the "tab" (simple tab separated) file format. 
  7   
  8  You are expected to use this module via the Bio.SeqIO functions. 
  9   
 10  The "tab" format is an ad-hoc plain text file format where each sequence is 
 11  on one (long) line.  Each line contains the identifier/description, followed 
 12  by a tab, followed by the sequence.  For example, consider the following 
 13  short FASTA format file:: 
 14   
 15      >ID123456 possible binding site? 
 16      CATCNAGATGACACTACGACTACGACTCAGACTAC 
 17      >ID123457 random sequence 
 18      ACACTACGACTACGACTCAGACTACAAN 
 19   
 20  Apart from the descriptions, this can be represented in the simple two column 
 21  tab separated format as follows:: 
 22   
 23      ID123456(tab)CATCNAGATGACACTACGACTACGACTCAGACTAC 
 24      ID123457(tab)ACACTACGACTACGACTCAGACTACAAN 
 25   
 26  When reading this file, "ID123456" or "ID123457" will be taken as the record's 
 27  .id and .name property.  There is no other information to record. 
 28   
 29  Similarly, when writing to this format, Biopython will ONLY record the record's 
 30  .id and .seq (and not the description or any other information) as in the 
 31  example above. 
 32  """ 
 33   
 34  from __future__ import print_function 
 35   
 36  from Bio.Alphabet import single_letter_alphabet 
 37  from Bio.Seq import Seq 
 38  from Bio.SeqRecord import SeqRecord 
 39  from Bio.SeqIO.Interfaces import SequentialSequenceWriter 
 40   
 41   
42 -def TabIterator(handle, alphabet=single_letter_alphabet):
43 """Iterates over tab separated lines (as SeqRecord objects). 44 45 Each line of the file should contain one tab only, dividing the line 46 into an identifier and the full sequence. 47 48 Arguments: 49 - handle - input file 50 - alphabet - optional alphabet 51 52 The first field is taken as the record's .id and .name (regardless of 53 any spaces within the text) and the second field is the sequence. 54 55 Any blank lines are ignored. 56 57 Example: 58 59 >>> with open("GenBank/NC_005816.tsv") as handle: 60 ... for record in TabIterator(handle): 61 ... print("%s length %i" % (record.id, len(record))) 62 gi|45478712|ref|NP_995567.1| length 340 63 gi|45478713|ref|NP_995568.1| length 260 64 gi|45478714|ref|NP_995569.1| length 64 65 gi|45478715|ref|NP_995570.1| length 123 66 gi|45478716|ref|NP_995571.1| length 145 67 gi|45478717|ref|NP_995572.1| length 357 68 gi|45478718|ref|NP_995573.1| length 138 69 gi|45478719|ref|NP_995574.1| length 312 70 gi|45478720|ref|NP_995575.1| length 99 71 gi|45478721|ref|NP_995576.1| length 90 72 73 """ 74 for line in handle: 75 try: 76 title, seq = line.split("\t") # will fail if more than one tab! 77 except: 78 if line.strip() == "": 79 # It's a blank line, ignore it 80 continue 81 raise ValueError("Each line should have one tab separating the" + 82 " title and sequence, this line has %i tabs: %r" 83 % (line.count("\t"), line)) 84 title = title.strip() 85 seq = seq.strip() # removes the trailing new line 86 yield SeqRecord(Seq(seq, alphabet), 87 id=title, name=title, 88 description="")
89 90
91 -class TabWriter(SequentialSequenceWriter):
92 """Class to write simple tab separated format files. 93 94 Each line consists of "id(tab)sequence" only. 95 96 Any description, name or other annotation is not recorded. 97 """
98 - def write_record(self, record):
99 """Write a single tab line to the file.""" 100 assert self._header_written 101 assert not self._footer_written 102 self._record_written = True 103 104 title = self.clean(record.id) 105 seq = self._get_seq_string(record) # Catches sequence being None 106 assert "\t" not in title 107 assert "\n" not in title 108 assert "\r" not in title 109 assert "\t" not in seq 110 assert "\n" not in seq 111 assert "\r" not in seq 112 self.handle.write("%s\t%s\n" % (title, seq))
113 114 115 if __name__ == "__main__": 116 from Bio._utils import run_doctest 117 run_doctest(verbose=0) 118