Package Bio :: Package SeqIO :: Module TabIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.TabIO

  1  # Copyright 2008-2017 by Peter Cock.  All rights reserved. 
  2  # 
  3  # This file is part of the Biopython distribution and governed by your 
  4  # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". 
  5  # Please see the LICENSE file that should have been included as part of this 
  6  # package. 
  7  """Bio.SeqIO support for the "tab" (simple tab separated) file format. 
  8   
  9  You are expected to use this module via the Bio.SeqIO functions. 
 10   
 11  The "tab" format is an ad-hoc plain text file format where each sequence is 
 12  on one (long) line.  Each line contains the identifier/description, followed 
 13  by a tab, followed by the sequence.  For example, consider the following 
 14  short FASTA format file:: 
 15   
 16      >ID123456 possible binding site? 
 17      CATCNAGATGACACTACGACTACGACTCAGACTAC 
 18      >ID123457 random sequence 
 19      ACACTACGACTACGACTCAGACTACAAN 
 20   
 21  Apart from the descriptions, this can be represented in the simple two column 
 22  tab separated format as follows:: 
 23   
 24      ID123456(tab)CATCNAGATGACACTACGACTACGACTCAGACTAC 
 25      ID123457(tab)ACACTACGACTACGACTCAGACTACAAN 
 26   
 27  When reading this file, "ID123456" or "ID123457" will be taken as the record's 
 28  .id and .name property.  There is no other information to record. 
 29   
 30  Similarly, when writing to this format, Biopython will ONLY record the record's 
 31  .id and .seq (and not the description or any other information) as in the 
 32  example above. 
 33  """ 
 34   
 35  from __future__ import print_function 
 36   
 37  from Bio.Alphabet import single_letter_alphabet 
 38  from Bio.Seq import Seq 
 39  from Bio.SeqRecord import SeqRecord 
 40  from Bio.SeqIO.Interfaces import SequentialSequenceWriter 
 41  from Bio.SeqIO.Interfaces import _clean, _get_seq_string 
 42   
 43   
44 -def TabIterator(handle, alphabet=single_letter_alphabet):
45 """Iterate over tab separated lines as SeqRecord objects. 46 47 Each line of the file should contain one tab only, dividing the line 48 into an identifier and the full sequence. 49 50 Arguments: 51 - handle - input file 52 - alphabet - optional alphabet 53 54 The first field is taken as the record's .id and .name (regardless of 55 any spaces within the text) and the second field is the sequence. 56 57 Any blank lines are ignored. 58 59 Examples 60 -------- 61 >>> with open("GenBank/NC_005816.tsv") as handle: 62 ... for record in TabIterator(handle): 63 ... print("%s length %i" % (record.id, len(record))) 64 gi|45478712|ref|NP_995567.1| length 340 65 gi|45478713|ref|NP_995568.1| length 260 66 gi|45478714|ref|NP_995569.1| length 64 67 gi|45478715|ref|NP_995570.1| length 123 68 gi|45478716|ref|NP_995571.1| length 145 69 gi|45478717|ref|NP_995572.1| length 357 70 gi|45478718|ref|NP_995573.1| length 138 71 gi|45478719|ref|NP_995574.1| length 312 72 gi|45478720|ref|NP_995575.1| length 99 73 gi|45478721|ref|NP_995576.1| length 90 74 75 """ 76 for line in handle: 77 try: 78 title, seq = line.split("\t") # will fail if more than one tab! 79 except ValueError: 80 if line.strip() == "": 81 # It's a blank line, ignore it 82 continue 83 raise ValueError("Each line should have one tab separating the" + 84 " title and sequence, this line has %i tabs: %r" 85 % (line.count("\t"), line)) 86 title = title.strip() 87 seq = seq.strip() # removes the trailing new line 88 yield SeqRecord(Seq(seq, alphabet), 89 id=title, name=title, 90 description="")
91 92
93 -class TabWriter(SequentialSequenceWriter):
94 """Class to write simple tab separated format files (OBSOLETE). 95 96 Each line consists of "id(tab)sequence" only. 97 98 Any description, name or other annotation is not recorded. 99 100 This class is now obsolete. Please use the function ``as_tab`` instead, 101 or the top level ``Bio.SeqIO.write()`` function with ``format="tab"``. 102 """ 103
104 - def write_record(self, record):
105 """Write a single tab line to the file.""" 106 assert self._header_written 107 assert not self._footer_written 108 self._record_written = True 109 self.handle.write(as_tab(record))
110 111
112 -def as_tab(record):
113 title = _clean(record.id) 114 seq = _get_seq_string(record) # Catches sequence being None 115 assert "\t" not in title 116 assert "\n" not in title 117 assert "\r" not in title 118 assert "\t" not in seq 119 assert "\n" not in seq 120 assert "\r" not in seq 121 return "%s\t%s\n" % (title, seq)
122 123 124 if __name__ == "__main__": 125 from Bio._utils import run_doctest 126 run_doctest(verbose=0) 127