Package Bio :: Package SeqIO :: Module FastaIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.FastaIO

  1  # Copyright 2006-2015 by Peter Cock.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  # 
  6  # This module is for reading and writing FASTA format files as SeqRecord 
  7  # objects.  The code is partly inspired  by earlier Biopython modules, 
  8  # Bio.Fasta.* and the now deprecated Bio.SeqIO.FASTA 
  9   
 10  """Bio.SeqIO support for the "fasta" (aka FastA or Pearson) file format. 
 11   
 12  You are expected to use this module via the Bio.SeqIO functions.""" 
 13   
 14  from __future__ import print_function 
 15   
 16  from Bio.Alphabet import single_letter_alphabet 
 17  from Bio.Seq import Seq 
 18  from Bio.SeqRecord import SeqRecord 
 19  from Bio.SeqIO.Interfaces import SequentialSequenceWriter 
 20   
 21   
22 -def SimpleFastaParser(handle):
23 """Generator function to iterate over Fasta records (as string tuples). 24 25 For each record a tuple of two strings is returned, the FASTA title 26 line (without the leading '>' character), and the sequence (with any 27 whitespace removed). The title line is not divided up into an 28 identifier (the first word) and comment or description. 29 30 >>> with open("Fasta/dups.fasta") as handle: 31 ... for values in SimpleFastaParser(handle): 32 ... print(values) 33 ... 34 ('alpha', 'ACGTA') 35 ('beta', 'CGTC') 36 ('gamma', 'CCGCC') 37 ('alpha (again - this is a duplicate entry to test the indexing code)', 'ACGTA') 38 ('delta', 'CGCGC') 39 40 """ 41 # Skip any text before the first record (e.g. blank lines, comments) 42 while True: 43 line = handle.readline() 44 if line == "": 45 return # Premature end of file, or just empty? 46 if line[0] == ">": 47 break 48 49 while True: 50 if line[0] != ">": 51 raise ValueError( 52 "Records in Fasta files should start with '>' character") 53 title = line[1:].rstrip() 54 lines = [] 55 line = handle.readline() 56 while True: 57 if not line: 58 break 59 if line[0] == ">": 60 break 61 lines.append(line.rstrip()) 62 line = handle.readline() 63 64 # Remove trailing whitespace, and any internal spaces 65 # (and any embedded \r which are possible in mangled files 66 # when not opened in universal read lines mode) 67 yield title, "".join(lines).replace(" ", "").replace("\r", "") 68 69 if not line: 70 return # StopIteration 71 72 assert False, "Should not reach this line"
73 74
75 -def FastaIterator(handle, alphabet=single_letter_alphabet, title2ids=None):
76 """Generator function to iterate over Fasta records (as SeqRecord objects). 77 78 Arguments: 79 80 - handle - input file 81 - alphabet - optional alphabet 82 - title2ids - A function that, when given the title of the FASTA 83 file (without the beginning >), will return the id, name and 84 description (in that order) for the record as a tuple of strings. 85 If this is not given, then the entire title line will be used 86 as the description, and the first word as the id and name. 87 88 By default this will act like calling Bio.SeqIO.parse(handle, "fasta") 89 with no custom handling of the title lines: 90 91 >>> with open("Fasta/dups.fasta") as handle: 92 ... for record in FastaIterator(handle): 93 ... print(record.id) 94 ... 95 alpha 96 beta 97 gamma 98 alpha 99 delta 100 101 However, you can supply a title2ids function to alter this: 102 103 >>> def take_upper(title): 104 ... return title.split(None, 1)[0].upper(), "", title 105 >>> with open("Fasta/dups.fasta") as handle: 106 ... for record in FastaIterator(handle, title2ids=take_upper): 107 ... print(record.id) 108 ... 109 ALPHA 110 BETA 111 GAMMA 112 ALPHA 113 DELTA 114 115 """ 116 if title2ids: 117 for title, sequence in SimpleFastaParser(handle): 118 id, name, descr = title2ids(title) 119 yield SeqRecord(Seq(sequence, alphabet), 120 id=id, name=name, description=descr) 121 else: 122 for title, sequence in SimpleFastaParser(handle): 123 try: 124 first_word = title.split(None, 1)[0] 125 except IndexError: 126 assert not title, repr(title) 127 # Should we use SeqRecord default for no ID? 128 first_word = "" 129 yield SeqRecord(Seq(sequence, alphabet), 130 id=first_word, name=first_word, description=title)
131 132
133 -class FastaWriter(SequentialSequenceWriter):
134 """Class to write Fasta format files."""
135 - def __init__(self, handle, wrap=60, record2title=None):
136 """Create a Fasta writer. 137 138 Arguments: 139 140 - handle - Handle to an output file, e.g. as returned 141 by open(filename, "w") 142 - wrap - Optional line length used to wrap sequence lines. 143 Defaults to wrapping the sequence at 60 characters 144 Use zero (or None) for no wrapping, giving a single 145 long line for the sequence. 146 - record2title - Optional function to return the text to be 147 used for the title line of each record. By default 148 a combination of the record.id and record.description 149 is used. If the record.description starts with the 150 record.id, then just the record.description is used. 151 152 You can either use:: 153 154 handle = open(filename, "w") 155 writer = FastaWriter(handle) 156 writer.write_file(myRecords) 157 handle.close() 158 159 Or, follow the sequential file writer system, for example:: 160 161 handle = open(filename, "w") 162 writer = FastaWriter(handle) 163 writer.write_header() # does nothing for Fasta files 164 ... 165 Multiple writer.write_record() and/or writer.write_records() calls 166 ... 167 writer.write_footer() # does nothing for Fasta files 168 handle.close() 169 170 """ 171 SequentialSequenceWriter.__init__(self, handle) 172 self.wrap = None 173 if wrap: 174 if wrap < 1: 175 raise ValueError 176 self.wrap = wrap 177 self.record2title = record2title
178
179 - def write_record(self, record):
180 """Write a single Fasta record to the file.""" 181 assert self._header_written 182 assert not self._footer_written 183 self._record_written = True 184 185 if self.record2title: 186 title = self.clean(self.record2title(record)) 187 else: 188 id = self.clean(record.id) 189 description = self.clean(record.description) 190 if description and description.split(None, 1)[0] == id: 191 # The description includes the id at the start 192 title = description 193 elif description: 194 title = "%s %s" % (id, description) 195 else: 196 title = id 197 198 assert "\n" not in title 199 assert "\r" not in title 200 self.handle.write(">%s\n" % title) 201 202 data = self._get_seq_string(record) # Catches sequence being None 203 204 assert "\n" not in data 205 assert "\r" not in data 206 207 if self.wrap: 208 for i in range(0, len(data), self.wrap): 209 self.handle.write(data[i:i + self.wrap] + "\n") 210 else: 211 self.handle.write(data + "\n")
212 213 if __name__ == "__main__": 214 from Bio._utils import run_doctest 215 run_doctest(verbose=0) 216