Package Bio :: Package SeqIO :: Module FastaIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.FastaIO

  1  # Copyright 2006-2014 by Peter Cock.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  # 
  6  # This module is for reading and writing FASTA format files as SeqRecord 
  7  # objects.  The code is partly inspired  by earlier Biopython modules, 
  8  # Bio.Fasta.* and the now deprecated Bio.SeqIO.FASTA 
  9   
 10  """Bio.SeqIO support for the "fasta" (aka FastA or Pearson) file format. 
 11   
 12  You are expected to use this module via the Bio.SeqIO functions.""" 
 13   
 14  from __future__ import print_function 
 15   
 16  from Bio.Alphabet import single_letter_alphabet 
 17  from Bio.Seq import Seq 
 18  from Bio.SeqRecord import SeqRecord 
 19  from Bio.SeqIO.Interfaces import SequentialSequenceWriter 
 20   
 21  __docformat__ = "restructuredtext en" 
 22   
 23   
24 -def SimpleFastaParser(handle):
25 """Generator function to iterate over Fasta records (as string tuples). 26 27 For each record a tuple of two strings is returned, the FASTA title 28 line (without the leading '>' character), and the sequence (with any 29 whitespace removed). The title line is not divided up into an 30 identifier (the first word) and comment or description. 31 32 >>> with open("Fasta/dups.fasta") as handle: 33 ... for values in SimpleFastaParser(handle): 34 ... print(values) 35 ... 36 ('alpha', 'ACGTA') 37 ('beta', 'CGTC') 38 ('gamma', 'CCGCC') 39 ('alpha (again - this is a duplicate entry to test the indexing code)', 'ACGTA') 40 ('delta', 'CGCGC') 41 42 """ 43 # Skip any text before the first record (e.g. blank lines, comments) 44 while True: 45 line = handle.readline() 46 if line == "": 47 return # Premature end of file, or just empty? 48 if line[0] == ">": 49 break 50 51 while True: 52 if line[0] != ">": 53 raise ValueError( 54 "Records in Fasta files should start with '>' character") 55 title = line[1:].rstrip() 56 lines = [] 57 line = handle.readline() 58 while True: 59 if not line: 60 break 61 if line[0] == ">": 62 break 63 lines.append(line.rstrip()) 64 line = handle.readline() 65 66 # Remove trailing whitespace, and any internal spaces 67 # (and any embedded \r which are possible in mangled files 68 # when not opened in universal read lines mode) 69 yield title, "".join(lines).replace(" ", "").replace("\r", "") 70 71 if not line: 72 return # StopIteration 73 74 assert False, "Should not reach this line"
75 76
77 -def FastaIterator(handle, alphabet=single_letter_alphabet, title2ids=None):
78 """Generator function to iterate over Fasta records (as SeqRecord objects). 79 80 Arguments: 81 82 - handle - input file 83 - alphabet - optional alphabet 84 - title2ids - A function that, when given the title of the FASTA 85 file (without the beginning >), will return the id, name and 86 description (in that order) for the record as a tuple of strings. 87 If this is not given, then the entire title line will be used 88 as the description, and the first word as the id and name. 89 90 By default this will act like calling Bio.SeqIO.parse(handle, "fasta") 91 with no custom handling of the title lines: 92 93 >>> with open("Fasta/dups.fasta") as handle: 94 ... for record in FastaIterator(handle): 95 ... print(record.id) 96 ... 97 alpha 98 beta 99 gamma 100 alpha 101 delta 102 103 However, you can supply a title2ids function to alter this: 104 105 >>> def take_upper(title): 106 ... return title.split(None, 1)[0].upper(), "", title 107 >>> with open("Fasta/dups.fasta") as handle: 108 ... for record in FastaIterator(handle, title2ids=take_upper): 109 ... print(record.id) 110 ... 111 ALPHA 112 BETA 113 GAMMA 114 ALPHA 115 DELTA 116 117 """ 118 if title2ids: 119 for title, sequence in SimpleFastaParser(handle): 120 id, name, descr = title2ids(title) 121 yield SeqRecord(Seq(sequence, alphabet), 122 id=id, name=name, description=descr) 123 else: 124 for title, sequence in SimpleFastaParser(handle): 125 try: 126 first_word = title.split(None, 1)[0] 127 except IndexError: 128 assert not title, repr(title) 129 # Should we use SeqRecord default for no ID? 130 first_word = "" 131 yield SeqRecord(Seq(sequence, alphabet), 132 id=first_word, name=first_word, description=title)
133 134
135 -class FastaWriter(SequentialSequenceWriter):
136 """Class to write Fasta format files."""
137 - def __init__(self, handle, wrap=60, record2title=None):
138 """Create a Fasta writer. 139 140 Arguements: 141 142 - handle - Handle to an output file, e.g. as returned 143 by open(filename, "w") 144 - wrap - Optional line length used to wrap sequence lines. 145 Defaults to wrapping the sequence at 60 characters 146 Use zero (or None) for no wrapping, giving a single 147 long line for the sequence. 148 - record2title - Optional function to return the text to be 149 used for the title line of each record. By default 150 a combination of the record.id and record.description 151 is used. If the record.description starts with the 152 record.id, then just the record.description is used. 153 154 You can either use:: 155 156 handle = open(filename, "w") 157 writer = FastaWriter(handle) 158 writer.write_file(myRecords) 159 handle.close() 160 161 Or, follow the sequential file writer system, for example:: 162 163 handle = open(filename, "w") 164 writer = FastaWriter(handle) 165 writer.write_header() # does nothing for Fasta files 166 ... 167 Multiple writer.write_record() and/or writer.write_records() calls 168 ... 169 writer.write_footer() # does nothing for Fasta files 170 handle.close() 171 172 """ 173 SequentialSequenceWriter.__init__(self, handle) 174 self.wrap = None 175 if wrap: 176 if wrap < 1: 177 raise ValueError 178 self.wrap = wrap 179 self.record2title = record2title
180
181 - def write_record(self, record):
182 """Write a single Fasta record to the file.""" 183 assert self._header_written 184 assert not self._footer_written 185 self._record_written = True 186 187 if self.record2title: 188 title = self.clean(self.record2title(record)) 189 else: 190 id = self.clean(record.id) 191 description = self.clean(record.description) 192 if description and description.split(None, 1)[0] == id: 193 # The description includes the id at the start 194 title = description 195 elif description: 196 title = "%s %s" % (id, description) 197 else: 198 title = id 199 200 assert "\n" not in title 201 assert "\r" not in title 202 self.handle.write(">%s\n" % title) 203 204 data = self._get_seq_string(record) # Catches sequence being None 205 206 assert "\n" not in data 207 assert "\r" not in data 208 209 if self.wrap: 210 for i in range(0, len(data), self.wrap): 211 self.handle.write(data[i:i + self.wrap] + "\n") 212 else: 213 self.handle.write(data + "\n")
214 215 if __name__ == "__main__": 216 print("Running quick self test") 217 218 import os 219 from Bio.Alphabet import generic_protein, generic_nucleotide 220 221 # Download the files from here: 222 # ftp://ftp.ncbi.nlm.nih.gov/genomes/Bacteria/Nanoarchaeum_equitans 223 fna_filename = "NC_005213.fna" 224 faa_filename = "NC_005213.faa" 225
226 - def genbank_name_function(text):
227 text, descr = text.split(None, 1) 228 id = text.split("|")[3] 229 name = id.split(".", 1)[0] 230 return id, name, descr
231 245 246 if os.path.isfile(fna_filename): 247 print("--------") 248 print("FastaIterator (single sequence)") 249 with open(fna_filename, "r") as h: 250 iterator = FastaIterator(h, alphabet=generic_nucleotide, 251 title2ids=genbank_name_function) 252 count = 0 253 for record in iterator: 254 count += 1 255 print_record(record) 256 assert count == 1 257 print(str(record.__class__)) 258 259 if os.path.isfile(faa_filename): 260 print("--------") 261 print("FastaIterator (multiple sequences)") 262 with open(faa_filename, "r") as h: 263 iterator = FastaIterator(h, alphabet=generic_protein, 264 title2ids=genbank_name_function) 265 count = 0 266 for record in iterator: 267 count += 1 268 print_record(record) 269 break 270 assert count > 0 271 print(str(record.__class__)) 272 273 from Bio._py3k import StringIO 274 print("--------") 275 print("FastaIterator (empty input file)") 276 # Just to make sure no errors happen 277 iterator = FastaIterator(StringIO("")) 278 count = 0 279 for record in iterator: 280 count += 1 281 assert count == 0 282 283 print("Done") 284