Package Bio :: Package SeqIO :: Module FastaIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.FastaIO

  1  # Copyright 2006-2009 by Peter Cock.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  # 
  6  # This module is for reading and writing FASTA format files as SeqRecord 
  7  # objects.  The code is partly inspired  by earlier Biopython modules, 
  8  # Bio.Fasta.* and the now deprecated Bio.SeqIO.FASTA 
  9   
 10  """Bio.SeqIO support for the "fasta" (aka FastA or Pearson) file format. 
 11   
 12  You are expected to use this module via the Bio.SeqIO functions.""" 
 13   
 14  from __future__ import print_function 
 15   
 16  from Bio.Alphabet import single_letter_alphabet 
 17  from Bio.Seq import Seq 
 18  from Bio.SeqRecord import SeqRecord 
 19  from Bio.SeqIO.Interfaces import SequentialSequenceWriter 
 20   
 21   
22 -def SimpleFastaParser(handle):
23 """Generator function to iterate over Fasta records (as string tuples). 24 25 For each record a tuple of two strings is returned, the FASTA title 26 line (without the leading '>' character), and the sequence (with any 27 whitespace removed). The title line is not divided up into an 28 identifier (the first word) and comment or description. 29 30 >>> with open("Fasta/dups.fasta") as handle: 31 ... for values in SimpleFastaParser(handle): 32 ... print(values) 33 ... 34 ('alpha', 'ACGTA') 35 ('beta', 'CGTC') 36 ('gamma', 'CCGCC') 37 ('alpha (again - this is a duplicate entry to test the indexing code)', 'ACGTA') 38 ('delta', 'CGCGC') 39 40 """ 41 #Skip any text before the first record (e.g. blank lines, comments) 42 while True: 43 line = handle.readline() 44 if line == "": 45 return # Premature end of file, or just empty? 46 if line[0] == ">": 47 break 48 49 while True: 50 if line[0] != ">": 51 raise ValueError( 52 "Records in Fasta files should start with '>' character") 53 title = line[1:].rstrip() 54 lines = [] 55 line = handle.readline() 56 while True: 57 if not line: 58 break 59 if line[0] == ">": 60 break 61 lines.append(line.rstrip()) 62 line = handle.readline() 63 64 #Remove trailing whitespace, and any internal spaces 65 #(and any embedded \r which are possible in mangled files 66 #when not opened in universal read lines mode) 67 yield title, "".join(lines).replace(" ", "").replace("\r", "") 68 69 if not line: 70 return # StopIteration 71 72 assert False, "Should not reach this line"
73 74
75 -def FastaIterator(handle, alphabet=single_letter_alphabet, title2ids=None):
76 """Generator function to iterate over Fasta records (as SeqRecord objects). 77 78 handle - input file 79 alphabet - optional alphabet 80 title2ids - A function that, when given the title of the FASTA 81 file (without the beginning >), will return the id, name and 82 description (in that order) for the record as a tuple of strings. 83 84 If this is not given, then the entire title line will be used 85 as the description, and the first word as the id and name. 86 87 By default this will act like calling Bio.SeqIO.parse(handle, "fasta") 88 with no custom handling of the title lines: 89 90 >>> with open("Fasta/dups.fasta") as handle: 91 ... for record in FastaIterator(handle): 92 ... print(record.id) 93 ... 94 alpha 95 beta 96 gamma 97 alpha 98 delta 99 100 However, you can supply a title2ids function to alter this: 101 102 >>> def take_upper(title): 103 ... return title.split(None, 1)[0].upper(), "", title 104 >>> with open("Fasta/dups.fasta") as handle: 105 ... for record in FastaIterator(handle, title2ids=take_upper): 106 ... print(record.id) 107 ... 108 ALPHA 109 BETA 110 GAMMA 111 ALPHA 112 DELTA 113 114 """ 115 if title2ids: 116 for title, sequence in SimpleFastaParser(handle): 117 id, name, descr = title2ids(title) 118 yield SeqRecord(Seq(sequence, alphabet), 119 id=id, name=name, description=descr) 120 else: 121 for title, sequence in SimpleFastaParser(handle): 122 try: 123 first_word = title.split(None, 1)[0] 124 except IndexError: 125 assert not title, repr(title) 126 #Should we use SeqRecord default for no ID? 127 first_word = "" 128 yield SeqRecord(Seq(sequence, alphabet), 129 id=first_word, name=first_word, description=title)
130 131
132 -class FastaWriter(SequentialSequenceWriter):
133 """Class to write Fasta format files."""
134 - def __init__(self, handle, wrap=60, record2title=None):
135 """Create a Fasta writer. 136 137 handle - Handle to an output file, e.g. as returned 138 by open(filename, "w") 139 wrap - Optional line length used to wrap sequence lines. 140 Defaults to wrapping the sequence at 60 characters 141 Use zero (or None) for no wrapping, giving a single 142 long line for the sequence. 143 record2title - Optional function to return the text to be 144 used for the title line of each record. By default 145 a combination of the record.id and record.description 146 is used. If the record.description starts with the 147 record.id, then just the record.description is used. 148 149 You can either use:: 150 151 handle = open(filename, "w") 152 myWriter = FastaWriter(handle) 153 writer.write_file(myRecords) 154 handle.close() 155 156 Or, follow the sequential file writer system, for example: 157 158 handle = open(filename, "w") 159 myWriter = FastaWriter(handle) 160 writer.write_header() # does nothing for Fasta files 161 ... 162 Multiple calls to writer.write_record() and/or writer.write_records() 163 ... 164 writer.write_footer() # does nothing for Fasta files 165 handle.close() 166 167 """ 168 SequentialSequenceWriter.__init__(self, handle) 169 #self.handle = handle 170 self.wrap = None 171 if wrap: 172 if wrap < 1: 173 raise ValueError 174 self.wrap = wrap 175 self.record2title = record2title
176
177 - def write_record(self, record):
178 """Write a single Fasta record to the file.""" 179 assert self._header_written 180 assert not self._footer_written 181 self._record_written = True 182 183 if self.record2title: 184 title = self.clean(self.record2title(record)) 185 else: 186 id = self.clean(record.id) 187 description = self.clean(record.description) 188 189 #if description[:len(id)]==id: 190 if description and description.split(None, 1)[0] == id: 191 #The description includes the id at the start 192 title = description 193 elif description: 194 title = "%s %s" % (id, description) 195 else: 196 title = id 197 198 assert "\n" not in title 199 assert "\r" not in title 200 self.handle.write(">%s\n" % title) 201 202 data = self._get_seq_string(record) # Catches sequence being None 203 204 assert "\n" not in data 205 assert "\r" not in data 206 207 if self.wrap: 208 for i in range(0, len(data), self.wrap): 209 self.handle.write(data[i:i + self.wrap] + "\n") 210 else: 211 self.handle.write(data + "\n")
212 213 if __name__ == "__main__": 214 print("Running quick self test") 215 216 import os 217 from Bio.Alphabet import generic_protein, generic_nucleotide 218 219 #Download the files from here: 220 #ftp://ftp.ncbi.nlm.nih.gov/genomes/Bacteria/Nanoarchaeum_equitans 221 fna_filename = "NC_005213.fna" 222 faa_filename = "NC_005213.faa" 223
224 - def genbank_name_function(text):
225 text, descr = text.split(None, 1) 226 id = text.split("|")[3] 227 name = id.split(".", 1)[0] 228 return id, name, descr
229 243 244 if os.path.isfile(fna_filename): 245 print("--------") 246 print("FastaIterator (single sequence)") 247 with open(fna_filename, "r") as h: 248 iterator = FastaIterator(h, alphabet=generic_nucleotide, title2ids=genbank_name_function) 249 count = 0 250 for record in iterator: 251 count += 1 252 print_record(record) 253 assert count == 1 254 print(str(record.__class__)) 255 256 if os.path.isfile(faa_filename): 257 print("--------") 258 print("FastaIterator (multiple sequences)") 259 with open(faa_filename, "r") as h: 260 iterator = FastaIterator(h, alphabet=generic_protein, title2ids=genbank_name_function) 261 count = 0 262 for record in iterator: 263 count += 1 264 print_record(record) 265 break 266 assert count > 0 267 print(str(record.__class__)) 268 269 from Bio._py3k import StringIO 270 print("--------") 271 print("FastaIterator (empty input file)") 272 #Just to make sure no errors happen 273 iterator = FastaIterator(StringIO("")) 274 count = 0 275 for record in iterator: 276 count += 1 277 assert count == 0 278 279 print("Done") 280