Package Bio :: Package SeqIO :: Module FastaIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.FastaIO

  1  # Copyright 2006-2009 by Peter Cock.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  # 
  6  # This module is for reading and writing FASTA format files as SeqRecord 
  7  # objects.  The code is partly inspired  by earlier Biopython modules, 
  8  # Bio.Fasta.* and the now deprecated Bio.SeqIO.FASTA 
  9   
 10  """Bio.SeqIO support for the "fasta" (aka FastA or Pearson) file format. 
 11   
 12  You are expected to use this module via the Bio.SeqIO functions.""" 
 13   
 14  from Bio.Alphabet import single_letter_alphabet 
 15  from Bio.Seq import Seq 
 16  from Bio.SeqRecord import SeqRecord 
 17  from Bio.SeqIO.Interfaces import SequentialSequenceWriter 
 18   
 19   
20 -def SimpleFastaParser(handle):
21 """Generator function to iterator over Fasta records (as string tuples). 22 23 For each record a tuple of two strings is returned, the FASTA title 24 line (without the leading '>' character), and the sequence (with any 25 whitespace removed). The title line is not divided up into an 26 identifier (the first word) and comment or description. 27 28 >>> for values in SimpleFastaParser(open("Fasta/dups.fasta")): 29 ... print values 30 ('alpha', 'ACGTA') 31 ('beta', 'CGTC') 32 ('gamma', 'CCGCC') 33 ('alpha (again - this is a duplicate entry to test the indexing code)', 'ACGTA') 34 ('delta', 'CGCGC') 35 36 """ 37 #Skip any text before the first record (e.g. blank lines, comments) 38 while True: 39 line = handle.readline() 40 if line == "": 41 return # Premature end of file, or just empty? 42 if line[0] == ">": 43 break 44 45 while True: 46 if line[0] != ">": 47 raise ValueError( 48 "Records in Fasta files should start with '>' character") 49 title = line[1:].rstrip() 50 lines = [] 51 line = handle.readline() 52 while True: 53 if not line: 54 break 55 if line[0] == ">": 56 break 57 lines.append(line.rstrip()) 58 line = handle.readline() 59 60 #Remove trailing whitespace, and any internal spaces 61 #(and any embedded \r which are possible in mangled files 62 #when not opened in universal read lines mode) 63 yield title, "".join(lines).replace(" ", "").replace("\r", "") 64 65 if not line: 66 return # StopIteration 67 68 assert False, "Should not reach this line"
69 70
71 -def FastaIterator(handle, alphabet=single_letter_alphabet, title2ids=None):
72 """Generator function to iterate over Fasta records (as SeqRecord objects). 73 74 handle - input file 75 alphabet - optional alphabet 76 title2ids - A function that, when given the title of the FASTA 77 file (without the beginning >), will return the id, name and 78 description (in that order) for the record as a tuple of strings. 79 80 If this is not given, then the entire title line will be used 81 as the description, and the first word as the id and name. 82 83 By default this will act like calling Bio.SeqIO.parse(handle, "fasta") 84 with no custom handling of the title lines: 85 86 >>> for record in FastaIterator(open("Fasta/dups.fasta")): 87 ... print record.id 88 alpha 89 beta 90 gamma 91 alpha 92 delta 93 94 However, you can supply a title2ids function to alter this: 95 96 >>> def take_upper(title): 97 ... return title.split(None,1)[0].upper(), "", title 98 >>> for record in FastaIterator(open("Fasta/dups.fasta"), title2ids=take_upper): 99 ... print record.id 100 ALPHA 101 BETA 102 GAMMA 103 ALPHA 104 DELTA 105 106 """ 107 if title2ids: 108 for title, sequence in SimpleFastaParser(handle): 109 id, name, descr = title2ids(title) 110 yield SeqRecord(Seq(sequence, alphabet), 111 id=id, name=name, description=descr) 112 else: 113 for title, sequence in SimpleFastaParser(handle): 114 try: 115 first_word = title.split(None, 1)[0] 116 except IndexError: 117 assert not title, repr(title) 118 #Should we use SeqRecord default for no ID? 119 first_word = "" 120 yield SeqRecord(Seq(sequence, alphabet), 121 id=first_word, name=first_word, description=title)
122 123
124 -class FastaWriter(SequentialSequenceWriter):
125 """Class to write Fasta format files."""
126 - def __init__(self, handle, wrap=60, record2title=None):
127 """Create a Fasta writer. 128 129 handle - Handle to an output file, e.g. as returned 130 by open(filename, "w") 131 wrap - Optional line length used to wrap sequence lines. 132 Defaults to wrapping the sequence at 60 characters 133 Use zero (or None) for no wrapping, giving a single 134 long line for the sequence. 135 record2title - Optional function to return the text to be 136 used for the title line of each record. By default the 137 a combination of the record.id and record.description 138 is used. If the record.description starts with the 139 record.id, then just the record.description is used. 140 141 You can either use: 142 143 myWriter = FastaWriter(open(filename,"w")) 144 writer.write_file(myRecords) 145 146 Or, follow the sequential file writer system, for example: 147 148 myWriter = FastaWriter(open(filename,"w")) 149 writer.write_header() # does nothing for Fasta files 150 ... 151 Multiple calls to writer.write_record() and/or writer.write_records() 152 ... 153 writer.write_footer() # does nothing for Fasta files 154 writer.close() 155 """ 156 SequentialSequenceWriter.__init__(self, handle) 157 #self.handle = handle 158 self.wrap = None 159 if wrap: 160 if wrap < 1: 161 raise ValueError 162 self.wrap = wrap 163 self.record2title = record2title
164
165 - def write_record(self, record):
166 """Write a single Fasta record to the file.""" 167 assert self._header_written 168 assert not self._footer_written 169 self._record_written = True 170 171 if self.record2title: 172 title = self.clean(self.record2title(record)) 173 else: 174 id = self.clean(record.id) 175 description = self.clean(record.description) 176 177 #if description[:len(id)]==id: 178 if description and description.split(None, 1)[0] == id: 179 #The description includes the id at the start 180 title = description 181 elif description: 182 title = "%s %s" % (id, description) 183 else: 184 title = id 185 186 assert "\n" not in title 187 assert "\r" not in title 188 self.handle.write(">%s\n" % title) 189 190 data = self._get_seq_string(record) # Catches sequence being None 191 192 assert "\n" not in data 193 assert "\r" not in data 194 195 if self.wrap: 196 for i in range(0, len(data), self.wrap): 197 self.handle.write(data[i:i + self.wrap] + "\n") 198 else: 199 self.handle.write(data + "\n")
200 201 if __name__ == "__main__": 202 print "Running quick self test" 203 204 import os 205 from Bio.Alphabet import generic_protein, generic_nucleotide 206 207 #Download the files from here: 208 #ftp://ftp.ncbi.nlm.nih.gov/genomes/Bacteria/Nanoarchaeum_equitans 209 fna_filename = "NC_005213.fna" 210 faa_filename = "NC_005213.faa" 211
212 - def genbank_name_function(text):
213 text, descr = text.split(None, 1) 214 id = text.split("|")[3] 215 name = id.split(".", 1)[0] 216 return id, name, descr
217 231 232 if os.path.isfile(fna_filename): 233 print "--------" 234 print "FastaIterator (single sequence)" 235 iterator = FastaIterator(open(fna_filename, "r"), alphabet=generic_nucleotide, title2ids=genbank_name_function) 236 count = 0 237 for record in iterator: 238 count += 1 239 print_record(record) 240 assert count == 1 241 print str(record.__class__) 242 243 if os.path.isfile(faa_filename): 244 print "--------" 245 print "FastaIterator (multiple sequences)" 246 iterator = FastaIterator(open(faa_filename, "r"), alphabet=generic_protein, title2ids=genbank_name_function) 247 count = 0 248 for record in iterator: 249 count += 1 250 print_record(record) 251 break 252 assert count > 0 253 print str(record.__class__) 254 255 from cStringIO import StringIO 256 print "--------" 257 print "FastaIterator (empty input file)" 258 #Just to make sure no errors happen 259 iterator = FastaIterator(StringIO("")) 260 count = 0 261 for record in iterator: 262 count += 1 263 assert count == 0 264 265 print "Done" 266