Package Bio :: Package SeqIO :: Module FastaIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.FastaIO

  1  # Copyright 2006-2015 by Peter Cock.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  # 
  6  # This module is for reading and writing FASTA format files as SeqRecord 
  7  # objects.  The code is partly inspired  by earlier Biopython modules, 
  8  # Bio.Fasta.* and the now deprecated Bio.SeqIO.FASTA 
  9   
 10  """Bio.SeqIO support for the "fasta" (aka FastA or Pearson) file format. 
 11   
 12  You are expected to use this module via the Bio.SeqIO functions. 
 13  """ 
 14   
 15  from __future__ import print_function 
 16   
 17  from Bio.Alphabet import single_letter_alphabet 
 18  from Bio.Seq import Seq 
 19  from Bio.SeqRecord import SeqRecord 
 20  from Bio.SeqIO.Interfaces import SequentialSequenceWriter 
 21   
 22   
23 -def SimpleFastaParser(handle):
24 """Iterate over Fasta records as string tuples. 25 26 For each record a tuple of two strings is returned, the FASTA title 27 line (without the leading '>' character), and the sequence (with any 28 whitespace removed). The title line is not divided up into an 29 identifier (the first word) and comment or description. 30 31 >>> with open("Fasta/dups.fasta") as handle: 32 ... for values in SimpleFastaParser(handle): 33 ... print(values) 34 ... 35 ('alpha', 'ACGTA') 36 ('beta', 'CGTC') 37 ('gamma', 'CCGCC') 38 ('alpha (again - this is a duplicate entry to test the indexing code)', 'ACGTA') 39 ('delta', 'CGCGC') 40 41 """ 42 # Skip any text before the first record (e.g. blank lines, comments) 43 while True: 44 line = handle.readline() 45 if line == "": 46 return # Premature end of file, or just empty? 47 if line[0] == ">": 48 break 49 50 while True: 51 if line[0] != ">": 52 raise ValueError( 53 "Records in Fasta files should start with '>' character") 54 title = line[1:].rstrip() 55 lines = [] 56 line = handle.readline() 57 while True: 58 if not line: 59 break 60 if line[0] == ">": 61 break 62 lines.append(line.rstrip()) 63 line = handle.readline() 64 65 # Remove trailing whitespace, and any internal spaces 66 # (and any embedded \r which are possible in mangled files 67 # when not opened in universal read lines mode) 68 yield title, "".join(lines).replace(" ", "").replace("\r", "") 69 70 if not line: 71 return # StopIteration 72 73 assert False, "Should not reach this line"
74 75
76 -def FastaTwoLineParser(handle):
77 """Iterate over no-wrapping Fasta records as string tuples. 78 79 Functionally the same as SimpleFastaParser but with a strict 80 interpretation of the FASTA format as exactly two lines per 81 record, the greater-than-sign identifier with description, 82 and the sequence with no line wrapping. 83 84 Any line wrapping will raise an exception, as will excess blank 85 lines (other than the special case of a zero-length sequence 86 as the second line of a record). 87 88 Examples 89 -------- 90 This file uses two lines per FASTA record: 91 92 >>> with open("Fasta/aster_no_wrap.pro") as handle: 93 ... for title, seq in FastaTwoLineParser(handle): 94 ... print("%s = %s..." % (title, seq[:3])) 95 ... 96 gi|3298468|dbj|BAA31520.1| SAMIPF = GGH... 97 98 This equivalent file uses line wrapping: 99 100 >>> with open("Fasta/aster.pro") as handle: 101 ... for title, seq in FastaTwoLineParser(handle): 102 ... print("%s = %s..." % (title, seq[:3])) 103 ... 104 Traceback (most recent call last): 105 ... 106 ValueError: Expected FASTA record starting with '>' character. Perhaps this file is using FASTA line wrapping? Got: 'MTFGLVYTVYATAIDPKKGSLGTIAPIAIGFIVGANI' 107 108 """ 109 line = handle.readline() 110 # If this is an empty file, while loop is skipped. 111 while line: 112 if line[0] != ">": 113 if line.strip(): 114 raise ValueError("Expected FASTA record starting with '>' character. " 115 "Perhaps this file is using FASTA line wrapping? " 116 "Got: %r" % line) 117 else: 118 raise ValueError("Expected FASTA record starting with '>' character. " 119 "If using two-lines-per-record, there should be no " 120 "blank lines between records, or at the end of file.") 121 title = line[1:].rstrip() 122 line = handle.readline() 123 if not line: 124 raise ValueError("Premature end of FASTA file (or this is not strict " 125 "two-line-per-record FASTA format), expected one line " 126 "of sequence following: '>%s'" % title) 127 elif line[0] == ">": 128 raise ValueError("Two '>' FASTA lines in a row. Missing sequence line " 129 "if this is strict two-line-per-record FASTA format. " 130 "Have '>%s' and '%s'" % (title, line)) 131 yield title, line.strip() 132 line = handle.readline() 133 134 assert not line, "Should be at end of file, but line=%r" % line
135 136
137 -def FastaIterator(handle, alphabet=single_letter_alphabet, title2ids=None):
138 """Iterate over Fasta records as SeqRecord objects. 139 140 Arguments: 141 - handle - input file 142 - alphabet - optional alphabet 143 - title2ids - A function that, when given the title of the FASTA 144 file (without the beginning >), will return the id, name and 145 description (in that order) for the record as a tuple of strings. 146 If this is not given, then the entire title line will be used 147 as the description, and the first word as the id and name. 148 149 By default this will act like calling Bio.SeqIO.parse(handle, "fasta") 150 with no custom handling of the title lines: 151 152 >>> with open("Fasta/dups.fasta") as handle: 153 ... for record in FastaIterator(handle): 154 ... print(record.id) 155 ... 156 alpha 157 beta 158 gamma 159 alpha 160 delta 161 162 However, you can supply a title2ids function to alter this: 163 164 >>> def take_upper(title): 165 ... return title.split(None, 1)[0].upper(), "", title 166 >>> with open("Fasta/dups.fasta") as handle: 167 ... for record in FastaIterator(handle, title2ids=take_upper): 168 ... print(record.id) 169 ... 170 ALPHA 171 BETA 172 GAMMA 173 ALPHA 174 DELTA 175 176 """ 177 if title2ids: 178 for title, sequence in SimpleFastaParser(handle): 179 id, name, descr = title2ids(title) 180 yield SeqRecord(Seq(sequence, alphabet), 181 id=id, name=name, description=descr) 182 else: 183 for title, sequence in SimpleFastaParser(handle): 184 try: 185 first_word = title.split(None, 1)[0] 186 except IndexError: 187 assert not title, repr(title) 188 # Should we use SeqRecord default for no ID? 189 first_word = "" 190 yield SeqRecord(Seq(sequence, alphabet), 191 id=first_word, name=first_word, description=title)
192 193
194 -def FastaTwoLineIterator(handle, alphabet=single_letter_alphabet):
195 """Iterate over two-line Fasta records (as SeqRecord objects). 196 197 Arguments: 198 - handle - input file 199 - alphabet - optional alphabet 200 201 This uses a strict interpretation of the FASTA as requiring 202 exactly two lines per record (no line wrapping). 203 204 Only the default title to ID/name/description parsing offered 205 by the relaxed FASTA parser is offered. 206 """ 207 for title, sequence in FastaTwoLineParser(handle): 208 try: 209 first_word = title.split(None, 1)[0] 210 except IndexError: 211 assert not title, repr(title) 212 # Should we use SeqRecord default for no ID? 213 first_word = "" 214 yield SeqRecord(Seq(sequence, alphabet), 215 id=first_word, name=first_word, description=title)
216 217
218 -class FastaWriter(SequentialSequenceWriter):
219 """Class to write Fasta format files.""" 220
221 - def __init__(self, handle, wrap=60, record2title=None):
222 """Create a Fasta writer. 223 224 Arguments: 225 - handle - Handle to an output file, e.g. as returned 226 by open(filename, "w") 227 - wrap - Optional line length used to wrap sequence lines. 228 Defaults to wrapping the sequence at 60 characters 229 Use zero (or None) for no wrapping, giving a single 230 long line for the sequence. 231 - record2title - Optional function to return the text to be 232 used for the title line of each record. By default 233 a combination of the record.id and record.description 234 is used. If the record.description starts with the 235 record.id, then just the record.description is used. 236 237 You can either use:: 238 239 handle = open(filename, "w") 240 writer = FastaWriter(handle) 241 writer.write_file(myRecords) 242 handle.close() 243 244 Or, follow the sequential file writer system, for example:: 245 246 handle = open(filename, "w") 247 writer = FastaWriter(handle) 248 writer.write_header() # does nothing for Fasta files 249 ... 250 Multiple writer.write_record() and/or writer.write_records() calls 251 ... 252 writer.write_footer() # does nothing for Fasta files 253 handle.close() 254 255 """ 256 SequentialSequenceWriter.__init__(self, handle) 257 self.wrap = None 258 if wrap: 259 if wrap < 1: 260 raise ValueError 261 self.wrap = wrap 262 self.record2title = record2title
263
264 - def write_record(self, record):
265 """Write a single Fasta record to the file.""" 266 assert self._header_written 267 assert not self._footer_written 268 self._record_written = True 269 270 if self.record2title: 271 title = self.clean(self.record2title(record)) 272 else: 273 id = self.clean(record.id) 274 description = self.clean(record.description) 275 if description and description.split(None, 1)[0] == id: 276 # The description includes the id at the start 277 title = description 278 elif description: 279 title = "%s %s" % (id, description) 280 else: 281 title = id 282 283 assert "\n" not in title 284 assert "\r" not in title 285 self.handle.write(">%s\n" % title) 286 287 data = self._get_seq_string(record) # Catches sequence being None 288 289 assert "\n" not in data 290 assert "\r" not in data 291 292 if self.wrap: 293 for i in range(0, len(data), self.wrap): 294 self.handle.write(data[i:i + self.wrap] + "\n") 295 else: 296 self.handle.write(data + "\n")
297 298
299 -class FastaTwoLineWriter(FastaWriter):
300 """Class to write 2-line per record Fasta format files. 301 302 This means we write the sequence information without line 303 wrapping, and will always write a blank line for an empty 304 sequence. 305 """ 306
307 - def __init__(self, handle, record2title=None):
308 """Create a Fasta writer. 309 310 Arguments: 311 - handle - Handle to an output file, e.g. as returned 312 by open(filename, "w") 313 - record2title - Optional function to return the text to be 314 used for the title line of each record. By default 315 a combination of the record.id and record.description 316 is used. If the record.description starts with the 317 record.id, then just the record.description is used. 318 319 You can either use:: 320 321 handle = open(filename, "w") 322 writer = FastaWriter(handle) 323 writer.write_file(myRecords) 324 handle.close() 325 326 Or, follow the sequential file writer system, for example:: 327 328 handle = open(filename, "w") 329 writer = FastaWriter(handle) 330 writer.write_header() # does nothing for Fasta files 331 ... 332 Multiple writer.write_record() and/or writer.write_records() calls 333 ... 334 writer.write_footer() # does nothing for Fasta files 335 handle.close() 336 337 """ 338 super(FastaTwoLineWriter, self).__init__(handle, wrap=None, 339 record2title=record2title)
340 341 342 if __name__ == "__main__": 343 from Bio._utils import run_doctest 344 run_doctest(verbose=0) 345