Package Bio :: Package SeqIO :: Module FastaIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.FastaIO

  1  # Copyright 2006-2017 by Peter Cock.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  # 
  6  # This module is for reading and writing FASTA format files as SeqRecord 
  7  # objects.  The code is partly inspired  by earlier Biopython modules, 
  8  # Bio.Fasta.* and the now deprecated Bio.SeqIO.FASTA 
  9   
 10  """Bio.SeqIO support for the "fasta" (aka FastA or Pearson) file format. 
 11   
 12  You are expected to use this module via the Bio.SeqIO functions. 
 13  """ 
 14   
 15  from __future__ import print_function 
 16   
 17  from Bio.Alphabet import single_letter_alphabet 
 18  from Bio.Seq import Seq 
 19  from Bio.SeqRecord import SeqRecord 
 20  from Bio.SeqIO.Interfaces import SequentialSequenceWriter 
 21  from Bio.SeqIO.Interfaces import _clean, _get_seq_string 
 22   
 23   
24 -def SimpleFastaParser(handle):
25 """Iterate over Fasta records as string tuples. 26 27 For each record a tuple of two strings is returned, the FASTA title 28 line (without the leading '>' character), and the sequence (with any 29 whitespace removed). The title line is not divided up into an 30 identifier (the first word) and comment or description. 31 32 >>> with open("Fasta/dups.fasta") as handle: 33 ... for values in SimpleFastaParser(handle): 34 ... print(values) 35 ... 36 ('alpha', 'ACGTA') 37 ('beta', 'CGTC') 38 ('gamma', 'CCGCC') 39 ('alpha (again - this is a duplicate entry to test the indexing code)', 'ACGTA') 40 ('delta', 'CGCGC') 41 42 """ 43 # Skip any text before the first record (e.g. blank lines, comments) 44 while True: 45 line = handle.readline() 46 if line == "": 47 return # Premature end of file, or just empty? 48 if line[0] == ">": 49 break 50 51 while True: 52 if line[0] != ">": 53 raise ValueError( 54 "Records in Fasta files should start with '>' character") 55 title = line[1:].rstrip() 56 lines = [] 57 line = handle.readline() 58 while True: 59 if not line: 60 break 61 if line[0] == ">": 62 break 63 lines.append(line.rstrip()) 64 line = handle.readline() 65 66 # Remove trailing whitespace, and any internal spaces 67 # (and any embedded \r which are possible in mangled files 68 # when not opened in universal read lines mode) 69 yield title, "".join(lines).replace(" ", "").replace("\r", "") 70 71 if not line: 72 return # StopIteration 73 74 assert False, "Should not reach this line"
75 76
77 -def FastaTwoLineParser(handle):
78 """Iterate over no-wrapping Fasta records as string tuples. 79 80 Functionally the same as SimpleFastaParser but with a strict 81 interpretation of the FASTA format as exactly two lines per 82 record, the greater-than-sign identifier with description, 83 and the sequence with no line wrapping. 84 85 Any line wrapping will raise an exception, as will excess blank 86 lines (other than the special case of a zero-length sequence 87 as the second line of a record). 88 89 Examples 90 -------- 91 This file uses two lines per FASTA record: 92 93 >>> with open("Fasta/aster_no_wrap.pro") as handle: 94 ... for title, seq in FastaTwoLineParser(handle): 95 ... print("%s = %s..." % (title, seq[:3])) 96 ... 97 gi|3298468|dbj|BAA31520.1| SAMIPF = GGH... 98 99 This equivalent file uses line wrapping: 100 101 >>> with open("Fasta/aster.pro") as handle: 102 ... for title, seq in FastaTwoLineParser(handle): 103 ... print("%s = %s..." % (title, seq[:3])) 104 ... 105 Traceback (most recent call last): 106 ... 107 ValueError: Expected FASTA record starting with '>' character. Perhaps this file is using FASTA line wrapping? Got: 'MTFGLVYTVYATAIDPKKGSLGTIAPIAIGFIVGANI' 108 109 """ 110 line = handle.readline() 111 # If this is an empty file, while loop is skipped. 112 while line: 113 if line[0] != ">": 114 if line.strip(): 115 raise ValueError("Expected FASTA record starting with '>' character. " 116 "Perhaps this file is using FASTA line wrapping? " 117 "Got: %r" % line) 118 else: 119 raise ValueError("Expected FASTA record starting with '>' character. " 120 "If using two-lines-per-record, there should be no " 121 "blank lines between records, or at the end of file.") 122 title = line[1:].rstrip() 123 line = handle.readline() 124 if not line: 125 raise ValueError("Premature end of FASTA file (or this is not strict " 126 "two-line-per-record FASTA format), expected one line " 127 "of sequence following: '>%s'" % title) 128 elif line[0] == ">": 129 raise ValueError("Two '>' FASTA lines in a row. Missing sequence line " 130 "if this is strict two-line-per-record FASTA format. " 131 "Have '>%s' and '%s'" % (title, line)) 132 yield title, line.strip() 133 line = handle.readline() 134 135 assert not line, "Should be at end of file, but line=%r" % line
136 137
138 -def FastaIterator(handle, alphabet=single_letter_alphabet, title2ids=None):
139 """Iterate over Fasta records as SeqRecord objects. 140 141 Arguments: 142 - handle - input file 143 - alphabet - optional alphabet 144 - title2ids - A function that, when given the title of the FASTA 145 file (without the beginning >), will return the id, name and 146 description (in that order) for the record as a tuple of strings. 147 If this is not given, then the entire title line will be used 148 as the description, and the first word as the id and name. 149 150 By default this will act like calling Bio.SeqIO.parse(handle, "fasta") 151 with no custom handling of the title lines: 152 153 >>> with open("Fasta/dups.fasta") as handle: 154 ... for record in FastaIterator(handle): 155 ... print(record.id) 156 ... 157 alpha 158 beta 159 gamma 160 alpha 161 delta 162 163 However, you can supply a title2ids function to alter this: 164 165 >>> def take_upper(title): 166 ... return title.split(None, 1)[0].upper(), "", title 167 >>> with open("Fasta/dups.fasta") as handle: 168 ... for record in FastaIterator(handle, title2ids=take_upper): 169 ... print(record.id) 170 ... 171 ALPHA 172 BETA 173 GAMMA 174 ALPHA 175 DELTA 176 177 """ 178 if title2ids: 179 for title, sequence in SimpleFastaParser(handle): 180 id, name, descr = title2ids(title) 181 yield SeqRecord(Seq(sequence, alphabet), 182 id=id, name=name, description=descr) 183 else: 184 for title, sequence in SimpleFastaParser(handle): 185 try: 186 first_word = title.split(None, 1)[0] 187 except IndexError: 188 assert not title, repr(title) 189 # Should we use SeqRecord default for no ID? 190 first_word = "" 191 yield SeqRecord(Seq(sequence, alphabet), 192 id=first_word, name=first_word, description=title)
193 194
195 -def FastaTwoLineIterator(handle, alphabet=single_letter_alphabet):
196 """Iterate over two-line Fasta records (as SeqRecord objects). 197 198 Arguments: 199 - handle - input file 200 - alphabet - optional alphabet 201 202 This uses a strict interpretation of the FASTA as requiring 203 exactly two lines per record (no line wrapping). 204 205 Only the default title to ID/name/description parsing offered 206 by the relaxed FASTA parser is offered. 207 """ 208 for title, sequence in FastaTwoLineParser(handle): 209 try: 210 first_word = title.split(None, 1)[0] 211 except IndexError: 212 assert not title, repr(title) 213 # Should we use SeqRecord default for no ID? 214 first_word = "" 215 yield SeqRecord(Seq(sequence, alphabet), 216 id=first_word, name=first_word, description=title)
217 218
219 -class FastaWriter(SequentialSequenceWriter):
220 """Class to write Fasta format files (OBSOLETE). 221 222 Please use the ``as_fasta`` function instead, or the top level 223 ``Bio.SeqIO.write()`` function instead using ``format="fasta"``. 224 """ 225
226 - def __init__(self, handle, wrap=60, record2title=None):
227 """Create a Fasta writer (OBSOLETE). 228 229 Arguments: 230 - handle - Handle to an output file, e.g. as returned 231 by open(filename, "w") 232 - wrap - Optional line length used to wrap sequence lines. 233 Defaults to wrapping the sequence at 60 characters 234 Use zero (or None) for no wrapping, giving a single 235 long line for the sequence. 236 - record2title - Optional function to return the text to be 237 used for the title line of each record. By default 238 a combination of the record.id and record.description 239 is used. If the record.description starts with the 240 record.id, then just the record.description is used. 241 242 You can either use:: 243 244 handle = open(filename, "w") 245 writer = FastaWriter(handle) 246 writer.write_file(myRecords) 247 handle.close() 248 249 Or, follow the sequential file writer system, for example:: 250 251 handle = open(filename, "w") 252 writer = FastaWriter(handle) 253 writer.write_header() # does nothing for Fasta files 254 ... 255 Multiple writer.write_record() and/or writer.write_records() calls 256 ... 257 writer.write_footer() # does nothing for Fasta files 258 handle.close() 259 260 """ 261 SequentialSequenceWriter.__init__(self, handle) 262 self.wrap = None 263 if wrap: 264 if wrap < 1: 265 raise ValueError 266 self.wrap = wrap 267 self.record2title = record2title
268
269 - def write_record(self, record):
270 """Write a single Fasta record to the file.""" 271 assert self._header_written 272 assert not self._footer_written 273 self._record_written = True 274 275 if self.record2title: 276 title = self.clean(self.record2title(record)) 277 else: 278 id = self.clean(record.id) 279 description = self.clean(record.description) 280 if description and description.split(None, 1)[0] == id: 281 # The description includes the id at the start 282 title = description 283 elif description: 284 title = "%s %s" % (id, description) 285 else: 286 title = id 287 288 assert "\n" not in title 289 assert "\r" not in title 290 self.handle.write(">%s\n" % title) 291 292 data = self._get_seq_string(record) # Catches sequence being None 293 294 assert "\n" not in data 295 assert "\r" not in data 296 297 if self.wrap: 298 for i in range(0, len(data), self.wrap): 299 self.handle.write(data[i:i + self.wrap] + "\n") 300 else: 301 self.handle.write(data + "\n")
302 303
304 -class FastaTwoLineWriter(FastaWriter):
305 """Class to write 2-line per record Fasta format files (OBSOLETE). 306 307 This means we write the sequence information without line 308 wrapping, and will always write a blank line for an empty 309 sequence. 310 """ 311
312 - def __init__(self, handle, record2title=None):
313 """Create a 2-line per record Fasta writer (OBSOLETE). 314 315 Arguments: 316 - handle - Handle to an output file, e.g. as returned 317 by open(filename, "w") 318 - record2title - Optional function to return the text to be 319 used for the title line of each record. By default 320 a combination of the record.id and record.description 321 is used. If the record.description starts with the 322 record.id, then just the record.description is used. 323 324 You can either use:: 325 326 handle = open(filename, "w") 327 writer = FastaWriter(handle) 328 writer.write_file(myRecords) 329 handle.close() 330 331 Or, follow the sequential file writer system, for example:: 332 333 handle = open(filename, "w") 334 writer = FastaWriter(handle) 335 writer.write_header() # does nothing for Fasta files 336 ... 337 Multiple writer.write_record() and/or writer.write_records() calls 338 ... 339 writer.write_footer() # does nothing for Fasta files 340 handle.close() 341 342 """ 343 super(FastaTwoLineWriter, self).__init__(handle, wrap=None, 344 record2title=record2title)
345 346
347 -def as_fasta(record):
348 """Turn a SeqRecord into a FASTA formated string. 349 350 This is used internally by the SeqRecord's .format("fasta") 351 method and by the SeqIO.write(..., ..., "fasta") function. 352 """ 353 id = _clean(record.id) 354 description = _clean(record.description) 355 if description and description.split(None, 1)[0] == id: 356 # The description includes the id at the start 357 title = description 358 elif description: 359 title = "%s %s" % (id, description) 360 else: 361 title = id 362 assert "\n" not in title 363 assert "\r" not in title 364 lines = [">%s\n" % title] 365 366 data = _get_seq_string(record) # Catches sequence being None 367 assert "\n" not in data 368 assert "\r" not in data 369 for i in range(0, len(data), 60): 370 lines.append(data[i:i + 60] + "\n") 371 372 return "".join(lines)
373 374
375 -def as_fasta_2line(record):
376 """Turn a SeqRecord into a two-line FASTA formated string. 377 378 This is used internally by the SeqRecord's .format("fasta-2line") 379 method and by the SeqIO.write(..., ..., "fasta-2line") function. 380 """ 381 id = _clean(record.id) 382 description = _clean(record.description) 383 if description and description.split(None, 1)[0] == id: 384 # The description includes the id at the start 385 title = description 386 elif description: 387 title = "%s %s" % (id, description) 388 else: 389 title = id 390 assert "\n" not in title 391 assert "\r" not in title 392 393 data = _get_seq_string(record) # Catches sequence being None 394 assert "\n" not in data 395 assert "\r" not in data 396 397 return ">%s\n%s\n" % (title, data)
398 399 400 if __name__ == "__main__": 401 from Bio._utils import run_doctest 402 run_doctest(verbose=0) 403