Package Bio :: Package AlignIO :: Module FastaIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.FastaIO

  1  # Copyright 2008-2016 by Peter Cock.  All rights reserved. 
  2  # 
  3  # This file is part of the Biopython distribution and governed by your 
  4  # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". 
  5  # Please see the LICENSE file that should have been included as part of this 
  6  # package. 
  7  """Bio.AlignIO support for "fasta-m10" output from Bill Pearson's FASTA tools. 
  8   
  9  You are expected to use this module via the Bio.AlignIO functions (or the 
 10  Bio.SeqIO functions if you want to work directly with the gapped sequences). 
 11   
 12  This module contains a parser for the pairwise alignments produced by Bill 
 13  Pearson's FASTA tools, for use from the Bio.AlignIO interface where it is 
 14  referred to as the "fasta-m10" file format (as we only support the machine 
 15  readable output format selected with the -m 10 command line option). 
 16   
 17  This module does NOT cover the generic "fasta" file format originally 
 18  developed as an input format to the FASTA tools.  The Bio.AlignIO and 
 19  Bio.SeqIO both use the Bio.SeqIO.FastaIO module to deal with these files, 
 20  which can also be used to store a multiple sequence alignments. 
 21  """ 
 22   
 23  from __future__ import print_function 
 24   
 25  from Bio.Seq import Seq 
 26  from Bio.SeqRecord import SeqRecord 
 27  from Bio.Align import MultipleSeqAlignment 
 28  from Bio.Alphabet import single_letter_alphabet, generic_dna, generic_protein 
 29  from Bio.Alphabet import Gapped 
 30   
 31   
32 -def _extract_alignment_region(alignment_seq_with_flanking, annotation):
33 """Extract alignment region (PRIVATE). 34 35 Helper function for the main parsing code. 36 37 To get the actual pairwise alignment sequences, we must first 38 translate the un-gapped sequence based coordinates into positions 39 in the gapped sequence (which may have a flanking region shown 40 using leading - characters). To date, I have never seen any 41 trailing flanking region shown in the m10 file, but the 42 following code should also cope with that. 43 44 Note that this code seems to work fine even when the "sq_offset" 45 entries are prsent as a result of using the -X command line option. 46 """ 47 align_stripped = alignment_seq_with_flanking.strip("-") 48 display_start = int(annotation['al_display_start']) 49 if int(annotation['al_start']) <= int(annotation['al_stop']): 50 start = int(annotation['al_start']) - display_start 51 end = int(annotation['al_stop']) - display_start + 1 52 else: 53 # FASTA has flipped this sequence... 54 start = display_start \ 55 - int(annotation['al_start']) 56 end = display_start \ 57 - int(annotation['al_stop']) + 1 58 end += align_stripped.count("-") 59 if start < 0 or start >= end or end > len(align_stripped): 60 raise ValueError("Problem with sequence start/stop,\n%s[%i:%i]\n%s" 61 % (alignment_seq_with_flanking, start, end, annotation)) 62 return align_stripped[start:end]
63 64
65 -def FastaM10Iterator(handle, alphabet=single_letter_alphabet):
66 """Alignment iterator for the FASTA tool's pairwise alignment output. 67 68 This is for reading the pairwise alignments output by Bill Pearson's 69 FASTA program when called with the -m 10 command line option for machine 70 readable output. For more details about the FASTA tools, see the website 71 http://fasta.bioch.virginia.edu/ and the paper: 72 73 W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448 74 75 This class is intended to be used via the Bio.AlignIO.parse() function 76 by specifying the format as "fasta-m10" as shown in the following code:: 77 78 from Bio import AlignIO 79 handle = ... 80 for a in AlignIO.parse(handle, "fasta-m10"): 81 assert len(a) == 2, "Should be pairwise!" 82 print("Alignment length %i" % a.get_alignment_length()) 83 for record in a: 84 print("%s %s %s" % (record.seq, record.name, record.id)) 85 86 Note that this is not a full blown parser for all the information 87 in the FASTA output - for example, most of the header and all of the 88 footer is ignored. Also, the alignments are not batched according to 89 the input queries. 90 91 Also note that there can be up to about 30 letters of flanking region 92 included in the raw FASTA output as contextual information. This is NOT 93 part of the alignment itself, and is not included in the resulting 94 MultipleSeqAlignment objects returned. 95 """ 96 if alphabet is None: 97 alphabet = single_letter_alphabet 98 99 state_PREAMBLE = -1 100 state_NONE = 0 101 state_QUERY_HEADER = 1 102 state_ALIGN_HEADER = 2 103 state_ALIGN_QUERY = 3 104 state_ALIGN_MATCH = 4 105 state_ALIGN_CONS = 5 106 107 def build_hsp(): 108 if not query_tags and not match_tags: 109 raise ValueError("No data for query %r, match %r" 110 % (query_id, match_id)) 111 assert query_tags, query_tags 112 assert match_tags, match_tags 113 evalue = align_tags.get("fa_expect") 114 q = "?" # Just for printing len(q) in debug below 115 m = "?" # Just for printing len(m) in debug below 116 tool = global_tags.get("tool", "").upper() 117 118 q = _extract_alignment_region(query_seq, query_tags) 119 if tool in ["TFASTX"] and len(match_seq) == len(q): 120 m = match_seq 121 # Quick hack until I can work out how -, * and / characters 122 # and the apparent mix of aa and bp coordinates works. 123 else: 124 m = _extract_alignment_region(match_seq, match_tags) 125 if len(q) != len(m): 126 message = """Darn... amino acids vs nucleotide coordinates? 127 tool: {0} 128 query_seq: {1} 129 query_tags: {2} 130 {3} length: {4} 131 match_seq: {5} 132 match_tags: {6} 133 {7} length: {8} 134 handle.name: {9} 135 """.format(tool, query_seq, query_tags, q, len(q), match_seq, match_tags, m, len(m), handle.name) 136 raise ValueError(message) 137 138 assert alphabet is not None 139 alignment = MultipleSeqAlignment([], alphabet) 140 141 # TODO - Introduce an annotated alignment class? 142 # See also Bio/AlignIO/MafIO.py for same requirement. 143 # For now, store the annotation a new private property: 144 alignment._annotations = {} 145 146 # Want to record both the query header tags, and the alignment tags. 147 for key, value in header_tags.items(): 148 alignment._annotations[key] = value 149 for key, value in align_tags.items(): 150 alignment._annotations[key] = value 151 152 # Query 153 # ===== 154 record = SeqRecord(Seq(q, alphabet), 155 id=query_id, 156 name="query", 157 description=query_descr, 158 annotations={"original_length": int(query_tags["sq_len"])}) 159 # TODO - handle start/end coordinates properly. Short term hack for now: 160 record._al_start = int(query_tags["al_start"]) 161 record._al_stop = int(query_tags["al_stop"]) 162 alignment.append(record) 163 164 # TODO - What if a specific alphabet has been requested? 165 # TODO - Use an IUPAC alphabet? 166 # TODO - Can FASTA output RNA? 167 if alphabet == single_letter_alphabet and "sq_type" in query_tags: 168 if query_tags["sq_type"] == "D": 169 record.seq.alphabet = generic_dna 170 elif query_tags["sq_type"] == "p": 171 record.seq.alphabet = generic_protein 172 if "-" in q: 173 if not hasattr(record.seq.alphabet, "gap_char"): 174 record.seq.alphabet = Gapped(record.seq.alphabet, "-") 175 176 # Match 177 # ===== 178 record = SeqRecord(Seq(m, alphabet), 179 id=match_id, 180 name="match", 181 description=match_descr, 182 annotations={"original_length": int(match_tags["sq_len"])}) 183 # TODO - handle start/end coordinates properly. Short term hack for now: 184 record._al_start = int(match_tags["al_start"]) 185 record._al_stop = int(match_tags["al_stop"]) 186 alignment.append(record) 187 188 # This is still a very crude way of dealing with the alphabet: 189 if alphabet == single_letter_alphabet and "sq_type" in match_tags: 190 if match_tags["sq_type"] == "D": 191 record.seq.alphabet = generic_dna 192 elif match_tags["sq_type"] == "p": 193 record.seq.alphabet = generic_protein 194 if "-" in m: 195 if not hasattr(record.seq.alphabet, "gap_char"): 196 record.seq.alphabet = Gapped(record.seq.alphabet, "-") 197 198 return alignment
199 200 state = state_PREAMBLE 201 query_id = None 202 match_id = None 203 query_descr = "" 204 match_descr = "" 205 global_tags = {} 206 header_tags = {} 207 align_tags = {} 208 query_tags = {} 209 match_tags = {} 210 query_seq = "" 211 match_seq = "" 212 cons_seq = "" 213 for line in handle: 214 if ">>>" in line and not line.startswith(">>>"): 215 if query_id and match_id: 216 # This happens on old FASTA output which lacked an end of 217 # query >>><<< marker line. 218 yield build_hsp() 219 state = state_NONE 220 query_descr = line[line.find(">>>") + 3:].strip() 221 query_id = query_descr.split(None, 1)[0] 222 match_id = None 223 header_tags = {} 224 align_tags = {} 225 query_tags = {} 226 match_tags = {} 227 query_seq = "" 228 match_seq = "" 229 cons_seq = "" 230 elif line.startswith("!! No "): 231 # e.g. 232 # !! No library sequences with E() < 0.5 233 # or on more recent versions, 234 # No sequences with E() < 0.05 235 assert state == state_NONE 236 assert not header_tags 237 assert not align_tags 238 assert not match_tags 239 assert not query_tags 240 assert match_id is None 241 assert not query_seq 242 assert not match_seq 243 assert not cons_seq 244 query_id = None 245 elif line.strip() in [">>><<<", ">>>///"]: 246 # End of query, possible end of all queries 247 if query_id and match_id: 248 yield build_hsp() 249 state = state_NONE 250 query_id = None 251 match_id = None 252 header_tags = {} 253 align_tags = {} 254 query_tags = {} 255 match_tags = {} 256 query_seq = "" 257 match_seq = "" 258 cons_seq = "" 259 elif line.startswith(">>>"): 260 # Should be start of a match! 261 assert query_id is not None 262 assert line[3:].split(", ", 1)[0] == query_id, line 263 assert match_id is None 264 assert not header_tags 265 assert not align_tags 266 assert not query_tags 267 assert not match_tags 268 assert not match_seq 269 assert not query_seq 270 assert not cons_seq 271 state = state_QUERY_HEADER 272 elif line.startswith(">>"): 273 # Should now be at start of a match alignment! 274 if query_id and match_id: 275 yield build_hsp() 276 align_tags = {} 277 query_tags = {} 278 match_tags = {} 279 query_seq = "" 280 match_seq = "" 281 cons_seq = "" 282 match_descr = line[2:].strip() 283 match_id = match_descr.split(None, 1)[0] 284 state = state_ALIGN_HEADER 285 elif line.startswith(">--"): 286 # End of one HSP 287 assert query_id and match_id, line 288 yield build_hsp() 289 # Clean up read for next HSP 290 # but reuse header_tags 291 align_tags = {} 292 query_tags = {} 293 match_tags = {} 294 query_seq = "" 295 match_seq = "" 296 cons_seq = "" 297 state = state_ALIGN_HEADER 298 elif line.startswith(">"): 299 if state == state_ALIGN_HEADER: 300 # Should be start of query alignment seq... 301 assert query_id is not None, line 302 assert match_id is not None, line 303 assert query_id.startswith(line[1:].split(None, 1)[0]), line 304 state = state_ALIGN_QUERY 305 elif state == state_ALIGN_QUERY: 306 # Should be start of match alignment seq 307 assert query_id is not None, line 308 assert match_id is not None, line 309 assert match_id.startswith(line[1:].split(None, 1)[0]), line 310 state = state_ALIGN_MATCH 311 elif state == state_NONE: 312 # Can get > as the last line of a histogram 313 pass 314 else: 315 raise RuntimeError("state %i got %r" % (state, line)) 316 elif line.startswith("; al_cons"): 317 assert state == state_ALIGN_MATCH, line 318 state = state_ALIGN_CONS 319 # Next line(s) should be consensus seq... 320 elif line.startswith("; "): 321 if ": " in line: 322 key, value = [s.strip() for s in line[2:].split(": ", 1)] 323 else: 324 import warnings 325 # Seen in lalign36, specifically version 36.3.4 Apr, 2011 326 # Fixed in version 36.3.5b Oct, 2011(preload8) 327 warnings.warn("Missing colon in line: %r" % line) 328 try: 329 key, value = [s.strip() for s in line[2:].split(" ", 1)] 330 except ValueError: 331 raise ValueError("Bad line: %r" % line) 332 if state == state_QUERY_HEADER: 333 header_tags[key] = value 334 elif state == state_ALIGN_HEADER: 335 align_tags[key] = value 336 elif state == state_ALIGN_QUERY: 337 query_tags[key] = value 338 elif state == state_ALIGN_MATCH: 339 match_tags[key] = value 340 else: 341 raise RuntimeError("Unexpected state %r, %r" % (state, line)) 342 elif state == state_ALIGN_QUERY: 343 query_seq += line.strip() 344 elif state == state_ALIGN_MATCH: 345 match_seq += line.strip() 346 elif state == state_ALIGN_CONS: 347 cons_seq += line.strip("\n") 348 elif state == state_PREAMBLE: 349 if line.startswith("#"): 350 global_tags["command"] = line[1:].strip() 351 elif line.startswith(" version "): 352 global_tags["version"] = line[9:].strip() 353 elif " compares a " in line: 354 global_tags["tool"] = line[:line.find(" compares a ")].strip() 355 elif " searches a " in line: 356 global_tags["tool"] = line[:line.find(" searches a ")].strip() 357 else: 358 pass 359