Package Bio :: Package AlignIO :: Module FastaIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.FastaIO

  1  # Copyright 2008-2011 by Peter Cock.  All rights reserved. 
  2  # 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """Bio.AlignIO support for "fasta-m10" output from Bill Pearson's FASTA tools. 
  7   
  8  You are expected to use this module via the Bio.AlignIO functions (or the 
  9  Bio.SeqIO functions if you want to work directly with the gapped sequences). 
 10   
 11  This module contains a parser for the pairwise alignments produced by Bill 
 12  Pearson's FASTA tools, for use from the Bio.AlignIO interface where it is 
 13  refered to as the "fasta-m10" file format (as we only support the machine 
 14  readable output format selected with the -m 10 command line option). 
 15   
 16  This module does NOT cover the generic "fasta" file format originally 
 17  developed as an input format to the FASTA tools.  The Bio.AlignIO and 
 18  Bio.SeqIO both use the Bio.SeqIO.FastaIO module to deal with these files, 
 19  which can also be used to store a multiple sequence alignments. 
 20  """ 
 21   
 22  from __future__ import print_function 
 23   
 24  from Bio.Seq import Seq 
 25  from Bio.SeqRecord import SeqRecord 
 26  from Bio.Align import MultipleSeqAlignment 
 27  from Bio.Alphabet import single_letter_alphabet, generic_dna, generic_protein 
 28  from Bio.Alphabet import Gapped 
 29   
 30  __docformat__ = "restructuredtext en" 
 31   
32 -def _extract_alignment_region(alignment_seq_with_flanking, annotation):
33 """Helper function for the main parsing code (PRIVATE). 34 35 To get the actual pairwise alignment sequences, we must first 36 translate the un-gapped sequence based coordinates into positions 37 in the gapped sequence (which may have a flanking region shown 38 using leading - characters). To date, I have never seen any 39 trailing flanking region shown in the m10 file, but the 40 following code should also cope with that. 41 42 Note that this code seems to work fine even when the "sq_offset" 43 entries are prsent as a result of using the -X command line option. 44 """ 45 align_stripped = alignment_seq_with_flanking.strip("-") 46 display_start = int(annotation['al_display_start']) 47 if int(annotation['al_start']) <= int(annotation['al_stop']): 48 start = int(annotation['al_start']) \ 49 - display_start 50 end = int(annotation['al_stop']) \ 51 - display_start + 1 52 else: 53 # FASTA has flipped this sequence... 54 start = display_start \ 55 - int(annotation['al_start']) 56 end = display_start \ 57 - int(annotation['al_stop']) + 1 58 end += align_stripped.count("-") 59 assert 0 <= start and start < end and end <= len(align_stripped), \ 60 "Problem with sequence start/stop,\n%s[%i:%i]\n%s" \ 61 % (alignment_seq_with_flanking, start, end, annotation) 62 return align_stripped[start:end]
63 64
65 -def FastaM10Iterator(handle, alphabet=single_letter_alphabet):
66 """Alignment iterator for the FASTA tool's pairwise alignment output. 67 68 This is for reading the pairwise alignments output by Bill Pearson's 69 FASTA program when called with the -m 10 command line option for machine 70 readable output. For more details about the FASTA tools, see the website 71 http://fasta.bioch.virginia.edu/ and the paper: 72 73 W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448 74 75 This class is intended to be used via the Bio.AlignIO.parse() function 76 by specifying the format as "fasta-m10" as shown in the following code:: 77 78 from Bio import AlignIO 79 handle = ... 80 for a in AlignIO.parse(handle, "fasta-m10"): 81 assert len(a) == 2, "Should be pairwise!" 82 print("Alignment length %i" % a.get_alignment_length()) 83 for record in a: 84 print("%s %s %s" % (record.seq, record.name, record.id)) 85 86 Note that this is not a full blown parser for all the information 87 in the FASTA output - for example, most of the header and all of the 88 footer is ignored. Also, the alignments are not batched according to 89 the input queries. 90 91 Also note that there can be up to about 30 letters of flanking region 92 included in the raw FASTA output as contextual information. This is NOT 93 part of the alignment itself, and is not included in the resulting 94 MultipleSeqAlignment objects returned. 95 """ 96 if alphabet is None: 97 alphabet = single_letter_alphabet 98 99 state_PREAMBLE = -1 100 state_NONE = 0 101 state_QUERY_HEADER = 1 102 state_ALIGN_HEADER = 2 103 state_ALIGN_QUERY = 3 104 state_ALIGN_MATCH = 4 105 state_ALIGN_CONS = 5 106 107 def build_hsp(): 108 if not query_tags and not match_tags: 109 raise ValueError("No data for query %r, match %r" 110 % (query_id, match_id)) 111 assert query_tags, query_tags 112 assert match_tags, match_tags 113 evalue = align_tags.get("fa_expect", None) 114 q = "?" # Just for printing len(q) in debug below 115 m = "?" # Just for printing len(m) in debug below 116 tool = global_tags.get("tool", "").upper() 117 try: 118 q = _extract_alignment_region(query_seq, query_tags) 119 if tool in ["TFASTX"] and len(match_seq) == len(q): 120 m = match_seq 121 # Quick hack until I can work out how -, * and / characters 122 # and the apparent mix of aa and bp coordinates works. 123 else: 124 m = _extract_alignment_region(match_seq, match_tags) 125 assert len(q) == len(m) 126 except AssertionError as err: 127 print("Darn... amino acids vs nucleotide coordinates?") 128 print(tool) 129 print(query_seq) 130 print(query_tags) 131 print("%s %i" % (q, len(q))) 132 print(match_seq) 133 print(match_tags) 134 print("%s %i" % (m, len(m))) 135 print(handle.name) 136 raise err 137 138 assert alphabet is not None 139 alignment = MultipleSeqAlignment([], alphabet) 140 141 # TODO - Introduce an annotated alignment class? 142 # For now, store the annotation a new private property: 143 alignment._annotations = {} 144 145 # Want to record both the query header tags, and the alignment tags. 146 for key, value in header_tags.items(): 147 alignment._annotations[key] = value 148 for key, value in align_tags.items(): 149 alignment._annotations[key] = value 150 151 # Query 152 # ===== 153 record = SeqRecord(Seq(q, alphabet), 154 id=query_id, 155 name="query", 156 description=query_descr, 157 annotations={"original_length": int(query_tags["sq_len"])}) 158 # TODO - handle start/end coordinates properly. Short term hack for now: 159 record._al_start = int(query_tags["al_start"]) 160 record._al_stop = int(query_tags["al_stop"]) 161 alignment.append(record) 162 163 # TODO - What if a specific alphabet has been requested? 164 # TODO - Use an IUPAC alphabet? 165 # TODO - Can FASTA output RNA? 166 if alphabet == single_letter_alphabet and "sq_type" in query_tags: 167 if query_tags["sq_type"] == "D": 168 record.seq.alphabet = generic_dna 169 elif query_tags["sq_type"] == "p": 170 record.seq.alphabet = generic_protein 171 if "-" in q: 172 if not hasattr(record.seq.alphabet, "gap_char"): 173 record.seq.alphabet = Gapped(record.seq.alphabet, "-") 174 175 # Match 176 # ===== 177 record = SeqRecord(Seq(m, alphabet), 178 id=match_id, 179 name="match", 180 description=match_descr, 181 annotations={"original_length": int(match_tags["sq_len"])}) 182 # TODO - handle start/end coordinates properly. Short term hack for now: 183 record._al_start = int(match_tags["al_start"]) 184 record._al_stop = int(match_tags["al_stop"]) 185 alignment.append(record) 186 187 # This is still a very crude way of dealing with the alphabet: 188 if alphabet == single_letter_alphabet and "sq_type" in match_tags: 189 if match_tags["sq_type"] == "D": 190 record.seq.alphabet = generic_dna 191 elif match_tags["sq_type"] == "p": 192 record.seq.alphabet = generic_protein 193 if "-" in m: 194 if not hasattr(record.seq.alphabet, "gap_char"): 195 record.seq.alphabet = Gapped(record.seq.alphabet, "-") 196 197 return alignment
198 199 state = state_PREAMBLE 200 query_id = None 201 match_id = None 202 query_descr = "" 203 match_descr = "" 204 global_tags = {} 205 header_tags = {} 206 align_tags = {} 207 query_tags = {} 208 match_tags = {} 209 query_seq = "" 210 match_seq = "" 211 cons_seq = "" 212 for line in handle: 213 if ">>>" in line and not line.startswith(">>>"): 214 if query_id and match_id: 215 # This happens on old FASTA output which lacked an end of 216 # query >>><<< marker line. 217 yield build_hsp() 218 state = state_NONE 219 query_descr = line[line.find(">>>")+3:].strip() 220 query_id = query_descr.split(None, 1)[0] 221 match_id = None 222 header_tags = {} 223 align_tags = {} 224 query_tags = {} 225 match_tags = {} 226 query_seq = "" 227 match_seq = "" 228 cons_seq = "" 229 elif line.startswith("!! No "): 230 # e.g. 231 # !! No library sequences with E() < 0.5 232 # or on more recent versions, 233 # No sequences with E() < 0.05 234 assert state == state_NONE 235 assert not header_tags 236 assert not align_tags 237 assert not match_tags 238 assert not query_tags 239 assert match_id is None 240 assert not query_seq 241 assert not match_seq 242 assert not cons_seq 243 query_id = None 244 elif line.strip() in [">>><<<", ">>>///"]: 245 # End of query, possible end of all queries 246 if query_id and match_id: 247 yield build_hsp() 248 state = state_NONE 249 query_id = None 250 match_id = None 251 header_tags = {} 252 align_tags = {} 253 query_tags = {} 254 match_tags = {} 255 query_seq = "" 256 match_seq = "" 257 cons_seq = "" 258 elif line.startswith(">>>"): 259 # Should be start of a match! 260 assert query_id is not None 261 assert line[3:].split(", ", 1)[0] == query_id, line 262 assert match_id is None 263 assert not header_tags 264 assert not align_tags 265 assert not query_tags 266 assert not match_tags 267 assert not match_seq 268 assert not query_seq 269 assert not cons_seq 270 state = state_QUERY_HEADER 271 elif line.startswith(">>"): 272 # Should now be at start of a match alignment! 273 if query_id and match_id: 274 yield build_hsp() 275 align_tags = {} 276 query_tags = {} 277 match_tags = {} 278 query_seq = "" 279 match_seq = "" 280 cons_seq = "" 281 match_descr = line[2:].strip() 282 match_id = match_descr.split(None, 1)[0] 283 state = state_ALIGN_HEADER 284 elif line.startswith(">--"): 285 # End of one HSP 286 assert query_id and match_id, line 287 yield build_hsp() 288 # Clean up read for next HSP 289 # but reuse header_tags 290 align_tags = {} 291 query_tags = {} 292 match_tags = {} 293 query_seq = "" 294 match_seq = "" 295 cons_seq = "" 296 state = state_ALIGN_HEADER 297 elif line.startswith(">"): 298 if state == state_ALIGN_HEADER: 299 # Should be start of query alignment seq... 300 assert query_id is not None, line 301 assert match_id is not None, line 302 assert query_id.startswith(line[1:].split(None, 1)[0]), line 303 state = state_ALIGN_QUERY 304 elif state == state_ALIGN_QUERY: 305 # Should be start of match alignment seq 306 assert query_id is not None, line 307 assert match_id is not None, line 308 assert match_id.startswith(line[1:].split(None, 1)[0]), line 309 state = state_ALIGN_MATCH 310 elif state == state_NONE: 311 # Can get > as the last line of a histogram 312 pass 313 else: 314 assert False, "state %i got %r" % (state, line) 315 elif line.startswith("; al_cons"): 316 assert state == state_ALIGN_MATCH, line 317 state = state_ALIGN_CONS 318 # Next line(s) should be consensus seq... 319 elif line.startswith("; "): 320 if ": " in line: 321 key, value = [s.strip() for s in line[2:].split(": ", 1)] 322 else: 323 import warnings 324 # Seen in lalign36, specifically version 36.3.4 Apr, 2011 325 # Fixed in version 36.3.5b Oct, 2011(preload8) 326 warnings.warn("Missing colon in line: %r" % line) 327 try: 328 key, value = [s.strip() for s in line[2:].split(" ", 1)] 329 except ValueError: 330 raise ValueError("Bad line: %r" % line) 331 if state == state_QUERY_HEADER: 332 header_tags[key] = value 333 elif state == state_ALIGN_HEADER: 334 align_tags[key] = value 335 elif state == state_ALIGN_QUERY: 336 query_tags[key] = value 337 elif state == state_ALIGN_MATCH: 338 match_tags[key] = value 339 else: 340 assert False, "Unexpected state %r, %r" % (state, line) 341 elif state == state_ALIGN_QUERY: 342 query_seq += line.strip() 343 elif state == state_ALIGN_MATCH: 344 match_seq += line.strip() 345 elif state == state_ALIGN_CONS: 346 cons_seq += line.strip("\n") 347 elif state == state_PREAMBLE: 348 if line.startswith("#"): 349 global_tags["command"] = line[1:].strip() 350 elif line.startswith(" version "): 351 global_tags["version"] = line[9:].strip() 352 elif " compares a " in line: 353 global_tags["tool"] = line[:line.find(" compares a ")].strip() 354 elif " searches a " in line: 355 global_tags["tool"] = line[:line.find(" searches a ")].strip() 356 else: 357 pass 358 359 360 if __name__ == "__main__": 361 print("Running a quick self-test") 362 363 # http://emboss.sourceforge.net/docs/themes/alnformats/align.simple 364 simple_example = \ 365 """# /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa 366 FASTA searches a protein or DNA sequence data bank 367 version 34.26 January 12, 2007 368 Please cite: 369 W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448 370 371 Query library NC_002127.faa vs NC_009649.faa library 372 searching NC_009649.faa library 373 374 1>>>gi|10955263|ref|NP_052604.1| plasmid mobilization [Escherichia coli O157:H7 s 107 aa - 107 aa 375 vs NC_009649.faa library 376 377 45119 residues in 180 sequences 378 Expectation_n fit: rho(ln(x))= 6.9146+/-0.0249; mu= -5.7948+/- 1.273 379 mean_var=53.6859+/-13.609, 0's: 0 Z-trim: 1 B-trim: 9 in 1/25 380 Lambda= 0.175043 381 382 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2 383 join: 36, opt: 24, open/ext: -10/-2, width: 16 384 Scan time: 0.000 385 The best scores are: opt bits E(180) 386 gi|152973457|ref|YP_001338508.1| ATPase with chape ( 931) 71 24.9 0.58 387 gi|152973588|ref|YP_001338639.1| F pilus assembly ( 459) 63 23.1 0.99 388 389 >>>gi|10955263|ref|NP_052604.1|, 107 aa vs NC_009649.faa library 390 ; pg_name: /opt/fasta/fasta34 391 ; pg_ver: 34.26 392 ; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa 393 ; pg_name: FASTA 394 ; pg_ver: 3.5 Sept 2006 395 ; pg_matrix: BL50 (15:-5) 396 ; pg_open-ext: -10 -2 397 ; pg_ktup: 2 398 ; pg_optcut: 24 399 ; pg_cgap: 36 400 ; mp_extrap: 60000 180 401 ; mp_stats: Expectation_n fit: rho(ln(x))= 6.9146+/-0.0249; mu= -5.7948+/- 1.273 mean_var=53.6859+/-13.609, 0's: 0 Z-trim: 1 B-trim: 9 in 1/25 Lambda= 0.175043 402 ; mp_KS: -0.0000 (N=0) at 8159228 403 >>gi|152973457|ref|YP_001338508.1| ATPase with chaperone activity, ATP-binding subunit [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] 404 ; fa_frame: f 405 ; fa_initn: 65 406 ; fa_init1: 43 407 ; fa_opt: 71 408 ; fa_z-score: 90.3 409 ; fa_bits: 24.9 410 ; fa_expect: 0.58 411 ; sw_score: 71 412 ; sw_ident: 0.250 413 ; sw_sim: 0.574 414 ; sw_overlap: 108 415 >gi|10955263| .. 416 ; sq_len: 107 417 ; sq_offset: 1 418 ; sq_type: p 419 ; al_start: 5 420 ; al_stop: 103 421 ; al_display_start: 1 422 --------------------------MTKRSGSNT-RRRAISRPVRLTAE 423 ED---QEIRKRAAECGKTVSGFLRAAALGKKVNSLTDDRVLKEVM----- 424 RLGALQKKLFIDGKRVGDREYAEVLIAITEYHRALLSRLMAD 425 >gi|152973457|ref|YP_001338508.1| .. 426 ; sq_len: 931 427 ; sq_type: p 428 ; al_start: 96 429 ; al_stop: 195 430 ; al_display_start: 66 431 SDFFRIGDDATPVAADTDDVVDASFGEPAAAGSGAPRRRGSGLASRISEQ 432 SEALLQEAAKHAAEFGRS------EVDTEHLLLALADSDVVKTILGQFKI 433 KVDDLKRQIESEAKR-GDKPF-EGEIGVSPRVKDALSRAFVASNELGHSY 434 VGPEHFLIGLAEEGEGLAANLLRRYGLTPQ 435 >>gi|152973588|ref|YP_001338639.1| F pilus assembly protein [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] 436 ; fa_frame: f 437 ; fa_initn: 33 438 ; fa_init1: 33 439 ; fa_opt: 63 440 ; fa_z-score: 86.1 441 ; fa_bits: 23.1 442 ; fa_expect: 0.99 443 ; sw_score: 63 444 ; sw_ident: 0.266 445 ; sw_sim: 0.656 446 ; sw_overlap: 64 447 >gi|10955263| .. 448 ; sq_len: 107 449 ; sq_offset: 1 450 ; sq_type: p 451 ; al_start: 32 452 ; al_stop: 94 453 ; al_display_start: 2 454 TKRSGSNTRRRAISRPVRLTAEEDQEIRKRAAECGKTVSGFLRAAALGKK 455 VNSLTDDRVLKEV-MRLGALQKKLFIDGKRVGDREYAEVLIAITEYHRAL 456 LSRLMAD 457 >gi|152973588|ref|YP_001338639.1| .. 458 ; sq_len: 459 459 ; sq_type: p 460 ; al_start: 191 461 ; al_stop: 248 462 ; al_display_start: 161 463 VGGLFPRTQVAQQKVCQDIAGESNIFSDWAASRQGCTVGG--KMDSVQDK 464 ASDKDKERVMKNINIMWNALSKNRLFDG----NKELKEFIMTLTGTLIFG 465 ENSEITPLPARTTDQDLIRAMMEGGTAKIYHCNDSDKCLKVVADATVTIT 466 SNKALKSQISALLSSIQNKAVADEKLTDQE 467 2>>>gi|10955264|ref|NP_052605.1| hypothetical protein pOSAK1_02 [Escherichia coli O157:H7 s 126 aa - 126 aa 468 vs NC_009649.faa library 469 470 45119 residues in 180 sequences 471 Expectation_n fit: rho(ln(x))= 7.1374+/-0.0246; mu= -7.6540+/- 1.313 472 mean_var=51.1189+/-13.171, 0's: 0 Z-trim: 1 B-trim: 8 in 1/25 473 Lambda= 0.179384 474 475 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2 476 join: 36, opt: 24, open/ext: -10/-2, width: 16 477 Scan time: 0.000 478 The best scores are: opt bits E(180) 479 gi|152973462|ref|YP_001338513.1| hypothetical prot ( 101) 58 22.9 0.29 480 481 >>>gi|10955264|ref|NP_052605.1|, 126 aa vs NC_009649.faa library 482 ; pg_name: /opt/fasta/fasta34 483 ; pg_ver: 34.26 484 ; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa 485 ; pg_name: FASTA 486 ; pg_ver: 3.5 Sept 2006 487 ; pg_matrix: BL50 (15:-5) 488 ; pg_open-ext: -10 -2 489 ; pg_ktup: 2 490 ; pg_optcut: 24 491 ; pg_cgap: 36 492 ; mp_extrap: 60000 180 493 ; mp_stats: Expectation_n fit: rho(ln(x))= 7.1374+/-0.0246; mu= -7.6540+/- 1.313 mean_var=51.1189+/-13.171, 0's: 0 Z-trim: 1 B-trim: 8 in 1/25 Lambda= 0.179384 494 ; mp_KS: -0.0000 (N=0) at 8159228 495 >>gi|152973462|ref|YP_001338513.1| hypothetical protein KPN_pKPN3p05904 [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] 496 ; fa_frame: f 497 ; fa_initn: 50 498 ; fa_init1: 50 499 ; fa_opt: 58 500 ; fa_z-score: 95.8 501 ; fa_bits: 22.9 502 ; fa_expect: 0.29 503 ; sw_score: 58 504 ; sw_ident: 0.289 505 ; sw_sim: 0.632 506 ; sw_overlap: 38 507 >gi|10955264| .. 508 ; sq_len: 126 509 ; sq_offset: 1 510 ; sq_type: p 511 ; al_start: 1 512 ; al_stop: 38 513 ; al_display_start: 1 514 ------------------------------MKKDKKYQIEAIKNKDKTLF 515 IVYATDIYSPSEFFSKIESDLKKKKSKGDVFFDLIIPNGGKKDRYVYTSF 516 NGEKFSSYTLNKVTKTDEYN 517 >gi|152973462|ref|YP_001338513.1| .. 518 ; sq_len: 101 519 ; sq_type: p 520 ; al_start: 44 521 ; al_stop: 81 522 ; al_display_start: 14 523 DALLGEIQRLRKQVHQLQLERDILTKANELIKKDLGVSFLKLKNREKTLI 524 VDALKKKYPVAELLSVLQLARSCYFYQNVCTISMRKYA 525 3>>>gi|10955265|ref|NP_052606.1| hypothetical protein pOSAK1_03 [Escherichia coli O157:H7 s 346 aa - 346 aa 526 vs NC_009649.faa library 527 528 45119 residues in 180 sequences 529 Expectation_n fit: rho(ln(x))= 6.0276+/-0.0276; mu= 3.0670+/- 1.461 530 mean_var=37.1634+/- 8.980, 0's: 0 Z-trim: 1 B-trim: 14 in 1/25 531 Lambda= 0.210386 532 533 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2 534 join: 37, opt: 25, open/ext: -10/-2, width: 16 535 Scan time: 0.020 536 The best scores are: opt bits E(180) 537 gi|152973545|ref|YP_001338596.1| putative plasmid ( 242) 70 27.5 0.082 538 539 >>>gi|10955265|ref|NP_052606.1|, 346 aa vs NC_009649.faa library 540 ; pg_name: /opt/fasta/fasta34 541 ; pg_ver: 34.26 542 ; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa 543 ; pg_name: FASTA 544 ; pg_ver: 3.5 Sept 2006 545 ; pg_matrix: BL50 (15:-5) 546 ; pg_open-ext: -10 -2 547 ; pg_ktup: 2 548 ; pg_optcut: 25 549 ; pg_cgap: 37 550 ; mp_extrap: 60000 180 551 ; mp_stats: Expectation_n fit: rho(ln(x))= 6.0276+/-0.0276; mu= 3.0670+/- 1.461 mean_var=37.1634+/- 8.980, 0's: 0 Z-trim: 1 B-trim: 14 in 1/25 Lambda= 0.210386 552 ; mp_KS: -0.0000 (N=0) at 8159228 553 >>gi|152973545|ref|YP_001338596.1| putative plasmid SOS inhibition protein A [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] 554 ; fa_frame: f 555 ; fa_initn: 52 556 ; fa_init1: 52 557 ; fa_opt: 70 558 ; fa_z-score: 105.5 559 ; fa_bits: 27.5 560 ; fa_expect: 0.082 561 ; sw_score: 70 562 ; sw_ident: 0.279 563 ; sw_sim: 0.651 564 ; sw_overlap: 43 565 >gi|10955265| .. 566 ; sq_len: 346 567 ; sq_offset: 1 568 ; sq_type: p 569 ; al_start: 197 570 ; al_stop: 238 571 ; al_display_start: 167 572 DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK 573 QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL 574 GEYFTENKPKYIIREIHQET 575 >gi|152973545|ref|YP_001338596.1| .. 576 ; sq_len: 242 577 ; sq_type: p 578 ; al_start: 52 579 ; al_stop: 94 580 ; al_display_start: 22 581 IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD 582 RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR 583 QDFAFTRKMRREARQVEQSW 584 >>><<< 585 586 587 579 residues in 3 query sequences 588 45119 residues in 180 library sequences 589 Scomplib [34.26] 590 start: Tue May 20 16:38:45 2008 done: Tue May 20 16:38:45 2008 591 Total Scan time: 0.020 Total Display time: 0.010 592 593 Function used was FASTA [version 34.26 January 12, 2007] 594 595 """ 596 597 from Bio._py3k import StringIO 598 599 alignments = list(FastaM10Iterator(StringIO(simple_example))) 600 assert len(alignments) == 4, len(alignments) 601 assert len(alignments[0]) == 2 602 for a in alignments: 603 print("Alignment %i sequences of length %i" 604 % (len(a), a.get_alignment_length())) 605 for r in a: 606 print("%s %s %i" % (r.seq, r.id, r.annotations["original_length"])) 607 # print(a.annotations) 608 print("Done") 609 610 import os 611 path = "../../Tests/Fasta/" 612 files = sorted(f for f in os.listdir(path) if os.path.splitext(f)[-1] == ".m10") 613 for filename in files: 614 if os.path.splitext(filename)[-1] == ".m10": 615 print("") 616 print(filename) 617 print("=" * len(filename)) 618 for i, a in enumerate(FastaM10Iterator(open(os.path.join(path, filename)))): 619 print("#%i, %s" % (i+1, a)) 620 for r in a: 621 if "-" in r.seq: 622 assert r.seq.alphabet.gap_char == "-" 623 else: 624 assert not hasattr(r.seq.alphabet, "gap_char") 625