Package Bio :: Package AlignIO :: Module FastaIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.FastaIO

  1  # Copyright 2008-2011 by Peter Cock.  All rights reserved. 
  2  # 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """ 
  7  Bio.AlignIO support for "fasta-m10" output from Bill Pearson's FASTA tools. 
  8   
  9  You are expected to use this module via the Bio.AlignIO functions (or the 
 10  Bio.SeqIO functions if you want to work directly with the gapped sequences). 
 11   
 12  This module contains a parser for the pairwise alignments produced by Bill 
 13  Pearson's FASTA tools, for use from the Bio.AlignIO interface where it is 
 14  refered to as the "fasta-m10" file format (as we only support the machine 
 15  readable output format selected with the -m 10 command line option). 
 16   
 17  This module does NOT cover the generic "fasta" file format originally 
 18  developed as an input format to the FASTA tools.  The Bio.AlignIO and 
 19  Bio.SeqIO both use the Bio.SeqIO.FastaIO module to deal with these files, 
 20  which can also be used to store a multiple sequence alignments. 
 21  """ 
 22   
 23  from Bio.Seq import Seq 
 24  from Bio.SeqRecord import SeqRecord 
 25  from Bio.Align import MultipleSeqAlignment 
 26  from Bio.Alphabet import single_letter_alphabet, generic_dna, generic_protein 
 27  from Bio.Alphabet import Gapped 
 28   
 29   
30 -def _extract_alignment_region(alignment_seq_with_flanking, annotation):
31 """Helper function for the main parsing code (PRIVATE). 32 33 To get the actual pairwise alignment sequences, we must first 34 translate the un-gapped sequence based coordinates into positions 35 in the gapped sequence (which may have a flanking region shown 36 using leading - characters). To date, I have never seen any 37 trailing flanking region shown in the m10 file, but the 38 following code should also cope with that. 39 40 Note that this code seems to work fine even when the "sq_offset" 41 entries are prsent as a result of using the -X command line option. 42 """ 43 align_stripped = alignment_seq_with_flanking.strip("-") 44 display_start = int(annotation['al_display_start']) 45 if int(annotation['al_start']) <= int(annotation['al_stop']): 46 start = int(annotation['al_start']) \ 47 - display_start 48 end = int(annotation['al_stop']) \ 49 - display_start + 1 50 else: 51 #FASTA has flipped this sequence... 52 start = display_start \ 53 - int(annotation['al_start']) 54 end = display_start \ 55 - int(annotation['al_stop']) + 1 56 end += align_stripped.count("-") 57 assert 0 <= start and start < end and end <= len(align_stripped), \ 58 "Problem with sequence start/stop,\n%s[%i:%i]\n%s" \ 59 % (alignment_seq_with_flanking, start, end, annotation) 60 return align_stripped[start:end]
61 62
63 -def FastaM10Iterator(handle, alphabet=single_letter_alphabet):
64 """Alignment iterator for the FASTA tool's pairwise alignment output. 65 66 This is for reading the pairwise alignments output by Bill Pearson's 67 FASTA program when called with the -m 10 command line option for machine 68 readable output. For more details about the FASTA tools, see the website 69 http://fasta.bioch.virginia.edu/ and the paper: 70 71 W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448 72 73 This class is intended to be used via the Bio.AlignIO.parse() function 74 by specifying the format as "fasta-m10" as shown in the following code: 75 76 from Bio import AlignIO 77 handle = ... 78 for a in AlignIO.parse(handle, "fasta-m10"): 79 assert len(a) == 2, "Should be pairwise!" 80 print "Alignment length %i" % a.get_alignment_length() 81 for record in a: 82 print record.seq, record.name, record.id 83 84 Note that this is not a full blown parser for all the information 85 in the FASTA output - for example, most of the header and all of the 86 footer is ignored. Also, the alignments are not batched according to 87 the input queries. 88 89 Also note that there can be up to about 30 letters of flanking region 90 included in the raw FASTA output as contextual information. This is NOT 91 part of the alignment itself, and is not included in the resulting 92 MultipleSeqAlignment objects returned. 93 """ 94 if alphabet is None: 95 alphabet = single_letter_alphabet 96 97 state_PREAMBLE = -1 98 state_NONE = 0 99 state_QUERY_HEADER = 1 100 state_ALIGN_HEADER = 2 101 state_ALIGN_QUERY = 3 102 state_ALIGN_MATCH = 4 103 state_ALIGN_CONS = 5 104 105 def build_hsp(): 106 if not query_tags and not match_tags: 107 raise ValueError("No data for query %r, match %r" 108 % (query_id, match_id)) 109 assert query_tags, query_tags 110 assert match_tags, match_tags 111 evalue = align_tags.get("fa_expect", None) 112 q = "?" # Just for printing len(q) in debug below 113 m = "?" # Just for printing len(m) in debug below 114 tool = global_tags.get("tool", "").upper() 115 try: 116 q = _extract_alignment_region(query_seq, query_tags) 117 if tool in ["TFASTX"] and len(match_seq) == len(q): 118 m = match_seq 119 #Quick hack until I can work out how -, * and / characters 120 #and the apparent mix of aa and bp coordinates works. 121 else: 122 m = _extract_alignment_region(match_seq, match_tags) 123 assert len(q) == len(m) 124 except AssertionError, err: 125 print "Darn... amino acids vs nucleotide coordinates?" 126 print tool 127 print query_seq 128 print query_tags 129 print q, len(q) 130 print match_seq 131 print match_tags 132 print m, len(m) 133 print handle.name 134 raise err 135 136 assert alphabet is not None 137 alignment = MultipleSeqAlignment([], alphabet) 138 139 #TODO - Introduce an annotated alignment class? 140 #For now, store the annotation a new private property: 141 alignment._annotations = {} 142 143 #Want to record both the query header tags, and the alignment tags. 144 for key, value in header_tags.iteritems(): 145 alignment._annotations[key] = value 146 for key, value in align_tags.iteritems(): 147 alignment._annotations[key] = value 148 149 #Query 150 #===== 151 record = SeqRecord(Seq(q, alphabet), 152 id=query_id, 153 name="query", 154 description=query_descr, 155 annotations={"original_length": int(query_tags["sq_len"])}) 156 #TODO - handle start/end coordinates properly. Short term hack for now: 157 record._al_start = int(query_tags["al_start"]) 158 record._al_stop = int(query_tags["al_stop"]) 159 alignment.append(record) 160 161 #TODO - What if a specific alphabet has been requested? 162 #TODO - Use an IUPAC alphabet? 163 #TODO - Can FASTA output RNA? 164 if alphabet == single_letter_alphabet and "sq_type" in query_tags: 165 if query_tags["sq_type"] == "D": 166 record.seq.alphabet = generic_dna 167 elif query_tags["sq_type"] == "p": 168 record.seq.alphabet = generic_protein 169 if "-" in q: 170 if not hasattr(record.seq.alphabet, "gap_char"): 171 record.seq.alphabet = Gapped(record.seq.alphabet, "-") 172 173 #Match 174 #===== 175 record = SeqRecord(Seq(m, alphabet), 176 id=match_id, 177 name="match", 178 description=match_descr, 179 annotations={"original_length": int(match_tags["sq_len"])}) 180 #TODO - handle start/end coordinates properly. Short term hack for now: 181 record._al_start = int(match_tags["al_start"]) 182 record._al_stop = int(match_tags["al_stop"]) 183 alignment.append(record) 184 185 #This is still a very crude way of dealing with the alphabet: 186 if alphabet == single_letter_alphabet and "sq_type" in match_tags: 187 if match_tags["sq_type"] == "D": 188 record.seq.alphabet = generic_dna 189 elif match_tags["sq_type"] == "p": 190 record.seq.alphabet = generic_protein 191 if "-" in m: 192 if not hasattr(record.seq.alphabet, "gap_char"): 193 record.seq.alphabet = Gapped(record.seq.alphabet, "-") 194 195 return alignment
196 197 state = state_PREAMBLE 198 query_id = None 199 match_id = None 200 query_descr = "" 201 match_descr = "" 202 global_tags = {} 203 header_tags = {} 204 align_tags = {} 205 query_tags = {} 206 match_tags = {} 207 query_seq = "" 208 match_seq = "" 209 cons_seq = "" 210 for line in handle: 211 if ">>>" in line and not line.startswith(">>>"): 212 if query_id and match_id: 213 #This happens on old FASTA output which lacked an end of 214 #query >>><<< marker line. 215 yield build_hsp() 216 state = state_NONE 217 query_descr = line[line.find(">>>")+3:].strip() 218 query_id = query_descr.split(None, 1)[0] 219 match_id = None 220 header_tags = {} 221 align_tags = {} 222 query_tags = {} 223 match_tags = {} 224 query_seq = "" 225 match_seq = "" 226 cons_seq = "" 227 elif line.startswith("!! No "): 228 #e.g. 229 #!! No library sequences with E() < 0.5 230 #or on more recent versions, 231 #No sequences with E() < 0.05 232 assert state == state_NONE 233 assert not header_tags 234 assert not align_tags 235 assert not match_tags 236 assert not query_tags 237 assert match_id is None 238 assert not query_seq 239 assert not match_seq 240 assert not cons_seq 241 query_id = None 242 elif line.strip() in [">>><<<", ">>>///"]: 243 #End of query, possible end of all queries 244 if query_id and match_id: 245 yield build_hsp() 246 state = state_NONE 247 query_id = None 248 match_id = None 249 header_tags = {} 250 align_tags = {} 251 query_tags = {} 252 match_tags = {} 253 query_seq = "" 254 match_seq = "" 255 cons_seq = "" 256 elif line.startswith(">>>"): 257 #Should be start of a match! 258 assert query_id is not None 259 assert line[3:].split(", ", 1)[0] == query_id, line 260 assert match_id is None 261 assert not header_tags 262 assert not align_tags 263 assert not query_tags 264 assert not match_tags 265 assert not match_seq 266 assert not query_seq 267 assert not cons_seq 268 state = state_QUERY_HEADER 269 elif line.startswith(">>"): 270 #Should now be at start of a match alignment! 271 if query_id and match_id: 272 yield build_hsp() 273 align_tags = {} 274 query_tags = {} 275 match_tags = {} 276 query_seq = "" 277 match_seq = "" 278 cons_seq = "" 279 match_descr = line[2:].strip() 280 match_id = match_descr.split(None, 1)[0] 281 state = state_ALIGN_HEADER 282 elif line.startswith(">--"): 283 #End of one HSP 284 assert query_id and match_id, line 285 yield build_hsp() 286 #Clean up read for next HSP 287 #but reuse header_tags 288 align_tags = {} 289 query_tags = {} 290 match_tags = {} 291 query_seq = "" 292 match_seq = "" 293 cons_seq = "" 294 state = state_ALIGN_HEADER 295 elif line.startswith(">"): 296 if state == state_ALIGN_HEADER: 297 #Should be start of query alignment seq... 298 assert query_id is not None, line 299 assert match_id is not None, line 300 assert query_id.startswith(line[1:].split(None, 1)[0]), line 301 state = state_ALIGN_QUERY 302 elif state == state_ALIGN_QUERY: 303 #Should be start of match alignment seq 304 assert query_id is not None, line 305 assert match_id is not None, line 306 assert match_id.startswith(line[1:].split(None, 1)[0]), line 307 state = state_ALIGN_MATCH 308 elif state == state_NONE: 309 #Can get > as the last line of a histogram 310 pass 311 else: 312 assert False, "state %i got %r" % (state, line) 313 elif line.startswith("; al_cons"): 314 assert state == state_ALIGN_MATCH, line 315 state = state_ALIGN_CONS 316 #Next line(s) should be consensus seq... 317 elif line.startswith("; "): 318 if ": " in line: 319 key, value = [s.strip() for s in line[2:].split(": ", 1)] 320 else: 321 import warnings 322 #Seen in lalign36, specifically version 36.3.4 Apr, 2011 323 #Fixed in version 36.3.5b Oct, 2011(preload8) 324 warnings.warn("Missing colon in line: %r" % line) 325 try: 326 key, value = [s.strip() for s in line[2:].split(" ", 1)] 327 except ValueError: 328 raise ValueError("Bad line: %r" % line) 329 if state == state_QUERY_HEADER: 330 header_tags[key] = value 331 elif state == state_ALIGN_HEADER: 332 align_tags[key] = value 333 elif state == state_ALIGN_QUERY: 334 query_tags[key] = value 335 elif state == state_ALIGN_MATCH: 336 match_tags[key] = value 337 else: 338 assert False, "Unexpected state %r, %r" % (state, line) 339 elif state == state_ALIGN_QUERY: 340 query_seq += line.strip() 341 elif state == state_ALIGN_MATCH: 342 match_seq += line.strip() 343 elif state == state_ALIGN_CONS: 344 cons_seq += line.strip("\n") 345 elif state == state_PREAMBLE: 346 if line.startswith("#"): 347 global_tags["command"] = line[1:].strip() 348 elif line.startswith(" version "): 349 global_tags["version"] = line[9:].strip() 350 elif " compares a " in line: 351 global_tags["tool"] = line[:line.find(" compares a ")].strip() 352 elif " searches a " in line: 353 global_tags["tool"] = line[:line.find(" searches a ")].strip() 354 else: 355 pass 356 357 358 if __name__ == "__main__": 359 print "Running a quick self-test" 360 361 #http://emboss.sourceforge.net/docs/themes/alnformats/align.simple 362 simple_example = \ 363 """# /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa 364 FASTA searches a protein or DNA sequence data bank 365 version 34.26 January 12, 2007 366 Please cite: 367 W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448 368 369 Query library NC_002127.faa vs NC_009649.faa library 370 searching NC_009649.faa library 371 372 1>>>gi|10955263|ref|NP_052604.1| plasmid mobilization [Escherichia coli O157:H7 s 107 aa - 107 aa 373 vs NC_009649.faa library 374 375 45119 residues in 180 sequences 376 Expectation_n fit: rho(ln(x))= 6.9146+/-0.0249; mu= -5.7948+/- 1.273 377 mean_var=53.6859+/-13.609, 0's: 0 Z-trim: 1 B-trim: 9 in 1/25 378 Lambda= 0.175043 379 380 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2 381 join: 36, opt: 24, open/ext: -10/-2, width: 16 382 Scan time: 0.000 383 The best scores are: opt bits E(180) 384 gi|152973457|ref|YP_001338508.1| ATPase with chape ( 931) 71 24.9 0.58 385 gi|152973588|ref|YP_001338639.1| F pilus assembly ( 459) 63 23.1 0.99 386 387 >>>gi|10955263|ref|NP_052604.1|, 107 aa vs NC_009649.faa library 388 ; pg_name: /opt/fasta/fasta34 389 ; pg_ver: 34.26 390 ; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa 391 ; pg_name: FASTA 392 ; pg_ver: 3.5 Sept 2006 393 ; pg_matrix: BL50 (15:-5) 394 ; pg_open-ext: -10 -2 395 ; pg_ktup: 2 396 ; pg_optcut: 24 397 ; pg_cgap: 36 398 ; mp_extrap: 60000 180 399 ; mp_stats: Expectation_n fit: rho(ln(x))= 6.9146+/-0.0249; mu= -5.7948+/- 1.273 mean_var=53.6859+/-13.609, 0's: 0 Z-trim: 1 B-trim: 9 in 1/25 Lambda= 0.175043 400 ; mp_KS: -0.0000 (N=0) at 8159228 401 >>gi|152973457|ref|YP_001338508.1| ATPase with chaperone activity, ATP-binding subunit [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] 402 ; fa_frame: f 403 ; fa_initn: 65 404 ; fa_init1: 43 405 ; fa_opt: 71 406 ; fa_z-score: 90.3 407 ; fa_bits: 24.9 408 ; fa_expect: 0.58 409 ; sw_score: 71 410 ; sw_ident: 0.250 411 ; sw_sim: 0.574 412 ; sw_overlap: 108 413 >gi|10955263| .. 414 ; sq_len: 107 415 ; sq_offset: 1 416 ; sq_type: p 417 ; al_start: 5 418 ; al_stop: 103 419 ; al_display_start: 1 420 --------------------------MTKRSGSNT-RRRAISRPVRLTAE 421 ED---QEIRKRAAECGKTVSGFLRAAALGKKVNSLTDDRVLKEVM----- 422 RLGALQKKLFIDGKRVGDREYAEVLIAITEYHRALLSRLMAD 423 >gi|152973457|ref|YP_001338508.1| .. 424 ; sq_len: 931 425 ; sq_type: p 426 ; al_start: 96 427 ; al_stop: 195 428 ; al_display_start: 66 429 SDFFRIGDDATPVAADTDDVVDASFGEPAAAGSGAPRRRGSGLASRISEQ 430 SEALLQEAAKHAAEFGRS------EVDTEHLLLALADSDVVKTILGQFKI 431 KVDDLKRQIESEAKR-GDKPF-EGEIGVSPRVKDALSRAFVASNELGHSY 432 VGPEHFLIGLAEEGEGLAANLLRRYGLTPQ 433 >>gi|152973588|ref|YP_001338639.1| F pilus assembly protein [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] 434 ; fa_frame: f 435 ; fa_initn: 33 436 ; fa_init1: 33 437 ; fa_opt: 63 438 ; fa_z-score: 86.1 439 ; fa_bits: 23.1 440 ; fa_expect: 0.99 441 ; sw_score: 63 442 ; sw_ident: 0.266 443 ; sw_sim: 0.656 444 ; sw_overlap: 64 445 >gi|10955263| .. 446 ; sq_len: 107 447 ; sq_offset: 1 448 ; sq_type: p 449 ; al_start: 32 450 ; al_stop: 94 451 ; al_display_start: 2 452 TKRSGSNTRRRAISRPVRLTAEEDQEIRKRAAECGKTVSGFLRAAALGKK 453 VNSLTDDRVLKEV-MRLGALQKKLFIDGKRVGDREYAEVLIAITEYHRAL 454 LSRLMAD 455 >gi|152973588|ref|YP_001338639.1| .. 456 ; sq_len: 459 457 ; sq_type: p 458 ; al_start: 191 459 ; al_stop: 248 460 ; al_display_start: 161 461 VGGLFPRTQVAQQKVCQDIAGESNIFSDWAASRQGCTVGG--KMDSVQDK 462 ASDKDKERVMKNINIMWNALSKNRLFDG----NKELKEFIMTLTGTLIFG 463 ENSEITPLPARTTDQDLIRAMMEGGTAKIYHCNDSDKCLKVVADATVTIT 464 SNKALKSQISALLSSIQNKAVADEKLTDQE 465 2>>>gi|10955264|ref|NP_052605.1| hypothetical protein pOSAK1_02 [Escherichia coli O157:H7 s 126 aa - 126 aa 466 vs NC_009649.faa library 467 468 45119 residues in 180 sequences 469 Expectation_n fit: rho(ln(x))= 7.1374+/-0.0246; mu= -7.6540+/- 1.313 470 mean_var=51.1189+/-13.171, 0's: 0 Z-trim: 1 B-trim: 8 in 1/25 471 Lambda= 0.179384 472 473 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2 474 join: 36, opt: 24, open/ext: -10/-2, width: 16 475 Scan time: 0.000 476 The best scores are: opt bits E(180) 477 gi|152973462|ref|YP_001338513.1| hypothetical prot ( 101) 58 22.9 0.29 478 479 >>>gi|10955264|ref|NP_052605.1|, 126 aa vs NC_009649.faa library 480 ; pg_name: /opt/fasta/fasta34 481 ; pg_ver: 34.26 482 ; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa 483 ; pg_name: FASTA 484 ; pg_ver: 3.5 Sept 2006 485 ; pg_matrix: BL50 (15:-5) 486 ; pg_open-ext: -10 -2 487 ; pg_ktup: 2 488 ; pg_optcut: 24 489 ; pg_cgap: 36 490 ; mp_extrap: 60000 180 491 ; mp_stats: Expectation_n fit: rho(ln(x))= 7.1374+/-0.0246; mu= -7.6540+/- 1.313 mean_var=51.1189+/-13.171, 0's: 0 Z-trim: 1 B-trim: 8 in 1/25 Lambda= 0.179384 492 ; mp_KS: -0.0000 (N=0) at 8159228 493 >>gi|152973462|ref|YP_001338513.1| hypothetical protein KPN_pKPN3p05904 [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] 494 ; fa_frame: f 495 ; fa_initn: 50 496 ; fa_init1: 50 497 ; fa_opt: 58 498 ; fa_z-score: 95.8 499 ; fa_bits: 22.9 500 ; fa_expect: 0.29 501 ; sw_score: 58 502 ; sw_ident: 0.289 503 ; sw_sim: 0.632 504 ; sw_overlap: 38 505 >gi|10955264| .. 506 ; sq_len: 126 507 ; sq_offset: 1 508 ; sq_type: p 509 ; al_start: 1 510 ; al_stop: 38 511 ; al_display_start: 1 512 ------------------------------MKKDKKYQIEAIKNKDKTLF 513 IVYATDIYSPSEFFSKIESDLKKKKSKGDVFFDLIIPNGGKKDRYVYTSF 514 NGEKFSSYTLNKVTKTDEYN 515 >gi|152973462|ref|YP_001338513.1| .. 516 ; sq_len: 101 517 ; sq_type: p 518 ; al_start: 44 519 ; al_stop: 81 520 ; al_display_start: 14 521 DALLGEIQRLRKQVHQLQLERDILTKANELIKKDLGVSFLKLKNREKTLI 522 VDALKKKYPVAELLSVLQLARSCYFYQNVCTISMRKYA 523 3>>>gi|10955265|ref|NP_052606.1| hypothetical protein pOSAK1_03 [Escherichia coli O157:H7 s 346 aa - 346 aa 524 vs NC_009649.faa library 525 526 45119 residues in 180 sequences 527 Expectation_n fit: rho(ln(x))= 6.0276+/-0.0276; mu= 3.0670+/- 1.461 528 mean_var=37.1634+/- 8.980, 0's: 0 Z-trim: 1 B-trim: 14 in 1/25 529 Lambda= 0.210386 530 531 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2 532 join: 37, opt: 25, open/ext: -10/-2, width: 16 533 Scan time: 0.020 534 The best scores are: opt bits E(180) 535 gi|152973545|ref|YP_001338596.1| putative plasmid ( 242) 70 27.5 0.082 536 537 >>>gi|10955265|ref|NP_052606.1|, 346 aa vs NC_009649.faa library 538 ; pg_name: /opt/fasta/fasta34 539 ; pg_ver: 34.26 540 ; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa 541 ; pg_name: FASTA 542 ; pg_ver: 3.5 Sept 2006 543 ; pg_matrix: BL50 (15:-5) 544 ; pg_open-ext: -10 -2 545 ; pg_ktup: 2 546 ; pg_optcut: 25 547 ; pg_cgap: 37 548 ; mp_extrap: 60000 180 549 ; mp_stats: Expectation_n fit: rho(ln(x))= 6.0276+/-0.0276; mu= 3.0670+/- 1.461 mean_var=37.1634+/- 8.980, 0's: 0 Z-trim: 1 B-trim: 14 in 1/25 Lambda= 0.210386 550 ; mp_KS: -0.0000 (N=0) at 8159228 551 >>gi|152973545|ref|YP_001338596.1| putative plasmid SOS inhibition protein A [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] 552 ; fa_frame: f 553 ; fa_initn: 52 554 ; fa_init1: 52 555 ; fa_opt: 70 556 ; fa_z-score: 105.5 557 ; fa_bits: 27.5 558 ; fa_expect: 0.082 559 ; sw_score: 70 560 ; sw_ident: 0.279 561 ; sw_sim: 0.651 562 ; sw_overlap: 43 563 >gi|10955265| .. 564 ; sq_len: 346 565 ; sq_offset: 1 566 ; sq_type: p 567 ; al_start: 197 568 ; al_stop: 238 569 ; al_display_start: 167 570 DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK 571 QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL 572 GEYFTENKPKYIIREIHQET 573 >gi|152973545|ref|YP_001338596.1| .. 574 ; sq_len: 242 575 ; sq_type: p 576 ; al_start: 52 577 ; al_stop: 94 578 ; al_display_start: 22 579 IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD 580 RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR 581 QDFAFTRKMRREARQVEQSW 582 >>><<< 583 584 585 579 residues in 3 query sequences 586 45119 residues in 180 library sequences 587 Scomplib [34.26] 588 start: Tue May 20 16:38:45 2008 done: Tue May 20 16:38:45 2008 589 Total Scan time: 0.020 Total Display time: 0.010 590 591 Function used was FASTA [version 34.26 January 12, 2007] 592 593 """ 594 595 from StringIO import StringIO 596 597 alignments = list(FastaM10Iterator(StringIO(simple_example))) 598 assert len(alignments) == 4, len(alignments) 599 assert len(alignments[0]) == 2 600 for a in alignments: 601 print "Alignment %i sequences of length %i" \ 602 % (len(a), a.get_alignment_length()) 603 for r in a: 604 print "%s %s %i" % (r.seq, r.id, r.annotations["original_length"]) 605 #print a.annotations 606 print "Done" 607 608 import os 609 path = "../../Tests/Fasta/" 610 files = [f for f in os.listdir(path) if os.path.splitext(f)[-1] == ".m10"] 611 files.sort() 612 for filename in files: 613 if os.path.splitext(filename)[-1] == ".m10": 614 print 615 print filename 616 print "=" * len(filename) 617 for i, a in enumerate(FastaM10Iterator(open(os.path.join(path, filename)))): 618 print "#%i, %s" % (i+1, a) 619 for r in a: 620 if "-" in r.seq: 621 assert r.seq.alphabet.gap_char == "-" 622 else: 623 assert not hasattr(r.seq.alphabet, "gap_char") 624