Package Bio :: Package Sequencing :: Module Ace
[hide private]
[frames] | no frames]

Source Code for Module Bio.Sequencing.Ace

  1  # Copyright 2004 by Frank Kauff and Cymon J. Cox.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Parser for ACE files output by PHRAP. 
  6   
  7  Written by Frank Kauff (fkauff@duke.edu) and 
  8  Cymon J. Cox (cymon@duke.edu) 
  9   
 10  Usage: 
 11   
 12  There are two ways of reading an ace file: 
 13  1) The function 'read' reads the whole file at once; 
 14  2) The function 'parse' reads the file contig after contig. 
 15   
 16  1) Parse whole ace file at once: 
 17   
 18          from Bio.Sequencing import Ace 
 19          acefilerecord=Ace.read(open('my_ace_file.ace')) 
 20   
 21  This gives you: 
 22          acefilerecord.ncontigs (the number of contigs in the ace file) 
 23          acefilerecord.nreads (the number of reads in the ace file) 
 24          acefilerecord.contigs[] (one instance of the Contig class for each contig) 
 25   
 26  The Contig class holds the info of the CO tag, CT and WA tags, and all the reads used 
 27  for this contig in a list of instances of the Read class, e.g.: 
 28   
 29          contig3=acefilerecord.contigs[2] 
 30          read4=contig3.reads[3] 
 31          RD_of_read4=read4.rd 
 32          DS_of_read4=read4.ds 
 33   
 34  CT, WA, RT tags from the end of the file can appear anywhere are automatically 
 35  sorted into the right place. 
 36   
 37  see _RecordConsumer for details. 
 38   
 39  2) Or you can iterate over the contigs of an ace file one by one in the ususal way: 
 40   
 41          from Bio.Sequencing import Ace 
 42          contigs=Ace.parse(open('my_ace_file.ace')) 
 43          for contig in contigs: 
 44              print(contig.name) 
 45              ... 
 46   
 47  Please note that for memory efficiency, when using the iterator approach, only one 
 48  contig is kept in memory at once.  However, there can be a footer to the ACE file 
 49  containing WA, CT, RT or WR tags which contain additional meta-data on the contigs. 
 50  Because the parser doesn't see this data until the final record, it cannot be added to 
 51  the appropriate records.  Instead these tags will be returned with the last contig record. 
 52  Thus an ace file does not entirerly suit the concept of iterating. If WA, CT, RT, WR tags 
 53  are needed, the 'read' function rather than the 'parse' function might be more appropriate. 
 54  """ 
 55   
 56  from __future__ import print_function 
 57  from Bio._py3k import zip 
 58   
59 -class rd(object):
60 """RD (reads), store a read with its name, sequence etc. 61 62 The location and strand each read is mapped to is held in the AF lines. 63 """
64 - def __init__(self):
65 self.name = '' 66 self.padded_bases = None 67 self.info_items = None 68 self.read_tags = None 69 self.sequence = ''
70 71
72 -class qa(object):
73 """QA (read quality), including which part if any was used as the consensus."""
74 - def __init__(self, line=None):
75 self.qual_clipping_start = None 76 self.qual_clipping_end = None 77 self.align_clipping_start = None 78 self.align_clipping_end = None 79 if line: 80 header = line.split() 81 self.qual_clipping_start = int(header[1]) 82 self.qual_clipping_end = int(header[2]) 83 self.align_clipping_start = int(header[3]) 84 self.align_clipping_end = int(header[4])
85 86
87 -class ds(object):
88 """DS lines, include file name of a read's chromatogram file."""
89 - def __init__(self, line=None):
90 self.chromat_file = '' 91 self.phd_file = '' 92 self.time = '' 93 self.chem = '' 94 self.dye = '' 95 self.template = '' 96 self.direction = '' 97 if line: 98 tags = ['CHROMAT_FILE', 'PHD_FILE', 'TIME', 'CHEM', 'DYE', 'TEMPLATE', 'DIRECTION'] 99 poss = [line.find(x) for x in tags] 100 tagpos = dict(zip(poss, tags)) 101 if -1 in tagpos: 102 del tagpos[-1] 103 ps = sorted(tagpos) # the keys 104 for (p1, p2) in zip(ps, ps[1:]+[len(line)+1]): 105 setattr(self, tagpos[p1].lower(), line[p1+len(tagpos[p1])+1:p2].strip())
106 107
108 -class af(object):
109 """AF lines, define the location of the read within the contig. 110 111 Note attribute coru is short for complemented (C) or uncomplemented (U), 112 since the strand information is stored in an ACE file using either the 113 C or U character. 114 """
115 - def __init__(self, line=None):
116 self.name = '' 117 self.coru = None 118 self.padded_start = None 119 if line: 120 header = line.split() 121 self.name = header[1] 122 self.coru = header[2] 123 self.padded_start = int(header[3])
124 125
126 -class bs(object):
127 """BS (base segment), which read was chosen as the consensus at each position."""
128 - def __init__(self, line=None):
129 self.name = '' 130 self.padded_start = None 131 self.padded_end = None 132 if line: 133 header = line.split() 134 self.padded_start = int(header[1]) 135 self.padded_end = int(header[2]) 136 self.name = header[3]
137 138
139 -class rt(object):
140 """RT (transient read tags), generated by crossmatch and phrap."""
141 - def __init__(self, line=None):
142 self.name = '' 143 self.tag_type = '' 144 self.program = '' 145 self.padded_start = None 146 self.padded_end = None 147 self.date = '' 148 self.comment = [] 149 if line: 150 header = line.split() 151 self.name = header[0] 152 self.tag_type = header[1] 153 self.program = header[2] 154 self.padded_start = int(header[3]) 155 self.padded_end = int(header[4]) 156 self.date = header[5]
157 158
159 -class ct(object):
160 """CT (consensus tags)."""
161 - def __init__(self, line=None):
162 self.name = '' 163 self.tag_type = '' 164 self.program = '' 165 self.padded_start = None 166 self.padded_end = None 167 self.date = '' 168 self.notrans = '' 169 self.info = [] 170 self.comment = [] 171 if line: 172 header = line.split() 173 self.name = header[0] 174 self.tag_type = header[1] 175 self.program = header[2] 176 self.padded_start = int(header[3]) 177 self.padded_end = int(header[4]) 178 self.date = header[5] 179 if len(header) == 7: 180 self.notrans = header[6]
181 182
183 -class wa(object):
184 """WA (whole assembly tag), holds the assembly program name, version, etc."""
185 - def __init__(self, line=None):
186 self.tag_type = '' 187 self.program = '' 188 self.date = '' 189 self.info = [] 190 if line: 191 header = line.split() 192 self.tag_type = header[0] 193 self.program = header[1] 194 self.date = header[2]
195 196
197 -class wr(object):
198 """WR lines."""
199 - def __init__(self, line=None):
200 self.name = '' 201 self.aligned = '' 202 self.program = '' 203 self.date = [] 204 if line: 205 header = line.split() 206 self.name = header[0] 207 self.aligned = header[1] 208 self.program = header[2] 209 self.date = header[3]
210 211
212 -class Reads(object):
213 """Holds information about a read supporting an ACE contig."""
214 - def __init__(self, line=None):
215 self.rd = None # one per read 216 self.qa = None # one per read 217 self.ds = None # none or one per read 218 self.rt = None # none or many per read 219 self.wr = None # none or many per read 220 if line: 221 self.rd = rd() 222 header = line.split() 223 self.rd.name = header[1] 224 self.rd.padded_bases = int(header[2]) 225 self.rd.info_items = int(header[3]) 226 self.rd.read_tags = int(header[4])
227 228
229 -class Contig(object):
230 """Holds information about a contig from an ACE record."""
231 - def __init__(self, line=None):
232 self.name = '' 233 self.nbases = None 234 self.nreads = None 235 self.nsegments = None 236 self.uorc = None 237 self.sequence = "" 238 self.quality = [] 239 self.af = [] 240 self.bs = [] 241 self.reads = [] 242 self.ct = None # none or many 243 self.wa = None # none or many 244 if line: 245 header = line.split() 246 self.name = header[1] 247 self.nbases = int(header[2]) 248 self.nreads = int(header[3]) 249 self.nsegments = int(header[4]) 250 self.uorc = header[5]
251 252
253 -def parse(handle):
254 """parse(handle) 255 256 where handle is a file-like object. 257 258 This function returns an iterator that allows you to iterate 259 over the ACE file record by record: 260 261 records = parse(handle) 262 for record in records: 263 # do something with the record 264 265 where each record is a Contig object. 266 """ 267 268 handle = iter(handle) 269 270 line = "" 271 while True: 272 # at beginning, skip the AS and look for first CO command 273 try: 274 while True: 275 if line.startswith('CO'): 276 break 277 line = next(handle) 278 except StopIteration: 279 return 280 281 record = Contig(line) 282 283 for line in handle: 284 line = line.strip() 285 if not line: 286 break 287 record.sequence += line 288 289 for line in handle: 290 if line.strip(): 291 break 292 if not line.startswith("BQ"): 293 raise ValueError("Failed to find BQ line") 294 295 for line in handle: 296 if not line.strip(): 297 break 298 record.quality.extend(int(x) for x in line.split()) 299 300 for line in handle: 301 if line.strip(): 302 break 303 304 while True: 305 if not line.startswith("AF "): 306 break 307 record.af.append(af(line)) 308 try: 309 line = next(handle) 310 except StopIteration: 311 raise ValueError("Unexpected end of AF block") 312 313 while True: 314 if line.strip(): 315 break 316 try: 317 line = next(handle) 318 except StopIteration: 319 raise ValueError("Unexpected end of file") 320 321 while True: 322 if not line.startswith("BS "): 323 break 324 record.bs.append(bs(line)) 325 try: 326 line = next(handle) 327 except StopIteration: 328 raise ValueError("Failed to find end of BS block") 329 330 # now read all the read data 331 # it starts with a 'RD', and then a mandatory QA 332 # then follows an optional DS 333 # CT,RT,WA,WR may or may not be there in unlimited quantity. They might refer to the actual read or contig, 334 # or, if encountered at the end of file, to any previous read or contig. the sort() method deals 335 # with that later. 336 while True: 337 338 # each read must have a rd and qa 339 try: 340 while True: 341 # If I've met the condition, then stop reading the line. 342 if line.startswith("RD "): 343 break 344 line = next(handle) 345 except StopIteration: 346 raise ValueError("Failed to find RD line") 347 348 record.reads.append(Reads(line)) 349 350 for line in handle: 351 line = line.strip() 352 if not line: 353 break 354 record.reads[-1].rd.sequence += line 355 356 for line in handle: 357 if line.strip(): 358 break 359 if not line.startswith("QA "): 360 raise ValueError("Failed to find QA line") 361 record.reads[-1].qa = qa(line) 362 363 # now one ds can follow 364 for line in handle: 365 if line.strip(): 366 break 367 else: 368 break 369 370 if line.startswith("DS "): 371 record.reads[-1].ds = ds(line) 372 line = "" 373 # the file could just end, or there's some more stuff. In ace files, anything can happen. 374 # the following tags are interspersed between reads and can appear multiple times. 375 while True: 376 # something left 377 try: 378 while True: 379 if line.strip(): 380 break 381 line = next(handle) 382 except StopIteration: 383 # file ends here 384 break 385 if line.startswith("RT{"): 386 # now if we're at the end of the file, this rt could 387 # belong to a previous read, not the actual one. 388 # we store it here were it appears, the user can sort later. 389 if record.reads[-1].rt is None: 390 record.reads[-1].rt = [] 391 for line in handle: 392 line = line.strip() 393 #if line=="COMMENT{": 394 if line.startswith("COMMENT{"): 395 if line[8:].strip(): 396 #MIRA 3.0.5 would miss the new line out :( 397 record.reads[-1].rt[-1].comment.append(line[8:]) 398 for line in handle: 399 line = line.strip() 400 if line.endswith("C}"): 401 break 402 record.reads[-1].rt[-1].comment.append(line) 403 elif line == '}': 404 break 405 else: 406 record.reads[-1].rt.append(rt(line)) 407 line = "" 408 elif line.startswith("WR{"): 409 if record.reads[-1].wr is None: 410 record.reads[-1].wr = [] 411 for line in handle: 412 line = line.strip() 413 if line == '}': 414 break 415 record.reads[-1].wr.append(wr(line)) 416 line = "" 417 elif line.startswith("WA{"): 418 if record.wa is None: 419 record.wa = [] 420 try: 421 line = next(handle) 422 except StopIteration: 423 raise ValueError("Failed to read WA block") 424 record.wa.append(wa(line)) 425 for line in handle: 426 line = line.strip() 427 if line == '}': 428 break 429 record.wa[-1].info.append(line) 430 line = "" 431 elif line.startswith("CT{"): 432 if record.ct is None: 433 record.ct = [] 434 try: 435 line = next(handle) 436 except StopIteration: 437 raise ValueError("Failed to read CT block") 438 record.ct.append(ct(line)) 439 for line in handle: 440 line = line.strip() 441 if line == "COMMENT{": 442 for line in handle: 443 line = line.strip() 444 if line.endswith("C}"): 445 break 446 record.ct[-1].comment.append(line) 447 elif line == '}': 448 break 449 else: 450 record.ct[-1].info.append(line) 451 line = "" 452 else: 453 break 454 455 if not line.startswith('RD'): # another read? 456 break 457 458 yield record
459 460
461 -class ACEFileRecord(object):
462 """Holds data of an ACE file. 463 """
464 - def __init__(self):
465 self.ncontigs = None 466 self.nreads = None 467 self.contigs = [] 468 self.wa = None # none or many
469
470 - def sort(self):
471 """Sorts wr, rt and ct tags into the appropriate contig / read instance, if possible.""" 472 473 ct = [] 474 rt = [] 475 wr = [] 476 # search for tags that aren't in the right position 477 for i in range(len(self.contigs)): 478 c = self.contigs[i] 479 if c.wa: 480 if not self.wa: 481 self.wa = [] 482 self.wa.extend(c.wa) 483 if c.ct: 484 newcts = [ct_tag for ct_tag in c.ct if ct_tag.name != c.name] 485 for x in newcts: 486 self.contigs[i].ct.remove(x) 487 ct.extend(newcts) 488 for j in range(len(c.reads)): 489 r = c.reads[j] 490 if r.rt: 491 newrts = [rt_tag for rt_tag in r.rt if rt_tag.name != r.rd.name] 492 for x in newrts: 493 self.contigs[i].reads[j].rt.remove(x) 494 rt.extend(newrts) 495 if r.wr: 496 newwrs = [wr_tag for wr_tag in r.wr if wr_tag.name != r.rd.name] 497 for x in newwrs: 498 self.contigs[i].reads[j].wr.remove(x) 499 wr.extend(newwrs) 500 # now sort them into their proper place 501 for i in range(len(self.contigs)): 502 c = self.contigs[i] 503 for ct_tag in ct: 504 if ct_tag.name == c.name: 505 if self.contigs[i].ct is None: 506 self.contigs[i].ct = [] 507 self.contigs[i].ct.append(ct_tag) 508 if rt or wr: 509 for j in range(len(c.reads)): 510 r = c.reads[j] 511 for rt_tag in rt: 512 if rt_tag.name == r.rd.name: 513 if self.contigs[i].reads[j].rt is None: 514 self.contigs[i].reads[j].rt = [] 515 self.contigs[i].reads[j].rt.append(rt_tag) 516 for wr_tag in wr: 517 if wr_tag.name == r.rd.name: 518 if self.contigs[i].reads[j].wr is None: 519 self.contigs[i].reads[j].wr = [] 520 self.contigs[i].reads[j].wr.append(wr_tag)
521 522
523 -def read(handle):
524 """Parses the full ACE file in list of contigs. 525 526 """ 527 528 handle = iter(handle) 529 530 record = ACEFileRecord() 531 532 try: 533 line = next(handle) 534 except StopIteration: 535 raise ValueError("Premature end of file") 536 537 # check if the file starts correctly 538 if not line.startswith('AS'): 539 raise ValueError("File does not start with 'AS'.") 540 541 words = line.split() 542 record.ncontigs = int(words[1]) 543 record.nreads = int(words[2]) 544 545 # now read all the records 546 record.contigs = list(parse(handle)) 547 # wa, ct, rt rags are usually at the end of the file, but not necessarily (correct?). 548 # If the iterator is used, the tags are returned with the contig or the read after which they appear, 549 # if all tags are at the end, they are read with the last contig. The concept of an 550 # iterator leaves no other choice. But if the user uses the ACEParser, we can check 551 # them and put them into the appropriate contig/read instance. 552 # Conclusion: An ACE file is not a filetype for which iteration is 100% suitable... 553 record.sort() 554 return record
555