Package Bio :: Package Sequencing :: Module Ace
[hide private]
[frames] | no frames]

Source Code for Module Bio.Sequencing.Ace

  1  # Copyright 2004 by Frank Kauff and Cymon J. Cox.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Parser for ACE files output by PHRAP. 
  6   
  7  Written by Frank Kauff (fkauff@duke.edu) and 
  8  Cymon J. Cox (cymon@duke.edu) 
  9   
 10  Usage: 
 11   
 12  There are two ways of reading an ace file: 
 13   
 14  1. The function 'read' reads the whole file at once; 
 15  2. The function 'parse' reads the file contig after contig. 
 16   
 17  First option, parse whole ace file at once:: 
 18   
 19          from Bio.Sequencing import Ace 
 20          acefilerecord = Ace.read(open('my_ace_file.ace')) 
 21   
 22  This gives you: 
 23   - acefilerecord.ncontigs (the number of contigs in the ace file) 
 24   - acefilerecord.nreads (the number of reads in the ace file) 
 25   - acefilerecord.contigs[] (one instance of the Contig class for each contig) 
 26   
 27  The Contig class holds the info of the CO tag, CT and WA tags, and all the reads used 
 28  for this contig in a list of instances of the Read class, e.g.:: 
 29   
 30          contig3 = acefilerecord.contigs[2] 
 31          read4 = contig3.reads[3] 
 32          RD_of_read4 = read4.rd 
 33          DS_of_read4 = read4.ds 
 34   
 35  CT, WA, RT tags from the end of the file can appear anywhere are automatically 
 36  sorted into the right place. 
 37   
 38  see _RecordConsumer for details. 
 39   
 40  The second option is to  iterate over the contigs of an ace file one by one 
 41  in the ususal way:: 
 42   
 43      from Bio.Sequencing import Ace 
 44      contigs = Ace.parse(open('my_ace_file.ace')) 
 45      for contig in contigs: 
 46          print(contig.name) 
 47          ... 
 48   
 49  Please note that for memory efficiency, when using the iterator approach, only one 
 50  contig is kept in memory at once.  However, there can be a footer to the ACE file 
 51  containing WA, CT, RT or WR tags which contain additional meta-data on the contigs. 
 52  Because the parser doesn't see this data until the final record, it cannot be added to 
 53  the appropriate records.  Instead these tags will be returned with the last contig record. 
 54  Thus an ace file does not entirerly suit the concept of iterating. If WA, CT, RT, WR tags 
 55  are needed, the 'read' function rather than the 'parse' function might be more appropriate. 
 56  """ 
 57   
 58  from __future__ import print_function 
 59  from Bio._py3k import zip 
 60   
 61   
62 -class rd(object):
63 """RD (reads), store a read with its name, sequence etc. 64 65 The location and strand each read is mapped to is held in the AF lines. 66 """
67 - def __init__(self):
68 self.name = '' 69 self.padded_bases = None 70 self.info_items = None 71 self.read_tags = None 72 self.sequence = ''
73 74
75 -class qa(object):
76 """QA (read quality), including which part if any was used as the consensus."""
77 - def __init__(self, line=None):
78 self.qual_clipping_start = None 79 self.qual_clipping_end = None 80 self.align_clipping_start = None 81 self.align_clipping_end = None 82 if line: 83 header = line.split() 84 self.qual_clipping_start = int(header[1]) 85 self.qual_clipping_end = int(header[2]) 86 self.align_clipping_start = int(header[3]) 87 self.align_clipping_end = int(header[4])
88 89
90 -class ds(object):
91 """DS lines, include file name of a read's chromatogram file."""
92 - def __init__(self, line=None):
93 self.chromat_file = '' 94 self.phd_file = '' 95 self.time = '' 96 self.chem = '' 97 self.dye = '' 98 self.template = '' 99 self.direction = '' 100 if line: 101 tags = ['CHROMAT_FILE', 'PHD_FILE', 'TIME', 'CHEM', 'DYE', 'TEMPLATE', 'DIRECTION'] 102 poss = [line.find(x) for x in tags] 103 tagpos = dict(zip(poss, tags)) 104 if -1 in tagpos: 105 del tagpos[-1] 106 ps = sorted(tagpos) # the keys 107 for (p1, p2) in zip(ps, ps[1:] + [len(line) + 1]): 108 setattr(self, tagpos[p1].lower(), line[p1 + len(tagpos[p1]) + 1:p2].strip())
109 110
111 -class af(object):
112 """AF lines, define the location of the read within the contig. 113 114 Note attribute coru is short for complemented (C) or uncomplemented (U), 115 since the strand information is stored in an ACE file using either the 116 C or U character. 117 """
118 - def __init__(self, line=None):
119 self.name = '' 120 self.coru = None 121 self.padded_start = None 122 if line: 123 header = line.split() 124 self.name = header[1] 125 self.coru = header[2] 126 self.padded_start = int(header[3])
127 128
129 -class bs(object):
130 """BS (base segment), which read was chosen as the consensus at each position."""
131 - def __init__(self, line=None):
132 self.name = '' 133 self.padded_start = None 134 self.padded_end = None 135 if line: 136 header = line.split() 137 self.padded_start = int(header[1]) 138 self.padded_end = int(header[2]) 139 self.name = header[3]
140 141
142 -class rt(object):
143 """RT (transient read tags), generated by crossmatch and phrap."""
144 - def __init__(self, line=None):
145 self.name = '' 146 self.tag_type = '' 147 self.program = '' 148 self.padded_start = None 149 self.padded_end = None 150 self.date = '' 151 self.comment = [] 152 if line: 153 header = line.split() 154 self.name = header[0] 155 self.tag_type = header[1] 156 self.program = header[2] 157 self.padded_start = int(header[3]) 158 self.padded_end = int(header[4]) 159 self.date = header[5]
160 161
162 -class ct(object):
163 """CT (consensus tags)."""
164 - def __init__(self, line=None):
165 self.name = '' 166 self.tag_type = '' 167 self.program = '' 168 self.padded_start = None 169 self.padded_end = None 170 self.date = '' 171 self.notrans = '' 172 self.info = [] 173 self.comment = [] 174 if line: 175 header = line.split() 176 self.name = header[0] 177 self.tag_type = header[1] 178 self.program = header[2] 179 self.padded_start = int(header[3]) 180 self.padded_end = int(header[4]) 181 self.date = header[5] 182 if len(header) == 7: 183 self.notrans = header[6]
184 185
186 -class wa(object):
187 """WA (whole assembly tag), holds the assembly program name, version, etc."""
188 - def __init__(self, line=None):
189 self.tag_type = '' 190 self.program = '' 191 self.date = '' 192 self.info = [] 193 if line: 194 header = line.split() 195 self.tag_type = header[0] 196 self.program = header[1] 197 self.date = header[2]
198 199
200 -class wr(object):
201 """WR lines."""
202 - def __init__(self, line=None):
203 self.name = '' 204 self.aligned = '' 205 self.program = '' 206 self.date = [] 207 if line: 208 header = line.split() 209 self.name = header[0] 210 self.aligned = header[1] 211 self.program = header[2] 212 self.date = header[3]
213 214
215 -class Reads(object):
216 """Holds information about a read supporting an ACE contig."""
217 - def __init__(self, line=None):
218 self.rd = None # one per read 219 self.qa = None # one per read 220 self.ds = None # none or one per read 221 self.rt = None # none or many per read 222 self.wr = None # none or many per read 223 if line: 224 self.rd = rd() 225 header = line.split() 226 self.rd.name = header[1] 227 self.rd.padded_bases = int(header[2]) 228 self.rd.info_items = int(header[3]) 229 self.rd.read_tags = int(header[4])
230 231
232 -class Contig(object):
233 """Holds information about a contig from an ACE record."""
234 - def __init__(self, line=None):
235 self.name = '' 236 self.nbases = None 237 self.nreads = None 238 self.nsegments = None 239 self.uorc = None 240 self.sequence = "" 241 self.quality = [] 242 self.af = [] 243 self.bs = [] 244 self.reads = [] 245 self.ct = None # none or many 246 self.wa = None # none or many 247 if line: 248 header = line.split() 249 self.name = header[1] 250 self.nbases = int(header[2]) 251 self.nreads = int(header[3]) 252 self.nsegments = int(header[4]) 253 self.uorc = header[5]
254 255
256 -def parse(handle):
257 """Iterate of ACE file contig by contig. 258 259 Argument handle is a file-like object. 260 261 This function returns an iterator that allows you to iterate 262 over the ACE file record by record:: 263 264 records = parse(handle) 265 for record in records: 266 # do something with the record 267 268 where each record is a Contig object. 269 """ 270 271 handle = iter(handle) 272 273 line = "" 274 while True: 275 # at beginning, skip the AS and look for first CO command 276 try: 277 while True: 278 if line.startswith('CO'): 279 break 280 line = next(handle) 281 except StopIteration: 282 return 283 284 record = Contig(line) 285 286 for line in handle: 287 line = line.strip() 288 if not line: 289 break 290 record.sequence += line 291 292 for line in handle: 293 if line.strip(): 294 break 295 if not line.startswith("BQ"): 296 raise ValueError("Failed to find BQ line") 297 298 for line in handle: 299 if not line.strip(): 300 break 301 record.quality.extend(int(x) for x in line.split()) 302 303 for line in handle: 304 if line.strip(): 305 break 306 307 while True: 308 if not line.startswith("AF "): 309 break 310 record.af.append(af(line)) 311 try: 312 line = next(handle) 313 except StopIteration: 314 raise ValueError("Unexpected end of AF block") 315 316 while True: 317 if line.strip(): 318 break 319 try: 320 line = next(handle) 321 except StopIteration: 322 raise ValueError("Unexpected end of file") 323 324 while True: 325 if not line.startswith("BS "): 326 break 327 record.bs.append(bs(line)) 328 try: 329 line = next(handle) 330 except StopIteration: 331 raise ValueError("Failed to find end of BS block") 332 333 # now read all the read data 334 # it starts with a 'RD', and then a mandatory QA 335 # then follows an optional DS 336 # CT,RT,WA,WR may or may not be there in unlimited quantity. They might refer to the actual read or contig, 337 # or, if encountered at the end of file, to any previous read or contig. the sort() method deals 338 # with that later. 339 while True: 340 341 # each read must have a rd and qa 342 try: 343 while True: 344 # If I've met the condition, then stop reading the line. 345 if line.startswith("RD "): 346 break 347 line = next(handle) 348 except StopIteration: 349 raise ValueError("Failed to find RD line") 350 351 record.reads.append(Reads(line)) 352 353 for line in handle: 354 line = line.strip() 355 if not line: 356 break 357 record.reads[-1].rd.sequence += line 358 359 for line in handle: 360 if line.strip(): 361 break 362 if not line.startswith("QA "): 363 raise ValueError("Failed to find QA line") 364 record.reads[-1].qa = qa(line) 365 366 # now one ds can follow 367 for line in handle: 368 if line.strip(): 369 break 370 else: 371 break 372 373 if line.startswith("DS "): 374 record.reads[-1].ds = ds(line) 375 line = "" 376 # the file could just end, or there's some more stuff. In ace files, anything can happen. 377 # the following tags are interspersed between reads and can appear multiple times. 378 while True: 379 # something left 380 try: 381 while True: 382 if line.strip(): 383 break 384 line = next(handle) 385 except StopIteration: 386 # file ends here 387 break 388 if line.startswith("RT{"): 389 # now if we're at the end of the file, this rt could 390 # belong to a previous read, not the actual one. 391 # we store it here were it appears, the user can sort later. 392 if record.reads[-1].rt is None: 393 record.reads[-1].rt = [] 394 for line in handle: 395 line = line.strip() 396 # if line=="COMMENT{": 397 if line.startswith("COMMENT{"): 398 if line[8:].strip(): 399 # MIRA 3.0.5 would miss the new line out :( 400 record.reads[-1].rt[-1].comment.append(line[8:]) 401 for line in handle: 402 line = line.strip() 403 if line.endswith("C}"): 404 break 405 record.reads[-1].rt[-1].comment.append(line) 406 elif line == '}': 407 break 408 else: 409 record.reads[-1].rt.append(rt(line)) 410 line = "" 411 elif line.startswith("WR{"): 412 if record.reads[-1].wr is None: 413 record.reads[-1].wr = [] 414 for line in handle: 415 line = line.strip() 416 if line == '}': 417 break 418 record.reads[-1].wr.append(wr(line)) 419 line = "" 420 elif line.startswith("WA{"): 421 if record.wa is None: 422 record.wa = [] 423 try: 424 line = next(handle) 425 except StopIteration: 426 raise ValueError("Failed to read WA block") 427 record.wa.append(wa(line)) 428 for line in handle: 429 line = line.strip() 430 if line == '}': 431 break 432 record.wa[-1].info.append(line) 433 line = "" 434 elif line.startswith("CT{"): 435 if record.ct is None: 436 record.ct = [] 437 try: 438 line = next(handle) 439 except StopIteration: 440 raise ValueError("Failed to read CT block") 441 record.ct.append(ct(line)) 442 for line in handle: 443 line = line.strip() 444 if line == "COMMENT{": 445 for line in handle: 446 line = line.strip() 447 if line.endswith("C}"): 448 break 449 record.ct[-1].comment.append(line) 450 elif line == '}': 451 break 452 else: 453 record.ct[-1].info.append(line) 454 line = "" 455 else: 456 break 457 458 if not line.startswith('RD'): # another read? 459 break 460 461 yield record
462 463
464 -class ACEFileRecord(object):
465 """Holds data of an ACE file. 466 """
467 - def __init__(self):
468 self.ncontigs = None 469 self.nreads = None 470 self.contigs = [] 471 self.wa = None # none or many
472
473 - def sort(self):
474 """Sorts wr, rt and ct tags into the appropriate contig / read instance, if possible.""" 475 476 ct = [] 477 rt = [] 478 wr = [] 479 # search for tags that aren't in the right position 480 for i in range(len(self.contigs)): 481 c = self.contigs[i] 482 if c.wa: 483 if not self.wa: 484 self.wa = [] 485 self.wa.extend(c.wa) 486 if c.ct: 487 newcts = [ct_tag for ct_tag in c.ct if ct_tag.name != c.name] 488 for x in newcts: 489 self.contigs[i].ct.remove(x) 490 ct.extend(newcts) 491 for j in range(len(c.reads)): 492 r = c.reads[j] 493 if r.rt: 494 newrts = [rt_tag for rt_tag in r.rt if rt_tag.name != r.rd.name] 495 for x in newrts: 496 self.contigs[i].reads[j].rt.remove(x) 497 rt.extend(newrts) 498 if r.wr: 499 newwrs = [wr_tag for wr_tag in r.wr if wr_tag.name != r.rd.name] 500 for x in newwrs: 501 self.contigs[i].reads[j].wr.remove(x) 502 wr.extend(newwrs) 503 # now sort them into their proper place 504 for i in range(len(self.contigs)): 505 c = self.contigs[i] 506 for ct_tag in ct: 507 if ct_tag.name == c.name: 508 if self.contigs[i].ct is None: 509 self.contigs[i].ct = [] 510 self.contigs[i].ct.append(ct_tag) 511 if rt or wr: 512 for j in range(len(c.reads)): 513 r = c.reads[j] 514 for rt_tag in rt: 515 if rt_tag.name == r.rd.name: 516 if self.contigs[i].reads[j].rt is None: 517 self.contigs[i].reads[j].rt = [] 518 self.contigs[i].reads[j].rt.append(rt_tag) 519 for wr_tag in wr: 520 if wr_tag.name == r.rd.name: 521 if self.contigs[i].reads[j].wr is None: 522 self.contigs[i].reads[j].wr = [] 523 self.contigs[i].reads[j].wr.append(wr_tag)
524 525
526 -def read(handle):
527 """Parses the full ACE file in list of contigs. 528 529 """ 530 531 handle = iter(handle) 532 533 record = ACEFileRecord() 534 535 try: 536 line = next(handle) 537 except StopIteration: 538 raise ValueError("Premature end of file") 539 540 # check if the file starts correctly 541 if not line.startswith('AS'): 542 raise ValueError("File does not start with 'AS'.") 543 544 words = line.split() 545 record.ncontigs = int(words[1]) 546 record.nreads = int(words[2]) 547 548 # now read all the records 549 record.contigs = list(parse(handle)) 550 # wa, ct, rt rags are usually at the end of the file, but not necessarily (correct?). 551 # If the iterator is used, the tags are returned with the contig or the read after which they appear, 552 # if all tags are at the end, they are read with the last contig. The concept of an 553 # iterator leaves no other choice. But if the user uses the ACEParser, we can check 554 # them and put them into the appropriate contig/read instance. 555 # Conclusion: An ACE file is not a filetype for which iteration is 100% suitable... 556 record.sort() 557 return record
558