Package Bio :: Package Sequencing :: Module Ace
[hide private]
[frames] | no frames]

Source Code for Module Bio.Sequencing.Ace

  1  # Copyright 2004 by Frank Kauff and Cymon J. Cox.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Parser for ACE files output by PHRAP. 
  6   
  7  Written by Frank Kauff (fkauff@duke.edu) and 
  8  Cymon J. Cox (cymon@duke.edu) 
  9   
 10  Usage: 
 11   
 12  There are two ways of reading an ace file: 
 13   
 14  1. The function 'read' reads the whole file at once; 
 15  2. The function 'parse' reads the file contig after contig. 
 16   
 17  First option, parse whole ace file at once:: 
 18   
 19          from Bio.Sequencing import Ace 
 20          acefilerecord = Ace.read(open('my_ace_file.ace')) 
 21   
 22  This gives you: 
 23   - acefilerecord.ncontigs (the number of contigs in the ace file) 
 24   - acefilerecord.nreads (the number of reads in the ace file) 
 25   - acefilerecord.contigs[] (one instance of the Contig class for each contig) 
 26   
 27  The Contig class holds the info of the CO tag, CT and WA tags, and all the reads used 
 28  for this contig in a list of instances of the Read class, e.g.:: 
 29   
 30          contig3 = acefilerecord.contigs[2] 
 31          read4 = contig3.reads[3] 
 32          RD_of_read4 = read4.rd 
 33          DS_of_read4 = read4.ds 
 34   
 35  CT, WA, RT tags from the end of the file can appear anywhere are automatically 
 36  sorted into the right place. 
 37   
 38  see _RecordConsumer for details. 
 39   
 40  The second option is to  iterate over the contigs of an ace file one by one 
 41  in the ususal way:: 
 42   
 43      from Bio.Sequencing import Ace 
 44      contigs = Ace.parse(open('my_ace_file.ace')) 
 45      for contig in contigs: 
 46          print(contig.name) 
 47          ... 
 48   
 49  Please note that for memory efficiency, when using the iterator approach, only one 
 50  contig is kept in memory at once.  However, there can be a footer to the ACE file 
 51  containing WA, CT, RT or WR tags which contain additional meta-data on the contigs. 
 52  Because the parser doesn't see this data until the final record, it cannot be added to 
 53  the appropriate records.  Instead these tags will be returned with the last contig record. 
 54  Thus an ace file does not entirerly suit the concept of iterating. If WA, CT, RT, WR tags 
 55  are needed, the 'read' function rather than the 'parse' function might be more appropriate. 
 56  """ 
 57   
 58  from __future__ import print_function 
 59  from Bio._py3k import zip 
 60   
 61   
62 -class rd(object):
63 """RD (reads), store a read with its name, sequence etc. 64 65 The location and strand each read is mapped to is held in the AF lines. 66 """
67 - def __init__(self):
68 self.name = '' 69 self.padded_bases = None 70 self.info_items = None 71 self.read_tags = None 72 self.sequence = ''
73 74
75 -class qa(object):
76 """QA (read quality), including which part if any was used as the consensus."""
77 - def __init__(self, line=None):
78 self.qual_clipping_start = None 79 self.qual_clipping_end = None 80 self.align_clipping_start = None 81 self.align_clipping_end = None 82 if line: 83 header = line.split() 84 self.qual_clipping_start = int(header[1]) 85 self.qual_clipping_end = int(header[2]) 86 self.align_clipping_start = int(header[3]) 87 self.align_clipping_end = int(header[4])
88 89
90 -class ds(object):
91 """DS lines, include file name of a read's chromatogram file."""
92 - def __init__(self, line=None):
93 self.chromat_file = '' 94 self.phd_file = '' 95 self.time = '' 96 self.chem = '' 97 self.dye = '' 98 self.template = '' 99 self.direction = '' 100 if line: 101 tags = ['CHROMAT_FILE', 'PHD_FILE', 'TIME', 'CHEM', 'DYE', 'TEMPLATE', 'DIRECTION'] 102 poss = [line.find(x) for x in tags] 103 tagpos = dict(zip(poss, tags)) 104 if -1 in tagpos: 105 del tagpos[-1] 106 ps = sorted(tagpos) # the keys 107 for (p1, p2) in zip(ps, ps[1:] + [len(line) + 1]): 108 setattr(self, tagpos[p1].lower(), line[p1 + len(tagpos[p1]) + 1:p2].strip())
109 110
111 -class af(object):
112 """AF lines, define the location of the read within the contig. 113 114 Note attribute coru is short for complemented (C) or uncomplemented (U), 115 since the strand information is stored in an ACE file using either the 116 C or U character. 117 """
118 - def __init__(self, line=None):
119 self.name = '' 120 self.coru = None 121 self.padded_start = None 122 if line: 123 header = line.split() 124 self.name = header[1] 125 self.coru = header[2] 126 self.padded_start = int(header[3])
127 128
129 -class bs(object):
130 """BS (base segment), which read was chosen as the consensus at each position."""
131 - def __init__(self, line=None):
132 self.name = '' 133 self.padded_start = None 134 self.padded_end = None 135 if line: 136 header = line.split() 137 self.padded_start = int(header[1]) 138 self.padded_end = int(header[2]) 139 self.name = header[3]
140 141
142 -class rt(object):
143 """RT (transient read tags), generated by crossmatch and phrap."""
144 - def __init__(self, line=None):
145 self.name = '' 146 self.tag_type = '' 147 self.program = '' 148 self.padded_start = None 149 self.padded_end = None 150 self.date = '' 151 self.comment = [] 152 if line: 153 header = line.split() 154 self.name = header[0] 155 self.tag_type = header[1] 156 self.program = header[2] 157 self.padded_start = int(header[3]) 158 self.padded_end = int(header[4]) 159 self.date = header[5]
160 161
162 -class ct(object):
163 """CT (consensus tags)."""
164 - def __init__(self, line=None):
165 self.name = '' 166 self.tag_type = '' 167 self.program = '' 168 self.padded_start = None 169 self.padded_end = None 170 self.date = '' 171 self.notrans = '' 172 self.info = [] 173 self.comment = [] 174 if line: 175 header = line.split() 176 self.name = header[0] 177 self.tag_type = header[1] 178 self.program = header[2] 179 self.padded_start = int(header[3]) 180 self.padded_end = int(header[4]) 181 self.date = header[5] 182 if len(header) == 7: 183 self.notrans = header[6]
184 185
186 -class wa(object):
187 """WA (whole assembly tag), holds the assembly program name, version, etc."""
188 - def __init__(self, line=None):
189 self.tag_type = '' 190 self.program = '' 191 self.date = '' 192 self.info = [] 193 if line: 194 header = line.split() 195 self.tag_type = header[0] 196 self.program = header[1] 197 self.date = header[2]
198 199
200 -class wr(object):
201 """WR lines."""
202 - def __init__(self, line=None):
203 self.name = '' 204 self.aligned = '' 205 self.program = '' 206 self.date = [] 207 if line: 208 header = line.split() 209 self.name = header[0] 210 self.aligned = header[1] 211 self.program = header[2] 212 self.date = header[3]
213 214
215 -class Reads(object):
216 """Holds information about a read supporting an ACE contig."""
217 - def __init__(self, line=None):
218 self.rd = None # one per read 219 self.qa = None # one per read 220 self.ds = None # none or one per read 221 self.rt = None # none or many per read 222 self.wr = None # none or many per read 223 if line: 224 self.rd = rd() 225 header = line.split() 226 self.rd.name = header[1] 227 self.rd.padded_bases = int(header[2]) 228 self.rd.info_items = int(header[3]) 229 self.rd.read_tags = int(header[4])
230 231
232 -class Contig(object):
233 """Holds information about a contig from an ACE record."""
234 - def __init__(self, line=None):
235 self.name = '' 236 self.nbases = None 237 self.nreads = None 238 self.nsegments = None 239 self.uorc = None 240 self.sequence = "" 241 self.quality = [] 242 self.af = [] 243 self.bs = [] 244 self.reads = [] 245 self.ct = None # none or many 246 self.wa = None # none or many 247 if line: 248 header = line.split() 249 self.name = header[1] 250 self.nbases = int(header[2]) 251 self.nreads = int(header[3]) 252 self.nsegments = int(header[4]) 253 self.uorc = header[5]
254 255
256 -def parse(handle):
257 """Iterate of ACE file contig by contig. 258 259 Argument handle is a file-like object. 260 261 This function returns an iterator that allows you to iterate 262 over the ACE file record by record:: 263 264 records = parse(handle) 265 for record in records: 266 # do something with the record 267 268 where each record is a Contig object. 269 """ 270 handle = iter(handle) 271 272 line = "" 273 while True: 274 # at beginning, skip the AS and look for first CO command 275 try: 276 while True: 277 if line.startswith('CO'): 278 break 279 line = next(handle) 280 except StopIteration: 281 return 282 283 record = Contig(line) 284 285 for line in handle: 286 line = line.strip() 287 if not line: 288 break 289 record.sequence += line 290 291 for line in handle: 292 if line.strip(): 293 break 294 if not line.startswith("BQ"): 295 raise ValueError("Failed to find BQ line") 296 297 for line in handle: 298 if not line.strip(): 299 break 300 record.quality.extend(int(x) for x in line.split()) 301 302 for line in handle: 303 if line.strip(): 304 break 305 306 while True: 307 if not line.startswith("AF "): 308 break 309 record.af.append(af(line)) 310 try: 311 line = next(handle) 312 except StopIteration: 313 raise ValueError("Unexpected end of AF block") 314 315 while True: 316 if line.strip(): 317 break 318 try: 319 line = next(handle) 320 except StopIteration: 321 raise ValueError("Unexpected end of file") 322 323 while True: 324 if not line.startswith("BS "): 325 break 326 record.bs.append(bs(line)) 327 try: 328 line = next(handle) 329 except StopIteration: 330 raise ValueError("Failed to find end of BS block") 331 332 # now read all the read data 333 # it starts with a 'RD', and then a mandatory QA 334 # then follows an optional DS 335 # CT,RT,WA,WR may or may not be there in unlimited quantity. They might refer to the actual read or contig, 336 # or, if encountered at the end of file, to any previous read or contig. the sort() method deals 337 # with that later. 338 while True: 339 340 # each read must have a rd and qa 341 try: 342 while True: 343 # If I've met the condition, then stop reading the line. 344 if line.startswith("RD "): 345 break 346 line = next(handle) 347 except StopIteration: 348 raise ValueError("Failed to find RD line") 349 350 record.reads.append(Reads(line)) 351 352 for line in handle: 353 line = line.strip() 354 if not line: 355 break 356 record.reads[-1].rd.sequence += line 357 358 for line in handle: 359 if line.strip(): 360 break 361 if not line.startswith("QA "): 362 raise ValueError("Failed to find QA line") 363 record.reads[-1].qa = qa(line) 364 365 # now one ds can follow 366 for line in handle: 367 if line.strip(): 368 break 369 else: 370 break 371 372 if line.startswith("DS "): 373 record.reads[-1].ds = ds(line) 374 line = "" 375 # the file could just end, or there's some more stuff. In ace files, anything can happen. 376 # the following tags are interspersed between reads and can appear multiple times. 377 while True: 378 # something left 379 try: 380 while True: 381 if line.strip(): 382 break 383 line = next(handle) 384 except StopIteration: 385 # file ends here 386 break 387 if line.startswith("RT{"): 388 # now if we're at the end of the file, this rt could 389 # belong to a previous read, not the actual one. 390 # we store it here were it appears, the user can sort later. 391 if record.reads[-1].rt is None: 392 record.reads[-1].rt = [] 393 for line in handle: 394 line = line.strip() 395 # if line=="COMMENT{": 396 if line.startswith("COMMENT{"): 397 if line[8:].strip(): 398 # MIRA 3.0.5 would miss the new line out :( 399 record.reads[-1].rt[-1].comment.append(line[8:]) 400 for line in handle: 401 line = line.strip() 402 if line.endswith("C}"): 403 break 404 record.reads[-1].rt[-1].comment.append(line) 405 elif line == '}': 406 break 407 else: 408 record.reads[-1].rt.append(rt(line)) 409 line = "" 410 elif line.startswith("WR{"): 411 if record.reads[-1].wr is None: 412 record.reads[-1].wr = [] 413 for line in handle: 414 line = line.strip() 415 if line == '}': 416 break 417 record.reads[-1].wr.append(wr(line)) 418 line = "" 419 elif line.startswith("WA{"): 420 if record.wa is None: 421 record.wa = [] 422 try: 423 line = next(handle) 424 except StopIteration: 425 raise ValueError("Failed to read WA block") 426 record.wa.append(wa(line)) 427 for line in handle: 428 line = line.strip() 429 if line == '}': 430 break 431 record.wa[-1].info.append(line) 432 line = "" 433 elif line.startswith("CT{"): 434 if record.ct is None: 435 record.ct = [] 436 try: 437 line = next(handle) 438 except StopIteration: 439 raise ValueError("Failed to read CT block") 440 record.ct.append(ct(line)) 441 for line in handle: 442 line = line.strip() 443 if line == "COMMENT{": 444 for line in handle: 445 line = line.strip() 446 if line.endswith("C}"): 447 break 448 record.ct[-1].comment.append(line) 449 elif line == '}': 450 break 451 else: 452 record.ct[-1].info.append(line) 453 line = "" 454 else: 455 break 456 457 if not line.startswith('RD'): # another read? 458 break 459 460 yield record
461 462
463 -class ACEFileRecord(object):
464 """Holds data of an ACE file.""" 465
466 - def __init__(self):
467 self.ncontigs = None 468 self.nreads = None 469 self.contigs = [] 470 self.wa = None # none or many
471
472 - def sort(self):
473 """Sorts wr, rt and ct tags into the appropriate contig / read instance, if possible.""" 474 ct = [] 475 rt = [] 476 wr = [] 477 # search for tags that aren't in the right position 478 for i in range(len(self.contigs)): 479 c = self.contigs[i] 480 if c.wa: 481 if not self.wa: 482 self.wa = [] 483 self.wa.extend(c.wa) 484 if c.ct: 485 newcts = [ct_tag for ct_tag in c.ct if ct_tag.name != c.name] 486 for x in newcts: 487 self.contigs[i].ct.remove(x) 488 ct.extend(newcts) 489 for j in range(len(c.reads)): 490 r = c.reads[j] 491 if r.rt: 492 newrts = [rt_tag for rt_tag in r.rt if rt_tag.name != r.rd.name] 493 for x in newrts: 494 self.contigs[i].reads[j].rt.remove(x) 495 rt.extend(newrts) 496 if r.wr: 497 newwrs = [wr_tag for wr_tag in r.wr if wr_tag.name != r.rd.name] 498 for x in newwrs: 499 self.contigs[i].reads[j].wr.remove(x) 500 wr.extend(newwrs) 501 # now sort them into their proper place 502 for i in range(len(self.contigs)): 503 c = self.contigs[i] 504 for ct_tag in ct: 505 if ct_tag.name == c.name: 506 if self.contigs[i].ct is None: 507 self.contigs[i].ct = [] 508 self.contigs[i].ct.append(ct_tag) 509 if rt or wr: 510 for j in range(len(c.reads)): 511 r = c.reads[j] 512 for rt_tag in rt: 513 if rt_tag.name == r.rd.name: 514 if self.contigs[i].reads[j].rt is None: 515 self.contigs[i].reads[j].rt = [] 516 self.contigs[i].reads[j].rt.append(rt_tag) 517 for wr_tag in wr: 518 if wr_tag.name == r.rd.name: 519 if self.contigs[i].reads[j].wr is None: 520 self.contigs[i].reads[j].wr = [] 521 self.contigs[i].reads[j].wr.append(wr_tag)
522 523
524 -def read(handle):
525 """Parses the full ACE file in list of contigs.""" 526 handle = iter(handle) 527 528 record = ACEFileRecord() 529 530 try: 531 line = next(handle) 532 except StopIteration: 533 raise ValueError("Premature end of file") 534 535 # check if the file starts correctly 536 if not line.startswith('AS'): 537 raise ValueError("File does not start with 'AS'.") 538 539 words = line.split() 540 record.ncontigs = int(words[1]) 541 record.nreads = int(words[2]) 542 543 # now read all the records 544 record.contigs = list(parse(handle)) 545 # wa, ct, rt rags are usually at the end of the file, but not necessarily (correct?). 546 # If the iterator is used, the tags are returned with the contig or the read after which they appear, 547 # if all tags are at the end, they are read with the last contig. The concept of an 548 # iterator leaves no other choice. But if the user uses the ACEParser, we can check 549 # them and put them into the appropriate contig/read instance. 550 # Conclusion: An ACE file is not a filetype for which iteration is 100% suitable... 551 record.sort() 552 return record
553