Package Bio :: Package Sequencing :: Module Ace
[hide private]
[frames] | no frames]

Source Code for Module Bio.Sequencing.Ace

  1  # Copyright 2004 by Frank Kauff and Cymon J. Cox.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Parser for ACE files output by PHRAP. 
  6   
  7  Written by Frank Kauff (fkauff@duke.edu) and 
  8  Cymon J. Cox (cymon@duke.edu) 
  9   
 10  Usage: 
 11   
 12  There are two ways of reading an ace file: 
 13   
 14  1. The function 'read' reads the whole file at once; 
 15  2. The function 'parse' reads the file contig after contig. 
 16   
 17  First option, parse whole ace file at once:: 
 18   
 19          from Bio.Sequencing import Ace 
 20          acefilerecord = Ace.read(open('my_ace_file.ace')) 
 21   
 22  This gives you: 
 23   - acefilerecord.ncontigs (the number of contigs in the ace file) 
 24   - acefilerecord.nreads (the number of reads in the ace file) 
 25   - acefilerecord.contigs[] (one instance of the Contig class for each contig) 
 26   
 27  The Contig class holds the info of the CO tag, CT and WA tags, and all the reads used 
 28  for this contig in a list of instances of the Read class, e.g.:: 
 29   
 30          contig3 = acefilerecord.contigs[2] 
 31          read4 = contig3.reads[3] 
 32          RD_of_read4 = read4.rd 
 33          DS_of_read4 = read4.ds 
 34   
 35  CT, WA, RT tags from the end of the file can appear anywhere are automatically 
 36  sorted into the right place. 
 37   
 38  see _RecordConsumer for details. 
 39   
 40  The second option is to  iterate over the contigs of an ace file one by one 
 41  in the ususal way:: 
 42   
 43      from Bio.Sequencing import Ace 
 44      contigs = Ace.parse(open('my_ace_file.ace')) 
 45      for contig in contigs: 
 46          print(contig.name) 
 47          ... 
 48   
 49  Please note that for memory efficiency, when using the iterator approach, only one 
 50  contig is kept in memory at once.  However, there can be a footer to the ACE file 
 51  containing WA, CT, RT or WR tags which contain additional meta-data on the contigs. 
 52  Because the parser doesn't see this data until the final record, it cannot be added to 
 53  the appropriate records.  Instead these tags will be returned with the last contig record. 
 54  Thus an ace file does not entirerly suit the concept of iterating. If WA, CT, RT, WR tags 
 55  are needed, the 'read' function rather than the 'parse' function might be more appropriate. 
 56  """ 
 57   
 58  from __future__ import print_function 
 59  from Bio._py3k import zip 
 60   
 61   
62 -class rd(object):
63 """RD (reads), store a read with its name, sequence etc. 64 65 The location and strand each read is mapped to is held in the AF lines. 66 """ 67
68 - def __init__(self):
69 self.name = '' 70 self.padded_bases = None 71 self.info_items = None 72 self.read_tags = None 73 self.sequence = ''
74 75
76 -class qa(object):
77 """QA (read quality), including which part if any was used as the consensus.""" 78
79 - def __init__(self, line=None):
80 self.qual_clipping_start = None 81 self.qual_clipping_end = None 82 self.align_clipping_start = None 83 self.align_clipping_end = None 84 if line: 85 header = line.split() 86 self.qual_clipping_start = int(header[1]) 87 self.qual_clipping_end = int(header[2]) 88 self.align_clipping_start = int(header[3]) 89 self.align_clipping_end = int(header[4])
90 91
92 -class ds(object):
93 """DS lines, include file name of a read's chromatogram file.""" 94
95 - def __init__(self, line=None):
96 self.chromat_file = '' 97 self.phd_file = '' 98 self.time = '' 99 self.chem = '' 100 self.dye = '' 101 self.template = '' 102 self.direction = '' 103 if line: 104 tags = ['CHROMAT_FILE', 'PHD_FILE', 'TIME', 'CHEM', 'DYE', 'TEMPLATE', 'DIRECTION'] 105 poss = [line.find(x) for x in tags] 106 tagpos = dict(zip(poss, tags)) 107 if -1 in tagpos: 108 del tagpos[-1] 109 ps = sorted(tagpos) # the keys 110 for (p1, p2) in zip(ps, ps[1:] + [len(line) + 1]): 111 setattr(self, tagpos[p1].lower(), line[p1 + len(tagpos[p1]) + 1:p2].strip())
112 113
114 -class af(object):
115 """AF lines, define the location of the read within the contig. 116 117 Note attribute coru is short for complemented (C) or uncomplemented (U), 118 since the strand information is stored in an ACE file using either the 119 C or U character. 120 """ 121
122 - def __init__(self, line=None):
123 self.name = '' 124 self.coru = None 125 self.padded_start = None 126 if line: 127 header = line.split() 128 self.name = header[1] 129 self.coru = header[2] 130 self.padded_start = int(header[3])
131 132
133 -class bs(object):
134 """BS (base segment), which read was chosen as the consensus at each position.""" 135
136 - def __init__(self, line=None):
137 self.name = '' 138 self.padded_start = None 139 self.padded_end = None 140 if line: 141 header = line.split() 142 self.padded_start = int(header[1]) 143 self.padded_end = int(header[2]) 144 self.name = header[3]
145 146
147 -class rt(object):
148 """RT (transient read tags), generated by crossmatch and phrap.""" 149
150 - def __init__(self, line=None):
151 self.name = '' 152 self.tag_type = '' 153 self.program = '' 154 self.padded_start = None 155 self.padded_end = None 156 self.date = '' 157 self.comment = [] 158 if line: 159 header = line.split() 160 self.name = header[0] 161 self.tag_type = header[1] 162 self.program = header[2] 163 self.padded_start = int(header[3]) 164 self.padded_end = int(header[4]) 165 self.date = header[5]
166 167
168 -class ct(object):
169 """CT (consensus tags).""" 170
171 - def __init__(self, line=None):
172 self.name = '' 173 self.tag_type = '' 174 self.program = '' 175 self.padded_start = None 176 self.padded_end = None 177 self.date = '' 178 self.notrans = '' 179 self.info = [] 180 self.comment = [] 181 if line: 182 header = line.split() 183 self.name = header[0] 184 self.tag_type = header[1] 185 self.program = header[2] 186 self.padded_start = int(header[3]) 187 self.padded_end = int(header[4]) 188 self.date = header[5] 189 if len(header) == 7: 190 self.notrans = header[6]
191 192
193 -class wa(object):
194 """WA (whole assembly tag), holds the assembly program name, version, etc.""" 195
196 - def __init__(self, line=None):
197 self.tag_type = '' 198 self.program = '' 199 self.date = '' 200 self.info = [] 201 if line: 202 header = line.split() 203 self.tag_type = header[0] 204 self.program = header[1] 205 self.date = header[2]
206 207
208 -class wr(object):
209 """WR lines.""" 210
211 - def __init__(self, line=None):
212 self.name = '' 213 self.aligned = '' 214 self.program = '' 215 self.date = [] 216 if line: 217 header = line.split() 218 self.name = header[0] 219 self.aligned = header[1] 220 self.program = header[2] 221 self.date = header[3]
222 223
224 -class Reads(object):
225 """Holds information about a read supporting an ACE contig.""" 226
227 - def __init__(self, line=None):
228 self.rd = None # one per read 229 self.qa = None # one per read 230 self.ds = None # none or one per read 231 self.rt = None # none or many per read 232 self.wr = None # none or many per read 233 if line: 234 self.rd = rd() 235 header = line.split() 236 self.rd.name = header[1] 237 self.rd.padded_bases = int(header[2]) 238 self.rd.info_items = int(header[3]) 239 self.rd.read_tags = int(header[4])
240 241
242 -class Contig(object):
243 """Holds information about a contig from an ACE record.""" 244
245 - def __init__(self, line=None):
246 self.name = '' 247 self.nbases = None 248 self.nreads = None 249 self.nsegments = None 250 self.uorc = None 251 self.sequence = "" 252 self.quality = [] 253 self.af = [] 254 self.bs = [] 255 self.reads = [] 256 self.ct = None # none or many 257 self.wa = None # none or many 258 if line: 259 header = line.split() 260 self.name = header[1] 261 self.nbases = int(header[2]) 262 self.nreads = int(header[3]) 263 self.nsegments = int(header[4]) 264 self.uorc = header[5]
265 266
267 -def parse(handle):
268 """Iterate of ACE file contig by contig. 269 270 Argument handle is a file-like object. 271 272 This function returns an iterator that allows you to iterate 273 over the ACE file record by record:: 274 275 records = parse(handle) 276 for record in records: 277 # do something with the record 278 279 where each record is a Contig object. 280 """ 281 handle = iter(handle) 282 283 line = "" 284 while True: 285 # at beginning, skip the AS and look for first CO command 286 try: 287 while True: 288 if line.startswith('CO'): 289 break 290 line = next(handle) 291 except StopIteration: 292 return 293 294 record = Contig(line) 295 296 for line in handle: 297 line = line.strip() 298 if not line: 299 break 300 record.sequence += line 301 302 for line in handle: 303 if line.strip(): 304 break 305 if not line.startswith("BQ"): 306 raise ValueError("Failed to find BQ line") 307 308 for line in handle: 309 if not line.strip(): 310 break 311 record.quality.extend(int(x) for x in line.split()) 312 313 for line in handle: 314 if line.strip(): 315 break 316 317 while True: 318 if not line.startswith("AF "): 319 break 320 record.af.append(af(line)) 321 try: 322 line = next(handle) 323 except StopIteration: 324 raise ValueError("Unexpected end of AF block") 325 326 while True: 327 if line.strip(): 328 break 329 try: 330 line = next(handle) 331 except StopIteration: 332 raise ValueError("Unexpected end of file") 333 334 while True: 335 if not line.startswith("BS "): 336 break 337 record.bs.append(bs(line)) 338 try: 339 line = next(handle) 340 except StopIteration: 341 raise ValueError("Failed to find end of BS block") 342 343 # now read all the read data 344 # it starts with a 'RD', and then a mandatory QA 345 # then follows an optional DS 346 # CT,RT,WA,WR may or may not be there in unlimited quantity. They might refer to the actual read or contig, 347 # or, if encountered at the end of file, to any previous read or contig. the sort() method deals 348 # with that later. 349 while True: 350 351 # each read must have a rd and qa 352 try: 353 while True: 354 # If I've met the condition, then stop reading the line. 355 if line.startswith("RD "): 356 break 357 line = next(handle) 358 except StopIteration: 359 raise ValueError("Failed to find RD line") 360 361 record.reads.append(Reads(line)) 362 363 for line in handle: 364 line = line.strip() 365 if not line: 366 break 367 record.reads[-1].rd.sequence += line 368 369 for line in handle: 370 if line.strip(): 371 break 372 if not line.startswith("QA "): 373 raise ValueError("Failed to find QA line") 374 record.reads[-1].qa = qa(line) 375 376 # now one ds can follow 377 for line in handle: 378 if line.strip(): 379 break 380 else: 381 break 382 383 if line.startswith("DS "): 384 record.reads[-1].ds = ds(line) 385 line = "" 386 # the file could just end, or there's some more stuff. In ace files, anything can happen. 387 # the following tags are interspersed between reads and can appear multiple times. 388 while True: 389 # something left 390 try: 391 while True: 392 if line.strip(): 393 break 394 line = next(handle) 395 except StopIteration: 396 # file ends here 397 break 398 if line.startswith("RT{"): 399 # now if we're at the end of the file, this rt could 400 # belong to a previous read, not the actual one. 401 # we store it here were it appears, the user can sort later. 402 if record.reads[-1].rt is None: 403 record.reads[-1].rt = [] 404 for line in handle: 405 line = line.strip() 406 # if line=="COMMENT{": 407 if line.startswith("COMMENT{"): 408 if line[8:].strip(): 409 # MIRA 3.0.5 would miss the new line out :( 410 record.reads[-1].rt[-1].comment.append(line[8:]) 411 for line in handle: 412 line = line.strip() 413 if line.endswith("C}"): 414 break 415 record.reads[-1].rt[-1].comment.append(line) 416 elif line == '}': 417 break 418 else: 419 record.reads[-1].rt.append(rt(line)) 420 line = "" 421 elif line.startswith("WR{"): 422 if record.reads[-1].wr is None: 423 record.reads[-1].wr = [] 424 for line in handle: 425 line = line.strip() 426 if line == '}': 427 break 428 record.reads[-1].wr.append(wr(line)) 429 line = "" 430 elif line.startswith("WA{"): 431 if record.wa is None: 432 record.wa = [] 433 try: 434 line = next(handle) 435 except StopIteration: 436 raise ValueError("Failed to read WA block") 437 record.wa.append(wa(line)) 438 for line in handle: 439 line = line.strip() 440 if line == '}': 441 break 442 record.wa[-1].info.append(line) 443 line = "" 444 elif line.startswith("CT{"): 445 if record.ct is None: 446 record.ct = [] 447 try: 448 line = next(handle) 449 except StopIteration: 450 raise ValueError("Failed to read CT block") 451 record.ct.append(ct(line)) 452 for line in handle: 453 line = line.strip() 454 if line == "COMMENT{": 455 for line in handle: 456 line = line.strip() 457 if line.endswith("C}"): 458 break 459 record.ct[-1].comment.append(line) 460 elif line == '}': 461 break 462 else: 463 record.ct[-1].info.append(line) 464 line = "" 465 else: 466 break 467 468 if not line.startswith('RD'): # another read? 469 break 470 471 yield record
472 473
474 -class ACEFileRecord(object):
475 """Holds data of an ACE file.""" 476
477 - def __init__(self):
478 self.ncontigs = None 479 self.nreads = None 480 self.contigs = [] 481 self.wa = None # none or many
482
483 - def sort(self):
484 """Sorts wr, rt and ct tags into the appropriate contig / read instance, if possible.""" 485 ct = [] 486 rt = [] 487 wr = [] 488 # search for tags that aren't in the right position 489 for i in range(len(self.contigs)): 490 c = self.contigs[i] 491 if c.wa: 492 if not self.wa: 493 self.wa = [] 494 self.wa.extend(c.wa) 495 if c.ct: 496 newcts = [ct_tag for ct_tag in c.ct if ct_tag.name != c.name] 497 for x in newcts: 498 self.contigs[i].ct.remove(x) 499 ct.extend(newcts) 500 for j in range(len(c.reads)): 501 r = c.reads[j] 502 if r.rt: 503 newrts = [rt_tag for rt_tag in r.rt if rt_tag.name != r.rd.name] 504 for x in newrts: 505 self.contigs[i].reads[j].rt.remove(x) 506 rt.extend(newrts) 507 if r.wr: 508 newwrs = [wr_tag for wr_tag in r.wr if wr_tag.name != r.rd.name] 509 for x in newwrs: 510 self.contigs[i].reads[j].wr.remove(x) 511 wr.extend(newwrs) 512 # now sort them into their proper place 513 for i in range(len(self.contigs)): 514 c = self.contigs[i] 515 for ct_tag in ct: 516 if ct_tag.name == c.name: 517 if self.contigs[i].ct is None: 518 self.contigs[i].ct = [] 519 self.contigs[i].ct.append(ct_tag) 520 if rt or wr: 521 for j in range(len(c.reads)): 522 r = c.reads[j] 523 for rt_tag in rt: 524 if rt_tag.name == r.rd.name: 525 if self.contigs[i].reads[j].rt is None: 526 self.contigs[i].reads[j].rt = [] 527 self.contigs[i].reads[j].rt.append(rt_tag) 528 for wr_tag in wr: 529 if wr_tag.name == r.rd.name: 530 if self.contigs[i].reads[j].wr is None: 531 self.contigs[i].reads[j].wr = [] 532 self.contigs[i].reads[j].wr.append(wr_tag)
533 534
535 -def read(handle):
536 """Parses the full ACE file in list of contigs.""" 537 handle = iter(handle) 538 539 record = ACEFileRecord() 540 541 try: 542 line = next(handle) 543 except StopIteration: 544 raise ValueError("Premature end of file") 545 546 # check if the file starts correctly 547 if not line.startswith('AS'): 548 raise ValueError("File does not start with 'AS'.") 549 550 words = line.split() 551 record.ncontigs = int(words[1]) 552 record.nreads = int(words[2]) 553 554 # now read all the records 555 record.contigs = list(parse(handle)) 556 # wa, ct, rt rags are usually at the end of the file, but not necessarily (correct?). 557 # If the iterator is used, the tags are returned with the contig or the read after which they appear, 558 # if all tags are at the end, they are read with the last contig. The concept of an 559 # iterator leaves no other choice. But if the user uses the ACEParser, we can check 560 # them and put them into the appropriate contig/read instance. 561 # Conclusion: An ACE file is not a filetype for which iteration is 100% suitable... 562 record.sort() 563 return record
564