Package Bio :: Package Sequencing :: Module Ace
[hide private]
[frames] | no frames]

Source Code for Module Bio.Sequencing.Ace

  1  # Copyright 2004 by Frank Kauff and Cymon J. Cox.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Parser for ACE files output by PHRAP. 
  6   
  7  Written by Frank Kauff (fkauff@duke.edu) and 
  8  Cymon J. Cox (cymon@duke.edu) 
  9   
 10  Usage: 
 11   
 12  There are two ways of reading an ace file: 
 13   
 14  1. The function 'read' reads the whole file at once; 
 15  2. The function 'parse' reads the file contig after contig. 
 16   
 17  First option, parse whole ace file at once:: 
 18   
 19          from Bio.Sequencing import Ace 
 20          acefilerecord = Ace.read(open('my_ace_file.ace')) 
 21   
 22  This gives you: 
 23   - acefilerecord.ncontigs (the number of contigs in the ace file) 
 24   - acefilerecord.nreads (the number of reads in the ace file) 
 25   - acefilerecord.contigs[] (one instance of the Contig class for each contig) 
 26   
 27  The Contig class holds the info of the CO tag, CT and WA tags, and all the reads used 
 28  for this contig in a list of instances of the Read class, e.g.:: 
 29   
 30          contig3 = acefilerecord.contigs[2] 
 31          read4 = contig3.reads[3] 
 32          RD_of_read4 = read4.rd 
 33          DS_of_read4 = read4.ds 
 34   
 35  CT, WA, RT tags from the end of the file can appear anywhere are automatically 
 36  sorted into the right place. 
 37   
 38  see _RecordConsumer for details. 
 39   
 40  The second option is to  iterate over the contigs of an ace file one by one 
 41  in the ususal way:: 
 42   
 43      from Bio.Sequencing import Ace 
 44      contigs = Ace.parse(open('my_ace_file.ace')) 
 45      for contig in contigs: 
 46          print(contig.name) 
 47          ... 
 48   
 49  Please note that for memory efficiency, when using the iterator approach, only one 
 50  contig is kept in memory at once.  However, there can be a footer to the ACE file 
 51  containing WA, CT, RT or WR tags which contain additional meta-data on the contigs. 
 52  Because the parser doesn't see this data until the final record, it cannot be added to 
 53  the appropriate records.  Instead these tags will be returned with the last contig record. 
 54  Thus an ace file does not entirerly suit the concept of iterating. If WA, CT, RT, WR tags 
 55  are needed, the 'read' function rather than the 'parse' function might be more appropriate. 
 56  """ 
 57   
 58  from __future__ import print_function 
 59  from Bio._py3k import zip 
 60   
 61  __docformat__ = "restructuredtext en" 
 62   
 63   
64 -class rd(object):
65 """RD (reads), store a read with its name, sequence etc. 66 67 The location and strand each read is mapped to is held in the AF lines. 68 """
69 - def __init__(self):
70 self.name = '' 71 self.padded_bases = None 72 self.info_items = None 73 self.read_tags = None 74 self.sequence = ''
75 76
77 -class qa(object):
78 """QA (read quality), including which part if any was used as the consensus."""
79 - def __init__(self, line=None):
80 self.qual_clipping_start = None 81 self.qual_clipping_end = None 82 self.align_clipping_start = None 83 self.align_clipping_end = None 84 if line: 85 header = line.split() 86 self.qual_clipping_start = int(header[1]) 87 self.qual_clipping_end = int(header[2]) 88 self.align_clipping_start = int(header[3]) 89 self.align_clipping_end = int(header[4])
90 91
92 -class ds(object):
93 """DS lines, include file name of a read's chromatogram file."""
94 - def __init__(self, line=None):
95 self.chromat_file = '' 96 self.phd_file = '' 97 self.time = '' 98 self.chem = '' 99 self.dye = '' 100 self.template = '' 101 self.direction = '' 102 if line: 103 tags = ['CHROMAT_FILE', 'PHD_FILE', 'TIME', 'CHEM', 'DYE', 'TEMPLATE', 'DIRECTION'] 104 poss = [line.find(x) for x in tags] 105 tagpos = dict(zip(poss, tags)) 106 if -1 in tagpos: 107 del tagpos[-1] 108 ps = sorted(tagpos) # the keys 109 for (p1, p2) in zip(ps, ps[1:]+[len(line)+1]): 110 setattr(self, tagpos[p1].lower(), line[p1+len(tagpos[p1])+1:p2].strip())
111 112
113 -class af(object):
114 """AF lines, define the location of the read within the contig. 115 116 Note attribute coru is short for complemented (C) or uncomplemented (U), 117 since the strand information is stored in an ACE file using either the 118 C or U character. 119 """
120 - def __init__(self, line=None):
121 self.name = '' 122 self.coru = None 123 self.padded_start = None 124 if line: 125 header = line.split() 126 self.name = header[1] 127 self.coru = header[2] 128 self.padded_start = int(header[3])
129 130
131 -class bs(object):
132 """BS (base segment), which read was chosen as the consensus at each position."""
133 - def __init__(self, line=None):
134 self.name = '' 135 self.padded_start = None 136 self.padded_end = None 137 if line: 138 header = line.split() 139 self.padded_start = int(header[1]) 140 self.padded_end = int(header[2]) 141 self.name = header[3]
142 143
144 -class rt(object):
145 """RT (transient read tags), generated by crossmatch and phrap."""
146 - def __init__(self, line=None):
147 self.name = '' 148 self.tag_type = '' 149 self.program = '' 150 self.padded_start = None 151 self.padded_end = None 152 self.date = '' 153 self.comment = [] 154 if line: 155 header = line.split() 156 self.name = header[0] 157 self.tag_type = header[1] 158 self.program = header[2] 159 self.padded_start = int(header[3]) 160 self.padded_end = int(header[4]) 161 self.date = header[5]
162 163
164 -class ct(object):
165 """CT (consensus tags)."""
166 - def __init__(self, line=None):
167 self.name = '' 168 self.tag_type = '' 169 self.program = '' 170 self.padded_start = None 171 self.padded_end = None 172 self.date = '' 173 self.notrans = '' 174 self.info = [] 175 self.comment = [] 176 if line: 177 header = line.split() 178 self.name = header[0] 179 self.tag_type = header[1] 180 self.program = header[2] 181 self.padded_start = int(header[3]) 182 self.padded_end = int(header[4]) 183 self.date = header[5] 184 if len(header) == 7: 185 self.notrans = header[6]
186 187
188 -class wa(object):
189 """WA (whole assembly tag), holds the assembly program name, version, etc."""
190 - def __init__(self, line=None):
191 self.tag_type = '' 192 self.program = '' 193 self.date = '' 194 self.info = [] 195 if line: 196 header = line.split() 197 self.tag_type = header[0] 198 self.program = header[1] 199 self.date = header[2]
200 201
202 -class wr(object):
203 """WR lines."""
204 - def __init__(self, line=None):
205 self.name = '' 206 self.aligned = '' 207 self.program = '' 208 self.date = [] 209 if line: 210 header = line.split() 211 self.name = header[0] 212 self.aligned = header[1] 213 self.program = header[2] 214 self.date = header[3]
215 216
217 -class Reads(object):
218 """Holds information about a read supporting an ACE contig."""
219 - def __init__(self, line=None):
220 self.rd = None # one per read 221 self.qa = None # one per read 222 self.ds = None # none or one per read 223 self.rt = None # none or many per read 224 self.wr = None # none or many per read 225 if line: 226 self.rd = rd() 227 header = line.split() 228 self.rd.name = header[1] 229 self.rd.padded_bases = int(header[2]) 230 self.rd.info_items = int(header[3]) 231 self.rd.read_tags = int(header[4])
232 233
234 -class Contig(object):
235 """Holds information about a contig from an ACE record."""
236 - def __init__(self, line=None):
237 self.name = '' 238 self.nbases = None 239 self.nreads = None 240 self.nsegments = None 241 self.uorc = None 242 self.sequence = "" 243 self.quality = [] 244 self.af = [] 245 self.bs = [] 246 self.reads = [] 247 self.ct = None # none or many 248 self.wa = None # none or many 249 if line: 250 header = line.split() 251 self.name = header[1] 252 self.nbases = int(header[2]) 253 self.nreads = int(header[3]) 254 self.nsegments = int(header[4]) 255 self.uorc = header[5]
256 257
258 -def parse(handle):
259 """Iterate of ACE file contig by contig. 260 261 Argument handle is a file-like object. 262 263 This function returns an iterator that allows you to iterate 264 over the ACE file record by record:: 265 266 records = parse(handle) 267 for record in records: 268 # do something with the record 269 270 where each record is a Contig object. 271 """ 272 273 handle = iter(handle) 274 275 line = "" 276 while True: 277 # at beginning, skip the AS and look for first CO command 278 try: 279 while True: 280 if line.startswith('CO'): 281 break 282 line = next(handle) 283 except StopIteration: 284 return 285 286 record = Contig(line) 287 288 for line in handle: 289 line = line.strip() 290 if not line: 291 break 292 record.sequence += line 293 294 for line in handle: 295 if line.strip(): 296 break 297 if not line.startswith("BQ"): 298 raise ValueError("Failed to find BQ line") 299 300 for line in handle: 301 if not line.strip(): 302 break 303 record.quality.extend(int(x) for x in line.split()) 304 305 for line in handle: 306 if line.strip(): 307 break 308 309 while True: 310 if not line.startswith("AF "): 311 break 312 record.af.append(af(line)) 313 try: 314 line = next(handle) 315 except StopIteration: 316 raise ValueError("Unexpected end of AF block") 317 318 while True: 319 if line.strip(): 320 break 321 try: 322 line = next(handle) 323 except StopIteration: 324 raise ValueError("Unexpected end of file") 325 326 while True: 327 if not line.startswith("BS "): 328 break 329 record.bs.append(bs(line)) 330 try: 331 line = next(handle) 332 except StopIteration: 333 raise ValueError("Failed to find end of BS block") 334 335 # now read all the read data 336 # it starts with a 'RD', and then a mandatory QA 337 # then follows an optional DS 338 # CT,RT,WA,WR may or may not be there in unlimited quantity. They might refer to the actual read or contig, 339 # or, if encountered at the end of file, to any previous read or contig. the sort() method deals 340 # with that later. 341 while True: 342 343 # each read must have a rd and qa 344 try: 345 while True: 346 # If I've met the condition, then stop reading the line. 347 if line.startswith("RD "): 348 break 349 line = next(handle) 350 except StopIteration: 351 raise ValueError("Failed to find RD line") 352 353 record.reads.append(Reads(line)) 354 355 for line in handle: 356 line = line.strip() 357 if not line: 358 break 359 record.reads[-1].rd.sequence += line 360 361 for line in handle: 362 if line.strip(): 363 break 364 if not line.startswith("QA "): 365 raise ValueError("Failed to find QA line") 366 record.reads[-1].qa = qa(line) 367 368 # now one ds can follow 369 for line in handle: 370 if line.strip(): 371 break 372 else: 373 break 374 375 if line.startswith("DS "): 376 record.reads[-1].ds = ds(line) 377 line = "" 378 # the file could just end, or there's some more stuff. In ace files, anything can happen. 379 # the following tags are interspersed between reads and can appear multiple times. 380 while True: 381 # something left 382 try: 383 while True: 384 if line.strip(): 385 break 386 line = next(handle) 387 except StopIteration: 388 # file ends here 389 break 390 if line.startswith("RT{"): 391 # now if we're at the end of the file, this rt could 392 # belong to a previous read, not the actual one. 393 # we store it here were it appears, the user can sort later. 394 if record.reads[-1].rt is None: 395 record.reads[-1].rt = [] 396 for line in handle: 397 line = line.strip() 398 # if line=="COMMENT{": 399 if line.startswith("COMMENT{"): 400 if line[8:].strip(): 401 # MIRA 3.0.5 would miss the new line out :( 402 record.reads[-1].rt[-1].comment.append(line[8:]) 403 for line in handle: 404 line = line.strip() 405 if line.endswith("C}"): 406 break 407 record.reads[-1].rt[-1].comment.append(line) 408 elif line == '}': 409 break 410 else: 411 record.reads[-1].rt.append(rt(line)) 412 line = "" 413 elif line.startswith("WR{"): 414 if record.reads[-1].wr is None: 415 record.reads[-1].wr = [] 416 for line in handle: 417 line = line.strip() 418 if line == '}': 419 break 420 record.reads[-1].wr.append(wr(line)) 421 line = "" 422 elif line.startswith("WA{"): 423 if record.wa is None: 424 record.wa = [] 425 try: 426 line = next(handle) 427 except StopIteration: 428 raise ValueError("Failed to read WA block") 429 record.wa.append(wa(line)) 430 for line in handle: 431 line = line.strip() 432 if line == '}': 433 break 434 record.wa[-1].info.append(line) 435 line = "" 436 elif line.startswith("CT{"): 437 if record.ct is None: 438 record.ct = [] 439 try: 440 line = next(handle) 441 except StopIteration: 442 raise ValueError("Failed to read CT block") 443 record.ct.append(ct(line)) 444 for line in handle: 445 line = line.strip() 446 if line == "COMMENT{": 447 for line in handle: 448 line = line.strip() 449 if line.endswith("C}"): 450 break 451 record.ct[-1].comment.append(line) 452 elif line == '}': 453 break 454 else: 455 record.ct[-1].info.append(line) 456 line = "" 457 else: 458 break 459 460 if not line.startswith('RD'): # another read? 461 break 462 463 yield record
464 465
466 -class ACEFileRecord(object):
467 """Holds data of an ACE file. 468 """
469 - def __init__(self):
470 self.ncontigs = None 471 self.nreads = None 472 self.contigs = [] 473 self.wa = None # none or many
474
475 - def sort(self):
476 """Sorts wr, rt and ct tags into the appropriate contig / read instance, if possible.""" 477 478 ct = [] 479 rt = [] 480 wr = [] 481 # search for tags that aren't in the right position 482 for i in range(len(self.contigs)): 483 c = self.contigs[i] 484 if c.wa: 485 if not self.wa: 486 self.wa = [] 487 self.wa.extend(c.wa) 488 if c.ct: 489 newcts = [ct_tag for ct_tag in c.ct if ct_tag.name != c.name] 490 for x in newcts: 491 self.contigs[i].ct.remove(x) 492 ct.extend(newcts) 493 for j in range(len(c.reads)): 494 r = c.reads[j] 495 if r.rt: 496 newrts = [rt_tag for rt_tag in r.rt if rt_tag.name != r.rd.name] 497 for x in newrts: 498 self.contigs[i].reads[j].rt.remove(x) 499 rt.extend(newrts) 500 if r.wr: 501 newwrs = [wr_tag for wr_tag in r.wr if wr_tag.name != r.rd.name] 502 for x in newwrs: 503 self.contigs[i].reads[j].wr.remove(x) 504 wr.extend(newwrs) 505 # now sort them into their proper place 506 for i in range(len(self.contigs)): 507 c = self.contigs[i] 508 for ct_tag in ct: 509 if ct_tag.name == c.name: 510 if self.contigs[i].ct is None: 511 self.contigs[i].ct = [] 512 self.contigs[i].ct.append(ct_tag) 513 if rt or wr: 514 for j in range(len(c.reads)): 515 r = c.reads[j] 516 for rt_tag in rt: 517 if rt_tag.name == r.rd.name: 518 if self.contigs[i].reads[j].rt is None: 519 self.contigs[i].reads[j].rt = [] 520 self.contigs[i].reads[j].rt.append(rt_tag) 521 for wr_tag in wr: 522 if wr_tag.name == r.rd.name: 523 if self.contigs[i].reads[j].wr is None: 524 self.contigs[i].reads[j].wr = [] 525 self.contigs[i].reads[j].wr.append(wr_tag)
526 527
528 -def read(handle):
529 """Parses the full ACE file in list of contigs. 530 531 """ 532 533 handle = iter(handle) 534 535 record = ACEFileRecord() 536 537 try: 538 line = next(handle) 539 except StopIteration: 540 raise ValueError("Premature end of file") 541 542 # check if the file starts correctly 543 if not line.startswith('AS'): 544 raise ValueError("File does not start with 'AS'.") 545 546 words = line.split() 547 record.ncontigs = int(words[1]) 548 record.nreads = int(words[2]) 549 550 # now read all the records 551 record.contigs = list(parse(handle)) 552 # wa, ct, rt rags are usually at the end of the file, but not necessarily (correct?). 553 # If the iterator is used, the tags are returned with the contig or the read after which they appear, 554 # if all tags are at the end, they are read with the last contig. The concept of an 555 # iterator leaves no other choice. But if the user uses the ACEParser, we can check 556 # them and put them into the appropriate contig/read instance. 557 # Conclusion: An ACE file is not a filetype for which iteration is 100% suitable... 558 record.sort() 559 return record
560