Package Bio :: Package AlignIO :: Module MafIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.MafIO

  1  # Copyright 2011, 2012 by Andrew Sczesnak.  All rights reserved. 
  2  # Revisions Copyright 2011, 2017 by Peter Cock.  All rights reserved. 
  3  # Revisions Copyright 2014, 2015 by Adam Novak.  All rights reserved. 
  4  # Revisions Copyright 2015, 2017 by Blaise Li.  All rights reserved. 
  5  # 
  6  # This code is part of the Biopython distribution and governed by its 
  7  # license.  Please see the LICENSE file that should have been included 
  8  # as part of this package. 
  9  """Bio.AlignIO support for the "maf" multiple alignment format. 
 10   
 11  The Multiple Alignment Format, described by UCSC, stores a series of 
 12  multiple alignments in a single file. It is suitable for whole-genome 
 13  to whole-genome alignments, metadata such as source chromosome, start 
 14  position, size, and strand can be stored. 
 15   
 16  See http://genome.ucsc.edu/FAQ/FAQformat.html#format5 
 17   
 18  You are expected to use this module via the Bio.AlignIO functions(or the 
 19  Bio.SeqIO functions if you want to work directly with the gapped sequences). 
 20   
 21  Coordinates in the MAF format are defined in terms of zero-based start 
 22  positions (like Python) and aligning region sizes. 
 23   
 24  A minimal aligned region of length one and starting at first position in the 
 25  source sequence would have ``start == 0`` and ``size == 1``. 
 26   
 27  As we can see on this example, ``start + size`` will give one more than the 
 28  zero-based end position. We can therefore manipulate ``start`` and 
 29  ``start + size`` as python list slice boundaries. 
 30   
 31  For an inclusive end coordinate, we need to use ``end = start + size - 1``. 
 32  A 1-column wide alignment would have ``start == end``. 
 33  """ 
 34  import os 
 35  from itertools import islice 
 36   
 37  try: 
 38      from sqlite3 import dbapi2 as _sqlite 
 39  except ImportError: 
 40      # Not present on Jython, but should be included in Python 2.5 
 41      # or later (unless compiled from source without its dependencies) 
 42      # Still want to offer simple parsing/output 
 43      _sqlite = None 
 44   
 45  from Bio.Alphabet import single_letter_alphabet 
 46  from Bio.Seq import Seq 
 47  from Bio.SeqRecord import SeqRecord 
 48  from Bio.Align import MultipleSeqAlignment 
 49  from .Interfaces import SequentialAlignmentWriter 
 50   
 51  MAFINDEX_VERSION = 1 
52 53 54 -class MafWriter(SequentialAlignmentWriter):
55 """Accepts a MultipleSeqAlignment object, writes a MAF file.""" 56
57 - def write_header(self):
58 """Write the MAF header.""" 59 self.handle.write("##maf version=1 scoring=none\n") 60 self.handle.write("# generated by Biopython\n\n")
61
62 - def _write_record(self, record):
63 """Write a single SeqRecord object to an 's' line in a MAF block (PRIVATE).""" 64 # convert biopython-style 1/-1 strand to MAF-style +/- strand 65 if record.annotations.get("strand") == 1: 66 strand = "+" 67 elif record.annotations.get("strand") == -1: 68 strand = "-" 69 else: 70 # TODO: issue warning? 71 strand = "+" 72 73 fields = ["s", 74 # In the MAF file format, spaces are not allowed in the id 75 "%-40s" % record.id.replace(" ", "_"), 76 "%15s" % record.annotations.get("start", 0), 77 "%5s" % record.annotations.get("size", len(str(record.seq).replace("-", ""))), 78 strand, 79 "%15s" % record.annotations.get("srcSize", 0), 80 str(record.seq)] 81 self.handle.write("%s\n" % " ".join(fields))
82
83 - def write_alignment(self, alignment):
84 """Write a complete alignment to a MAF block. 85 86 Writes every SeqRecord in a MultipleSeqAlignment object to its own 87 MAF block (beginning with an 'a' line, containing 's' lines). 88 """ 89 if not isinstance(alignment, MultipleSeqAlignment): 90 raise TypeError("Expected an alignment object") 91 92 if len(set([len(x) for x in alignment])) > 1: 93 raise ValueError("Sequences must all be the same length") 94 95 # We allow multiple sequences with the same IDs; for example, there may 96 # be a MAF aligning the + and - strands of the same sequence together. 97 98 # for now, use ._annotations private property, but restrict keys to those 99 # specifically supported by the MAF format, according to spec 100 try: 101 anno = " ".join(["%s=%s" % (x, y) 102 for x, y in alignment._annotations.items() 103 if x in ("score", "pass")]) 104 except AttributeError: 105 anno = "score=0.00" 106 107 self.handle.write("a %s\n" % (anno,)) 108 109 recs_out = 0 110 111 for record in alignment: 112 self._write_record(record) 113 114 recs_out += 1 115 116 self.handle.write("\n") 117 118 return recs_out
119
120 121 # Invalid function name according to pylint, but kept for compatibility 122 # with Bio* conventions. 123 -def MafIterator(handle, seq_count=None, alphabet=single_letter_alphabet):
124 """Iterate over a MAF file handle as MultipleSeqAlignment objects. 125 126 Iterates over lines in a MAF file-like object (handle), yielding 127 MultipleSeqAlignment objects. SeqRecord IDs generally correspond to 128 species names. 129 """ 130 in_a_bundle = False 131 132 annotations = [] 133 records = [] 134 135 while True: 136 # allows parsing of the last bundle without duplicating code 137 try: 138 line = next(handle) 139 except StopIteration: 140 line = "" 141 142 if in_a_bundle: 143 if line.startswith("s"): 144 # add a SeqRecord to the bundle 145 line_split = line.strip().split() 146 147 if len(line_split) != 7: 148 raise ValueError("Error parsing alignment - 's' line must have 7 fields") 149 150 # convert MAF-style +/- strand to biopython-type 1/-1 151 if line_split[4] == "+": 152 strand = 1 153 elif line_split[4] == "-": 154 strand = -1 155 else: 156 # TODO: issue warning, set to 0? 157 strand = 1 158 159 # s (literal), src (ID), start, size, strand, srcSize, text (sequence) 160 anno = {"start": int(line_split[2]), 161 "size": int(line_split[3]), 162 "strand": strand, 163 "srcSize": int(line_split[5])} 164 165 sequence = line_split[6] 166 167 # interpret a dot/period to mean the same as the first sequence 168 if "." in sequence: 169 if not records: 170 raise ValueError("Found dot/period in first sequence of alignment") 171 172 ref = str(records[0].seq) 173 new = [] 174 175 for (letter, ref_letter) in zip(sequence, ref): 176 new.append(ref_letter if letter == "." else letter) 177 178 sequence = "".join(new) 179 180 records.append(SeqRecord(Seq(sequence, alphabet), 181 id=line_split[1], 182 name=line_split[1], 183 description="", 184 annotations=anno)) 185 elif line.startswith("i"): 186 # TODO: information about what is in the aligned species DNA before 187 # and after the immediately preceding "s" line 188 pass 189 elif line.startswith("e"): 190 # TODO: information about the size of the gap between the alignments 191 # that span the current block 192 pass 193 elif line.startswith("q"): 194 # TODO: quality of each aligned base for the species. 195 # Need to find documentation on this, looks like ASCII 0-9 or gap? 196 # Can then store in each SeqRecord's .letter_annotations dictionary, 197 # perhaps as the raw string or turned into integers / None for gap? 198 pass 199 elif line.startswith("#"): 200 # ignore comments 201 # (not sure whether comments 202 # are in the maf specification, though) 203 pass 204 elif not line.strip(): 205 # end a bundle of records 206 if seq_count is not None: 207 assert len(records) == seq_count 208 209 alignment = MultipleSeqAlignment(records, alphabet) 210 # TODO - Introduce an annotated alignment class? 211 # See also Bio/AlignIO/FastaIO.py for same requirement. 212 # For now, store the annotation a new private property: 213 alignment._annotations = annotations 214 215 yield alignment 216 217 in_a_bundle = False 218 219 annotations = [] 220 records = [] 221 else: 222 raise ValueError("Error parsing alignment - unexpected line:\n%s" % (line,)) 223 elif line.startswith("a"): 224 # start a bundle of records 225 in_a_bundle = True 226 annot_strings = line.strip().split()[1:] 227 if len(annot_strings) != line.count("="): 228 raise ValueError("Error parsing alignment - invalid key in 'a' line") 229 annotations = dict([a_string.split("=") for a_string in annot_strings]) 230 elif line.startswith("#"): 231 # ignore comments 232 pass 233 elif not line: 234 break
235
236 237 -class MafIndex(object):
238 """Index for a MAF file. 239 240 The index is a sqlite3 database that is built upon creation of the object 241 if necessary, and queried when methods *search* or *get_spliced* are 242 used. 243 """ 244
245 - def __init__(self, sqlite_file, maf_file, target_seqname):
246 """Indexes or loads the index of a MAF file.""" 247 self._target_seqname = target_seqname 248 # example: Tests/MAF/ucsc_mm9_chr10.mafindex 249 self._index_filename = sqlite_file 250 # example: /home/bli/src/biopython/Tests/MAF 251 self._relative_path = os.path.abspath(os.path.dirname(sqlite_file)) 252 # example: Tests/MAF/ucsc_mm9_chr10.maf 253 self._maf_file = maf_file 254 255 self._maf_fp = open(self._maf_file, "r") 256 257 # if sqlite_file exists, use the existing db, otherwise index the file 258 if os.path.isfile(sqlite_file): 259 self._con = _sqlite.connect(sqlite_file) 260 self._record_count = self.__check_existing_db() 261 else: 262 self._con = _sqlite.connect(sqlite_file) 263 self._record_count = self.__make_new_index() 264 265 # lastly, setup a MafIterator pointing at the open maf_file 266 self._mafiter = MafIterator(self._maf_fp)
267
268 - def __check_existing_db(self):
269 """Perform basic sanity checks upon loading an existing index (PRIVATE).""" 270 try: 271 idx_version = int(self._con.execute( 272 "SELECT value FROM meta_data WHERE key = 'version'").fetchone()[0]) 273 if idx_version != MAFINDEX_VERSION: 274 raise ValueError("Index version (%s) incompatible with " 275 "this version of MafIndex" % idx_version) 276 277 filename = self._con.execute( 278 "SELECT value FROM meta_data WHERE key = 'filename'").fetchone()[0] 279 # Compute absolute path of the original maf file 280 if os.path.isabs(filename): 281 # It was already stored as absolute 282 tmp_mafpath = filename 283 else: 284 # It should otherwise have been stored as relative to the index 285 # Would be stored with Unix / path separator, so convert 286 # it to the local OS path separator here: 287 tmp_mafpath = os.path.join( 288 self._relative_path, filename.replace("/", os.path.sep)) 289 if tmp_mafpath != os.path.abspath(self._maf_file): 290 # Original and given absolute paths differ. 291 raise ValueError("Index uses a different file (%s != %s)" 292 % (filename, self._maf_file)) 293 294 db_target = self._con.execute( 295 "SELECT value FROM meta_data WHERE key = 'target_seqname'").fetchone()[0] 296 if db_target != self._target_seqname: 297 raise ValueError("Provided database indexed for %s, expected %s" 298 % (db_target, self._target_seqname)) 299 300 record_count = int(self._con.execute( 301 "SELECT value FROM meta_data WHERE key = 'record_count'").fetchone()[0]) 302 if record_count == -1: 303 raise ValueError("Unfinished/partial database provided") 304 305 records_found = int(self._con.execute( 306 "SELECT COUNT(*) FROM offset_data").fetchone()[0]) 307 if records_found != record_count: 308 raise ValueError("Expected %s records, found %s. Corrupt index?" 309 % (record_count, records_found)) 310 311 return records_found 312 313 except (_sqlite.OperationalError, _sqlite.DatabaseError) as err: 314 raise ValueError("Problem with SQLite database: %s" % err)
315
316 - def __make_new_index(self):
317 """Read MAF file and generate SQLite index (PRIVATE).""" 318 # make the tables 319 self._con.execute("CREATE TABLE meta_data (key TEXT, value TEXT);") 320 self._con.execute("INSERT INTO meta_data (key, value) VALUES ('version', 1);") 321 self._con.execute("INSERT INTO meta_data (key, value) VALUES ('record_count', -1);") 322 self._con.execute("INSERT INTO meta_data (key, value) VALUES ('target_seqname', '%s');" % 323 (self._target_seqname,)) 324 # Determine whether to store maf file as relative to the index or absolute 325 # See https://github.com/biopython/biopython/pull/381 326 if not os.path.isabs(self._maf_file) and not os.path.isabs(self._index_filename): 327 # Since the user gave both maf file and index as relative paths, 328 # we will store the maf file relative to the index. 329 # Note for cross platform use (e.g. shared drive over SAMBA), 330 # convert any Windows slash into Unix style for rel paths. 331 # example: ucsc_mm9_chr10.maf 332 mafpath = os.path.relpath( 333 self._maf_file, self._relative_path).replace(os.path.sep, "/") 334 elif (os.path.dirname(os.path.abspath(self._maf_file)) + 335 os.path.sep).startswith(self._relative_path + os.path.sep): 336 # Since maf file is in same directory or sub directory, 337 # might as well make this into a relative path: 338 mafpath = os.path.relpath( 339 self._maf_file, self._relative_path).replace(os.path.sep, "/") 340 else: 341 # Default to storing as an absolute path 342 # example: /home/bli/src/biopython/Tests/MAF/ucsc_mm9_chr10.maf 343 mafpath = os.path.abspath(self._maf_file) 344 self._con.execute("INSERT INTO meta_data (key, value) VALUES ('filename', '%s');" % 345 (mafpath,)) 346 self._con.execute("CREATE TABLE offset_data (bin INTEGER, start INTEGER, end INTEGER, offset INTEGER);") 347 348 insert_count = 0 349 350 # iterate over the entire file and insert in batches 351 mafindex_func = self.__maf_indexer() 352 353 while True: 354 batch = list(islice(mafindex_func, 100)) 355 if not batch: 356 break 357 358 # batch is made from self.__maf_indexer(), 359 self._con.executemany( 360 "INSERT INTO offset_data (bin, start, end, offset) VALUES (?,?,?,?);", batch) 361 self._con.commit() 362 insert_count += len(batch) 363 364 # then make indexes on the relevant fields 365 self._con.execute("CREATE INDEX IF NOT EXISTS bin_index ON offset_data(bin);") 366 self._con.execute("CREATE INDEX IF NOT EXISTS start_index ON offset_data(start);") 367 self._con.execute("CREATE INDEX IF NOT EXISTS end_index ON offset_data(end);") 368 369 self._con.execute( 370 "UPDATE meta_data SET value = '%s' WHERE key = 'record_count'" % (insert_count,)) 371 372 self._con.commit() 373 374 return insert_count
375
376 - def __maf_indexer(self):
377 """Return index information for each bundle (PRIVATE). 378 379 Yields index information for each bundle in the form of 380 (bin, start, end, offset) tuples where start and end are 381 0-based inclusive coordinates. 382 """ 383 line = self._maf_fp.readline() 384 385 while line: 386 if line.startswith("a"): 387 # note the offset 388 offset = self._maf_fp.tell() - len(line) 389 390 # search the following lines for a match to target_seqname 391 while True: 392 line = self._maf_fp.readline() 393 394 if not line.strip() or line.startswith("a"): 395 # Empty line or new alignment record 396 raise ValueError("Target for indexing (%s) not found in this bundle" 397 % (self._target_seqname,)) 398 elif line.startswith("s"): 399 # s (literal), src (ID), start, size, strand, srcSize, text (sequence) 400 line_split = line.strip().split() 401 402 if line_split[1] == self._target_seqname: 403 start = int(line_split[2]) 404 end = int(line_split[2]) + int(line_split[3]) 405 406 if end - start != len(line_split[6].replace("-", "")): 407 raise ValueError("Invalid length for target coordinates (expected %s, found %s)" % 408 (end - start, len(line_split[6].replace("-", "")))) 409 410 yield (self._ucscbin(start, end), start, end, offset) 411 412 break 413 414 line = self._maf_fp.readline()
415 416 # TODO: check coordinate correctness for the two bin-related static methods 417 @staticmethod
418 - def _region2bin(start, end):
419 """Find bins that a region may belong to (PRIVATE). 420 421 Converts a region to a list of bins that it may belong to, including largest 422 and smallest bins. 423 """ 424 bins = [0, 1] 425 426 bins.extend(range(1 + (start >> 26), 2 + ((end - 1) >> 26))) 427 bins.extend(range(9 + (start >> 23), 10 + ((end - 1) >> 23))) 428 bins.extend(range(73 + (start >> 20), 74 + ((end - 1) >> 20))) 429 bins.extend(range(585 + (start >> 17), 586 + ((end - 1) >> 17))) 430 431 return set(bins)
432 433 @staticmethod
434 - def _ucscbin(start, end):
435 """Return the smallest bin a given region will fit into (PRIVATE). 436 437 Adapted from http://genomewiki.ucsc.edu/index.php/Bin_indexing_system 438 """ 439 bin_offsets = [512 + 64 + 8 + 1, 64 + 8 + 1, 8 + 1, 1, 0] 440 441 _bin_first_shift = 17 442 _bin_next_shift = 3 443 444 start_bin = start 445 end_bin = end - 1 446 447 start_bin >>= _bin_first_shift 448 end_bin >>= _bin_first_shift 449 450 for bin_offset in bin_offsets: 451 if start_bin == end_bin: 452 return bin_offset + start_bin 453 start_bin >>= _bin_next_shift 454 end_bin >>= _bin_next_shift 455 456 return 0
457
458 - def _get_record(self, offset):
459 """Retrieve a single MAF record located at the offset provided (PRIVATE).""" 460 self._maf_fp.seek(offset) 461 return next(self._mafiter)
462
463 - def search(self, starts, ends):
464 """Search index database for MAF records overlapping ranges provided. 465 466 Returns *MultipleSeqAlignment* results in order by start, then end, then 467 internal offset field. 468 469 *starts* should be a list of 0-based start coordinates of segments in the reference. 470 *ends* should be the list of the corresponding segment ends 471 (in the half-open UCSC convention: 472 http://genome.ucsc.edu/blog/the-ucsc-genome-browser-coordinate-counting-systems/). 473 """ 474 # verify the provided exon coordinates 475 if len(starts) != len(ends): 476 raise ValueError("Every position in starts must have a match in ends") 477 478 for exonstart, exonend in zip(starts, ends): 479 if exonstart >= exonend: 480 raise ValueError("Exon coordinates invalid (%s >= %s)" % (exonstart, exonend)) 481 con = self._con 482 483 # Keep track of what blocks have already been yielded 484 # in order to avoid duplicating them 485 # (see https://github.com/biopython/biopython/issues/1083) 486 yielded_rec_coords = set([]) 487 # search for every exon 488 for exonstart, exonend in zip(starts, ends): 489 try: 490 possible_bins = ", ".join(map(str, self._region2bin(exonstart, exonend))) 491 except TypeError: 492 raise TypeError("Exon coordinates must be integers " 493 "(start=%d, end=%d)" % (exonstart, exonend)) 494 495 # https://www.sqlite.org/lang_expr.html 496 # ----- 497 # The BETWEEN operator 498 # 499 # The BETWEEN operator is logically equivalent to a pair of 500 # comparisons. "x BETWEEN y AND z" is equivalent to "x>=y AND x<=z" 501 # except that with BETWEEN, the x expression is only evaluated 502 # once. The precedence of the BETWEEN operator is the same as the 503 # precedence as operators == and != and LIKE and groups left to 504 # right. 505 # ----- 506 507 result = con.execute("SELECT DISTINCT start, end, offset FROM " 508 "offset_data WHERE bin IN (%s) AND (end BETWEEN %s AND %s " 509 "OR %s BETWEEN start AND end) ORDER BY start, end, " 510 "offset ASC;" 511 % (possible_bins, exonstart, exonend, exonend)) 512 513 rows = result.fetchall() 514 515 for rec_start, rec_end, offset in rows: 516 # Avoid yielding multiple time the same block 517 if (rec_start, rec_end) in yielded_rec_coords: 518 continue 519 else: 520 yielded_rec_coords.add((rec_start, rec_end)) 521 # Iterate through hits, fetching alignments from the MAF file 522 # and checking to be sure we've retrieved the expected record. 523 524 fetched = self._get_record(int(offset)) 525 526 for record in fetched: 527 if record.id == self._target_seqname: 528 # start and size come from the maf lines 529 start = record.annotations["start"] 530 end = record.annotations["start"] + record.annotations["size"] 531 532 if not (start == rec_start and end == rec_end): 533 raise ValueError("Expected %s-%s @ offset %s, found %s-%s" % 534 (rec_start, rec_end, offset, start, end)) 535 536 yield fetched
537
538 - def get_spliced(self, starts, ends, strand=1):
539 """Return a multiple alignment of the exact sequence range provided. 540 541 Accepts two lists of start and end positions on target_seqname, representing 542 exons to be spliced in silico. Returns a *MultipleSeqAlignment* of the 543 desired sequences spliced together. 544 545 *starts* should be a list of 0-based start coordinates of segments in the reference. 546 *ends* should be the list of the corresponding segment ends 547 (in the half-open UCSC convention: 548 http://genome.ucsc.edu/blog/the-ucsc-genome-browser-coordinate-counting-systems/). 549 550 To ask for the alignment portion corresponding to the first 100 551 nucleotides of the reference sequence, you would use 552 ``search([0], [100])`` 553 """ 554 # validate strand 555 if strand not in (1, -1): 556 raise ValueError("Strand must be 1 or -1, got %s" % str(strand)) 557 558 # pull all alignments that span the desired intervals 559 fetched = [multiseq for multiseq in self.search(starts, ends)] 560 561 # keep track of the expected letter count 562 # (sum of lengths of [start, end) segments, 563 # where [start, end) half-open) 564 expected_letters = sum([end - start for start, end in zip(starts, ends)]) 565 566 # if there's no alignment, return filler for the assembly of the length given 567 if len(fetched) == 0: 568 return MultipleSeqAlignment([SeqRecord(Seq("N" * expected_letters), 569 id=self._target_seqname)]) 570 571 # find the union of all IDs in these alignments 572 all_seqnames = set( 573 [sequence.id for multiseq in fetched for sequence in multiseq]) 574 575 # split every record by base position 576 # key: sequence name 577 # value: dictionary 578 # key: position in the reference sequence 579 # value: letter(s) (including letters 580 # aligned to the "-" preceding the letter 581 # at the position in the reference, if any) 582 split_by_position = dict([(seq_name, {}) for seq_name in all_seqnames]) 583 584 # keep track of what the total number of (unspliced) letters should be 585 total_rec_length = 0 586 587 # track first strand encountered on the target seqname 588 ref_first_strand = None 589 590 for multiseq in fetched: 591 # find the target_seqname in this MultipleSeqAlignment and use it to 592 # set the parameters for the rest of this iteration 593 for seqrec in multiseq: 594 if seqrec.id == self._target_seqname: 595 try: 596 if ref_first_strand is None: 597 ref_first_strand = seqrec.annotations["strand"] 598 599 if ref_first_strand not in (1, -1): 600 raise ValueError("Strand must be 1 or -1") 601 elif ref_first_strand != seqrec.annotations["strand"]: 602 raise ValueError("Encountered strand='%s' on target seqname, " 603 "expected '%s'" % 604 (seqrec.annotations["strand"], ref_first_strand)) 605 except KeyError: 606 raise ValueError("No strand information for target seqname (%s)" % 607 self._target_seqname) 608 # length including gaps (i.e. alignment length) 609 rec_length = len(seqrec) 610 rec_start = seqrec.annotations["start"] 611 rec_end = seqrec.annotations["start"] + seqrec.annotations["size"] 612 613 total_rec_length += rec_end - rec_start 614 615 # blank out these positions for every seqname 616 for seqrec in multiseq: 617 for pos in range(rec_start, rec_end): 618 split_by_position[seqrec.id][pos] = "" 619 620 break 621 else: 622 raise ValueError("Did not find %s in alignment bundle" % (self._target_seqname,)) 623 624 # the true, chromosome/contig/etc position in the target seqname 625 real_pos = rec_start 626 627 # loop over the alignment to fill split_by_position 628 for gapped_pos in range(0, rec_length): 629 for seqrec in multiseq: 630 # keep track of this position's value for the target seqname 631 if seqrec.id == self._target_seqname: 632 track_val = seqrec.seq[gapped_pos] 633 634 # Here, a real_pos that corresponds to just after a series of "-" 635 # in the reference will "accumulate" the letters found in other sequences 636 # in front of the "-"s 637 split_by_position[seqrec.id][real_pos] += seqrec.seq[gapped_pos] 638 639 # increment the real_pos counter only when non-gaps are found in 640 # the target_seqname, and we haven't reached the end of the record 641 if track_val != "-" and real_pos < rec_end - 1: 642 real_pos += 1 643 644 # make sure the number of bp entries equals the sum of the record lengths 645 if len(split_by_position[self._target_seqname]) != total_rec_length: 646 raise ValueError("Target seqname (%s) has %s records, expected %s" % 647 (self._target_seqname, 648 len(split_by_position[self._target_seqname]), 649 total_rec_length)) 650 651 # translates a position in the target_seqname sequence to its gapped length 652 realpos_to_len = dict([(x, len(y)) 653 for x, y in split_by_position[self._target_seqname].items() 654 if len(y) > 1]) 655 656 # splice together the exons 657 subseq = {} 658 659 for seqid in all_seqnames: 660 seq_split = split_by_position[seqid] 661 seq_splice = [] 662 663 filler_char = "N" if seqid == self._target_seqname else "-" 664 665 # iterate from start to end, taking bases from split_by_position when 666 # they exist, using N or - for gaps when there is no alignment. 667 append = seq_splice.append 668 669 for exonstart, exonend in zip(starts, ends): 670 for real_pos in range(exonstart, exonend): 671 # if this seqname has this position, add it 672 if real_pos in seq_split: 673 append(seq_split[real_pos]) 674 # if not, but it's in the target_seqname, add length-matched filler 675 elif real_pos in realpos_to_len: 676 append(filler_char * realpos_to_len[real_pos]) 677 # it's not in either, so add a single filler character 678 else: 679 append(filler_char) 680 681 subseq[seqid] = "".join(seq_splice) 682 683 # make sure we're returning the right number of letters 684 if len(subseq[self._target_seqname].replace("-", "")) != expected_letters: 685 raise ValueError("Returning %s letters for target seqname (%s), expected %s" % 686 (len(subseq[self._target_seqname].replace("-", "")), 687 self._target_seqname, expected_letters)) 688 689 # check to make sure all sequences are the same length as the target seqname 690 ref_subseq_len = len(subseq[self._target_seqname]) 691 692 for seqid, seq in subseq.items(): 693 if len(seq) != ref_subseq_len: 694 raise ValueError("Returning length %s for %s, expected %s" % 695 (len(seq), seqid, ref_subseq_len)) 696 697 # finally, build a MultipleSeqAlignment object for our final sequences 698 result_multiseq = [] 699 700 for seqid, seq in subseq.items(): 701 seq = Seq(seq) 702 703 seq = seq if strand == ref_first_strand else seq.reverse_complement() 704 705 result_multiseq.append(SeqRecord(seq, 706 id=seqid, 707 name=seqid, 708 description="")) 709 710 return MultipleSeqAlignment(result_multiseq)
711
712 - def __repr__(self):
713 return "MafIO.MafIndex(%r, target_seqname=%r)" % (self._maf_fp.name, 714 self._target_seqname)
715
716 - def __len__(self):
717 """Return the number of records in the index.""" 718 return self._record_count
719