Package Bio :: Package AlignIO :: Module MafIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.MafIO

  1  # Copyright 2011, 2012 by Andrew Sczesnak.  All rights reserved. 
  2  # Revisions Copyright 2011, 2017 by Peter Cock.  All rights reserved. 
  3  # Revisions Copyright 2014, 2015 by Adam Novak.  All rights reserved. 
  4  # Revisions Copyright 2015, 2017 by Blaise Li.  All rights reserved. 
  5  # 
  6  # This file is part of the Biopython distribution and governed by your 
  7  # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". 
  8  # Please see the LICENSE file that should have been included as part of this 
  9  # package. 
 10  """Bio.AlignIO support for the "maf" multiple alignment format. 
 11   
 12  The Multiple Alignment Format, described by UCSC, stores a series of 
 13  multiple alignments in a single file. It is suitable for whole-genome 
 14  to whole-genome alignments, metadata such as source chromosome, start 
 15  position, size, and strand can be stored. 
 16   
 17  See http://genome.ucsc.edu/FAQ/FAQformat.html#format5 
 18   
 19  You are expected to use this module via the Bio.AlignIO functions(or the 
 20  Bio.SeqIO functions if you want to work directly with the gapped sequences). 
 21   
 22  Coordinates in the MAF format are defined in terms of zero-based start 
 23  positions (like Python) and aligning region sizes. 
 24   
 25  A minimal aligned region of length one and starting at first position in the 
 26  source sequence would have ``start == 0`` and ``size == 1``. 
 27   
 28  As we can see on this example, ``start + size`` will give one more than the 
 29  zero-based end position. We can therefore manipulate ``start`` and 
 30  ``start + size`` as python list slice boundaries. 
 31   
 32  For an inclusive end coordinate, we need to use ``end = start + size - 1``. 
 33  A 1-column wide alignment would have ``start == end``. 
 34  """ 
 35  import os 
 36  from itertools import islice 
 37   
 38  try: 
 39      from sqlite3 import dbapi2 as _sqlite 
 40  except ImportError: 
 41      # Not present on Jython, but should be included in Python 2.5 
 42      # or later (unless compiled from source without its dependencies) 
 43      # Still want to offer simple parsing/output 
 44      _sqlite = None 
 45   
 46  from Bio.Alphabet import single_letter_alphabet 
 47  from Bio.Seq import Seq 
 48  from Bio.SeqRecord import SeqRecord 
 49  from Bio.Align import MultipleSeqAlignment 
 50  from .Interfaces import SequentialAlignmentWriter 
 51   
 52  MAFINDEX_VERSION = 2 
53 54 55 -class MafWriter(SequentialAlignmentWriter):
56 """Accepts a MultipleSeqAlignment object, writes a MAF file.""" 57
58 - def write_header(self):
59 """Write the MAF header.""" 60 self.handle.write("##maf version=1 scoring=none\n") 61 self.handle.write("# generated by Biopython\n\n")
62
63 - def _write_record(self, record):
64 """Write a single SeqRecord object to an 's' line in a MAF block (PRIVATE).""" 65 # convert biopython-style 1/-1 strand to MAF-style +/- strand 66 if record.annotations.get("strand") == 1: 67 strand = "+" 68 elif record.annotations.get("strand") == -1: 69 strand = "-" 70 else: 71 # TODO: issue warning? 72 strand = "+" 73 74 fields = ["s", 75 # In the MAF file format, spaces are not allowed in the id 76 "%-40s" % record.id.replace(" ", "_"), 77 "%15s" % record.annotations.get("start", 0), 78 "%5s" % record.annotations.get("size", len(str(record.seq).replace("-", ""))), 79 strand, 80 "%15s" % record.annotations.get("srcSize", 0), 81 str(record.seq)] 82 self.handle.write("%s\n" % " ".join(fields))
83
84 - def write_alignment(self, alignment):
85 """Write a complete alignment to a MAF block. 86 87 Writes every SeqRecord in a MultipleSeqAlignment object to its own 88 MAF block (beginning with an 'a' line, containing 's' lines). 89 """ 90 if not isinstance(alignment, MultipleSeqAlignment): 91 raise TypeError("Expected an alignment object") 92 93 if len(set([len(x) for x in alignment])) > 1: 94 raise ValueError("Sequences must all be the same length") 95 96 # We allow multiple sequences with the same IDs; for example, there may 97 # be a MAF aligning the + and - strands of the same sequence together. 98 99 # for now, use ._annotations private property, but restrict keys to those 100 # specifically supported by the MAF format, according to spec 101 try: 102 anno = " ".join(["%s=%s" % (x, y) 103 for x, y in alignment._annotations.items() 104 if x in ("score", "pass")]) 105 except AttributeError: 106 anno = "score=0.00" 107 108 self.handle.write("a %s\n" % (anno,)) 109 110 recs_out = 0 111 112 for record in alignment: 113 self._write_record(record) 114 115 recs_out += 1 116 117 self.handle.write("\n") 118 119 return recs_out
120
121 122 # Invalid function name according to pylint, but kept for compatibility 123 # with Bio* conventions. 124 -def MafIterator(handle, seq_count=None, alphabet=single_letter_alphabet):
125 """Iterate over a MAF file handle as MultipleSeqAlignment objects. 126 127 Iterates over lines in a MAF file-like object (handle), yielding 128 MultipleSeqAlignment objects. SeqRecord IDs generally correspond to 129 species names. 130 """ 131 in_a_bundle = False 132 133 annotations = [] 134 records = [] 135 136 while True: 137 # allows parsing of the last bundle without duplicating code 138 try: 139 line = next(handle) 140 except StopIteration: 141 line = "" 142 143 if in_a_bundle: 144 if line.startswith("s"): 145 # add a SeqRecord to the bundle 146 line_split = line.strip().split() 147 148 if len(line_split) != 7: 149 raise ValueError("Error parsing alignment - 's' line must have 7 fields") 150 151 # convert MAF-style +/- strand to biopython-type 1/-1 152 if line_split[4] == "+": 153 strand = 1 154 elif line_split[4] == "-": 155 strand = -1 156 else: 157 # TODO: issue warning, set to 0? 158 strand = 1 159 160 # s (literal), src (ID), start, size, strand, srcSize, text (sequence) 161 anno = {"start": int(line_split[2]), 162 "size": int(line_split[3]), 163 "strand": strand, 164 "srcSize": int(line_split[5])} 165 166 sequence = line_split[6] 167 168 # interpret a dot/period to mean the same as the first sequence 169 if "." in sequence: 170 if not records: 171 raise ValueError("Found dot/period in first sequence of alignment") 172 173 ref = str(records[0].seq) 174 new = [] 175 176 for (letter, ref_letter) in zip(sequence, ref): 177 new.append(ref_letter if letter == "." else letter) 178 179 sequence = "".join(new) 180 181 records.append(SeqRecord(Seq(sequence, alphabet), 182 id=line_split[1], 183 name=line_split[1], 184 description="", 185 annotations=anno)) 186 elif line.startswith("i"): 187 # TODO: information about what is in the aligned species DNA before 188 # and after the immediately preceding "s" line 189 pass 190 elif line.startswith("e"): 191 # TODO: information about the size of the gap between the alignments 192 # that span the current block 193 pass 194 elif line.startswith("q"): 195 # TODO: quality of each aligned base for the species. 196 # Need to find documentation on this, looks like ASCII 0-9 or gap? 197 # Can then store in each SeqRecord's .letter_annotations dictionary, 198 # perhaps as the raw string or turned into integers / None for gap? 199 pass 200 elif line.startswith("#"): 201 # ignore comments 202 # (not sure whether comments 203 # are in the maf specification, though) 204 pass 205 elif not line.strip(): 206 # end a bundle of records 207 if seq_count is not None: 208 assert len(records) == seq_count 209 210 alignment = MultipleSeqAlignment(records, alphabet) 211 # TODO - Introduce an annotated alignment class? 212 # See also Bio/AlignIO/FastaIO.py for same requirement. 213 # For now, store the annotation a new private property: 214 alignment._annotations = annotations 215 216 yield alignment 217 218 in_a_bundle = False 219 220 annotations = [] 221 records = [] 222 else: 223 raise ValueError("Error parsing alignment - unexpected line:\n%s" % (line,)) 224 elif line.startswith("a"): 225 # start a bundle of records 226 in_a_bundle = True 227 annot_strings = line.strip().split()[1:] 228 if len(annot_strings) != line.count("="): 229 raise ValueError("Error parsing alignment - invalid key in 'a' line") 230 annotations = dict([a_string.split("=") for a_string in annot_strings]) 231 elif line.startswith("#"): 232 # ignore comments 233 pass 234 elif not line: 235 break
236
237 238 -class MafIndex(object):
239 """Index for a MAF file. 240 241 The index is a sqlite3 database that is built upon creation of the object 242 if necessary, and queried when methods *search* or *get_spliced* are 243 used. 244 """ 245
246 - def __init__(self, sqlite_file, maf_file, target_seqname):
247 """Indexes or loads the index of a MAF file.""" 248 self._target_seqname = target_seqname 249 # example: Tests/MAF/ucsc_mm9_chr10.mafindex 250 self._index_filename = sqlite_file 251 # example: /home/bli/src/biopython/Tests/MAF 252 self._relative_path = os.path.abspath(os.path.dirname(sqlite_file)) 253 # example: Tests/MAF/ucsc_mm9_chr10.maf 254 self._maf_file = maf_file 255 256 self._maf_fp = open(self._maf_file, "r") 257 258 # if sqlite_file exists, use the existing db, otherwise index the file 259 if os.path.isfile(sqlite_file): 260 self._con = _sqlite.connect(sqlite_file) 261 self._record_count = self.__check_existing_db() 262 else: 263 self._con = _sqlite.connect(sqlite_file) 264 self._record_count = self.__make_new_index() 265 266 # lastly, setup a MafIterator pointing at the open maf_file 267 self._mafiter = MafIterator(self._maf_fp)
268
269 - def __check_existing_db(self):
270 """Perform basic sanity checks upon loading an existing index (PRIVATE).""" 271 try: 272 idx_version = int(self._con.execute( 273 "SELECT value FROM meta_data WHERE key = 'version'").fetchone()[0]) 274 if idx_version != MAFINDEX_VERSION: 275 msg = "\n".join([ 276 "Index version (%s) incompatible with this version " 277 "of MafIndex" % idx_version, 278 "You might erase the existing index %s " 279 "for it to be rebuilt." % self._index_filename]) 280 raise ValueError(msg) 281 282 filename = self._con.execute( 283 "SELECT value FROM meta_data WHERE key = 'filename'").fetchone()[0] 284 # Compute absolute path of the original maf file 285 if os.path.isabs(filename): 286 # It was already stored as absolute 287 tmp_mafpath = filename 288 else: 289 # It should otherwise have been stored as relative to the index 290 # Would be stored with Unix / path separator, so convert 291 # it to the local OS path separator here: 292 tmp_mafpath = os.path.join( 293 self._relative_path, filename.replace("/", os.path.sep)) 294 if tmp_mafpath != os.path.abspath(self._maf_file): 295 # Original and given absolute paths differ. 296 raise ValueError("Index uses a different file (%s != %s)" 297 % (filename, self._maf_file)) 298 299 db_target = self._con.execute( 300 "SELECT value FROM meta_data WHERE key = 'target_seqname'").fetchone()[0] 301 if db_target != self._target_seqname: 302 raise ValueError("Provided database indexed for %s, expected %s" 303 % (db_target, self._target_seqname)) 304 305 record_count = int(self._con.execute( 306 "SELECT value FROM meta_data WHERE key = 'record_count'").fetchone()[0]) 307 if record_count == -1: 308 raise ValueError("Unfinished/partial database provided") 309 310 records_found = int(self._con.execute( 311 "SELECT COUNT(*) FROM offset_data").fetchone()[0]) 312 if records_found != record_count: 313 raise ValueError("Expected %s records, found %s. Corrupt index?" 314 % (record_count, records_found)) 315 316 return records_found 317 318 except (_sqlite.OperationalError, _sqlite.DatabaseError) as err: 319 raise ValueError("Problem with SQLite database: %s" % err)
320
321 - def __make_new_index(self):
322 """Read MAF file and generate SQLite index (PRIVATE).""" 323 # make the tables 324 self._con.execute("CREATE TABLE meta_data (key TEXT, value TEXT);") 325 self._con.execute("INSERT INTO meta_data (key, value) VALUES ('version', %s);" % MAFINDEX_VERSION) 326 self._con.execute("INSERT INTO meta_data (key, value) VALUES ('record_count', -1);") 327 self._con.execute("INSERT INTO meta_data (key, value) VALUES ('target_seqname', '%s');" % 328 (self._target_seqname,)) 329 # Determine whether to store maf file as relative to the index or absolute 330 # See https://github.com/biopython/biopython/pull/381 331 if not os.path.isabs(self._maf_file) and not os.path.isabs(self._index_filename): 332 # Since the user gave both maf file and index as relative paths, 333 # we will store the maf file relative to the index. 334 # Note for cross platform use (e.g. shared drive over SAMBA), 335 # convert any Windows slash into Unix style for rel paths. 336 # example: ucsc_mm9_chr10.maf 337 mafpath = os.path.relpath( 338 self._maf_file, self._relative_path).replace(os.path.sep, "/") 339 elif (os.path.dirname(os.path.abspath(self._maf_file)) + 340 os.path.sep).startswith(self._relative_path + os.path.sep): 341 # Since maf file is in same directory or sub directory, 342 # might as well make this into a relative path: 343 mafpath = os.path.relpath( 344 self._maf_file, self._relative_path).replace(os.path.sep, "/") 345 else: 346 # Default to storing as an absolute path 347 # example: /home/bli/src/biopython/Tests/MAF/ucsc_mm9_chr10.maf 348 mafpath = os.path.abspath(self._maf_file) 349 self._con.execute("INSERT INTO meta_data (key, value) VALUES ('filename', '%s');" % 350 (mafpath,)) 351 self._con.execute("CREATE TABLE offset_data (bin INTEGER, start INTEGER, end INTEGER, offset INTEGER);") 352 353 insert_count = 0 354 355 # iterate over the entire file and insert in batches 356 mafindex_func = self.__maf_indexer() 357 358 while True: 359 batch = list(islice(mafindex_func, 100)) 360 if not batch: 361 break 362 363 # batch is made from self.__maf_indexer(), 364 # which yields zero-based "inclusive" start and end coordinates 365 self._con.executemany( 366 "INSERT INTO offset_data (bin, start, end, offset) VALUES (?,?,?,?);", batch) 367 self._con.commit() 368 insert_count += len(batch) 369 370 # then make indexes on the relevant fields 371 self._con.execute("CREATE INDEX IF NOT EXISTS bin_index ON offset_data(bin);") 372 self._con.execute("CREATE INDEX IF NOT EXISTS start_index ON offset_data(start);") 373 self._con.execute("CREATE INDEX IF NOT EXISTS end_index ON offset_data(end);") 374 375 self._con.execute( 376 "UPDATE meta_data SET value = '%s' WHERE key = 'record_count'" % (insert_count,)) 377 378 self._con.commit() 379 380 return insert_count
381
382 - def __maf_indexer(self):
383 """Return index information for each bundle (PRIVATE). 384 385 Yields index information for each bundle in the form of 386 (bin, start, end, offset) tuples where start and end are 387 0-based inclusive coordinates. 388 """ 389 line = self._maf_fp.readline() 390 391 while line: 392 if line.startswith("a"): 393 # note the offset 394 offset = self._maf_fp.tell() - len(line) 395 396 # search the following lines for a match to target_seqname 397 while True: 398 line = self._maf_fp.readline() 399 400 if not line.strip() or line.startswith("a"): 401 # Empty line or new alignment record 402 raise ValueError("Target for indexing (%s) not found in this bundle" 403 % (self._target_seqname,)) 404 elif line.startswith("s"): 405 # s (literal), src (ID), start, size, strand, srcSize, text (sequence) 406 line_split = line.strip().split() 407 408 if line_split[1] == self._target_seqname: 409 start = int(line_split[2]) 410 size = int(line_split[3]) 411 if size != len(line_split[6].replace("-", "")): 412 raise ValueError( 413 "Invalid length for target coordinates " 414 "(expected %s, found %s)" % ( 415 size, len(line_split[6].replace("-", "")))) 416 417 # "inclusive" end position is start + length - 1 418 end = start + size - 1 419 420 # _ucscbin takes end-exclusive coordinates 421 yield (self._ucscbin(start, end + 1), start, end, offset) 422 423 break 424 425 line = self._maf_fp.readline()
426 427 # TODO: check coordinate correctness for the two bin-related static methods 428 @staticmethod
429 - def _region2bin(start, end):
430 """Find bins that a region may belong to (PRIVATE). 431 432 Converts a region to a list of bins that it may belong to, including largest 433 and smallest bins. 434 """ 435 bins = [0, 1] 436 437 bins.extend(range(1 + (start >> 26), 2 + ((end - 1) >> 26))) 438 bins.extend(range(9 + (start >> 23), 10 + ((end - 1) >> 23))) 439 bins.extend(range(73 + (start >> 20), 74 + ((end - 1) >> 20))) 440 bins.extend(range(585 + (start >> 17), 586 + ((end - 1) >> 17))) 441 442 return set(bins)
443 444 @staticmethod
445 - def _ucscbin(start, end):
446 """Return the smallest bin a given region will fit into (PRIVATE). 447 448 Adapted from http://genomewiki.ucsc.edu/index.php/Bin_indexing_system 449 """ 450 bin_offsets = [512 + 64 + 8 + 1, 64 + 8 + 1, 8 + 1, 1, 0] 451 452 _bin_first_shift = 17 453 _bin_next_shift = 3 454 455 start_bin = start 456 end_bin = end - 1 457 458 start_bin >>= _bin_first_shift 459 end_bin >>= _bin_first_shift 460 461 for bin_offset in bin_offsets: 462 if start_bin == end_bin: 463 return bin_offset + start_bin 464 start_bin >>= _bin_next_shift 465 end_bin >>= _bin_next_shift 466 467 return 0
468
469 - def _get_record(self, offset):
470 """Retrieve a single MAF record located at the offset provided (PRIVATE).""" 471 self._maf_fp.seek(offset) 472 return next(self._mafiter)
473
474 - def search(self, starts, ends):
475 """Search index database for MAF records overlapping ranges provided. 476 477 Returns *MultipleSeqAlignment* results in order by start, then end, then 478 internal offset field. 479 480 *starts* should be a list of 0-based start coordinates of segments in the reference. 481 *ends* should be the list of the corresponding segment ends 482 (in the half-open UCSC convention: 483 http://genome.ucsc.edu/blog/the-ucsc-genome-browser-coordinate-counting-systems/). 484 """ 485 # verify the provided exon coordinates 486 if len(starts) != len(ends): 487 raise ValueError("Every position in starts must have a match in ends") 488 489 # Could it be safer to sort the (exonstart, exonend) pairs? 490 for exonstart, exonend in zip(starts, ends): 491 exonlen = exonend - exonstart 492 if exonlen < 1: 493 raise ValueError("Exon coordinates (%d, %d) invalid: exon length (%d) < 1" % ( 494 exonstart, exonend, exonlen)) 495 con = self._con 496 497 # Keep track of what blocks have already been yielded 498 # in order to avoid duplicating them 499 # (see https://github.com/biopython/biopython/issues/1083) 500 yielded_rec_coords = set([]) 501 # search for every exon 502 for exonstart, exonend in zip(starts, ends): 503 try: 504 possible_bins = ", ".join(map(str, self._region2bin(exonstart, exonend))) 505 except TypeError: 506 raise TypeError("Exon coordinates must be integers " 507 "(start=%d, end=%d)" % (exonstart, exonend)) 508 509 # https://www.sqlite.org/lang_expr.html 510 # ----- 511 # The BETWEEN operator 512 # 513 # The BETWEEN operator is logically equivalent to a pair of 514 # comparisons. "x BETWEEN y AND z" is equivalent to "x>=y AND x<=z" 515 # except that with BETWEEN, the x expression is only evaluated 516 # once. The precedence of the BETWEEN operator is the same as the 517 # precedence as operators == and != and LIKE and groups left to 518 # right. 519 # ----- 520 521 # We are testing overlap between the query segment and records in 522 # the index, using non-strict coordinates comparisons. 523 # The query segment end must be passed as end-inclusive 524 # The index should also have been build with end-inclusive 525 # end coordinates. 526 # See https://github.com/biopython/biopython/pull/1086#issuecomment-285069073 527 528 result = con.execute( 529 "SELECT DISTINCT start, end, offset FROM offset_data " 530 "WHERE bin IN (%s) " 531 "AND (end BETWEEN %s AND %s OR %s BETWEEN start AND end) " 532 "ORDER BY start, end, offset ASC;" % 533 (possible_bins, exonstart, exonend - 1, exonend - 1)) 534 535 rows = result.fetchall() 536 537 # rows come from the sqlite index, 538 # which should have been written using __make_new_index, 539 # so rec_start and rec_end should be zero-based "inclusive" coordinates 540 for rec_start, rec_end, offset in rows: 541 # Avoid yielding multiple time the same block 542 if (rec_start, rec_end) in yielded_rec_coords: 543 continue 544 else: 545 yielded_rec_coords.add((rec_start, rec_end)) 546 # Iterate through hits, fetching alignments from the MAF file 547 # and checking to be sure we've retrieved the expected record. 548 549 fetched = self._get_record(int(offset)) 550 551 for record in fetched: 552 if record.id == self._target_seqname: 553 # start and size come from the maf lines 554 start = record.annotations["start"] 555 # "inclusive" end is start + length - 1 556 end = start + record.annotations["size"] - 1 557 558 if not (start == rec_start and end == rec_end): 559 raise ValueError("Expected %s-%s @ offset %s, found %s-%s" % 560 (rec_start, rec_end, offset, start, end)) 561 562 yield fetched
563
564 - def get_spliced(self, starts, ends, strand=1):
565 """Return a multiple alignment of the exact sequence range provided. 566 567 Accepts two lists of start and end positions on target_seqname, representing 568 exons to be spliced in silico. Returns a *MultipleSeqAlignment* of the 569 desired sequences spliced together. 570 571 *starts* should be a list of 0-based start coordinates of segments in the reference. 572 *ends* should be the list of the corresponding segment ends 573 (in the half-open UCSC convention: 574 http://genome.ucsc.edu/blog/the-ucsc-genome-browser-coordinate-counting-systems/). 575 576 To ask for the alignment portion corresponding to the first 100 577 nucleotides of the reference sequence, you would use 578 ``search([0], [100])`` 579 """ 580 # validate strand 581 if strand not in (1, -1): 582 raise ValueError("Strand must be 1 or -1, got %s" % str(strand)) 583 584 # pull all alignments that span the desired intervals 585 fetched = [multiseq for multiseq in self.search(starts, ends)] 586 587 # keep track of the expected letter count 588 # (sum of lengths of [start, end) segments, 589 # where [start, end) half-open) 590 expected_letters = sum([end - start for start, end in zip(starts, ends)]) 591 592 # if there's no alignment, return filler for the assembly of the length given 593 if len(fetched) == 0: 594 return MultipleSeqAlignment([SeqRecord(Seq("N" * expected_letters), 595 id=self._target_seqname)]) 596 597 # find the union of all IDs in these alignments 598 all_seqnames = set( 599 [sequence.id for multiseq in fetched for sequence in multiseq]) 600 601 # split every record by base position 602 # key: sequence name 603 # value: dictionary 604 # key: position in the reference sequence 605 # value: letter(s) (including letters 606 # aligned to the "-" preceding the letter 607 # at the position in the reference, if any) 608 split_by_position = dict([(seq_name, {}) for seq_name in all_seqnames]) 609 610 # keep track of what the total number of (unspliced) letters should be 611 total_rec_length = 0 612 613 # track first strand encountered on the target seqname 614 ref_first_strand = None 615 616 for multiseq in fetched: 617 # find the target_seqname in this MultipleSeqAlignment and use it to 618 # set the parameters for the rest of this iteration 619 for seqrec in multiseq: 620 if seqrec.id == self._target_seqname: 621 try: 622 if ref_first_strand is None: 623 ref_first_strand = seqrec.annotations["strand"] 624 625 if ref_first_strand not in (1, -1): 626 raise ValueError("Strand must be 1 or -1") 627 elif ref_first_strand != seqrec.annotations["strand"]: 628 raise ValueError("Encountered strand='%s' on target seqname, " 629 "expected '%s'" % 630 (seqrec.annotations["strand"], ref_first_strand)) 631 except KeyError: 632 raise ValueError("No strand information for target seqname (%s)" % 633 self._target_seqname) 634 # length including gaps (i.e. alignment length) 635 rec_length = len(seqrec) 636 rec_start = seqrec.annotations["start"] 637 ungapped_length = seqrec.annotations["size"] 638 # inclusive end in zero-based coordinates of the reference 639 rec_end = rec_start + ungapped_length - 1 640 # This is length in terms of actual letters in the reference 641 total_rec_length += ungapped_length 642 643 # blank out these positions for every seqname 644 for seqrec in multiseq: 645 for pos in range(rec_start, rec_end + 1): 646 split_by_position[seqrec.id][pos] = "" 647 648 break 649 # http://psung.blogspot.fr/2007/12/for-else-in-python.html 650 # https://docs.python.org/2/tutorial/controlflow.html#break-and-continue-statements-and-else-clauses-on-loops 651 else: 652 raise ValueError("Did not find %s in alignment bundle" % (self._target_seqname,)) 653 654 # the true, chromosome/contig/etc position in the target seqname 655 real_pos = rec_start 656 657 # loop over the alignment to fill split_by_position 658 for gapped_pos in range(0, rec_length): 659 for seqrec in multiseq: 660 # keep track of this position's value for the target seqname 661 if seqrec.id == self._target_seqname: 662 track_val = seqrec.seq[gapped_pos] 663 664 # Here, a real_pos that corresponds to just after a series of "-" 665 # in the reference will "accumulate" the letters found in other sequences 666 # in front of the "-"s 667 split_by_position[seqrec.id][real_pos] += seqrec.seq[gapped_pos] 668 669 # increment the real_pos counter only when non-gaps are found in 670 # the target_seqname, and we haven't reached the end of the record 671 if track_val != "-" and real_pos < rec_end: 672 real_pos += 1 673 674 # make sure the number of bp entries equals the sum of the record lengths 675 if len(split_by_position[self._target_seqname]) != total_rec_length: 676 raise ValueError("Target seqname (%s) has %s records, expected %s" % 677 (self._target_seqname, 678 len(split_by_position[self._target_seqname]), 679 total_rec_length)) 680 681 # translates a position in the target_seqname sequence to its gapped length 682 realpos_to_len = dict([(pos, len(gapped_fragment)) 683 for pos, gapped_fragment in split_by_position[self._target_seqname].items() 684 if len(gapped_fragment) > 1]) 685 686 # splice together the exons 687 subseq = {} 688 689 for seqid in all_seqnames: 690 seq_split = split_by_position[seqid] 691 seq_splice = [] 692 693 filler_char = "N" if seqid == self._target_seqname else "-" 694 695 # iterate from start to end, taking bases from split_by_position when 696 # they exist, using N or - for gaps when there is no alignment. 697 append = seq_splice.append 698 699 for exonstart, exonend in zip(starts, ends): 700 # exonend is exclusive 701 for real_pos in range(exonstart, exonend): 702 # if this seqname has this position, add it 703 if real_pos in seq_split: 704 append(seq_split[real_pos]) 705 # if not, but it's in the target_seqname, add length-matched filler 706 elif real_pos in realpos_to_len: 707 append(filler_char * realpos_to_len[real_pos]) 708 # it's not in either, so add a single filler character 709 else: 710 append(filler_char) 711 712 subseq[seqid] = "".join(seq_splice) 713 714 # make sure we're returning the right number of letters 715 if len(subseq[self._target_seqname].replace("-", "")) != expected_letters: 716 raise ValueError("Returning %s letters for target seqname (%s), expected %s" % 717 (len(subseq[self._target_seqname].replace("-", "")), 718 self._target_seqname, expected_letters)) 719 720 # check to make sure all sequences are the same length as the target seqname 721 ref_subseq_len = len(subseq[self._target_seqname]) 722 723 for seqid, seq in subseq.items(): 724 if len(seq) != ref_subseq_len: 725 raise ValueError("Returning length %s for %s, expected %s" % 726 (len(seq), seqid, ref_subseq_len)) 727 728 # finally, build a MultipleSeqAlignment object for our final sequences 729 result_multiseq = [] 730 731 for seqid, seq in subseq.items(): 732 seq = Seq(seq) 733 734 seq = seq if strand == ref_first_strand else seq.reverse_complement() 735 736 result_multiseq.append(SeqRecord(seq, 737 id=seqid, 738 name=seqid, 739 description="")) 740 741 return MultipleSeqAlignment(result_multiseq)
742
743 - def __repr__(self):
744 """Return a string representation of the index.""" 745 return "MafIO.MafIndex(%r, target_seqname=%r)" % (self._maf_fp.name, 746 self._target_seqname)
747
748 - def __len__(self):
749 """Return the number of records in the index.""" 750 return self._record_count
751