Package Bio :: Package AlignIO :: Module MauveIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.MauveIO

  1  # Copyright 2015-2015 by Eric Rasche.  All rights reserved. 
  2  # 
  3  # This file is part of the Biopython distribution and governed by your 
  4  # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". 
  5  # Please see the LICENSE file that should have been included as part of this 
  6  # package. 
  7  """Bio.AlignIO support for "xmfa" output from Mauve/ProgressiveMauve. 
  8   
  9  You are expected to use this module via the Bio.AlignIO functions (or the 
 10  Bio.SeqIO functions if you want to work directly with the gapped sequences). 
 11   
 12  For example, consider a progressiveMauve alignment file containing the following:: 
 13   
 14      #FormatVersion Mauve1 
 15      #Sequence1File      a.fa 
 16      #Sequence1Entry     1 
 17      #Sequence1Format    FastA 
 18      #Sequence2File      b.fa 
 19      #Sequence2Entry     2 
 20      #Sequence2Format    FastA 
 21      #Sequence3File      c.fa 
 22      #Sequence3Entry     3 
 23      #Sequence3Format    FastA 
 24      #BackboneFile       three.xmfa.bbcols 
 25      > 1:0-0 + a.fa 
 26      -------------------------------------------------------------------------------- 
 27      -------------------------------------------------------------------------------- 
 28      -------------------------------------------------------------------------------- 
 29      > 2:5417-5968 + b.fa 
 30      TTTAAACATCCCTCGGCCCGTCGCCCTTTTATAATAGCAGTACGTGAGAGGAGCGCCCTAAGCTTTGGGAAATTCAAGC- 
 31      -------------------------------------------------------------------------------- 
 32      CTGGAACGTACTTGCTGGTTTCGCTACTATTTCAAACAAGTTAGAGGCCGTTACCTCGGGCGAACGTATAAACCATTCTG 
 33      > 3:9476-10076 - c.fa 
 34      TTTAAACACCTTTTTGGATG--GCCCAGTTCGTTCAGTTGTG-GGGAGGAGATCGCCCCAAACGTATGGTGAGTCGGGCG 
 35      TTTCCTATAGCTATAGGACCAATCCACTTACCATACGCCCGGCGTCGCCCAGTCCGGTTCGGTACCCTCCATGACCCACG 
 36      ---------------------------------------------------------AAATGAGGGCCCAGGGTATGCTT 
 37      = 
 38      > 2:5969-6015 + b.fa 
 39      ----------------------- 
 40      GGGCGAACGTATAAACCATTCTG 
 41      > 3:9429-9476 - c.fa 
 42      TTCGGTACCCTCCATGACCCACG 
 43      AAATGAGGGCCCAGGGTATGCTT 
 44   
 45  This is a multiple sequence alignment with multiple aligned sections, so you 
 46  would probably load this using the Bio.AlignIO.parse() function: 
 47   
 48      >>> from Bio import AlignIO 
 49      >>> align = AlignIO.parse("Mauve/simple.xmfa", "mauve") 
 50      >>> alignments = list(align) 
 51      >>> for aln in alignments: 
 52      ...     print(align) 
 53      SingleLetterAlphabet() alignment with 3 rows and 240 columns 
 54      --------------------------------------------...--- 1 
 55      TTTAAACATCCCTCGGCCCGTCGCCCTTTTATAATAGCAGTACG...CTG 2 
 56      TTTAAACACCTTTTTGGATG--GCCCAGTTCGTTCAGTTGTG-G...CTT 3 
 57      SingleLetterAlphabet() alignment with 3 rows and 46 columns 
 58      ---------------------------------------------- 1 
 59      -----------------------GGGCGAACGTATAAACCATTCTG 2 
 60      TTCGGTACCCTCCATGACCCACGAAATGAGGGCCCAGGGTATGCTT 3 
 61   
 62  Additional information is extracted from the XMFA file and available through 
 63  the annotation attribute of each record:: 
 64   
 65      >>> for record in alignments[0]: 
 66      ...   print record.id, len(record), record.annotations 
 67      1 240 {'start': 0, 'end': 0, 'strand': 1} 
 68      2 240 {'start': 5417, 'end': 5968, 'strand': 1} 
 69      3 240 {'start': 9476, 'end': 10076, 'strand': -1} 
 70   
 71  """ 
 72   
 73  from __future__ import print_function 
 74   
 75  import re 
 76  from Bio.Seq import Seq 
 77  from Bio.SeqRecord import SeqRecord 
 78  from Bio.Align import MultipleSeqAlignment 
 79  from .Interfaces import AlignmentIterator 
 80  from .Interfaces import SequentialAlignmentWriter 
 81   
 82   
 83  XMFA_HEADER_REGEX = re.compile("> (?P<id>\d+):(?P<start>\d+)-(?P<end>\d+) (?P<strand>[+-]) (?P<name>.*)") 
 84  XMFA_HEADER_REGEX_BIOPYTHON = re.compile("> (?P<id>\d+):(?P<start>\d+)-(?P<end>\d+) (?P<strand>[+-]) (?P<name>[^#]*) # (?P<realname>.*)") 
 85  ID_LINE_FMT = "> {seq_name}:{start}-{end} {strand} {file} # {ugly_hack}\n" 
 86   
 87   
88 -def _identifier_split(identifier):
89 """Return (name, start, end) string tuple from an identifier (PRIVATE).""" 90 id, loc, strand = identifier.split(':') 91 start, end = map(int, loc.split('-')) 92 start -= 1 93 return id, start, end, strand
94 95
96 -class MauveWriter(SequentialAlignmentWriter):
97 """Mauve/XMFA alignment writer.""" 98
99 - def __init__(self, *args, **kwargs):
100 """Initialize.""" 101 super(MauveWriter, self).__init__(*args, **kwargs) 102 self._wrote_header = False 103 self._wrote_first = False
104
105 - def write_alignment(self, alignment):
106 """Use this to write (another) single alignment to an open file. 107 108 Note that sequences and their annotation are recorded 109 together (rather than having a block of annotation followed 110 by a block of aligned sequences). 111 """ 112 count = len(alignment) 113 114 self._length_of_sequences = alignment.get_alignment_length() 115 116 # NOTE - For now, the alignment object does not hold any per column 117 # or per alignment annotation - only per sequence. 118 119 if count == 0: 120 raise ValueError("Must have at least one sequence") 121 if self._length_of_sequences == 0: 122 raise ValueError("Non-empty sequences are required") 123 124 if not self._wrote_header: 125 self._wrote_header = True 126 self.handle.write("#FormatVersion Mauve1\n") 127 # There are some more headers, but we ignore those for now. 128 # Sequence1File unknown.fa 129 # Sequence1Entry 1 130 # Sequence1Format FastA 131 for i in range(1, count + 1): 132 self.handle.write('#Sequence%sEntry\t%s\n' % (i, i)) 133 134 for idx, record in enumerate(alignment): 135 self._write_record(record, record_idx=idx) 136 self.handle.write('=\n')
137
138 - def _write_record(self, record, record_idx=0):
139 """Write a single SeqRecord to the file (PRIVATE).""" 140 if self._length_of_sequences != len(record.seq): 141 raise ValueError("Sequences must all be the same length") 142 143 seq_name = record.name 144 try: 145 seq_name = str(int(record.name)) 146 except ValueError: 147 seq_name = str(record_idx + 1) 148 149 # We remove the "/{start}-{end}" before writing, as it cannot be part 150 # of the produced XMFA file. 151 if "start" in record.annotations and "end" in record.annotations: 152 suffix0 = "/%s-%s" % (str(record.annotations["start"]), 153 str(record.annotations["end"])) 154 suffix1 = "/%s-%s" % (str(record.annotations["start"] + 1), 155 str(record.annotations["end"])) 156 if seq_name[-len(suffix0):] == suffix0: 157 seq_name = seq_name[:-len(suffix0)] 158 if seq_name[-len(suffix1):] == suffix1: 159 seq_name = seq_name[:-len(suffix1)] 160 161 if "start" in record.annotations \ 162 and "end" in record.annotations \ 163 and "strand" in record.annotations: 164 id_line = ID_LINE_FMT.format( 165 seq_name=seq_name, start=record.annotations["start"] + 1, end=record.annotations["end"], 166 strand=("+" if record.annotations["strand"] == 1 else "-"), file=record.name + '.fa', 167 ugly_hack=record.id 168 ) 169 lacking_annotations = False 170 else: 171 id_line = ID_LINE_FMT.format( 172 seq_name=seq_name, start=0, end=0, strand='+', 173 file=record.name + '.fa', ugly_hack=record.id 174 ) 175 lacking_annotations = True 176 177 # If the sequence is an empty one, skip writing it out 178 if (':0-0 ' in id_line or ':1-0 ' in id_line) and not lacking_annotations: 179 # Except in the first LCB 180 if not self._wrote_first: 181 self._wrote_first = True 182 # The first LCB we write out is special, and must list ALL 183 # sequences, for the Mauve GUI 184 # http://darlinglab.org/mauve/user-guide/files.html#non-standard-xmfa-formatting-used-by-the-mauve-gui 185 id_line = ID_LINE_FMT.format( 186 seq_name=seq_name, start=0, end=0, strand='+', 187 file=record.name + '.fa', ugly_hack=record.id 188 ) 189 190 self.handle.write(id_line + '\n') 191 # Alignments lacking a start/stop/strand were generated by 192 # BioPython on load, and shouldn't exist according to XMFA 193 else: 194 # In other blocks, we only write sequences if they exist in a given 195 # alignment. 196 self.handle.write(id_line) 197 for i in range(0, len(record.seq), 80): 198 self.handle.write("%s\n" % str(record.seq[i:i + 80]))
199 200
201 -class MauveIterator(AlignmentIterator):
202 """Mauve xmfa alignment iterator.""" 203 204 _ids = [] # for caching IDs between __next__ calls 205
206 - def __next__(self):
207 """Parse the next alignment from the handle.""" 208 handle = self.handle 209 line = handle.readline() 210 211 if not line: 212 raise StopIteration 213 214 # Strip out header comments 215 while line and line.strip().startswith('#'): 216 line = handle.readline() 217 218 seqs = {} 219 seq_regions = {} 220 passed_end_alignment = False 221 222 latest_id = None 223 while True: 224 if not line: 225 break # end of file 226 line = line.strip() 227 228 if line.startswith('='): 229 # There may be more data, but we've reached the end of this 230 # alignment 231 break 232 elif line.startswith('>'): 233 m = XMFA_HEADER_REGEX_BIOPYTHON.match(line) 234 if not m: 235 m = XMFA_HEADER_REGEX.match(line) 236 if not m: 237 raise ValueError("Malformed header line: %s", line) 238 239 parsed_id = m.group('id') 240 parsed_data = {} 241 for key in ('start', 'end', 'id', 'strand', 'name', 'realname'): 242 try: 243 value = m.group(key) 244 if key == 'start': 245 value = int(value) 246 # Convert to zero based counting 247 if value > 0: 248 value -= 1 249 250 if key == 'end': 251 value = int(value) 252 parsed_data[key] = value 253 except IndexError: 254 # This will occur if we're asking for a group that 255 # doesn't exist. It's fine. 256 pass 257 seq_regions[parsed_id] = parsed_data 258 259 if parsed_id not in self._ids: 260 self._ids.append(parsed_id) 261 262 seqs.setdefault(parsed_id, '') 263 latest_id = parsed_id 264 else: 265 assert not passed_end_alignment 266 if latest_id is None: 267 raise ValueError("Saw sequence before definition line") 268 seqs[latest_id] += line 269 line = handle.readline() 270 271 assert len(seqs) <= len(self._ids) 272 273 self.ids = self._ids 274 self.sequences = seqs 275 276 if self._ids and seqs: 277 alignment_length = max(map(len, list(seqs.values()))) 278 records = [] 279 for id in self._ids: 280 if id not in seqs or len(seqs[id]) == 0 \ 281 or len(seqs[id]) == 0: 282 seq = '-' * alignment_length 283 else: 284 seq = seqs[id] 285 286 if alignment_length != len(seq): 287 raise ValueError("Sequences have different lengths, or repeated identifier") 288 289 # Sometimes we don't see a particular sequence in the 290 # alignment, so we skip that record since it isn't present in 291 # that LCB/alignment 292 if id not in seq_regions: 293 continue 294 295 if (seq_regions[id]['start'] != 0 or seq_regions[id]['end'] != 0): 296 suffix = '/{start}-{end}'.format(**seq_regions[id]) 297 if 'realname' in seq_regions[id]: 298 corrected_id = seq_regions[id]['realname'] 299 else: 300 corrected_id = seq_regions[id]['name'] 301 if corrected_id.count(suffix) == 0: 302 corrected_id += suffix 303 else: 304 if 'realname' in seq_regions[id]: 305 corrected_id = seq_regions[id]['realname'] 306 else: 307 corrected_id = seq_regions[id]['name'] 308 309 record = SeqRecord( 310 Seq(seq, self.alphabet), 311 id=corrected_id, 312 name=id 313 ) 314 315 record.annotations["start"] = seq_regions[id]['start'] 316 record.annotations["end"] = seq_regions[id]['end'] 317 record.annotations["strand"] = 1 if seq_regions[id]['strand'] == '+' else -1 318 319 records.append(record) 320 return MultipleSeqAlignment(records, self.alphabet) 321 else: 322 raise StopIteration
323