Package Bio :: Package AlignIO :: Module MauveIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.MauveIO

  1  # Copyright 2015-2015 by Eric Rasche.  All rights reserved. 
  2  # 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license. Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """Bio.AlignIO support for "xmfa" output from Mauve/ProgressiveMauve. 
  7   
  8  You are expected to use this module via the Bio.AlignIO functions (or the 
  9  Bio.SeqIO functions if you want to work directly with the gapped sequences). 
 10   
 11  For example, consider a progressiveMauve alignment file containing the following:: 
 12   
 13      #FormatVersion Mauve1 
 14      #Sequence1File      a.fa 
 15      #Sequence1Entry     1 
 16      #Sequence1Format    FastA 
 17      #Sequence2File      b.fa 
 18      #Sequence2Entry     2 
 19      #Sequence2Format    FastA 
 20      #Sequence3File      c.fa 
 21      #Sequence3Entry     3 
 22      #Sequence3Format    FastA 
 23      #BackboneFile       three.xmfa.bbcols 
 24      > 1:0-0 + a.fa 
 25      -------------------------------------------------------------------------------- 
 26      -------------------------------------------------------------------------------- 
 27      -------------------------------------------------------------------------------- 
 28      > 2:5417-5968 + b.fa 
 29      TTTAAACATCCCTCGGCCCGTCGCCCTTTTATAATAGCAGTACGTGAGAGGAGCGCCCTAAGCTTTGGGAAATTCAAGC- 
 30      -------------------------------------------------------------------------------- 
 31      CTGGAACGTACTTGCTGGTTTCGCTACTATTTCAAACAAGTTAGAGGCCGTTACCTCGGGCGAACGTATAAACCATTCTG 
 32      > 3:9476-10076 - c.fa 
 33      TTTAAACACCTTTTTGGATG--GCCCAGTTCGTTCAGTTGTG-GGGAGGAGATCGCCCCAAACGTATGGTGAGTCGGGCG 
 34      TTTCCTATAGCTATAGGACCAATCCACTTACCATACGCCCGGCGTCGCCCAGTCCGGTTCGGTACCCTCCATGACCCACG 
 35      ---------------------------------------------------------AAATGAGGGCCCAGGGTATGCTT 
 36      = 
 37      > 2:5969-6015 + b.fa 
 38      ----------------------- 
 39      GGGCGAACGTATAAACCATTCTG 
 40      > 3:9429-9476 - c.fa 
 41      TTCGGTACCCTCCATGACCCACG 
 42      AAATGAGGGCCCAGGGTATGCTT 
 43   
 44  This is a multiple sequence alignment with multiple aligned sections, so you 
 45  would probably load this using the Bio.AlignIO.parse() function: 
 46   
 47      >>> from Bio import AlignIO 
 48      >>> align = AlignIO.parse("Mauve/simple.xmfa", "mauve") 
 49      >>> alignments = list(align) 
 50      >>> for aln in alignments: 
 51      ...     print(align) 
 52      SingleLetterAlphabet() alignment with 3 rows and 240 columns 
 53      --------------------------------------------...--- 1 
 54      TTTAAACATCCCTCGGCCCGTCGCCCTTTTATAATAGCAGTACG...CTG 2 
 55      TTTAAACACCTTTTTGGATG--GCCCAGTTCGTTCAGTTGTG-G...CTT 3 
 56      SingleLetterAlphabet() alignment with 3 rows and 46 columns 
 57      ---------------------------------------------- 1 
 58      -----------------------GGGCGAACGTATAAACCATTCTG 2 
 59      TTCGGTACCCTCCATGACCCACGAAATGAGGGCCCAGGGTATGCTT 3 
 60   
 61  Additional information is extracted from the XMFA file and available through 
 62  the annotation attribute of each record:: 
 63   
 64      >>> for record in alignments[0]: 
 65      ...   print record.id, len(record), record.annotations 
 66      1 240 {'start': 0, 'end': 0, 'strand': 1} 
 67      2 240 {'start': 5417, 'end': 5968, 'strand': 1} 
 68      3 240 {'start': 9476, 'end': 10076, 'strand': -1} 
 69   
 70  """ 
 71   
 72  from __future__ import print_function 
 73   
 74  import re 
 75  from Bio.Seq import Seq 
 76  from Bio.SeqRecord import SeqRecord 
 77  from Bio.Align import MultipleSeqAlignment 
 78  from .Interfaces import AlignmentIterator 
 79  from .Interfaces import SequentialAlignmentWriter 
 80   
 81   
 82  XMFA_HEADER_REGEX = re.compile("> (?P<id>\d+):(?P<start>\d+)-(?P<end>\d+) (?P<strand>[+-]) (?P<name>.*)") 
 83  XMFA_HEADER_REGEX_BIOPYTHON = re.compile("> (?P<id>\d+):(?P<start>\d+)-(?P<end>\d+) (?P<strand>[+-]) (?P<name>[^#]*) # (?P<realname>.*)") 
 84  ID_LINE_FMT = "> {seq_name}:{start}-{end} {strand} {file} # {ugly_hack}\n" 
 85   
 86   
87 -def _identifier_split(identifier):
88 """Return (name, start, end) string tuple from an identifier (PRIVATE).""" 89 id, loc, strand = identifier.split(':') 90 start, end = map(int, loc.split('-')) 91 start -= 1 92 return id, start, end, strand
93 94
95 -class MauveWriter(SequentialAlignmentWriter):
96 """Mauve/XMFA alignment writer.""" 97
98 - def __init__(self, *args, **kwargs):
99 """Initialize.""" 100 super(MauveWriter, self).__init__(*args, **kwargs) 101 self._wrote_header = False 102 self._wrote_first = False
103
104 - def write_alignment(self, alignment):
105 """Use this to write (another) single alignment to an open file. 106 107 Note that sequences and their annotation are recorded 108 together (rather than having a block of annotation followed 109 by a block of aligned sequences). 110 """ 111 count = len(alignment) 112 113 self._length_of_sequences = alignment.get_alignment_length() 114 115 # NOTE - For now, the alignment object does not hold any per column 116 # or per alignment annotation - only per sequence. 117 118 if count == 0: 119 raise ValueError("Must have at least one sequence") 120 if self._length_of_sequences == 0: 121 raise ValueError("Non-empty sequences are required") 122 123 if not self._wrote_header: 124 self._wrote_header = True 125 self.handle.write("#FormatVersion Mauve1\n") 126 # There are some more headers, but we ignore those for now. 127 # Sequence1File unknown.fa 128 # Sequence1Entry 1 129 # Sequence1Format FastA 130 for i in range(1, count + 1): 131 self.handle.write('#Sequence%sEntry\t%s\n' % (i, i)) 132 133 for idx, record in enumerate(alignment): 134 self._write_record(record, record_idx=idx) 135 self.handle.write('=\n')
136
137 - def _write_record(self, record, record_idx=0):
138 """Write a single SeqRecord to the file.""" 139 if self._length_of_sequences != len(record.seq): 140 raise ValueError("Sequences must all be the same length") 141 142 seq_name = record.name 143 try: 144 seq_name = str(int(record.name)) 145 except ValueError: 146 seq_name = str(record_idx + 1) 147 148 # We remove the "/{start}-{end}" before writing, as it cannot be part 149 # of the produced XMFA file. 150 if "start" in record.annotations and "end" in record.annotations: 151 suffix0 = "/%s-%s" % (str(record.annotations["start"]), 152 str(record.annotations["end"])) 153 suffix1 = "/%s-%s" % (str(record.annotations["start"] + 1), 154 str(record.annotations["end"])) 155 if seq_name[-len(suffix0):] == suffix0: 156 seq_name = seq_name[:-len(suffix0)] 157 if seq_name[-len(suffix1):] == suffix1: 158 seq_name = seq_name[:-len(suffix1)] 159 160 if "start" in record.annotations \ 161 and "end" in record.annotations \ 162 and "strand" in record.annotations: 163 id_line = ID_LINE_FMT.format( 164 seq_name=seq_name, start=record.annotations["start"] + 1, end=record.annotations["end"], 165 strand=("+" if record.annotations["strand"] == 1 else "-"), file=record.name + '.fa', 166 ugly_hack=record.id 167 ) 168 lacking_annotations = False 169 else: 170 id_line = ID_LINE_FMT.format( 171 seq_name=seq_name, start=0, end=0, strand='+', 172 file=record.name + '.fa', ugly_hack=record.id 173 ) 174 lacking_annotations = True 175 176 # If the sequence is an empty one, skip writing it out 177 if (':0-0 ' in id_line or ':1-0 ' in id_line) and not lacking_annotations: 178 # Except in the first LCB 179 if not self._wrote_first: 180 self._wrote_first = True 181 # The first LCB we write out is special, and must list ALL 182 # sequences, for the Mauve GUI 183 # http://darlinglab.org/mauve/user-guide/files.html#non-standard-xmfa-formatting-used-by-the-mauve-gui 184 id_line = ID_LINE_FMT.format( 185 seq_name=seq_name, start=0, end=0, strand='+', 186 file=record.name + '.fa', ugly_hack=record.id 187 ) 188 189 self.handle.write(id_line + '\n') 190 # Alignments lacking a start/stop/strand were generated by 191 # BioPython on load, and shouldn't exist according to XMFA 192 else: 193 # In other blocks, we only write sequences if they exist in a given 194 # alignment. 195 self.handle.write(id_line) 196 for i in range(0, len(record.seq), 80): 197 self.handle.write("%s\n" % str(record.seq[i:i + 80]))
198 199
200 -class MauveIterator(AlignmentIterator):
201 """Mauve xmfa alignment iterator.""" 202 203 _ids = [] # for caching IDs between __next__ calls 204
205 - def __next__(self):
206 """Parse the next alignment from the handle.""" 207 handle = self.handle 208 line = handle.readline() 209 210 if not line: 211 raise StopIteration 212 213 # Strip out header comments 214 while line and line.strip().startswith('#'): 215 line = handle.readline() 216 217 seqs = {} 218 seq_regions = {} 219 passed_end_alignment = False 220 221 latest_id = None 222 while True: 223 if not line: 224 break # end of file 225 line = line.strip() 226 227 if line.startswith('='): 228 # There may be more data, but we've reached the end of this 229 # alignment 230 break 231 elif line.startswith('>'): 232 m = XMFA_HEADER_REGEX_BIOPYTHON.match(line) 233 if not m: 234 m = XMFA_HEADER_REGEX.match(line) 235 if not m: 236 raise ValueError("Malformed header line: %s", line) 237 238 parsed_id = m.group('id') 239 parsed_data = {} 240 for key in ('start', 'end', 'id', 'strand', 'name', 'realname'): 241 try: 242 value = m.group(key) 243 if key == 'start': 244 value = int(value) 245 # Convert to zero based counting 246 if value > 0: 247 value -= 1 248 249 if key == 'end': 250 value = int(value) 251 parsed_data[key] = value 252 except IndexError: 253 # This will occur if we're asking for a group that 254 # doesn't exist. It's fine. 255 pass 256 seq_regions[parsed_id] = parsed_data 257 258 if parsed_id not in self._ids: 259 self._ids.append(parsed_id) 260 261 seqs.setdefault(parsed_id, '') 262 latest_id = parsed_id 263 else: 264 assert not passed_end_alignment 265 if latest_id is None: 266 raise ValueError("Saw sequence before definition line") 267 seqs[latest_id] += line 268 line = handle.readline() 269 270 assert len(seqs) <= len(self._ids) 271 272 self.ids = self._ids 273 self.sequences = seqs 274 275 if self._ids and seqs: 276 alignment_length = max(map(len, list(seqs.values()))) 277 records = [] 278 for id in self._ids: 279 if id not in seqs or len(seqs[id]) == 0 \ 280 or len(seqs[id]) == 0: 281 seq = '-' * alignment_length 282 else: 283 seq = seqs[id] 284 285 if alignment_length != len(seq): 286 raise ValueError("Sequences have different lengths, or repeated identifier") 287 288 # Sometimes we don't see a particular sequence in the 289 # alignment, so we skip that record since it isn't present in 290 # that LCB/alignment 291 if id not in seq_regions: 292 continue 293 294 if (seq_regions[id]['start'] != 0 or seq_regions[id]['end'] != 0): 295 suffix = '/{start}-{end}'.format(**seq_regions[id]) 296 if 'realname' in seq_regions[id]: 297 corrected_id = seq_regions[id]['realname'] 298 else: 299 corrected_id = seq_regions[id]['name'] 300 if corrected_id.count(suffix) == 0: 301 corrected_id += suffix 302 else: 303 if 'realname' in seq_regions[id]: 304 corrected_id = seq_regions[id]['realname'] 305 else: 306 corrected_id = seq_regions[id]['name'] 307 308 record = SeqRecord( 309 Seq(seq, self.alphabet), 310 id=corrected_id, 311 name=id 312 ) 313 314 record.annotations["start"] = seq_regions[id]['start'] 315 record.annotations["end"] = seq_regions[id]['end'] 316 record.annotations["strand"] = 1 if seq_regions[id]['strand'] == '+' else -1 317 318 records.append(record) 319 return MultipleSeqAlignment(records, self.alphabet) 320 else: 321 raise StopIteration
322