1
2
3
4
5 """
6 Bio.AlignIO support for the "stockholm" format (used in the PFAM database).
7
8 You are expected to use this module via the Bio.AlignIO functions (or the
9 Bio.SeqIO functions if you want to work directly with the gapped sequences).
10
11 For example, consider a Stockholm alignment file containing the following::
12
13 # STOCKHOLM 1.0
14 #=GC SS_cons .................<<<<<<<<...<<<<<<<........>>>>>>>..
15 AP001509.1 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGU
16 #=GR AP001509.1 SS -----------------<<<<<<<<---..<<-<<-------->>->>..--
17 AE007476.1 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGU
18 #=GR AE007476.1 SS -----------------<<<<<<<<-----<<.<<-------->>.>>----
19
20 #=GC SS_cons ......<<<<<<<.......>>>>>>>..>>>>>>>>...............
21 AP001509.1 CUCUAC-AGGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU
22 #=GR AP001509.1 SS -------<<<<<--------->>>>>--->>>>>>>>---------------
23 AE007476.1 UUCUACAAGGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU
24 #=GR AE007476.1 SS ------.<<<<<--------->>>>>.-->>>>>>>>---------------
25 //
26
27 This is a single multiple sequence alignment, so you would probably load this
28 using the Bio.AlignIO.read() function:
29
30 >>> from Bio import AlignIO
31 >>> align = AlignIO.read("Stockholm/simple.sth", "stockholm")
32 >>> print align
33 SingleLetterAlphabet() alignment with 2 rows and 104 columns
34 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-G...UGU AP001509.1
35 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-C...GAU AE007476.1
36 >>> for record in align:
37 ... print record.id, len(record)
38 AP001509.1 104
39 AE007476.1 104
40
41 This example file is clearly using RNA, so you might want the alignment object
42 (and the SeqRecord objects it holds) to reflect this, rather than simple using
43 the default single letter alphabet as shown above. You can do this with an
44 optional argument to the Bio.AlignIO.read() function:
45
46 >>> from Bio import AlignIO
47 >>> from Bio.Alphabet import generic_rna
48 >>> align = AlignIO.read("Stockholm/simple.sth", "stockholm",
49 ... alphabet=generic_rna)
50 >>> print align
51 RNAAlphabet() alignment with 2 rows and 104 columns
52 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-G...UGU AP001509.1
53 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-C...GAU AE007476.1
54
55 In addition to the sequences themselves, this example alignment also includes
56 some GR lines for the secondary structure of the sequences. These are
57 strings, with one character for each letter in the associated sequence:
58
59 >>> for record in align:
60 ... print record.id
61 ... print record.seq
62 ... print record.letter_annotations['secondary_structure']
63 AP001509.1
64 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGUCUCUAC-AGGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU
65 -----------------<<<<<<<<---..<<-<<-------->>->>..---------<<<<<--------->>>>>--->>>>>>>>---------------
66 AE007476.1
67 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGUUUCUACAAGGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU
68 -----------------<<<<<<<<-----<<.<<-------->>.>>----------.<<<<<--------->>>>>.-->>>>>>>>---------------
69
70 Any general annotation for each row is recorded in the SeqRecord's annotations
71 dictionary. You can output this alignment in many different file formats
72 using Bio.AlignIO.write(), or the MultipleSeqAlignment object's format method:
73
74 >>> print align.format("fasta")
75 >AP001509.1
76 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGUCUCUAC-A
77 GGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU
78 >AE007476.1
79 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGUUUCUACAA
80 GGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU
81 <BLANKLINE>
82
83 Most output formats won't be able to hold the annotation possible in a
84 Stockholm file:
85
86 >>> print align.format("stockholm")
87 # STOCKHOLM 1.0
88 #=GF SQ 2
89 AP001509.1 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGUCUCUAC-AGGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU
90 #=GS AP001509.1 AC AP001509.1
91 #=GS AP001509.1 DE AP001509.1
92 #=GR AP001509.1 SS -----------------<<<<<<<<---..<<-<<-------->>->>..---------<<<<<--------->>>>>--->>>>>>>>---------------
93 AE007476.1 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGUUUCUACAAGGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU
94 #=GS AE007476.1 AC AE007476.1
95 #=GS AE007476.1 DE AE007476.1
96 #=GR AE007476.1 SS -----------------<<<<<<<<-----<<.<<-------->>.>>----------.<<<<<--------->>>>>.-->>>>>>>>---------------
97 //
98 <BLANKLINE>
99
100 Note that when writing Stockholm files, AlignIO does not break long sequences
101 up and interleave them (as in the input file shown above). The standard
102 allows this simpler layout, and it is more likely to be understood by other
103 tools.
104
105 Finally, as an aside, it can sometimes be useful to use Bio.SeqIO.parse() to
106 iterate over the alignment rows as SeqRecord objects - rather than working
107 with Alignnment objects. Again, if you want to you can specify this is RNA:
108
109 >>> from Bio import SeqIO
110 >>> from Bio.Alphabet import generic_rna
111 >>> for record in SeqIO.parse("Stockholm/simple.sth", "stockholm",
112 ... alphabet=generic_rna):
113 ... print record.id
114 ... print record.seq
115 ... print record.letter_annotations['secondary_structure']
116 AP001509.1
117 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGUCUCUAC-AGGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU
118 -----------------<<<<<<<<---..<<-<<-------->>->>..---------<<<<<--------->>>>>--->>>>>>>>---------------
119 AE007476.1
120 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGUUUCUACAAGGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU
121 -----------------<<<<<<<<-----<<.<<-------->>.>>----------.<<<<<--------->>>>>.-->>>>>>>>---------------
122
123 Remember that if you slice a SeqRecord, the per-letter-annotions like the
124 secondary structure string here, are also sliced:
125
126 >>> sub_record = record[10:20]
127 >>> print sub_record.seq
128 AUCGUUUUAC
129 >>> print sub_record.letter_annotations['secondary_structure']
130 -------<<<
131 """
132 __docformat__ = "epytext en"
133 from Bio.Seq import Seq
134 from Bio.SeqRecord import SeqRecord
135 from Bio.Align import MultipleSeqAlignment
136 from Interfaces import AlignmentIterator, SequentialAlignmentWriter
137
138
140 """Stockholm/PFAM alignment writer."""
141
142
143
144 pfam_gr_mapping = {"secondary_structure": "SS",
145 "surface_accessibility": "SA",
146 "transmembrane": "TM",
147 "posterior_probability": "PP",
148 "ligand_binding": "LI",
149 "active_site": "AS",
150 "intron": "IN"}
151
152 pfam_gs_mapping = {"organism": "OS",
153 "organism_classification": "OC",
154 "look": "LO"}
155
157 """Use this to write (another) single alignment to an open file.
158
159 Note that sequences and their annotation are recorded
160 together (rather than having a block of annotation followed
161 by a block of aligned sequences).
162 """
163 count = len(alignment)
164
165 self._length_of_sequences = alignment.get_alignment_length()
166 self._ids_written = []
167
168
169
170
171 if count == 0:
172 raise ValueError("Must have at least one sequence")
173 if self._length_of_sequences == 0:
174 raise ValueError("Non-empty sequences are required")
175
176 self.handle.write("# STOCKHOLM 1.0\n")
177 self.handle.write("#=GF SQ %i\n" % count)
178 for record in alignment:
179 self._write_record(record)
180 self.handle.write("//\n")
181
183 """Write a single SeqRecord to the file"""
184 if self._length_of_sequences != len(record.seq):
185 raise ValueError("Sequences must all be the same length")
186
187
188 seq_name = record.id
189 if record.name is not None:
190 if "accession" in record.annotations:
191 if record.id == record.annotations["accession"]:
192 seq_name = record.name
193
194
195 seq_name = seq_name.replace(" ", "_")
196
197 if "start" in record.annotations \
198 and "end" in record.annotations:
199 suffix = "/%s-%s" % (str(record.annotations["start"]),
200 str(record.annotations["end"]))
201 if seq_name[-len(suffix):] != suffix:
202 seq_name = "%s/%s-%s" % (seq_name,
203 str(record.annotations["start"]),
204 str(record.annotations["end"]))
205
206 if seq_name in self._ids_written:
207 raise ValueError("Duplicate record identifier: %s" % seq_name)
208 self._ids_written.append(seq_name)
209 self.handle.write("%s %s\n" % (seq_name, str(record.seq)))
210
211
212
213
214
215
216
217
218
219
220
221
222
223 if "accession" in record.annotations:
224 self.handle.write("#=GS %s AC %s\n"
225 % (seq_name, self.clean(record.annotations["accession"])))
226 elif record.id:
227 self.handle.write("#=GS %s AC %s\n"
228 % (seq_name, self.clean(record.id)))
229
230
231 if record.description:
232 self.handle.write("#=GS %s DE %s\n"
233 % (seq_name, self.clean(record.description)))
234
235
236 for xref in record.dbxrefs:
237 self.handle.write("#=GS %s DR %s\n"
238 % (seq_name, self.clean(xref)))
239
240
241 for key, value in record.annotations.iteritems():
242 if key in self.pfam_gs_mapping:
243 data = self.clean(str(value))
244 if data:
245 self.handle.write("#=GS %s %s %s\n"
246 % (seq_name,
247 self.clean(self.pfam_gs_mapping[key]),
248 data))
249 else:
250
251
252 pass
253
254
255 for key, value in record.letter_annotations.iteritems():
256 if key in self.pfam_gr_mapping and len(str(value)) == len(record.seq):
257 data = self.clean(str(value))
258 if data:
259 self.handle.write("#=GR %s %s %s\n"
260 % (seq_name,
261 self.clean(self.pfam_gr_mapping[key]),
262 data))
263 else:
264
265
266 pass
267
268
270 """Loads a Stockholm file from PFAM into MultipleSeqAlignment objects.
271
272 The file may contain multiple concatenated alignments, which are loaded
273 and returned incrementally.
274
275 This parser will detect if the Stockholm file follows the PFAM
276 conventions for sequence specific meta-data (lines starting #=GS
277 and #=GR) and populates the SeqRecord fields accordingly.
278
279 Any annotation which does not follow the PFAM conventions is currently
280 ignored.
281
282 If an accession is provided for an entry in the meta data, IT WILL NOT
283 be used as the record.id (it will be recorded in the record's
284 annotations). This is because some files have (sub) sequences from
285 different parts of the same accession (differentiated by different
286 start-end positions).
287
288 Wrap-around alignments are not supported - each sequences must be on
289 a single line. However, interlaced sequences should work.
290
291 For more information on the file format, please see:
292 http://www.bioperl.org/wiki/Stockholm_multiple_alignment_format
293 http://www.cgb.ki.se/cgb/groups/sonnhammer/Stockholm.html
294
295 For consistency with BioPerl and EMBOSS we call this the "stockholm"
296 format.
297 """
298
299
300
301 pfam_gr_mapping = {"SS": "secondary_structure",
302 "SA": "surface_accessibility",
303 "TM": "transmembrane",
304 "PP": "posterior_probability",
305 "LI": "ligand_binding",
306 "AS": "active_site",
307 "IN": "intron"}
308
309 pfam_gs_mapping = {"OS": "organism",
310 "OC": "organism_classification",
311 "LO": "look"}
312
314 try:
315 line = self._header
316 del self._header
317 except AttributeError:
318 line = self.handle.readline()
319 if not line:
320
321 raise StopIteration
322 if not line.strip() == '# STOCKHOLM 1.0':
323 raise ValueError("Did not find STOCKHOLM header")
324
325
326
327
328
329
330
331
332 seqs = {}
333 ids = []
334 gs = {}
335 gr = {}
336 gf = {}
337 passed_end_alignment = False
338 while 1:
339 line = self.handle.readline()
340 if not line:
341 break
342 line = line.strip()
343 if line == '# STOCKHOLM 1.0':
344 self._header = line
345 break
346 elif line == "//":
347
348
349 passed_end_alignment = True
350 elif line == "":
351
352 pass
353 elif line[0] != "#":
354
355
356 assert not passed_end_alignment
357 parts = [x.strip() for x in line.split(" ", 1)]
358 if len(parts) != 2:
359
360 raise ValueError("Could not split line into identifier "
361 + "and sequence:\n" + line)
362 id, seq = parts
363 if id not in ids:
364 ids.append(id)
365 seqs.setdefault(id, '')
366 seqs[id] += seq.replace(".", "-")
367 elif len(line) >= 5:
368
369 if line[:5] == "#=GF ":
370
371
372 feature, text = line[5:].strip().split(None, 1)
373
374
375 if feature not in gf:
376 gf[feature] = [text]
377 else:
378 gf[feature].append(text)
379 elif line[:5] == '#=GC ':
380
381
382 pass
383 elif line[:5] == '#=GS ':
384
385
386 id, feature, text = line[5:].strip().split(None, 2)
387
388
389 if id not in gs:
390 gs[id] = {}
391 if feature not in gs[id]:
392 gs[id][feature] = [text]
393 else:
394 gs[id][feature].append(text)
395 elif line[:5] == "#=GR ":
396
397
398 id, feature, text = line[5:].strip().split(None, 2)
399
400
401 if id not in gr:
402 gr[id] = {}
403 if feature not in gr[id]:
404 gr[id][feature] = ""
405 gr[id][feature] += text.strip()
406
407
408
409
410
411 assert len(seqs) <= len(ids)
412
413
414
415 self.ids = ids
416 self.sequences = seqs
417 self.seq_annotation = gs
418 self.seq_col_annotation = gr
419
420 if ids and seqs:
421
422 if self.records_per_alignment is not None \
423 and self.records_per_alignment != len(ids):
424 raise ValueError("Found %i records in this alignment, told to expect %i"
425 % (len(ids), self.records_per_alignment))
426
427 alignment_length = len(seqs.values()[0])
428 records = []
429 for id in ids:
430 seq = seqs[id]
431 if alignment_length != len(seq):
432 raise ValueError("Sequences have different lengths, or repeated identifier")
433 name, start, end = self._identifier_split(id)
434 record = SeqRecord(Seq(seq, self.alphabet),
435 id=id, name=name, description=id,
436 annotations={"accession": name})
437
438
439 record.annotations["accession"] = name
440
441 if start is not None:
442 record.annotations["start"] = start
443 if end is not None:
444 record.annotations["end"] = end
445
446 self._populate_meta_data(id, record)
447 records.append(record)
448 alignment = MultipleSeqAlignment(records, self.alphabet)
449
450
451
452 alignment._annotations = gr
453
454 return alignment
455 else:
456 raise StopIteration
457
459 """Returns (name,start,end) string tuple from an identier."""
460 if '/' in identifier:
461 name, start_end = identifier.rsplit("/", 1)
462 if start_end.count("-") == 1:
463 try:
464 start, end = map(int, start_end.split("-"))
465 return (name, start, end)
466 except ValueError:
467
468 pass
469 return (identifier, None, None)
470
503
535
536
537 if __name__ == "__main__":
538 from Bio._utils import run_doctest
539 run_doctest()
540