1
2
3
4
5
6 """
7 Code to deal with COMPASS output, a program for profile/profile comparison.
8
9 Compass is described in:
10
11 Sadreyev R, Grishin N. COMPASS: a tool for comparison of multiple protein
12 alignments with assessment of statistical significance. J Mol Biol. 2003 Feb
13 7;326(1):317-36.
14
15 Tested with COMPASS 1.24.
16
17 Functions:
18 read Reads a COMPASS file containing one COMPASS record
19 parse Iterates over records in a COMPASS file.
20
21 Classes:
22 Record One result of a COMPASS file
23 """
24 import re
25
26
58
59
97
98
100 """
101 Hold information from one compass hit.
102 Ali1 is the query, Ali2 the hit.
103 """
104
106 self.query=''
107 self.hit=''
108 self.gap_threshold=0
109 self.query_length=0
110 self.query_filtered_length=0
111 self.query_nseqs=0
112 self.query_neffseqs=0
113 self.hit_length=0
114 self.hit_filtered_length=0
115 self.hit_nseqs=0
116 self.hit_neffseqs=0
117 self.sw_score=0
118 self.evalue=-1
119 self.query_start=-1
120 self.hit_start=-1
121 self.query_aln=''
122 self.hit_aln=''
123 self.positives=''
124
126 """Return the length of the query covered in the alignment."""
127 s = self.query_aln.replace("=", "")
128 return len(s)
129
131 """Return the length of the hit covered in the alignment."""
132 s = self.hit_aln.replace("=", "")
133 return len(s)
134
135
136
137 __regex = {"names": re.compile("Ali1:\s+(\S+)\s+Ali2:\s+(\S+)\s+"),
138 "threshold": re.compile("Threshold of effective gap content in columns: (\S+)"),
139 "lengths": re.compile("length1=(\S+)\s+filtered_length1=(\S+)\s+length2=(\S+)\s+filtered_length2=(\S+)"),
140 "profilewidth": re.compile("Nseqs1=(\S+)\s+Neff1=(\S+)\s+Nseqs2=(\S+)\s+Neff2=(\S+)"),
141 "scores": re.compile("Smith-Waterman score = (\S+)\s+Evalue = (\S+)"),
142 "start": re.compile("(\d+)"),
143 "align": re.compile("^.{15}(\S+)"),
144 "positive_alignment": re.compile("^.{15}(.+)"),
145 }
146
147
149 """
150 Ali1: 60456.blo.gz.aln Ali2: allscop//14984.blo.gz.aln
151 ------query----- -------hit-------------
152 """
153 if not "Ali1:" in line:
154 raise ValueError("Line does not contain 'Ali1:':\n%s" % line)
155 m = __regex["names"].search(line)
156 record.query = m.group(1)
157 record.hit = m.group(2)
158
159
161 if not line.startswith("Threshold"):
162 raise ValueError("Line does not start with 'Threshold':\n%s" % line)
163 m = __regex["threshold"].search(line)
164 record.gap_threshold = float(m.group(1))
165
166
168 if not line.startswith("length1="):
169 raise ValueError("Line does not start with 'length1=':\n%s" % line)
170 m = __regex["lengths"].search(line)
171 record.query_length = int(m.group(1))
172 record.query_filtered_length = float(m.group(2))
173 record.hit_length = int(m.group(3))
174 record.hit_filtered_length = float(m.group(4))
175
176
178 if not "Nseqs1" in line:
179 raise ValueError("Line does not contain 'Nseqs1':\n%s" % line)
180 m = __regex["profilewidth"].search(line)
181 record.query_nseqs = int(m.group(1))
182 record.query_neffseqs = float(m.group(2))
183 record.hit_nseqs = int(m.group(3))
184 record.hit_neffseqs = float(m.group(4))
185
186
188 if not line.startswith("Smith-Waterman"):
189 raise ValueError("Line does not start with 'Smith-Waterman':\n%s" % line)
190 m = __regex["scores"].search(line)
191 if m:
192 record.sw_score = int(m.group(1))
193 record.evalue = float(m.group(2))
194 else:
195 record.sw_score = 0
196 record.evalue = -1.0
197
198
200 m = __regex["start"].search(line)
201 if m:
202 record.query_start = int(m.group(1))
203 m = __regex["align"].match(line)
204 assert m is not None, "invalid match"
205 record.query_aln += m.group(1)
206
207
209 m = __regex["positive_alignment"].match(line)
210 assert m is not None, "invalid match"
211 record.positives += m.group(1)
212
213
215 m = __regex["start"].search(line)
216 if m:
217 record.hit_start = int(m.group(1))
218 m = __regex["align"].match(line)
219 assert m is not None, "invalid match"
220 record.hit_aln += m.group(1)
221