1 """Find and deal with signatures in biological sequence data.
2
3 In addition to representing sequences according to motifs (see Motif.py
4 for more information), we can also use Signatures, which are conserved
5 regions that are not necessarily consecutive. This may be useful in
6 the case of very diverged sequences, where signatures may pick out
7 important conservation that can't be found by motifs (hopefully!).
8 """
9
10 from Bio.Alphabet import _verify_alphabet
11 from Bio.Seq import Seq
12
13
14 from Pattern import PatternRepository
15
16
18 """Find Signatures in a group of sequence records.
19
20 In this simple implementation, signatures are just defined as a
21 two motifs separated by a gap. We need something a lot smarter than
22 this to find more complicated signatures.
23 """
25 """Initialize a finder to get signatures.
26
27 Arguments:
28
29 o alphabet_strict - Specify whether signatures should be required
30 to have all letters in the signature be consistent with the
31 alphabet of the original sequence. This requires that all Seqs
32 used have a consistent alphabet. This helps protect against getting
33 useless signatures full of ambiguity signals.
34 """
35 self._alphabet_strict = alphabet_strict
36
37 - def find(self, seq_records, signature_size, max_gap):
38 """Find all signatures in a group of sequences.
39
40 Arguments:
41
42 o seq_records - A list of SeqRecord objects we'll use the sequences
43 from to find signatures.
44
45 o signature_size - The size of each half of a signature (ie. if this
46 is set at 3, then the signature could be AGC-----GAC)
47
48 o max_gap - The maximum gap size between two parts of a signature.
49 """
50 sig_info = self._get_signature_dict(seq_records, signature_size,
51 max_gap)
52
53 return PatternRepository(sig_info)
54
56 """Return a dictionary with all signatures and their counts.
57
58 This internal function does all of the hard work for the
59 find_signatures function.
60 """
61 if self._alphabet_strict:
62 alphabet = seq_records[0].seq.alphabet
63 else:
64 alphabet = None
65
66
67 all_sigs = {}
68 for seq_record in seq_records:
69
70 if alphabet is not None:
71 assert seq_record.seq.alphabet == alphabet, \
72 "Working with alphabet %s and got %s" % \
73 (alphabet, seq_record.seq.alphabet)
74
75
76 largest_sig_size = sig_size * 2 + max_gap
77 for start in range(len(seq_record.seq) - (largest_sig_size - 1)):
78
79 first_sig = str(seq_record.seq[start:start + sig_size])
80
81
82 for second in range(start + 1, (start + 1) + max_gap):
83 second_sig = str(seq_record.seq[second: second + sig_size])
84
85
86
87 if alphabet is not None:
88 first_seq = Seq(first_sig, alphabet)
89 second_seq = Seq(second_sig, alphabet)
90 if _verify_alphabet(first_seq) \
91 and _verify_alphabet(second_seq):
92 all_sigs = self._add_sig(all_sigs,
93 (first_sig, second_sig))
94
95
96 else:
97 all_sigs = self._add_sig(all_sigs,
98 (first_sig, second_sig))
99
100 return all_sigs
101
102 - def _add_sig(self, sig_dict, sig_to_add):
103 """Add a signature to the given dictionary.
104 """
105
106 if sig_to_add in sig_dict:
107 sig_dict[sig_to_add] += 1
108
109 else:
110 sig_dict[sig_to_add] = 1
111
112 return sig_dict
113
114
116 """Convert a Sequence into its signature representatives.
117
118 This takes a sequence and a set of signatures, and converts the
119 sequence into a list of numbers representing the relative amounts
120 each signature is seen in the sequence. This allows a sequence to
121 serve as input into a neural network.
122 """
123 - def __init__(self, signatures, max_gap):
124 """Initialize with the signatures to look for.
125
126 Arguments:
127
128 o signatures - A complete list of signatures, in order, that
129 are to be searched for in the sequences. The signatures should
130 be represented as a tuple of (first part of the signature,
131 second_part of the signature) -- ('GATC', 'GATC').
132
133 o max_gap - The maximum gap we can have between the two
134 elements of the signature.
135 """
136 self._signatures = signatures
137 self._max_gap = max_gap
138
139
140
141 if len(self._signatures) > 0:
142 first_sig_size = len(self._signatures[0][0])
143 second_sig_size = len(self._signatures[0][1])
144
145 assert first_sig_size == second_sig_size, \
146 "Ends of the signature do not match: %s" \
147 % self._signatures[0]
148
149 for sig in self._signatures:
150 assert len(sig[0]) == first_sig_size, \
151 "Got first part of signature %s, expected size %s" % \
152 (sig[0], first_sig_size)
153 assert len(sig[1]) == second_sig_size, \
154 "Got second part of signature %s, expected size %s" % \
155 (sig[1], second_sig_size)
156
158 """Convert a sequence into a representation of its signatures.
159
160 Arguments:
161
162 o sequence - A Seq object we are going to convert into a set of
163 signatures.
164
165 Returns:
166 A list of relative signature representations. Each item in the
167 list corresponds to the signature passed in to the initializer and
168 is the number of times that the signature was found, divided by the
169 total number of signatures found in the sequence.
170 """
171
172
173 if len(self._signatures) == 0:
174 return []
175
176
177 sequence_sigs = {}
178 for sig in self._signatures:
179 sequence_sigs[sig] = 0
180
181
182 all_first_sigs = []
183 for sig_start, sig_end in self._signatures:
184 all_first_sigs.append(sig_start)
185
186
187 sig_size = len(self._signatures[0][0])
188 smallest_sig_size = sig_size * 2
189
190 for start in range(len(sequence) - (smallest_sig_size - 1)):
191
192
193 first_sig = str(sequence[start:start + sig_size])
194 if first_sig in all_first_sigs:
195 for second in range(start + sig_size,
196 (start + sig_size + 1) + self._max_gap):
197 second_sig = str(sequence[second:second + sig_size])
198
199
200 if (first_sig, second_sig) in sequence_sigs:
201 sequence_sigs[(first_sig, second_sig)] += 1
202
203
204 min_count = min(sequence_sigs.values())
205 max_count = max(sequence_sigs.values())
206
207
208
209 if max_count > 0:
210 for sig in sequence_sigs:
211 sequence_sigs[sig] = (float(sequence_sigs[sig] - min_count)
212 / float(max_count))
213
214
215 sig_amounts = []
216 for sig in self._signatures:
217 sig_amounts.append(sequence_sigs[sig])
218
219 return sig_amounts
220