1
2
3
4
5
6 """Find and deal with signatures in biological sequence data.
7
8 In addition to representing sequences according to motifs (see Motif.py
9 for more information), we can also use Signatures, which are conserved
10 regions that are not necessarily consecutive. This may be useful in
11 the case of very diverged sequences, where signatures may pick out
12 important conservation that can't be found by motifs (hopefully!).
13 """
14
15 from Bio.Alphabet import _verify_alphabet
16 from Bio.Seq import Seq
17
18
19 from .Pattern import PatternRepository
20
21
23 """Find Signatures in a group of sequence records.
24
25 In this simple implementation, signatures are just defined as a
26 two motifs separated by a gap. We need something a lot smarter than
27 this to find more complicated signatures.
28 """
29
31 """Initialize a finder to get signatures.
32
33 Arguments:
34 - alphabet_strict - Specify whether signatures should be required
35 to have all letters in the signature be consistent with the
36 alphabet of the original sequence. This requires that all Seqs
37 used have a consistent alphabet. This helps protect against getting
38 useless signatures full of ambiguity signals.
39
40 """
41 self._alphabet_strict = alphabet_strict
42
43 - def find(self, seq_records, signature_size, max_gap):
44 """Find all signatures in a group of sequences.
45
46 Arguments:
47 - seq_records - A list of SeqRecord objects we'll use the sequences
48 from to find signatures.
49 - signature_size - The size of each half of a signature (ie. if this
50 is set at 3, then the signature could be AGC-----GAC)
51 - max_gap - The maximum gap size between two parts of a signature.
52
53 """
54 sig_info = self._get_signature_dict(seq_records, signature_size,
55 max_gap)
56
57 return PatternRepository(sig_info)
58
60 """Return a dictionary with all signatures and their counts.
61
62 This internal function does all of the hard work for the
63 find_signatures function.
64 """
65 if self._alphabet_strict:
66 alphabet = seq_records[0].seq.alphabet
67 else:
68 alphabet = None
69
70
71 all_sigs = {}
72 for seq_record in seq_records:
73
74 if alphabet is not None:
75 assert seq_record.seq.alphabet == alphabet, \
76 "Working with alphabet %s and got %s" % \
77 (alphabet, seq_record.seq.alphabet)
78
79
80 largest_sig_size = sig_size * 2 + max_gap
81 for start in range(len(seq_record.seq) - (largest_sig_size - 1)):
82
83 first_sig = str(seq_record.seq[start:start + sig_size])
84
85
86 for second in range(start + 1, (start + 1) + max_gap):
87 second_sig = str(seq_record.seq[second: second + sig_size])
88
89
90
91 if alphabet is not None:
92 first_seq = Seq(first_sig, alphabet)
93 second_seq = Seq(second_sig, alphabet)
94 if _verify_alphabet(first_seq) \
95 and _verify_alphabet(second_seq):
96 all_sigs = self._add_sig(all_sigs,
97 (first_sig, second_sig))
98
99
100 else:
101 all_sigs = self._add_sig(all_sigs,
102 (first_sig, second_sig))
103
104 return all_sigs
105
106 - def _add_sig(self, sig_dict, sig_to_add):
107 """Add a signature to the given dictionary."""
108
109 if sig_to_add in sig_dict:
110 sig_dict[sig_to_add] += 1
111
112 else:
113 sig_dict[sig_to_add] = 1
114
115 return sig_dict
116
117
119 """Convert a Sequence into its signature representatives.
120
121 This takes a sequence and a set of signatures, and converts the
122 sequence into a list of numbers representing the relative amounts
123 each signature is seen in the sequence. This allows a sequence to
124 serve as input into a neural network.
125 """
126
127 - def __init__(self, signatures, max_gap):
128 """Initialize with the signatures to look for.
129
130 Arguments:
131 - signatures - A complete list of signatures, in order, that
132 are to be searched for in the sequences. The signatures should
133 be represented as a tuple of (first part of the signature,
134 second_part of the signature) -- ('GATC', 'GATC').
135 - max_gap - The maximum gap we can have between the two
136 elements of the signature.
137
138 """
139 self._signatures = signatures
140 self._max_gap = max_gap
141
142
143
144 if len(self._signatures) > 0:
145 first_sig_size = len(self._signatures[0][0])
146 second_sig_size = len(self._signatures[0][1])
147
148 assert first_sig_size == second_sig_size, \
149 "Ends of the signature do not match: %s" \
150 % self._signatures[0]
151
152 for sig in self._signatures:
153 assert len(sig[0]) == first_sig_size, \
154 "Got first part of signature %s, expected size %s" % \
155 (sig[0], first_sig_size)
156 assert len(sig[1]) == second_sig_size, \
157 "Got second part of signature %s, expected size %s" % \
158 (sig[1], second_sig_size)
159
161 """Convert a sequence into a representation of its signatures.
162
163 Arguments:
164 - sequence - A Seq object we are going to convert into a set of
165 signatures.
166
167 Returns:
168 A list of relative signature representations. Each item in the
169 list corresponds to the signature passed in to the initializer and
170 is the number of times that the signature was found, divided by the
171 total number of signatures found in the sequence.
172
173 """
174
175
176 if len(self._signatures) == 0:
177 return []
178
179
180 sequence_sigs = {}
181 for sig in self._signatures:
182 sequence_sigs[sig] = 0
183
184
185 all_first_sigs = []
186 for sig_start, sig_end in self._signatures:
187 all_first_sigs.append(sig_start)
188
189
190 sig_size = len(self._signatures[0][0])
191 smallest_sig_size = sig_size * 2
192
193 for start in range(len(sequence) - (smallest_sig_size - 1)):
194
195
196 first_sig = str(sequence[start:start + sig_size])
197 if first_sig in all_first_sigs:
198 for second in range(start + sig_size,
199 (start + sig_size + 1) + self._max_gap):
200 second_sig = str(sequence[second:second + sig_size])
201
202
203 if (first_sig, second_sig) in sequence_sigs:
204 sequence_sigs[(first_sig, second_sig)] += 1
205
206
207 min_count = min(sequence_sigs.values())
208 max_count = max(sequence_sigs.values())
209
210
211
212 if max_count > 0:
213 for sig in sequence_sigs:
214 sequence_sigs[sig] = (float(sequence_sigs[sig] - min_count) /
215 float(max_count))
216
217
218 sig_amounts = []
219 for sig in self._signatures:
220 sig_amounts.append(sequence_sigs[sig])
221
222 return sig_amounts
223