1 """Find and deal with motifs in biological sequence data.
2
3 Representing DNA (or RNA or proteins) in a neural network can be difficult
4 since input sequences can have different lengths. One way to get around
5 this problem is to deal with sequences by finding common motifs, and counting
6 the number of times those motifs occur in a sequence. This information can
7 then be used for creating the neural networks, with occurances of motifs
8 going into the network instead of raw sequence data.
9 """
10
11 from Bio.Alphabet import _verify_alphabet
12 from Bio.Seq import Seq
13
14
15 from Pattern import PatternRepository
16
17
19 """Find motifs in a set of Sequence Records.
20 """
22 """Initialize a finder to get motifs.
23
24 Arguments:
25
26 o alphabet_strict - Whether or not motifs should be
27 restricted to having all of there elements within the alphabet
28 of the sequences. This requires that the Sequences have a real
29 alphabet, and that all sequences have the same alphabet.
30 """
31 self.alphabet_strict = alphabet_strict
32
33 - def find(self, seq_records, motif_size):
34 """Find all motifs of the given size in the passed SeqRecords.
35
36 Arguments:
37
38 o seq_records - A list of SeqRecord objects which the motifs
39 will be found from.
40
41 o motif_size - The size of the motifs we want to look for.
42
43 Returns:
44 A PatternRepository object that contains all of the motifs (and their
45 counts) found in the training sequences).
46 """
47 motif_info = self._get_motif_dict(seq_records, motif_size)
48
49 return PatternRepository(motif_info)
50
52 """Return a dictionary with information on motifs.
53
54 This internal function essentially does all of the hard work for
55 finding motifs, and returns a dictionary containing the found motifs
56 and their counts. This is internal so it can be reused by
57 find_motif_differences.
58 """
59 if self.alphabet_strict:
60 alphabet = seq_records[0].seq.alphabet
61 else:
62 alphabet = None
63
64
65 all_motifs = {}
66 for seq_record in seq_records:
67
68 if alphabet is not None:
69 assert seq_record.seq.alphabet == alphabet, \
70 "Working with alphabet %s and got %s" % \
71 (alphabet, seq_record.seq.alphabet)
72
73
74 for start in range(len(seq_record.seq) - (motif_size - 1)):
75 motif = str(seq_record.seq[start:start + motif_size])
76
77
78
79 if alphabet is not None:
80 motif_seq = Seq(motif, alphabet)
81 if _verify_alphabet(motif_seq):
82 all_motifs = self._add_motif(all_motifs, motif)
83
84
85 else:
86 all_motifs = self._add_motif(all_motifs, motif)
87
88 return all_motifs
89
91 """Find motifs in two sets of records and return the differences.
92
93 This is used for finding motifs, but instead of just counting up all
94 of the motifs in a set of records, this returns the differences
95 between two listings of seq_records.
96
97 o first_records, second_records - Two listings of SeqRecord objects
98 to have their motifs compared.
99
100 o motif_size - The size of the motifs we are looking for.
101
102 Returns:
103 A PatternRepository object that has motifs, but instead of their
104 raw counts, this has the counts in the first set of records
105 subtracted from the counts in the second set.
106 """
107 first_motifs = self._get_motif_dict(first_records, motif_size)
108 second_motifs = self._get_motif_dict(second_records, motif_size)
109
110 motif_diffs = {}
111
112
113 for cur_key in first_motifs:
114 if cur_key in second_motifs:
115 motif_diffs[cur_key] = first_motifs[cur_key] - \
116 second_motifs[cur_key]
117 else:
118 motif_diffs[cur_key] = first_motifs[cur_key]
119
120
121
122 missing_motifs = list(second_motifs)
123
124
125 for added_motif in motif_diffs:
126 if added_motif in missing_motifs:
127 missing_motifs.remove(added_motif)
128
129
130 for cur_key in missing_motifs:
131 motif_diffs[cur_key] = 0 - second_motifs[cur_key]
132
133 return PatternRepository(motif_diffs)
134
136 """Add a motif to the given dictionary.
137 """
138
139 if motif_to_add in motif_dict:
140 motif_dict[motif_to_add] += 1
141
142 else:
143 motif_dict[motif_to_add] = 1
144
145 return motif_dict
146
147
149 """Convert motifs and a sequence into neural network representations.
150
151 This is designed to convert a sequence into a representation that
152 can be fed as an input into a neural network. It does this by
153 representing a sequence based the motifs present.
154 """
156 """Initialize an input producer with motifs to look for.
157
158 Arguments:
159
160 o motifs - A complete list of motifs, in order, that are to be
161 searched for in a sequence.
162 """
163 self._motifs = motifs
164
165
166 self._motif_size = len(self._motifs[0])
167 for motif in self._motifs:
168 if len(motif) != self._motif_size:
169 raise ValueError("Motif %s given, expected motif size %s"
170 % (motif, self._motif_size))
171
173 """Represent a sequence as a set of motifs.
174
175 Arguments:
176
177 o sequence - A Bio.Seq object to represent as a motif.
178
179 This converts a sequence into a representation based on the motifs.
180 The representation is returned as a list of the relative amount of
181 each motif (number of times a motif occurred divided by the total
182 number of motifs in the sequence). The values in the list correspond
183 to the input order of the motifs specified in the initializer.
184 """
185
186 seq_motifs = {}
187 for motif in self._motifs:
188 seq_motifs[motif] = 0
189
190
191 for start in range(len(sequence) - (self._motif_size - 1)):
192 motif = str(sequence[start:start + self._motif_size])
193
194 if motif in seq_motifs:
195 seq_motifs[motif] += 1
196
197
198 min_count = min(seq_motifs.values())
199 max_count = max(seq_motifs.values())
200
201
202
203 if max_count > 0:
204 for motif in seq_motifs.keys():
205 seq_motifs[motif] = (float(seq_motifs[motif] - min_count)
206 / float(max_count))
207
208
209 motif_amounts = []
210 for motif in self._motifs:
211 motif_amounts.append(seq_motifs[motif])
212
213 return motif_amounts
214