Package Bio :: Package NeuralNetwork :: Package Gene :: Module Motif
[hide private]
[frames] | no frames]

Source Code for Module Bio.NeuralNetwork.Gene.Motif

  1  # This code is part of the Biopython distribution and governed by its 
  2  # license.  Please see the LICENSE file that should have been included 
  3  # as part of this package. 
  4  # 
  5   
  6  """Find and deal with motifs in biological sequence data. 
  7   
  8  Representing DNA (or RNA or proteins) in a neural network can be difficult 
  9  since input sequences can have different lengths. One way to get around 
 10  this problem is to deal with sequences by finding common motifs, and counting 
 11  the number of times those motifs occur in a sequence. This information can 
 12  then be used for creating the neural networks, with occurances of motifs 
 13  going into the network instead of raw sequence data. 
 14  """ 
 15  # biopython 
 16  from Bio.Alphabet import _verify_alphabet 
 17  from Bio.Seq import Seq 
 18   
 19  # local modules 
 20  from .Pattern import PatternRepository 
 21   
 22   
23 -class MotifFinder(object):
24 """Find motifs in a set of Sequence Records. 25 """
26 - def __init__(self, alphabet_strict=1):
27 """Initialize a finder to get motifs. 28 29 Arguments: 30 31 o alphabet_strict - Whether or not motifs should be 32 restricted to having all of there elements within the alphabet 33 of the sequences. This requires that the Sequences have a real 34 alphabet, and that all sequences have the same alphabet. 35 """ 36 self.alphabet_strict = alphabet_strict
37
38 - def find(self, seq_records, motif_size):
39 """Find all motifs of the given size in the passed SeqRecords. 40 41 Arguments: 42 43 o seq_records - A list of SeqRecord objects which the motifs 44 will be found from. 45 46 o motif_size - The size of the motifs we want to look for. 47 48 Returns: 49 A PatternRepository object that contains all of the motifs (and their 50 counts) found in the training sequences). 51 """ 52 motif_info = self._get_motif_dict(seq_records, motif_size) 53 54 return PatternRepository(motif_info)
55
56 - def _get_motif_dict(self, seq_records, motif_size):
57 """Return a dictionary with information on motifs. 58 59 This internal function essentially does all of the hard work for 60 finding motifs, and returns a dictionary containing the found motifs 61 and their counts. This is internal so it can be reused by 62 find_motif_differences. 63 """ 64 if self.alphabet_strict: 65 alphabet = seq_records[0].seq.alphabet 66 else: 67 alphabet = None 68 69 # loop through all records to find the motifs in the sequences 70 all_motifs = {} 71 for seq_record in seq_records: 72 # if we are working with alphabets, make sure we are consistent 73 if alphabet is not None: 74 assert seq_record.seq.alphabet == alphabet, \ 75 "Working with alphabet %s and got %s" % \ 76 (alphabet, seq_record.seq.alphabet) 77 78 # now start finding motifs in the sequence 79 for start in range(len(seq_record.seq) - (motif_size - 1)): 80 motif = str(seq_record.seq[start:start + motif_size]) 81 82 # if we are being alphabet strict, make sure the motif 83 # falls within the specified alphabet 84 if alphabet is not None: 85 motif_seq = Seq(motif, alphabet) 86 if _verify_alphabet(motif_seq): 87 all_motifs = self._add_motif(all_motifs, motif) 88 89 # if we are not being strict, just add the motif 90 else: 91 all_motifs = self._add_motif(all_motifs, motif) 92 93 return all_motifs
94
95 - def find_differences(self, first_records, second_records, motif_size):
96 """Find motifs in two sets of records and return the differences. 97 98 This is used for finding motifs, but instead of just counting up all 99 of the motifs in a set of records, this returns the differences 100 between two listings of seq_records. 101 102 o first_records, second_records - Two listings of SeqRecord objects 103 to have their motifs compared. 104 105 o motif_size - The size of the motifs we are looking for. 106 107 Returns: 108 A PatternRepository object that has motifs, but instead of their 109 raw counts, this has the counts in the first set of records 110 subtracted from the counts in the second set. 111 """ 112 first_motifs = self._get_motif_dict(first_records, motif_size) 113 second_motifs = self._get_motif_dict(second_records, motif_size) 114 115 motif_diffs = {} 116 117 # first deal with all of the keys from the first motif 118 for cur_key in first_motifs: 119 if cur_key in second_motifs: 120 motif_diffs[cur_key] = first_motifs[cur_key] - \ 121 second_motifs[cur_key] 122 else: 123 motif_diffs[cur_key] = first_motifs[cur_key] 124 125 # now see if there are any keys from the second motif 126 # that we haven't got yet. 127 missing_motifs = list(second_motifs) 128 129 # remove all of the motifs we've already added 130 for added_motif in motif_diffs: 131 if added_motif in missing_motifs: 132 missing_motifs.remove(added_motif) 133 134 # now put in all of the motifs we didn't get 135 for cur_key in missing_motifs: 136 motif_diffs[cur_key] = 0 - second_motifs[cur_key] 137 138 return PatternRepository(motif_diffs)
139
140 - def _add_motif(self, motif_dict, motif_to_add):
141 """Add a motif to the given dictionary. 142 """ 143 # incrememt the count of the motif if it is already present 144 if motif_to_add in motif_dict: 145 motif_dict[motif_to_add] += 1 146 # otherwise add it to the dictionary 147 else: 148 motif_dict[motif_to_add] = 1 149 150 return motif_dict
151 152
153 -class MotifCoder(object):
154 """Convert motifs and a sequence into neural network representations. 155 156 This is designed to convert a sequence into a representation that 157 can be fed as an input into a neural network. It does this by 158 representing a sequence based the motifs present. 159 """
160 - def __init__(self, motifs):
161 """Initialize an input producer with motifs to look for. 162 163 Arguments: 164 165 o motifs - A complete list of motifs, in order, that are to be 166 searched for in a sequence. 167 """ 168 self._motifs = motifs 169 170 # check to be sure the motifs make sense (all the same size) 171 self._motif_size = len(self._motifs[0]) 172 for motif in self._motifs: 173 if len(motif) != self._motif_size: 174 raise ValueError("Motif %s given, expected motif size %s" 175 % (motif, self._motif_size))
176
177 - def representation(self, sequence):
178 """Represent a sequence as a set of motifs. 179 180 Arguments: 181 182 o sequence - A Bio.Seq object to represent as a motif. 183 184 This converts a sequence into a representation based on the motifs. 185 The representation is returned as a list of the relative amount of 186 each motif (number of times a motif occurred divided by the total 187 number of motifs in the sequence). The values in the list correspond 188 to the input order of the motifs specified in the initializer. 189 """ 190 # initialize a dictionary to hold the motifs in this sequence 191 seq_motifs = {} 192 for motif in self._motifs: 193 seq_motifs[motif] = 0 194 195 # count all of the motifs we are looking for in the sequence 196 for start in range(len(sequence) - (self._motif_size - 1)): 197 motif = str(sequence[start:start + self._motif_size]) 198 199 if motif in seq_motifs: 200 seq_motifs[motif] += 1 201 202 # normalize the motifs to go between zero and one 203 min_count = min(seq_motifs.values()) 204 max_count = max(seq_motifs.values()) 205 206 # as long as we have some motifs present, normalize them 207 # otherwise we'll just return 0 for everything 208 if max_count > 0: 209 for motif in seq_motifs: 210 seq_motifs[motif] = (float(seq_motifs[motif] - min_count) 211 / float(max_count)) 212 213 # return the relative motif counts in the specified order 214 motif_amounts = [] 215 for motif in self._motifs: 216 motif_amounts.append(seq_motifs[motif]) 217 218 return motif_amounts
219