Package Bio :: Package NeuralNetwork :: Package Gene :: Module Motif
[hide private]
[frames] | no frames]

Source Code for Module Bio.NeuralNetwork.Gene.Motif

  1  # This code is part of the Biopython distribution and governed by its 
  2  # license.  Please see the LICENSE file that should have been included 
  3  # as part of this package. 
  4  # 
  5   
  6  """Find and deal with motifs in biological sequence data. 
  7   
  8  Representing DNA (or RNA or proteins) in a neural network can be difficult 
  9  since input sequences can have different lengths. One way to get around 
 10  this problem is to deal with sequences by finding common motifs, and counting 
 11  the number of times those motifs occur in a sequence. This information can 
 12  then be used for creating the neural networks, with occurrences of motifs 
 13  going into the network instead of raw sequence data. 
 14  """ 
 15  # biopython 
 16  from Bio.Alphabet import _verify_alphabet 
 17  from Bio.Seq import Seq 
 18   
 19  # local modules 
 20  from .Pattern import PatternRepository 
 21   
 22   
23 -class MotifFinder(object):
24 """Find motifs in a set of Sequence Records.""" 25
26 - def __init__(self, alphabet_strict=1):
27 """Initialize a finder to get motifs. 28 29 Arguments: 30 - alphabet_strict - Whether or not motifs should be restricted 31 to having all of there elements within the alphabet of the 32 sequences. This requires that the Sequences have a real 33 alphabet, and that all sequences have the same alphabet. 34 35 """ 36 self.alphabet_strict = alphabet_strict
37
38 - def find(self, seq_records, motif_size):
39 """Find all motifs of the given size in the passed SeqRecords. 40 41 Arguments: 42 - seq_records - A list of SeqRecord objects which the motifs 43 will be found from. 44 - motif_size - The size of the motifs we want to look for. 45 46 Returns: 47 A PatternRepository object that contains all of the motifs (and their 48 counts) found in the training sequences). 49 50 """ 51 motif_info = self._get_motif_dict(seq_records, motif_size) 52 53 return PatternRepository(motif_info)
54
55 - def _get_motif_dict(self, seq_records, motif_size):
56 """Return a dictionary with information on motifs. 57 58 This internal function essentially does all of the hard work for 59 finding motifs, and returns a dictionary containing the found motifs 60 and their counts. This is internal so it can be reused by 61 find_motif_differences. 62 """ 63 if self.alphabet_strict: 64 alphabet = seq_records[0].seq.alphabet 65 else: 66 alphabet = None 67 68 # loop through all records to find the motifs in the sequences 69 all_motifs = {} 70 for seq_record in seq_records: 71 # if we are working with alphabets, make sure we are consistent 72 if alphabet is not None: 73 assert seq_record.seq.alphabet == alphabet, \ 74 "Working with alphabet %s and got %s" % \ 75 (alphabet, seq_record.seq.alphabet) 76 77 # now start finding motifs in the sequence 78 for start in range(len(seq_record.seq) - (motif_size - 1)): 79 motif = str(seq_record.seq[start:start + motif_size]) 80 81 # if we are being alphabet strict, make sure the motif 82 # falls within the specified alphabet 83 if alphabet is not None: 84 motif_seq = Seq(motif, alphabet) 85 if _verify_alphabet(motif_seq): 86 all_motifs = self._add_motif(all_motifs, motif) 87 88 # if we are not being strict, just add the motif 89 else: 90 all_motifs = self._add_motif(all_motifs, motif) 91 92 return all_motifs
93
94 - def find_differences(self, first_records, second_records, motif_size):
95 """Find motifs in two sets of records and return the differences. 96 97 This is used for finding motifs, but instead of just counting up all 98 of the motifs in a set of records, this returns the differences 99 between two listings of seq_records. 100 101 Arguments: 102 - first_records, second_records - Two listings of SeqRecord objects 103 to have their motifs compared. 104 - motif_size - The size of the motifs we are looking for. 105 106 Returns: 107 A PatternRepository object that has motifs, but instead of their 108 raw counts, this has the counts in the first set of records 109 subtracted from the counts in the second set. 110 111 """ 112 first_motifs = self._get_motif_dict(first_records, motif_size) 113 second_motifs = self._get_motif_dict(second_records, motif_size) 114 115 motif_diffs = {} 116 117 # first deal with all of the keys from the first motif 118 for cur_key in first_motifs: 119 if cur_key in second_motifs: 120 motif_diffs[cur_key] = first_motifs[cur_key] - \ 121 second_motifs[cur_key] 122 else: 123 motif_diffs[cur_key] = first_motifs[cur_key] 124 125 # now see if there are any keys from the second motif 126 # that we haven't got yet. 127 missing_motifs = list(second_motifs) 128 129 # remove all of the motifs we've already added 130 for added_motif in motif_diffs: 131 if added_motif in missing_motifs: 132 missing_motifs.remove(added_motif) 133 134 # now put in all of the motifs we didn't get 135 for cur_key in missing_motifs: 136 motif_diffs[cur_key] = 0 - second_motifs[cur_key] 137 138 return PatternRepository(motif_diffs)
139
140 - def _add_motif(self, motif_dict, motif_to_add):
141 """Add a motif to the given dictionary.""" 142 # incrememt the count of the motif if it is already present 143 if motif_to_add in motif_dict: 144 motif_dict[motif_to_add] += 1 145 # otherwise add it to the dictionary 146 else: 147 motif_dict[motif_to_add] = 1 148 149 return motif_dict
150 151
152 -class MotifCoder(object):
153 """Convert motifs and a sequence into neural network representations. 154 155 This is designed to convert a sequence into a representation that 156 can be fed as an input into a neural network. It does this by 157 representing a sequence based the motifs present. 158 """ 159
160 - def __init__(self, motifs):
161 """Initialize an input producer with motifs to look for. 162 163 Arguments: 164 - motifs - A complete list of motifs, in order, that are to be 165 searched for in a sequence. 166 167 """ 168 self._motifs = motifs 169 170 # check to be sure the motifs make sense (all the same size) 171 self._motif_size = len(self._motifs[0]) 172 for motif in self._motifs: 173 if len(motif) != self._motif_size: 174 raise ValueError("Motif %s given, expected motif size %s" 175 % (motif, self._motif_size))
176
177 - def representation(self, sequence):
178 """Represent a sequence as a set of motifs. 179 180 Arguments: 181 - sequence - A Bio.Seq object to represent as a motif. 182 183 This converts a sequence into a representation based on the motifs. 184 The representation is returned as a list of the relative amount of 185 each motif (number of times a motif occurred divided by the total 186 number of motifs in the sequence). The values in the list correspond 187 to the input order of the motifs specified in the initializer. 188 """ 189 # initialize a dictionary to hold the motifs in this sequence 190 seq_motifs = {} 191 for motif in self._motifs: 192 seq_motifs[motif] = 0 193 194 # count all of the motifs we are looking for in the sequence 195 for start in range(len(sequence) - (self._motif_size - 1)): 196 motif = str(sequence[start:start + self._motif_size]) 197 198 if motif in seq_motifs: 199 seq_motifs[motif] += 1 200 201 # normalize the motifs to go between zero and one 202 min_count = min(seq_motifs.values()) 203 max_count = max(seq_motifs.values()) 204 205 # as long as we have some motifs present, normalize them 206 # otherwise we'll just return 0 for everything 207 if max_count > 0: 208 for motif in seq_motifs: 209 seq_motifs[motif] = (float(seq_motifs[motif] - min_count) / 210 float(max_count)) 211 212 # return the relative motif counts in the specified order 213 motif_amounts = [] 214 for motif in self._motifs: 215 motif_amounts.append(seq_motifs[motif]) 216 217 return motif_amounts
218