Package Bio :: Package NeuralNetwork :: Package Gene :: Module Motif
[hide private]
[frames] | no frames]

Source Code for Module Bio.NeuralNetwork.Gene.Motif

  1  """Find and deal with motifs in biological sequence data. 
  2   
  3  Representing DNA (or RNA or proteins) in a neural network can be difficult 
  4  since input sequences can have different lengths. One way to get around 
  5  this problem is to deal with sequences by finding common motifs, and counting 
  6  the number of times those motifs occur in a sequence. This information can 
  7  then be used for creating the neural networks, with occurances of motifs 
  8  going into the network instead of raw sequence data. 
  9  """ 
 10  # biopython 
 11  from Bio.Alphabet import _verify_alphabet 
 12  from Bio.Seq import Seq 
 13   
 14  # local modules 
 15  from Pattern import PatternRepository 
 16   
 17   
18 -class MotifFinder(object):
19 """Find motifs in a set of Sequence Records. 20 """
21 - def __init__(self, alphabet_strict=1):
22 """Initialize a finder to get motifs. 23 24 Arguments: 25 26 o alphabet_strict - Whether or not motifs should be 27 restricted to having all of there elements within the alphabet 28 of the sequences. This requires that the Sequences have a real 29 alphabet, and that all sequences have the same alphabet. 30 """ 31 self.alphabet_strict = alphabet_strict
32
33 - def find(self, seq_records, motif_size):
34 """Find all motifs of the given size in the passed SeqRecords. 35 36 Arguments: 37 38 o seq_records - A list of SeqRecord objects which the motifs 39 will be found from. 40 41 o motif_size - The size of the motifs we want to look for. 42 43 Returns: 44 A PatternRepository object that contains all of the motifs (and their 45 counts) found in the training sequences). 46 """ 47 motif_info = self._get_motif_dict(seq_records, motif_size) 48 49 return PatternRepository(motif_info)
50
51 - def _get_motif_dict(self, seq_records, motif_size):
52 """Return a dictionary with information on motifs. 53 54 This internal function essentially does all of the hard work for 55 finding motifs, and returns a dictionary containing the found motifs 56 and their counts. This is internal so it can be reused by 57 find_motif_differences. 58 """ 59 if self.alphabet_strict: 60 alphabet = seq_records[0].seq.alphabet 61 else: 62 alphabet = None 63 64 # loop through all records to find the motifs in the sequences 65 all_motifs = {} 66 for seq_record in seq_records: 67 # if we are working with alphabets, make sure we are consistent 68 if alphabet is not None: 69 assert seq_record.seq.alphabet == alphabet, \ 70 "Working with alphabet %s and got %s" % \ 71 (alphabet, seq_record.seq.alphabet) 72 73 # now start finding motifs in the sequence 74 for start in range(len(seq_record.seq) - (motif_size - 1)): 75 motif = str(seq_record.seq[start:start + motif_size]) 76 77 # if we are being alphabet strict, make sure the motif 78 # falls within the specified alphabet 79 if alphabet is not None: 80 motif_seq = Seq(motif, alphabet) 81 if _verify_alphabet(motif_seq): 82 all_motifs = self._add_motif(all_motifs, motif) 83 84 # if we are not being strict, just add the motif 85 else: 86 all_motifs = self._add_motif(all_motifs, motif) 87 88 return all_motifs
89
90 - def find_differences(self, first_records, second_records, motif_size):
91 """Find motifs in two sets of records and return the differences. 92 93 This is used for finding motifs, but instead of just counting up all 94 of the motifs in a set of records, this returns the differences 95 between two listings of seq_records. 96 97 o first_records, second_records - Two listings of SeqRecord objects 98 to have their motifs compared. 99 100 o motif_size - The size of the motifs we are looking for. 101 102 Returns: 103 A PatternRepository object that has motifs, but instead of their 104 raw counts, this has the counts in the first set of records 105 subtracted from the counts in the second set. 106 """ 107 first_motifs = self._get_motif_dict(first_records, motif_size) 108 second_motifs = self._get_motif_dict(second_records, motif_size) 109 110 motif_diffs = {} 111 112 # first deal with all of the keys from the first motif 113 for cur_key in first_motifs: 114 if cur_key in second_motifs: 115 motif_diffs[cur_key] = first_motifs[cur_key] - \ 116 second_motifs[cur_key] 117 else: 118 motif_diffs[cur_key] = first_motifs[cur_key] 119 120 # now see if there are any keys from the second motif 121 # that we haven't got yet. 122 missing_motifs = list(second_motifs) 123 124 # remove all of the motifs we've already added 125 for added_motif in motif_diffs: 126 if added_motif in missing_motifs: 127 missing_motifs.remove(added_motif) 128 129 # now put in all of the motifs we didn't get 130 for cur_key in missing_motifs: 131 motif_diffs[cur_key] = 0 - second_motifs[cur_key] 132 133 return PatternRepository(motif_diffs)
134
135 - def _add_motif(self, motif_dict, motif_to_add):
136 """Add a motif to the given dictionary. 137 """ 138 # incrememt the count of the motif if it is already present 139 if motif_to_add in motif_dict: 140 motif_dict[motif_to_add] += 1 141 # otherwise add it to the dictionary 142 else: 143 motif_dict[motif_to_add] = 1 144 145 return motif_dict
146 147
148 -class MotifCoder(object):
149 """Convert motifs and a sequence into neural network representations. 150 151 This is designed to convert a sequence into a representation that 152 can be fed as an input into a neural network. It does this by 153 representing a sequence based the motifs present. 154 """
155 - def __init__(self, motifs):
156 """Initialize an input producer with motifs to look for. 157 158 Arguments: 159 160 o motifs - A complete list of motifs, in order, that are to be 161 searched for in a sequence. 162 """ 163 self._motifs = motifs 164 165 # check to be sure the motifs make sense (all the same size) 166 self._motif_size = len(self._motifs[0]) 167 for motif in self._motifs: 168 if len(motif) != self._motif_size: 169 raise ValueError("Motif %s given, expected motif size %s" 170 % (motif, self._motif_size))
171
172 - def representation(self, sequence):
173 """Represent a sequence as a set of motifs. 174 175 Arguments: 176 177 o sequence - A Bio.Seq object to represent as a motif. 178 179 This converts a sequence into a representation based on the motifs. 180 The representation is returned as a list of the relative amount of 181 each motif (number of times a motif occurred divided by the total 182 number of motifs in the sequence). The values in the list correspond 183 to the input order of the motifs specified in the initializer. 184 """ 185 # initialize a dictionary to hold the motifs in this sequence 186 seq_motifs = {} 187 for motif in self._motifs: 188 seq_motifs[motif] = 0 189 190 # count all of the motifs we are looking for in the sequence 191 for start in range(len(sequence) - (self._motif_size - 1)): 192 motif = str(sequence[start:start + self._motif_size]) 193 194 if motif in seq_motifs: 195 seq_motifs[motif] += 1 196 197 # normalize the motifs to go between zero and one 198 min_count = min(seq_motifs.values()) 199 max_count = max(seq_motifs.values()) 200 201 # as long as we have some motifs present, normalize them 202 # otherwise we'll just return 0 for everything 203 if max_count > 0: 204 for motif in seq_motifs.keys(): 205 seq_motifs[motif] = (float(seq_motifs[motif] - min_count) 206 / float(max_count)) 207 208 # return the relative motif counts in the specified order 209 motif_amounts = [] 210 for motif in self._motifs: 211 motif_amounts.append(seq_motifs[motif]) 212 213 return motif_amounts
214