Package Bio :: Package NeuralNetwork :: Package Gene :: Module Signature
[hide private]
[frames] | no frames]

Source Code for Module Bio.NeuralNetwork.Gene.Signature

  1  # This code is part of the Biopython distribution and governed by its 
  2  # license.  Please see the LICENSE file that should have been included 
  3  # as part of this package. 
  4  # 
  5   
  6  """Find and deal with signatures in biological sequence data. 
  7   
  8  In addition to representing sequences according to motifs (see Motif.py 
  9  for more information), we can also use Signatures, which are conserved 
 10  regions that are not necessarily consecutive. This may be useful in 
 11  the case of very diverged sequences, where signatures may pick out 
 12  important conservation that can't be found by motifs (hopefully!). 
 13  """ 
 14  # biopython 
 15  from Bio.Alphabet import _verify_alphabet 
 16  from Bio.Seq import Seq 
 17   
 18  # local stuff 
 19  from .Pattern import PatternRepository 
 20   
 21   
22 -class SignatureFinder(object):
23 """Find Signatures in a group of sequence records. 24 25 In this simple implementation, signatures are just defined as a 26 two motifs separated by a gap. We need something a lot smarter than 27 this to find more complicated signatures. 28 """
29 - def __init__(self, alphabet_strict=1):
30 """Initialize a finder to get signatures. 31 32 Arguments: 33 34 o alphabet_strict - Specify whether signatures should be required 35 to have all letters in the signature be consistent with the 36 alphabet of the original sequence. This requires that all Seqs 37 used have a consistent alphabet. This helps protect against getting 38 useless signatures full of ambiguity signals. 39 """ 40 self._alphabet_strict = alphabet_strict
41
42 - def find(self, seq_records, signature_size, max_gap):
43 """Find all signatures in a group of sequences. 44 45 Arguments: 46 47 o seq_records - A list of SeqRecord objects we'll use the sequences 48 from to find signatures. 49 50 o signature_size - The size of each half of a signature (ie. if this 51 is set at 3, then the signature could be AGC-----GAC) 52 53 o max_gap - The maximum gap size between two parts of a signature. 54 """ 55 sig_info = self._get_signature_dict(seq_records, signature_size, 56 max_gap) 57 58 return PatternRepository(sig_info)
59
60 - def _get_signature_dict(self, seq_records, sig_size, max_gap):
61 """Return a dictionary with all signatures and their counts. 62 63 This internal function does all of the hard work for the 64 find_signatures function. 65 """ 66 if self._alphabet_strict: 67 alphabet = seq_records[0].seq.alphabet 68 else: 69 alphabet = None 70 71 # loop through all records to find signatures 72 all_sigs = {} 73 for seq_record in seq_records: 74 # if we are working with alphabets, make sure we are consistent 75 if alphabet is not None: 76 assert seq_record.seq.alphabet == alphabet, \ 77 "Working with alphabet %s and got %s" % \ 78 (alphabet, seq_record.seq.alphabet) 79 80 # now start finding signatures in the sequence 81 largest_sig_size = sig_size * 2 + max_gap 82 for start in range(len(seq_record.seq) - (largest_sig_size - 1)): 83 # find the first part of the signature 84 first_sig = str(seq_record.seq[start:start + sig_size]) 85 86 # now find all of the second parts of the signature 87 for second in range(start + 1, (start + 1) + max_gap): 88 second_sig = str(seq_record.seq[second: second + sig_size]) 89 90 # if we are being alphabet strict, make sure both parts 91 # of the sig fall within the specified alphabet 92 if alphabet is not None: 93 first_seq = Seq(first_sig, alphabet) 94 second_seq = Seq(second_sig, alphabet) 95 if _verify_alphabet(first_seq) \ 96 and _verify_alphabet(second_seq): 97 all_sigs = self._add_sig(all_sigs, 98 (first_sig, second_sig)) 99 100 # if we are not being strict, just add the motif 101 else: 102 all_sigs = self._add_sig(all_sigs, 103 (first_sig, second_sig)) 104 105 return all_sigs
106
107 - def _add_sig(self, sig_dict, sig_to_add):
108 """Add a signature to the given dictionary. 109 """ 110 # incrememt the count of the signature if it is already present 111 if sig_to_add in sig_dict: 112 sig_dict[sig_to_add] += 1 113 # otherwise add it to the dictionary 114 else: 115 sig_dict[sig_to_add] = 1 116 117 return sig_dict
118 119
120 -class SignatureCoder(object):
121 """Convert a Sequence into its signature representatives. 122 123 This takes a sequence and a set of signatures, and converts the 124 sequence into a list of numbers representing the relative amounts 125 each signature is seen in the sequence. This allows a sequence to 126 serve as input into a neural network. 127 """
128 - def __init__(self, signatures, max_gap):
129 """Initialize with the signatures to look for. 130 131 Arguments: 132 133 o signatures - A complete list of signatures, in order, that 134 are to be searched for in the sequences. The signatures should 135 be represented as a tuple of (first part of the signature, 136 second_part of the signature) -- ('GATC', 'GATC'). 137 138 o max_gap - The maximum gap we can have between the two 139 elements of the signature. 140 """ 141 self._signatures = signatures 142 self._max_gap = max_gap 143 144 # check to be sure the signatures are all the same size 145 # only do this if we actually have signatures 146 if len(self._signatures) > 0: 147 first_sig_size = len(self._signatures[0][0]) 148 second_sig_size = len(self._signatures[0][1]) 149 150 assert first_sig_size == second_sig_size, \ 151 "Ends of the signature do not match: %s" \ 152 % self._signatures[0] 153 154 for sig in self._signatures: 155 assert len(sig[0]) == first_sig_size, \ 156 "Got first part of signature %s, expected size %s" % \ 157 (sig[0], first_sig_size) 158 assert len(sig[1]) == second_sig_size, \ 159 "Got second part of signature %s, expected size %s" % \ 160 (sig[1], second_sig_size)
161
162 - def representation(self, sequence):
163 """Convert a sequence into a representation of its signatures. 164 165 Arguments: 166 167 o sequence - A Seq object we are going to convert into a set of 168 signatures. 169 170 Returns: 171 A list of relative signature representations. Each item in the 172 list corresponds to the signature passed in to the initializer and 173 is the number of times that the signature was found, divided by the 174 total number of signatures found in the sequence. 175 """ 176 # check to be sure we have signatures to deal with, 177 # otherwise just return an empty list 178 if len(self._signatures) == 0: 179 return [] 180 181 # initialize a dictionary to hold the signature counts 182 sequence_sigs = {} 183 for sig in self._signatures: 184 sequence_sigs[sig] = 0 185 186 # get a list of all of the first parts of the signatures 187 all_first_sigs = [] 188 for sig_start, sig_end in self._signatures: 189 all_first_sigs.append(sig_start) 190 191 # count all of the signatures we are looking for in the sequence 192 sig_size = len(self._signatures[0][0]) 193 smallest_sig_size = sig_size * 2 194 195 for start in range(len(sequence) - (smallest_sig_size - 1)): 196 # if the first part matches any of the signatures we are looking 197 # for, then expand out to look for the second part 198 first_sig = str(sequence[start:start + sig_size]) 199 if first_sig in all_first_sigs: 200 for second in range(start + sig_size, 201 (start + sig_size + 1) + self._max_gap): 202 second_sig = str(sequence[second:second + sig_size]) 203 204 # if we find the motif, increase the counts for it 205 if (first_sig, second_sig) in sequence_sigs: 206 sequence_sigs[(first_sig, second_sig)] += 1 207 208 # -- normalize the signature info to go between zero and one 209 min_count = min(sequence_sigs.values()) 210 max_count = max(sequence_sigs.values()) 211 212 # as long as we have some signatures present, normalize them 213 # otherwise we'll just return 0 for everything 214 if max_count > 0: 215 for sig in sequence_sigs: 216 sequence_sigs[sig] = (float(sequence_sigs[sig] - min_count) 217 / float(max_count)) 218 219 # return the relative signature info in the specified order 220 sig_amounts = [] 221 for sig in self._signatures: 222 sig_amounts.append(sequence_sigs[sig]) 223 224 return sig_amounts
225