Package Bio :: Package NeuralNetwork :: Package Gene :: Module Signature
[hide private]
[frames] | no frames]

Source Code for Module Bio.NeuralNetwork.Gene.Signature

  1  # This code is part of the Biopython distribution and governed by its 
  2  # license.  Please see the LICENSE file that should have been included 
  3  # as part of this package. 
  4  # 
  5   
  6  """Find and deal with signatures in biological sequence data. 
  7   
  8  In addition to representing sequences according to motifs (see Motif.py 
  9  for more information), we can also use Signatures, which are conserved 
 10  regions that are not necessarily consecutive. This may be useful in 
 11  the case of very diverged sequences, where signatures may pick out 
 12  important conservation that can't be found by motifs (hopefully!). 
 13  """ 
 14  # biopython 
 15  from Bio.Alphabet import _verify_alphabet 
 16  from Bio.Seq import Seq 
 17   
 18  # local stuff 
 19  from .Pattern import PatternRepository 
 20   
 21   
22 -class SignatureFinder(object):
23 """Find Signatures in a group of sequence records. 24 25 In this simple implementation, signatures are just defined as a 26 two motifs separated by a gap. We need something a lot smarter than 27 this to find more complicated signatures. 28 """ 29
30 - def __init__(self, alphabet_strict=1):
31 """Initialize a finder to get signatures. 32 33 Arguments: 34 - alphabet_strict - Specify whether signatures should be required 35 to have all letters in the signature be consistent with the 36 alphabet of the original sequence. This requires that all Seqs 37 used have a consistent alphabet. This helps protect against getting 38 useless signatures full of ambiguity signals. 39 40 """ 41 self._alphabet_strict = alphabet_strict
42
43 - def find(self, seq_records, signature_size, max_gap):
44 """Find all signatures in a group of sequences. 45 46 Arguments: 47 - seq_records - A list of SeqRecord objects we'll use the sequences 48 from to find signatures. 49 - signature_size - The size of each half of a signature (ie. if this 50 is set at 3, then the signature could be AGC-----GAC) 51 - max_gap - The maximum gap size between two parts of a signature. 52 53 """ 54 sig_info = self._get_signature_dict(seq_records, signature_size, 55 max_gap) 56 57 return PatternRepository(sig_info)
58
59 - def _get_signature_dict(self, seq_records, sig_size, max_gap):
60 """Return a dictionary with all signatures and their counts. 61 62 This internal function does all of the hard work for the 63 find_signatures function. 64 """ 65 if self._alphabet_strict: 66 alphabet = seq_records[0].seq.alphabet 67 else: 68 alphabet = None 69 70 # loop through all records to find signatures 71 all_sigs = {} 72 for seq_record in seq_records: 73 # if we are working with alphabets, make sure we are consistent 74 if alphabet is not None: 75 assert seq_record.seq.alphabet == alphabet, \ 76 "Working with alphabet %s and got %s" % \ 77 (alphabet, seq_record.seq.alphabet) 78 79 # now start finding signatures in the sequence 80 largest_sig_size = sig_size * 2 + max_gap 81 for start in range(len(seq_record.seq) - (largest_sig_size - 1)): 82 # find the first part of the signature 83 first_sig = str(seq_record.seq[start:start + sig_size]) 84 85 # now find all of the second parts of the signature 86 for second in range(start + 1, (start + 1) + max_gap): 87 second_sig = str(seq_record.seq[second: second + sig_size]) 88 89 # if we are being alphabet strict, make sure both parts 90 # of the sig fall within the specified alphabet 91 if alphabet is not None: 92 first_seq = Seq(first_sig, alphabet) 93 second_seq = Seq(second_sig, alphabet) 94 if _verify_alphabet(first_seq) \ 95 and _verify_alphabet(second_seq): 96 all_sigs = self._add_sig(all_sigs, 97 (first_sig, second_sig)) 98 99 # if we are not being strict, just add the motif 100 else: 101 all_sigs = self._add_sig(all_sigs, 102 (first_sig, second_sig)) 103 104 return all_sigs
105
106 - def _add_sig(self, sig_dict, sig_to_add):
107 """Add a signature to the given dictionary.""" 108 # incrememt the count of the signature if it is already present 109 if sig_to_add in sig_dict: 110 sig_dict[sig_to_add] += 1 111 # otherwise add it to the dictionary 112 else: 113 sig_dict[sig_to_add] = 1 114 115 return sig_dict
116 117
118 -class SignatureCoder(object):
119 """Convert a Sequence into its signature representatives. 120 121 This takes a sequence and a set of signatures, and converts the 122 sequence into a list of numbers representing the relative amounts 123 each signature is seen in the sequence. This allows a sequence to 124 serve as input into a neural network. 125 """ 126
127 - def __init__(self, signatures, max_gap):
128 """Initialize with the signatures to look for. 129 130 Arguments: 131 - signatures - A complete list of signatures, in order, that 132 are to be searched for in the sequences. The signatures should 133 be represented as a tuple of (first part of the signature, 134 second_part of the signature) -- ('GATC', 'GATC'). 135 - max_gap - The maximum gap we can have between the two 136 elements of the signature. 137 138 """ 139 self._signatures = signatures 140 self._max_gap = max_gap 141 142 # check to be sure the signatures are all the same size 143 # only do this if we actually have signatures 144 if len(self._signatures) > 0: 145 first_sig_size = len(self._signatures[0][0]) 146 second_sig_size = len(self._signatures[0][1]) 147 148 assert first_sig_size == second_sig_size, \ 149 "Ends of the signature do not match: %s" \ 150 % self._signatures[0] 151 152 for sig in self._signatures: 153 assert len(sig[0]) == first_sig_size, \ 154 "Got first part of signature %s, expected size %s" % \ 155 (sig[0], first_sig_size) 156 assert len(sig[1]) == second_sig_size, \ 157 "Got second part of signature %s, expected size %s" % \ 158 (sig[1], second_sig_size)
159
160 - def representation(self, sequence):
161 """Convert a sequence into a representation of its signatures. 162 163 Arguments: 164 - sequence - A Seq object we are going to convert into a set of 165 signatures. 166 167 Returns: 168 A list of relative signature representations. Each item in the 169 list corresponds to the signature passed in to the initializer and 170 is the number of times that the signature was found, divided by the 171 total number of signatures found in the sequence. 172 173 """ 174 # check to be sure we have signatures to deal with, 175 # otherwise just return an empty list 176 if len(self._signatures) == 0: 177 return [] 178 179 # initialize a dictionary to hold the signature counts 180 sequence_sigs = {} 181 for sig in self._signatures: 182 sequence_sigs[sig] = 0 183 184 # get a list of all of the first parts of the signatures 185 all_first_sigs = [] 186 for sig_start, sig_end in self._signatures: 187 all_first_sigs.append(sig_start) 188 189 # count all of the signatures we are looking for in the sequence 190 sig_size = len(self._signatures[0][0]) 191 smallest_sig_size = sig_size * 2 192 193 for start in range(len(sequence) - (smallest_sig_size - 1)): 194 # if the first part matches any of the signatures we are looking 195 # for, then expand out to look for the second part 196 first_sig = str(sequence[start:start + sig_size]) 197 if first_sig in all_first_sigs: 198 for second in range(start + sig_size, 199 (start + sig_size + 1) + self._max_gap): 200 second_sig = str(sequence[second:second + sig_size]) 201 202 # if we find the motif, increase the counts for it 203 if (first_sig, second_sig) in sequence_sigs: 204 sequence_sigs[(first_sig, second_sig)] += 1 205 206 # -- normalize the signature info to go between zero and one 207 min_count = min(sequence_sigs.values()) 208 max_count = max(sequence_sigs.values()) 209 210 # as long as we have some signatures present, normalize them 211 # otherwise we'll just return 0 for everything 212 if max_count > 0: 213 for sig in sequence_sigs: 214 sequence_sigs[sig] = (float(sequence_sigs[sig] - min_count) / 215 float(max_count)) 216 217 # return the relative signature info in the specified order 218 sig_amounts = [] 219 for sig in self._signatures: 220 sig_amounts.append(sequence_sigs[sig]) 221 222 return sig_amounts
223