Package Bio :: Package NeuralNetwork :: Package Gene :: Module Signature
[hide private]
[frames] | no frames]

Source Code for Module Bio.NeuralNetwork.Gene.Signature

  1  """Find and deal with signatures in biological sequence data. 
  2   
  3  In addition to representing sequences according to motifs (see Motif.py 
  4  for more information), we can also use Signatures, which are conserved 
  5  regions that are not necessarily consecutive. This may be useful in 
  6  the case of very diverged sequences, where signatures may pick out 
  7  important conservation that can't be found by motifs (hopefully!). 
  8  """ 
  9  # biopython 
 10  from Bio.Alphabet import _verify_alphabet 
 11  from Bio.Seq import Seq 
 12   
 13  # local stuff 
 14  from Pattern import PatternRepository 
 15   
 16   
17 -class SignatureFinder(object):
18 """Find Signatures in a group of sequence records. 19 20 In this simple implementation, signatures are just defined as a 21 two motifs separated by a gap. We need something a lot smarter than 22 this to find more complicated signatures. 23 """
24 - def __init__(self, alphabet_strict=1):
25 """Initialize a finder to get signatures. 26 27 Arguments: 28 29 o alphabet_strict - Specify whether signatures should be required 30 to have all letters in the signature be consistent with the 31 alphabet of the original sequence. This requires that all Seqs 32 used have a consistent alphabet. This helps protect against getting 33 useless signatures full of ambiguity signals. 34 """ 35 self._alphabet_strict = alphabet_strict
36
37 - def find(self, seq_records, signature_size, max_gap):
38 """Find all signatures in a group of sequences. 39 40 Arguments: 41 42 o seq_records - A list of SeqRecord objects we'll use the sequences 43 from to find signatures. 44 45 o signature_size - The size of each half of a signature (ie. if this 46 is set at 3, then the signature could be AGC-----GAC) 47 48 o max_gap - The maximum gap size between two parts of a signature. 49 """ 50 sig_info = self._get_signature_dict(seq_records, signature_size, 51 max_gap) 52 53 return PatternRepository(sig_info)
54
55 - def _get_signature_dict(self, seq_records, sig_size, max_gap):
56 """Return a dictionary with all signatures and their counts. 57 58 This internal function does all of the hard work for the 59 find_signatures function. 60 """ 61 if self._alphabet_strict: 62 alphabet = seq_records[0].seq.alphabet 63 else: 64 alphabet = None 65 66 # loop through all records to find signatures 67 all_sigs = {} 68 for seq_record in seq_records: 69 # if we are working with alphabets, make sure we are consistent 70 if alphabet is not None: 71 assert seq_record.seq.alphabet == alphabet, \ 72 "Working with alphabet %s and got %s" % \ 73 (alphabet, seq_record.seq.alphabet) 74 75 # now start finding signatures in the sequence 76 largest_sig_size = sig_size * 2 + max_gap 77 for start in range(len(seq_record.seq) - (largest_sig_size - 1)): 78 # find the first part of the signature 79 first_sig = str(seq_record.seq[start:start + sig_size]) 80 81 # now find all of the second parts of the signature 82 for second in range(start + 1, (start + 1) + max_gap): 83 second_sig = str(seq_record.seq[second: second + sig_size]) 84 85 # if we are being alphabet strict, make sure both parts 86 # of the sig fall within the specified alphabet 87 if alphabet is not None: 88 first_seq = Seq(first_sig, alphabet) 89 second_seq = Seq(second_sig, alphabet) 90 if _verify_alphabet(first_seq) \ 91 and _verify_alphabet(second_seq): 92 all_sigs = self._add_sig(all_sigs, 93 (first_sig, second_sig)) 94 95 # if we are not being strict, just add the motif 96 else: 97 all_sigs = self._add_sig(all_sigs, 98 (first_sig, second_sig)) 99 100 return all_sigs
101
102 - def _add_sig(self, sig_dict, sig_to_add):
103 """Add a signature to the given dictionary. 104 """ 105 # incrememt the count of the signature if it is already present 106 if sig_to_add in sig_dict: 107 sig_dict[sig_to_add] += 1 108 # otherwise add it to the dictionary 109 else: 110 sig_dict[sig_to_add] = 1 111 112 return sig_dict
113 114
115 -class SignatureCoder(object):
116 """Convert a Sequence into its signature representatives. 117 118 This takes a sequence and a set of signatures, and converts the 119 sequence into a list of numbers representing the relative amounts 120 each signature is seen in the sequence. This allows a sequence to 121 serve as input into a neural network. 122 """
123 - def __init__(self, signatures, max_gap):
124 """Initialize with the signatures to look for. 125 126 Arguments: 127 128 o signatures - A complete list of signatures, in order, that 129 are to be searched for in the sequences. The signatures should 130 be represented as a tuple of (first part of the signature, 131 second_part of the signature) -- ('GATC', 'GATC'). 132 133 o max_gap - The maximum gap we can have between the two 134 elements of the signature. 135 """ 136 self._signatures = signatures 137 self._max_gap = max_gap 138 139 # check to be sure the signatures are all the same size 140 # only do this if we actually have signatures 141 if len(self._signatures) > 0: 142 first_sig_size = len(self._signatures[0][0]) 143 second_sig_size = len(self._signatures[0][1]) 144 145 assert first_sig_size == second_sig_size, \ 146 "Ends of the signature do not match: %s" \ 147 % self._signatures[0] 148 149 for sig in self._signatures: 150 assert len(sig[0]) == first_sig_size, \ 151 "Got first part of signature %s, expected size %s" % \ 152 (sig[0], first_sig_size) 153 assert len(sig[1]) == second_sig_size, \ 154 "Got second part of signature %s, expected size %s" % \ 155 (sig[1], second_sig_size)
156
157 - def representation(self, sequence):
158 """Convert a sequence into a representation of its signatures. 159 160 Arguments: 161 162 o sequence - A Seq object we are going to convert into a set of 163 signatures. 164 165 Returns: 166 A list of relative signature representations. Each item in the 167 list corresponds to the signature passed in to the initializer and 168 is the number of times that the signature was found, divided by the 169 total number of signatures found in the sequence. 170 """ 171 # check to be sure we have signatures to deal with, 172 # otherwise just return an empty list 173 if len(self._signatures) == 0: 174 return [] 175 176 # initialize a dictionary to hold the signature counts 177 sequence_sigs = {} 178 for sig in self._signatures: 179 sequence_sigs[sig] = 0 180 181 # get a list of all of the first parts of the signatures 182 all_first_sigs = [] 183 for sig_start, sig_end in self._signatures: 184 all_first_sigs.append(sig_start) 185 186 # count all of the signatures we are looking for in the sequence 187 sig_size = len(self._signatures[0][0]) 188 smallest_sig_size = sig_size * 2 189 190 for start in range(len(sequence) - (smallest_sig_size - 1)): 191 # if the first part matches any of the signatures we are looking 192 # for, then expand out to look for the second part 193 first_sig = str(sequence[start:start + sig_size]) 194 if first_sig in all_first_sigs: 195 for second in range(start + sig_size, 196 (start + sig_size + 1) + self._max_gap): 197 second_sig = str(sequence[second:second + sig_size]) 198 199 # if we find the motif, increase the counts for it 200 if (first_sig, second_sig) in sequence_sigs: 201 sequence_sigs[(first_sig, second_sig)] += 1 202 203 # -- normalize the signature info to go between zero and one 204 min_count = min(sequence_sigs.values()) 205 max_count = max(sequence_sigs.values()) 206 207 # as long as we have some signatures present, normalize them 208 # otherwise we'll just return 0 for everything 209 if max_count > 0: 210 for sig in sequence_sigs: 211 sequence_sigs[sig] = (float(sequence_sigs[sig] - min_count) 212 / float(max_count)) 213 214 # return the relative signature info in the specified order 215 sig_amounts = [] 216 for sig in self._signatures: 217 sig_amounts.append(sequence_sigs[sig]) 218 219 return sig_amounts
220