Package Bio :: Package NeuralNetwork :: Package Gene :: Module Schema
[hide private]
[frames] | no frames]

Source Code for Module Bio.NeuralNetwork.Gene.Schema

  1  # This code is part of the Biopython distribution and governed by its 
  2  # license.  Please see the LICENSE file that should have been included 
  3  # as part of this package. 
  4  # 
  5   
  6  """Deal with Motifs or Signatures allowing ambiguity in the sequences. 
  7   
  8  This class contains Schema which deal with Motifs and Signatures at 
  9  a higher level, by introducing "don't care" (ambiguity) symbols into 
 10  the sequences. For instance, you could combine the following Motifs: 
 11   
 12  'GATC', 'GATG', 'GATG', 'GATT' 
 13   
 14  as all falling under a schema like 'GAT*', where the star indicates a 
 15  character can be anything. This helps us condense a whole ton of 
 16  motifs or signatures. 
 17  """ 
 18  # standard modules 
 19  from __future__ import print_function 
 20   
 21  import random 
 22  import re 
 23   
 24  from Bio._py3k import range 
 25   
 26  from Bio import Alphabet 
 27  from Bio.Seq import MutableSeq 
 28   
 29  # neural network libraries 
 30  from .Pattern import PatternRepository 
 31   
 32  # genetic algorithm libraries 
 33  from Bio.GA import Organism 
 34  from Bio.GA.Evolver import GenerationEvolver 
 35  from Bio.GA.Mutation.Simple import SinglePositionMutation 
 36  from Bio.GA.Crossover.Point import SinglePointCrossover 
 37  from Bio.GA.Repair.Stabilizing import AmbiguousRepair 
 38  from Bio.GA.Selection.Tournament import TournamentSelection 
 39  from Bio.GA.Selection.Diversity import DiversitySelection 
 40   
 41   
42 -class Schema(object):
43 """Deal with motifs that have ambiguity characters in it. 44 45 This motif class allows specific ambiguity characters and tries to 46 speed up finding motifs using regular expressions. 47 48 This is likely to be a replacement for the Schema representation, 49 since it allows multiple ambiguity characters to be used. 50 """ 51
52 - def __init__(self, ambiguity_info):
53 """Initialize with ambiguity information. 54 55 Arguments: 56 - ambiguity_info - A dictionary which maps letters in the motifs to 57 the ambiguous characters which they might represent. For example, 58 {'R' : 'AG'} specifies that Rs in the motif can match an A or a G. 59 All letters in the motif must be represented in the ambiguity_info 60 dictionary. 61 62 """ 63 self._ambiguity_info = ambiguity_info 64 65 # a cache of all encoded motifs 66 self._motif_cache = {}
67
68 - def encode_motif(self, motif):
69 """Encode the passed motif as a regular expression pattern object. 70 71 Arguments: 72 - motif - The motif we want to encode. This should be a string. 73 74 Returns: 75 A compiled regular expression pattern object that can be used 76 for searching strings. 77 78 """ 79 regexp_string = "" 80 81 for motif_letter in motif: 82 try: 83 letter_matches = self._ambiguity_info[motif_letter] 84 except KeyError: 85 raise KeyError("No match information for letter %s" 86 % motif_letter) 87 88 if len(letter_matches) > 1: 89 regexp_match = "[" + letter_matches + "]" 90 elif len(letter_matches) == 1: 91 regexp_match = letter_matches 92 else: 93 raise ValueError("Unexpected match information %s" 94 % letter_matches) 95 96 regexp_string += regexp_match 97 98 return re.compile(regexp_string)
99
100 - def find_ambiguous(self, motif):
101 """Return the location of ambiguous items in the motif. 102 103 This just checks through the motif and compares each letter 104 against the ambiguity information. If a letter stands for multiple 105 items, it is ambiguous. 106 """ 107 ambig_positions = [] 108 for motif_letter_pos in range(len(motif)): 109 motif_letter = motif[motif_letter_pos] 110 try: 111 letter_matches = self._ambiguity_info[motif_letter] 112 except KeyError: 113 raise KeyError("No match information for letter %s" 114 % motif_letter) 115 116 if len(letter_matches) > 1: 117 ambig_positions.append(motif_letter_pos) 118 119 return ambig_positions
120
121 - def num_ambiguous(self, motif):
122 """Return the number of ambiguous letters in a given motif.""" 123 ambig_positions = self.find_ambiguous(motif) 124 return len(ambig_positions)
125
126 - def find_matches(self, motif, query):
127 """Return all non-overlapping motif matches in the query string. 128 129 This utilizes the regular expression findall function, and will 130 return a list of all non-overlapping occurrences in query that 131 match the ambiguous motif. 132 """ 133 try: 134 motif_pattern = self._motif_cache[motif] 135 except KeyError: 136 motif_pattern = self.encode_motif(motif) 137 self._motif_cache[motif] = motif_pattern 138 139 return motif_pattern.findall(query)
140
141 - def num_matches(self, motif, query):
142 """Find the number of non-overlapping times motif occurs in query.""" 143 all_matches = self.find_matches(motif, query) 144 return len(all_matches)
145
146 - def all_unambiguous(self):
147 """Return a listing of all unambiguous letters allowed in motifs.""" 148 all_letters = sorted(self._ambiguity_info) 149 unambig_letters = [] 150 151 for letter in all_letters: 152 possible_matches = self._ambiguity_info[letter] 153 if len(possible_matches) == 1: 154 unambig_letters.append(letter) 155 156 return unambig_letters
157 158 # --- helper classes and functions for the default SchemaFinder 159 160 # -- Alphabets 161 162
163 -class SchemaDNAAlphabet(Alphabet.Alphabet):
164 """Alphabet of a simple Schema for DNA sequences. 165 166 This defines a simple alphabet for DNA sequences that has a single 167 character which can match any other character: 168 169 - `G`, `A`, `T`, `C` - The standard unambiguous DNA alphabet. 170 - `*` - Any letter 171 172 """ 173 174 letters = ["G", "A", "T", "C", "*"] 175 176 alphabet_matches = {"G": "G", 177 "A": "A", 178 "T": "T", 179 "C": "C", 180 "*": "GATC"}
181 182 # -- GA schema finder 183 184
185 -class GeneticAlgorithmFinder(object):
186 """Find schemas using a genetic algorithm approach. 187 188 This approach to finding schema uses Genetic Algorithms to evolve 189 a set of schema and find the best schema for a specific set of 190 records. 191 192 The 'default' finder searches for ambiguous DNA elements. This 193 can be overridden easily by creating a GeneticAlgorithmFinder 194 with a different alphabet. 195 """ 196
197 - def __init__(self, alphabet=SchemaDNAAlphabet()):
198 """Initialize a finder to get schemas using Genetic Algorithms. 199 200 Arguments: 201 - alphabet -- The alphabet which specifies the contents of the 202 schemas we'll be generating. This alphabet must contain the 203 attribute 'alphabet_matches', which is a dictionary specifying 204 the potential ambiguities of each letter in the alphabet. These 205 ambiguities will be used in building up the schema. 206 207 """ 208 self.alphabet = alphabet 209 210 self.initial_population = 500 211 self.min_generations = 10 212 213 self._set_up_genetic_algorithm()
214
216 """Set up the genetic algorithm parameters (PRIVATE). 217 218 This functions sole job is to set up the different genetic 219 algorithm functionality. Since this can be quite complicated, this 220 allows cusotmizablity of all of the parameters. If you want to 221 customize specially, you can inherit from this class and override 222 this function. 223 """ 224 self.motif_generator = RandomMotifGenerator(self.alphabet) 225 226 self.mutator = SinglePositionMutation(mutation_rate=0.1) 227 self.crossover = SinglePointCrossover(crossover_prob=0.25) 228 self.repair = AmbiguousRepair(Schema(self.alphabet.alphabet_matches), 229 4) 230 self.base_selector = TournamentSelection(self.mutator, self.crossover, 231 self.repair, 2) 232 self.selector = DiversitySelection(self.base_selector, 233 self.motif_generator.random_motif)
234
235 - def find_schemas(self, fitness, num_schemas):
236 """Find the given number of unique schemas using a genetic algorithm. 237 238 Arguments: 239 - fitness - A callable object (ie. function) which will evaluate 240 the fitness of a motif. 241 - num_schemas - The number of unique schemas with good fitness 242 that we want to generate. 243 244 """ 245 start_population = \ 246 Organism.function_population(self.motif_generator.random_motif, 247 self.initial_population, 248 fitness) 249 finisher = SimpleFinisher(num_schemas, self.min_generations) 250 251 # set up the evolver and do the evolution 252 evolver = GenerationEvolver(start_population, self.selector) 253 evolved_pop = evolver.evolve(finisher.is_finished) 254 255 # convert the evolved population into a PatternRepository 256 schema_info = {} 257 for org in evolved_pop: 258 # convert the Genome from a MutableSeq to a Seq so that 259 # the schemas are just strings (and not array("c")s) 260 seq_genome = org.genome.toseq() 261 schema_info[str(seq_genome)] = org.fitness 262 263 return PatternRepository(schema_info)
264 265 # -- fitness classes 266 267
268 -class DifferentialSchemaFitness(object):
269 """Calculate fitness for schemas that differentiate between sequences.""" 270
271 - def __init__(self, positive_seqs, negative_seqs, schema_evaluator):
272 """Initialize with different sequences to evaluate. 273 274 Arguments: 275 - positive_seq - A list of SeqRecord objects which are the 'positive' 276 sequences -- the ones we want to select for. 277 - negative_seq - A list of SeqRecord objects which are the 'negative' 278 sequences that we want to avoid selecting. 279 - schema_evaluator - An Schema class which can be used to 280 evaluate find motif matches in sequences. 281 282 """ 283 self._pos_seqs = positive_seqs 284 self._neg_seqs = negative_seqs 285 self._schema_eval = schema_evaluator
286
287 - def calculate_fitness(self, genome):
288 """Calculate the fitness for a given schema. 289 290 Fitness is specified by the number of occurrences of the schema in 291 the positive sequences minus the number of occurrences in the 292 negative examples. 293 294 This fitness is then modified by multiplying by the length of the 295 schema and then dividing by the number of ambiguous characters in 296 the schema. This helps select for schema which are longer and have 297 less redundancy. 298 """ 299 # convert the genome into a string 300 seq_motif = genome.toseq() 301 motif = str(seq_motif) 302 303 # get the counts in the positive examples 304 num_pos = 0 305 for seq_record in self._pos_seqs: 306 cur_counts = self._schema_eval.num_matches(motif, 307 str(seq_record.seq)) 308 num_pos += cur_counts 309 310 # get the counts in the negative examples 311 num_neg = 0 312 for seq_record in self._neg_seqs: 313 cur_counts = self._schema_eval.num_matches(motif, 314 str(seq_record.seq)) 315 316 num_neg += cur_counts 317 318 num_ambiguous = self._schema_eval.num_ambiguous(motif) 319 # weight the ambiguous stuff more highly 320 num_ambiguous = pow(2.0, num_ambiguous) 321 # increment num ambiguous to prevent division by zero errors. 322 num_ambiguous += 1 323 324 motif_size = len(motif) 325 motif_size = motif_size * 4.0 326 327 discerning_power = num_pos - num_neg 328 329 diff = (discerning_power * motif_size) / float(num_ambiguous) 330 return diff
331 332
333 -class MostCountSchemaFitness(object):
334 """Calculate a fitness giving weight to schemas that match many times. 335 336 This fitness function tries to maximize schemas which are found many 337 times in a group of sequences. 338 """ 339
340 - def __init__(self, seq_records, schema_evaluator):
341 """Initialize with sequences to evaluate. 342 343 Arguments: 344 - seq_records -- A set of SeqRecord objects which we use to 345 calculate the fitness. 346 - schema_evaluator - An Schema class which can be used to 347 evaluate find motif matches in sequences. 348 349 """ 350 self._records = seq_records 351 self._evaluator = schema_evaluator
352
353 - def calculate_fitness(self, genome):
354 """Calculate the fitness of a genome based on schema matches. 355 356 This bases the fitness of a genome completely on the number of times 357 it matches in the set of seq_records. Matching more times gives a 358 better fitness 359 """ 360 # convert the genome into a string 361 seq_motif = genome.toseq() 362 motif = str(seq_motif) 363 364 # find the number of times the genome matches 365 num_times = 0 366 for seq_record in self._records: 367 cur_counts = self._evaluator.num_matches(motif, 368 str(seq_record.seq)) 369 num_times += cur_counts 370 371 return num_times
372 373 374 # -- Helper classes
375 -class RandomMotifGenerator(object):
376 """Generate a random motif within given parameters.""" 377
378 - def __init__(self, alphabet, min_size=12, max_size=17):
379 """Initialize with the motif parameters. 380 381 Arguments: 382 - alphabet - An alphabet specifying what letters can be inserted in 383 a motif. 384 - min_size, max_size - Specify the range of sizes for motifs. 385 386 """ 387 self._alphabet = alphabet 388 self._min_size = min_size 389 self._max_size = max_size
390
391 - def random_motif(self):
392 """Create a random motif within the given parameters. 393 394 This returns a single motif string with letters from the given 395 alphabet. The size of the motif will be randomly chosen between 396 max_size and min_size. 397 """ 398 motif_size = random.randrange(self._min_size, self._max_size) 399 400 motif = "" 401 for letter_num in range(motif_size): 402 cur_letter = random.choice(self._alphabet.letters) 403 motif += cur_letter 404 405 return MutableSeq(motif, self._alphabet)
406 407
408 -class SimpleFinisher(object):
409 """Determine when we are done evolving motifs. 410 411 This takes the very simple approach of halting evolution when the 412 GA has proceeded for a specified number of generations and has 413 a given number of unique schema with positive fitness. 414 """ 415
416 - def __init__(self, num_schemas, min_generations=100):
417 """Initialize the finisher with its parameters. 418 419 Arguments: 420 - num_schemas -- the number of useful (positive fitness) schemas 421 we want to generate. 422 - min_generations -- The minimum number of generations to allow 423 the GA to proceed. 424 425 """ 426 self.num_generations = 0 427 428 self.num_schemas = num_schemas 429 self.min_generations = min_generations
430
431 - def is_finished(self, organisms):
432 """Determine when we can stop evolving the population.""" 433 self.num_generations += 1 434 # print "generation %s" % self.num_generations 435 436 if self.num_generations >= self.min_generations: 437 all_seqs = [] 438 for org in organisms: 439 if org.fitness > 0: 440 if org.genome not in all_seqs: 441 all_seqs.append(org.genome) 442 443 if len(all_seqs) >= self.num_schemas: 444 return 1 445 446 return 0
447 # --- 448 449
450 -class SchemaFinder(object):
451 """Find schema in a set of sequences using a genetic algorithm approach. 452 453 Finding good schemas is very difficult because it takes forever to 454 enumerate all of the potential schemas. This finder using a genetic 455 algorithm approach to evolve good schema which match many times in 456 a set of sequences. 457 458 The default implementation of the finder is ready to find schemas 459 in a set of DNA sequences, but the finder can be customized to deal 460 with any type of data. 461 """ 462
463 - def __init__(self, num_schemas=100, 464 schema_finder=GeneticAlgorithmFinder()):
465 """Initialize the Schema Finder with its parameters. 466 467 Arguments: 468 - num_schemas -- the number of useful (positive fitness) schemas 469 we want to generate. 470 471 """ 472 self.num_schemas = num_schemas 473 self._finder = schema_finder 474 475 self.evaluator = Schema(self._finder.alphabet.alphabet_matches)
476
477 - def find(self, seq_records):
478 """Find well-represented schemas in the given set of SeqRecords.""" 479 fitness_evaluator = MostCountSchemaFitness(seq_records, 480 self.evaluator) 481 482 return self._finder.find_schemas(fitness_evaluator.calculate_fitness, 483 self.num_schemas)
484
485 - def find_differences(self, first_records, second_records):
486 """Find schemas which differentiate between the two sets of SeqRecords.""" 487 fitness_evaluator = DifferentialSchemaFitness(first_records, 488 second_records, 489 self.evaluator) 490 491 return self._finder.find_schemas(fitness_evaluator.calculate_fitness, 492 self.num_schemas)
493 494
495 -class SchemaCoder(object):
496 """Convert a sequence into a representation of ambiguous motifs (schemas). 497 498 This takes a sequence, and returns the number of times specified 499 motifs are found in the sequence. This lets you represent a sequence 500 as just a count of (possibly ambiguous) motifs. 501 """ 502
503 - def __init__(self, schemas, ambiguous_converter):
504 """Initialize the coder to convert sequences. 505 506 Arguments: 507 - schema - A list of all of the schemas we want to search for 508 in input sequences. 509 - ambiguous_converter - An Schema class which can be 510 used to convert motifs into regular expressions for searching. 511 512 """ 513 self._schemas = schemas 514 self._converter = ambiguous_converter
515
516 - def representation(self, sequence):
517 """Represent the given input sequence as a bunch of motif counts. 518 519 Arguments: 520 - sequence - A Bio.Seq object we are going to represent as schemas. 521 522 This takes the sequence, searches for the motifs within it, and then 523 returns counts specifying the relative number of times each motifs 524 was found. The frequencies are in the order the original motifs were 525 passed into the initializer. 526 """ 527 schema_counts = [] 528 529 for schema in self._schemas: 530 num_counts = self._converter.num_matches(schema, str(sequence)) 531 schema_counts.append(num_counts) 532 533 # normalize the counts to go between zero and one 534 min_count = 0 535 max_count = max(schema_counts) 536 537 # only normalize if we've actually found something, otherwise 538 # we'll just return 0 for everything 539 if max_count > 0: 540 for count_num in range(len(schema_counts)): 541 schema_counts[count_num] = (float(schema_counts[count_num]) - 542 float(min_count)) / float(max_count) 543 544 return schema_counts
545 546
547 -def matches_schema(pattern, schema, ambiguity_character='*'):
548 """Determine whether or not the given pattern matches the schema. 549 550 Arguments: 551 - pattern - A string representing the pattern we want to check for 552 matching. This pattern can contain ambiguity characters (which are 553 assumed to be the same as those in the schema). 554 - schema - A string schema with ambiguity characters. 555 - ambiguity_character - The character used for ambiguity in the schema. 556 557 """ 558 if len(pattern) != len(schema): 559 return 0 560 561 # check each position, and return a non match if the schema and pattern 562 # are non ambiguous and don't match 563 for pos in range(len(pattern)): 564 if schema[pos] != ambiguity_character and \ 565 pattern[pos] != ambiguity_character and \ 566 pattern[pos] != schema[pos]: 567 568 return 0 569 570 return 1
571 572
573 -class SchemaFactory(object):
574 """Generate Schema from inputs of Motifs or Signatures.""" 575
576 - def __init__(self, ambiguity_symbol='*'):
577 """Initialize the SchemaFactory. 578 579 Arguments: 580 - ambiguity_symbol -- The symbol to use when specifying that 581 a position is arbitrary. 582 583 """ 584 self._ambiguity_symbol = ambiguity_symbol
585
586 - def from_motifs(self, motif_repository, motif_percent, num_ambiguous):
587 """Generate schema from a list of motifs. 588 589 Arguments: 590 - motif_repository - A MotifRepository class that has all of the 591 motifs we want to convert to Schema. 592 - motif_percent - The percentage of motifs in the motif bank which 593 should be matches. We'll try to create schema that match this 594 percentage of motifs. 595 - num_ambiguous - The number of ambiguous characters to include 596 in each schema. The positions of these ambiguous characters will 597 be randomly selected. 598 599 """ 600 # get all of the motifs we can deal with 601 all_motifs = motif_repository.get_top_percentage(motif_percent) 602 603 # start building up schemas 604 schema_info = {} 605 # continue until we've built schema matching the desired percentage 606 # of motifs 607 total_count = self._get_num_motifs(motif_repository, all_motifs) 608 matched_count = 0 609 assert total_count > 0, "Expected to have motifs to match" 610 while (float(matched_count) / float(total_count)) < motif_percent: 611 new_schema, matching_motifs = \ 612 self._get_unique_schema(list(schema_info.keys()), 613 all_motifs, num_ambiguous) 614 615 # get the number of counts for the new schema and clean up 616 # the motif list 617 schema_counts = 0 618 for motif in matching_motifs: 619 # get the counts for the motif 620 schema_counts += motif_repository.count(motif) 621 622 # remove the motif from the motif list since it is already 623 # represented by this schema 624 all_motifs.remove(motif) 625 626 # all the schema info 627 schema_info[new_schema] = schema_counts 628 629 matched_count += schema_counts 630 631 # print "percentage:", float(matched_count) / float(total_count) 632 633 return PatternRepository(schema_info)
634
635 - def _get_num_motifs(self, repository, motif_list):
636 """Return the number of motif counts for the list of motifs.""" 637 motif_count = 0 638 for motif in motif_list: 639 motif_count += repository.count(motif) 640 641 return motif_count
642
643 - def _get_unique_schema(self, cur_schemas, motif_list, num_ambiguous):
644 """Retrieve a unique schema from a motif. 645 646 We don't want to end up with schema that match the same thing, 647 since this could lead to ambiguous results, and be messy. This 648 tries to create schema, and checks that they do not match any 649 currently existing schema. 650 """ 651 # create a schema starting with a random motif 652 # we'll keep doing this until we get a completely new schema that 653 # doesn't match any old schema 654 num_tries = 0 655 656 while True: 657 # pick a motif to work from and make a schema from it 658 cur_motif = random.choice(motif_list) 659 660 num_tries += 1 661 662 new_schema, matching_motifs = \ 663 self._schema_from_motif(cur_motif, motif_list, 664 num_ambiguous) 665 666 has_match = 0 667 for old_schema in cur_schemas: 668 if matches_schema(new_schema, old_schema, 669 self._ambiguity_symbol): 670 has_match = 1 671 672 # if the schema doesn't match any other schema we've got 673 # a good one 674 if not(has_match): 675 break 676 677 # check for big loops in which we can't find a new schema 678 assert num_tries < 150, \ 679 "Could not generate schema in %s tries from %s with %s" \ 680 % (num_tries, motif_list, cur_schemas) 681 682 return new_schema, matching_motifs
683
684 - def _schema_from_motif(self, motif, motif_list, num_ambiguous):
685 """Create a schema from a given starting motif. 686 687 Arguments: 688 - motif - A motif with the pattern we will start from. 689 - motif_list - The total motifs we have.to match to. 690 - num_ambiguous - The number of ambiguous characters that should 691 be present in the schema. 692 693 Returns: 694 - A string representing the newly generated schema. 695 - A list of all of the motifs in motif_list that match the schema. 696 697 """ 698 assert motif in motif_list, \ 699 "Expected starting motif present in remaining motifs." 700 701 # convert random positions in the motif to ambiguous characters 702 # convert the motif into a list of characters so we can manipulate it 703 new_schema_list = list(motif) 704 for add_ambiguous in range(num_ambiguous): 705 # add an ambiguous position in a new place in the motif 706 while True: 707 ambig_pos = random.choice(list(range(len(new_schema_list)))) 708 709 # only add a position if it isn't already ambiguous 710 # otherwise, we'll try again 711 if new_schema_list[ambig_pos] != self._ambiguity_symbol: 712 new_schema_list[ambig_pos] = self._ambiguity_symbol 713 break 714 715 # convert the schema back to a string 716 new_schema = ''.join(new_schema_list) 717 718 # get the motifs that the schema matches 719 matched_motifs = [] 720 for motif in motif_list: 721 if matches_schema(motif, new_schema, self._ambiguity_symbol): 722 matched_motifs.append(motif) 723 724 return new_schema, matched_motifs
725
726 - def from_signatures(self, signature_repository, num_ambiguous):
727 """Initialize from a signature repository (not implemented yet).""" 728 raise NotImplementedError("Still need to code this.")
729