Package Bio :: Package NeuralNetwork :: Package Gene :: Module Schema
[hide private]
[frames] | no frames]

Source Code for Module Bio.NeuralNetwork.Gene.Schema

  1  # This code is part of the Biopython distribution and governed by its 
  2  # license.  Please see the LICENSE file that should have been included 
  3  # as part of this package. 
  4  # 
  5   
  6  """Deal with Motifs or Signatures allowing ambiguity in the sequences. 
  7   
  8  This class contains Schema which deal with Motifs and Signatures at 
  9  a higher level, by introducing \`don't care\` (ambiguity) symbols into 
 10  the sequences. For instance, you could combine the following Motifs: 
 11   
 12  'GATC', 'GATG', 'GATG', 'GATT' 
 13   
 14  as all falling under a schema like 'GAT*', where the star indicates a 
 15  character can be anything. This helps us condense a whole ton of 
 16  motifs or signatures. 
 17  """ 
 18  # standard modules 
 19  from __future__ import print_function 
 20   
 21  import random 
 22  import re 
 23   
 24  from Bio._py3k import range 
 25   
 26  from Bio import Alphabet 
 27  from Bio.Seq import MutableSeq 
 28   
 29  # neural network libraries 
 30  from .Pattern import PatternRepository 
 31   
 32  # genetic algorithm libraries 
 33  from Bio.GA import Organism 
 34  from Bio.GA.Evolver import GenerationEvolver 
 35  from Bio.GA.Mutation.Simple import SinglePositionMutation 
 36  from Bio.GA.Crossover.Point import SinglePointCrossover 
 37  from Bio.GA.Repair.Stabilizing import AmbiguousRepair 
 38  from Bio.GA.Selection.Tournament import TournamentSelection 
 39  from Bio.GA.Selection.Diversity import DiversitySelection 
 40   
 41  __docformat__ = "restructuredtext en" 
 42   
 43   
44 -class Schema(object):
45 """Deal with motifs that have ambiguity characters in it. 46 47 This motif class allows specific ambiguity characters and tries to 48 speed up finding motifs using regular expressions. 49 50 This is likely to be a replacement for the Schema representation, 51 since it allows multiple ambiguity characters to be used. 52 """
53 - def __init__(self, ambiguity_info):
54 """Initialize with ambiguity information. 55 56 Arguments: 57 58 o ambiguity_info - A dictionary which maps letters in the motifs to 59 the ambiguous characters which they might represent. For example, 60 {'R' : 'AG'} specifies that Rs in the motif can match an A or a G. 61 All letters in the motif must be represented in the ambiguity_info 62 dictionary. 63 """ 64 self._ambiguity_info = ambiguity_info 65 66 # a cache of all encoded motifs 67 self._motif_cache = {}
68
69 - def encode_motif(self, motif):
70 """Encode the passed motif as a regular expression pattern object. 71 72 Arguments: 73 74 o motif - The motif we want to encode. This should be a string. 75 76 Returns: 77 A compiled regular expression pattern object that can be used 78 for searching strings. 79 """ 80 regexp_string = "" 81 82 for motif_letter in motif: 83 try: 84 letter_matches = self._ambiguity_info[motif_letter] 85 except KeyError: 86 raise KeyError("No match information for letter %s" 87 % motif_letter) 88 89 if len(letter_matches) > 1: 90 regexp_match = "[" + letter_matches + "]" 91 elif len(letter_matches) == 1: 92 regexp_match = letter_matches 93 else: 94 raise ValueError("Unexpected match information %s" 95 % letter_matches) 96 97 regexp_string += regexp_match 98 99 return re.compile(regexp_string)
100
101 - def find_ambiguous(self, motif):
102 """Return the location of ambiguous items in the motif. 103 104 This just checks through the motif and compares each letter 105 against the ambiguity information. If a letter stands for multiple 106 items, it is ambiguous. 107 """ 108 ambig_positions = [] 109 for motif_letter_pos in range(len(motif)): 110 motif_letter = motif[motif_letter_pos] 111 try: 112 letter_matches = self._ambiguity_info[motif_letter] 113 except KeyError: 114 raise KeyError("No match information for letter %s" 115 % motif_letter) 116 117 if len(letter_matches) > 1: 118 ambig_positions.append(motif_letter_pos) 119 120 return ambig_positions
121
122 - def num_ambiguous(self, motif):
123 """Return the number of ambiguous letters in a given motif. 124 """ 125 ambig_positions = self.find_ambiguous(motif) 126 return len(ambig_positions)
127
128 - def find_matches(self, motif, query):
129 """Return all non-overlapping motif matches in the query string. 130 131 This utilizes the regular expression findall function, and will 132 return a list of all non-overlapping occurances in query that 133 match the ambiguous motif. 134 """ 135 try: 136 motif_pattern = self._motif_cache[motif] 137 except KeyError: 138 motif_pattern = self.encode_motif(motif) 139 self._motif_cache[motif] = motif_pattern 140 141 return motif_pattern.findall(query)
142
143 - def num_matches(self, motif, query):
144 """Find the number of non-overlapping times motif occurs in query. 145 """ 146 all_matches = self.find_matches(motif, query) 147 return len(all_matches)
148
149 - def all_unambiguous(self):
150 """Return a listing of all unambiguous letters allowed in motifs. 151 """ 152 all_letters = sorted(self._ambiguity_info) 153 unambig_letters = [] 154 155 for letter in all_letters: 156 possible_matches = self._ambiguity_info[letter] 157 if len(possible_matches) == 1: 158 unambig_letters.append(letter) 159 160 return unambig_letters
161 162 # --- helper classes and functions for the default SchemaFinder 163 164 # -- Alphabets 165 166
167 -class SchemaDNAAlphabet(Alphabet.Alphabet):
168 """Alphabet of a simple Schema for DNA sequences. 169 170 This defines a simple alphabet for DNA sequences that has a single 171 character which can match any other character. 172 173 o G,A,T,C - The standard unambiguous DNA alphabet. 174 175 o * - Any letter 176 """ 177 letters = ["G", "A", "T", "C", "*"] 178 179 alphabet_matches = {"G": "G", 180 "A": "A", 181 "T": "T", 182 "C": "C", 183 "*": "GATC"}
184 185 # -- GA schema finder 186 187
188 -class GeneticAlgorithmFinder(object):
189 """Find schemas using a genetic algorithm approach. 190 191 This approach to finding schema uses Genetic Algorithms to evolve 192 a set of schema and find the best schema for a specific set of 193 records. 194 195 The 'default' finder searches for ambiguous DNA elements. This 196 can be overridden easily by creating a GeneticAlgorithmFinder 197 with a different alphabet. 198 """
199 - def __init__(self, alphabet=SchemaDNAAlphabet()):
200 """Initialize a finder to get schemas using Genetic Algorithms. 201 202 Arguments: 203 204 o alphabet -- The alphabet which specifies the contents of the 205 schemas we'll be generating. This alphabet must contain the 206 attribute 'alphabet_matches', which is a dictionary specifying 207 the potential ambiguities of each letter in the alphabet. These 208 ambiguities will be used in building up the schema. 209 """ 210 self.alphabet = alphabet 211 212 self.initial_population = 500 213 self.min_generations = 10 214 215 self._set_up_genetic_algorithm()
216
218 """Overrideable function to set up the genetic algorithm parameters. 219 220 This functions sole job is to set up the different genetic 221 algorithm functionality. Since this can be quite complicated, this 222 allows cusotmizablity of all of the parameters. If you want to 223 customize specially, you can inherit from this class and override 224 this function. 225 """ 226 self.motif_generator = RandomMotifGenerator(self.alphabet) 227 228 self.mutator = SinglePositionMutation(mutation_rate=0.1) 229 self.crossover = SinglePointCrossover(crossover_prob=0.25) 230 self.repair = AmbiguousRepair(Schema(self.alphabet.alphabet_matches), 231 4) 232 self.base_selector = TournamentSelection(self.mutator, self.crossover, 233 self.repair, 2) 234 self.selector = DiversitySelection(self.base_selector, 235 self.motif_generator.random_motif)
236
237 - def find_schemas(self, fitness, num_schemas):
238 """Find the given number of unique schemas using a genetic algorithm 239 240 Arguments: 241 242 o fitness - A callable object (ie. function) which will evaluate 243 the fitness of a motif. 244 245 o num_schemas - The number of unique schemas with good fitness 246 that we want to generate. 247 """ 248 start_population = \ 249 Organism.function_population(self.motif_generator.random_motif, 250 self.initial_population, 251 fitness) 252 finisher = SimpleFinisher(num_schemas, self.min_generations) 253 254 # set up the evolver and do the evolution 255 evolver = GenerationEvolver(start_population, self.selector) 256 evolved_pop = evolver.evolve(finisher.is_finished) 257 258 # convert the evolved population into a PatternRepository 259 schema_info = {} 260 for org in evolved_pop: 261 # convert the Genome from a MutableSeq to a Seq so that 262 # the schemas are just strings (and not array("c")s) 263 seq_genome = org.genome.toseq() 264 schema_info[str(seq_genome)] = org.fitness 265 266 return PatternRepository(schema_info)
267 268 # -- fitness classes 269 270
271 -class DifferentialSchemaFitness(object):
272 """Calculate fitness for schemas that differentiate between sequences. 273 """
274 - def __init__(self, positive_seqs, negative_seqs, schema_evaluator):
275 """Initialize with different sequences to evaluate 276 277 Arguments: 278 279 o positive_seq - A list of SeqRecord objects which are the 'positive' 280 sequences -- the ones we want to select for. 281 282 o negative_seq - A list of SeqRecord objects which are the 'negative' 283 sequences that we want to avoid selecting. 284 285 o schema_evaluator - An Schema class which can be used to 286 evaluate find motif matches in sequences. 287 """ 288 self._pos_seqs = positive_seqs 289 self._neg_seqs = negative_seqs 290 self._schema_eval = schema_evaluator
291
292 - def calculate_fitness(self, genome):
293 """Calculate the fitness for a given schema. 294 295 Fitness is specified by the number of occurances of the schema in 296 the positive sequences minus the number of occurances in the 297 negative examples. 298 299 This fitness is then modified by multiplying by the length of the 300 schema and then dividing by the number of ambiguous characters in 301 the schema. This helps select for schema which are longer and have 302 less redundancy. 303 """ 304 # convert the genome into a string 305 seq_motif = genome.toseq() 306 motif = str(seq_motif) 307 308 # get the counts in the positive examples 309 num_pos = 0 310 for seq_record in self._pos_seqs: 311 cur_counts = self._schema_eval.num_matches(motif, 312 str(seq_record.seq)) 313 num_pos += cur_counts 314 315 # get the counts in the negative examples 316 num_neg = 0 317 for seq_record in self._neg_seqs: 318 cur_counts = self._schema_eval.num_matches(motif, 319 str(seq_record.seq)) 320 321 num_neg += cur_counts 322 323 num_ambiguous = self._schema_eval.num_ambiguous(motif) 324 # weight the ambiguous stuff more highly 325 num_ambiguous = pow(2.0, num_ambiguous) 326 # increment num ambiguous to prevent division by zero errors. 327 num_ambiguous += 1 328 329 motif_size = len(motif) 330 motif_size = motif_size * 4.0 331 332 discerning_power = num_pos - num_neg 333 334 diff = (discerning_power * motif_size) / float(num_ambiguous) 335 return diff
336 337
338 -class MostCountSchemaFitness(object):
339 """Calculate a fitness giving weight to schemas that match many times. 340 341 This fitness function tries to maximize schemas which are found many 342 times in a group of sequences. 343 """
344 - def __init__(self, seq_records, schema_evaluator):
345 """Initialize with sequences to evaluate. 346 347 Arguments: 348 349 o seq_records -- A set of SeqRecord objects which we use to 350 calculate the fitness. 351 352 o schema_evaluator - An Schema class which can be used to 353 evaluate find motif matches in sequences. 354 """ 355 self._records = seq_records 356 self._evaluator = schema_evaluator
357
358 - def calculate_fitness(self, genome):
359 """Calculate the fitness of a genome based on schema matches. 360 361 This bases the fitness of a genome completely on the number of times 362 it matches in the set of seq_records. Matching more times gives a 363 better fitness 364 """ 365 # convert the genome into a string 366 seq_motif = genome.toseq() 367 motif = str(seq_motif) 368 369 # find the number of times the genome matches 370 num_times = 0 371 for seq_record in self._records: 372 cur_counts = self._evaluator.num_matches(motif, 373 str(seq_record.seq)) 374 num_times += cur_counts 375 376 return num_times
377 378 379 # -- Helper classes
380 -class RandomMotifGenerator(object):
381 """Generate a random motif within given parameters. 382 """
383 - def __init__(self, alphabet, min_size=12, max_size=17):
384 """Initialize with the motif parameters. 385 386 Arguments: 387 388 o alphabet - An alphabet specifying what letters can be inserted in 389 a motif. 390 391 o min_size, max_size - Specify the range of sizes for motifs. 392 """ 393 self._alphabet = alphabet 394 self._min_size = min_size 395 self._max_size = max_size
396
397 - def random_motif(self):
398 """Create a random motif within the given parameters. 399 400 This returns a single motif string with letters from the given 401 alphabet. The size of the motif will be randomly chosen between 402 max_size and min_size. 403 """ 404 motif_size = random.randrange(self._min_size, self._max_size) 405 406 motif = "" 407 for letter_num in range(motif_size): 408 cur_letter = random.choice(self._alphabet.letters) 409 motif += cur_letter 410 411 return MutableSeq(motif, self._alphabet)
412 413
414 -class SimpleFinisher(object):
415 """Determine when we are done evolving motifs. 416 417 This takes the very simple approach of halting evolution when the 418 GA has proceeded for a specified number of generations and has 419 a given number of unique schema with positive fitness. 420 """
421 - def __init__(self, num_schemas, min_generations=100):
422 """Initialize the finisher with its parameters. 423 424 Arguments: 425 426 o num_schemas -- the number of useful (positive fitness) schemas 427 we want to generation 428 429 o min_generations -- The minimum number of generations to allow 430 the GA to proceed. 431 """ 432 self.num_generations = 0 433 434 self.num_schemas = num_schemas 435 self.min_generations = min_generations
436
437 - def is_finished(self, organisms):
438 """Determine when we can stop evolving the population. 439 """ 440 self.num_generations += 1 441 # print "generation %s" % self.num_generations 442 443 if self.num_generations >= self.min_generations: 444 all_seqs = [] 445 for org in organisms: 446 if org.fitness > 0: 447 if org.genome not in all_seqs: 448 all_seqs.append(org.genome) 449 450 if len(all_seqs) >= self.num_schemas: 451 return 1 452 453 return 0
454 # --- 455 456
457 -class SchemaFinder(object):
458 """Find schema in a set of sequences using a genetic algorithm approach. 459 460 Finding good schemas is very difficult because it takes forever to 461 enumerate all of the potential schemas. This finder using a genetic 462 algorithm approach to evolve good schema which match many times in 463 a set of sequences. 464 465 The default implementation of the finder is ready to find schemas 466 in a set of DNA sequences, but the finder can be customized to deal 467 with any type of data. 468 """
469 - def __init__(self, num_schemas=100, 470 schema_finder=GeneticAlgorithmFinder()):
471 self.num_schemas = num_schemas 472 self._finder = schema_finder 473 474 self.evaluator = Schema(self._finder.alphabet.alphabet_matches)
475
476 - def find(self, seq_records):
477 """Find well-represented schemas in the given set of SeqRecords. 478 """ 479 fitness_evaluator = MostCountSchemaFitness(seq_records, 480 self.evaluator) 481 482 return self._finder.find_schemas(fitness_evaluator.calculate_fitness, 483 self.num_schemas)
484
485 - def find_differences(self, first_records, second_records):
486 """Find schemas which differentiate between the two sets of SeqRecords. 487 """ 488 fitness_evaluator = DifferentialSchemaFitness(first_records, 489 second_records, 490 self.evaluator) 491 492 return self._finder.find_schemas(fitness_evaluator.calculate_fitness, 493 self.num_schemas)
494 495
496 -class SchemaCoder(object):
497 """Convert a sequence into a representation of ambiguous motifs (schemas). 498 499 This takes a sequence, and returns the number of times specified 500 motifs are found in the sequence. This lets you represent a sequence 501 as just a count of (possibly ambiguous) motifs. 502 """
503 - def __init__(self, schemas, ambiguous_converter):
504 """Initialize the coder to convert sequences 505 506 Arguments: 507 508 o schema - A list of all of the schemas we want to search for 509 in input sequences. 510 511 o ambiguous_converter - An Schema class which can be 512 used to convert motifs into regular expressions for searching. 513 """ 514 self._schemas = schemas 515 self._converter = ambiguous_converter
516
517 - def representation(self, sequence):
518 """Represent the given input sequence as a bunch of motif counts. 519 520 Arguments: 521 522 o sequence - A Bio.Seq object we are going to represent as schemas. 523 524 This takes the sequence, searches for the motifs within it, and then 525 returns counts specifying the relative number of times each motifs 526 was found. The frequencies are in the order the original motifs were 527 passed into the initializer. 528 """ 529 schema_counts = [] 530 531 for schema in self._schemas: 532 num_counts = self._converter.num_matches(schema, str(sequence)) 533 schema_counts.append(num_counts) 534 535 # normalize the counts to go between zero and one 536 min_count = 0 537 max_count = max(schema_counts) 538 539 # only normalize if we've actually found something, otherwise 540 # we'll just return 0 for everything 541 if max_count > 0: 542 for count_num in range(len(schema_counts)): 543 schema_counts[count_num] = (float(schema_counts[count_num]) - 544 float(min_count)) / float(max_count) 545 546 return schema_counts
547 548
549 -def matches_schema(pattern, schema, ambiguity_character='*'):
550 """Determine whether or not the given pattern matches the schema. 551 552 Arguments: 553 554 o pattern - A string representing the pattern we want to check for 555 matching. This pattern can contain ambiguity characters (which are 556 assumed to be the same as those in the schema). 557 558 o schema - A string schema with ambiguity characters. 559 560 o ambiguity_character - The character used for ambiguity in the schema. 561 """ 562 if len(pattern) != len(schema): 563 return 0 564 565 # check each position, and return a non match if the schema and pattern 566 # are non ambiguous and don't match 567 for pos in range(len(pattern)): 568 if schema[pos] != ambiguity_character and \ 569 pattern[pos] != ambiguity_character and \ 570 pattern[pos] != schema[pos]: 571 572 return 0 573 574 return 1
575 576
577 -class SchemaFactory(object):
578 """Generate Schema from inputs of Motifs or Signatures. 579 """
580 - def __init__(self, ambiguity_symbol='*'):
581 """Initialize the SchemaFactory 582 583 Arguments: 584 585 o ambiguity_symbol -- The symbol to use when specifying that 586 a position is arbitrary. 587 """ 588 self._ambiguity_symbol = ambiguity_symbol
589
590 - def from_motifs(self, motif_repository, motif_percent, num_ambiguous):
591 """Generate schema from a list of motifs. 592 593 Arguments: 594 595 o motif_repository - A MotifRepository class that has all of the 596 motifs we want to convert to Schema. 597 598 o motif_percent - The percentage of motifs in the motif bank which 599 should be matches. We'll try to create schema that match this 600 percentage of motifs. 601 602 o num_ambiguous - The number of ambiguous characters to include 603 in each schema. The positions of these ambiguous characters will 604 be randomly selected. 605 """ 606 # get all of the motifs we can deal with 607 all_motifs = motif_repository.get_top_percentage(motif_percent) 608 609 # start building up schemas 610 schema_info = {} 611 # continue until we've built schema matching the desired percentage 612 # of motifs 613 total_count = self._get_num_motifs(motif_repository, all_motifs) 614 matched_count = 0 615 assert total_count > 0, "Expected to have motifs to match" 616 while (float(matched_count) / float(total_count)) < motif_percent: 617 new_schema, matching_motifs = \ 618 self._get_unique_schema(list(schema_info.keys()), 619 all_motifs, num_ambiguous) 620 621 # get the number of counts for the new schema and clean up 622 # the motif list 623 schema_counts = 0 624 for motif in matching_motifs: 625 # get the counts for the motif 626 schema_counts += motif_repository.count(motif) 627 628 # remove the motif from the motif list since it is already 629 # represented by this schema 630 all_motifs.remove(motif) 631 632 # all the schema info 633 schema_info[new_schema] = schema_counts 634 635 matched_count += schema_counts 636 637 # print "percentage:", float(matched_count) / float(total_count) 638 639 return PatternRepository(schema_info)
640
641 - def _get_num_motifs(self, repository, motif_list):
642 """Return the number of motif counts for the list of motifs. 643 """ 644 motif_count = 0 645 for motif in motif_list: 646 motif_count += repository.count(motif) 647 648 return motif_count
649
650 - def _get_unique_schema(self, cur_schemas, motif_list, num_ambiguous):
651 """Retrieve a unique schema from a motif. 652 653 We don't want to end up with schema that match the same thing, 654 since this could lead to ambiguous results, and be messy. This 655 tries to create schema, and checks that they do not match any 656 currently existing schema. 657 """ 658 # create a schema starting with a random motif 659 # we'll keep doing this until we get a completely new schema that 660 # doesn't match any old schema 661 num_tries = 0 662 663 while True: 664 # pick a motif to work from and make a schema from it 665 cur_motif = random.choice(motif_list) 666 667 num_tries += 1 668 669 new_schema, matching_motifs = \ 670 self._schema_from_motif(cur_motif, motif_list, 671 num_ambiguous) 672 673 has_match = 0 674 for old_schema in cur_schemas: 675 if matches_schema(new_schema, old_schema, 676 self._ambiguity_symbol): 677 has_match = 1 678 679 # if the schema doesn't match any other schema we've got 680 # a good one 681 if not(has_match): 682 break 683 684 # check for big loops in which we can't find a new schema 685 assert num_tries < 150, \ 686 "Could not generate schema in %s tries from %s with %s" \ 687 % (num_tries, motif_list, cur_schemas) 688 689 return new_schema, matching_motifs
690
691 - def _schema_from_motif(self, motif, motif_list, num_ambiguous):
692 """Create a schema from a given starting motif. 693 694 Arguments: 695 696 o motif - A motif with the pattern we will start from. 697 698 o motif_list - The total motifs we have.to match to. 699 700 o num_ambiguous - The number of ambiguous characters that should 701 be present in the schema. 702 703 Returns: 704 705 o A string representing the newly generated schema. 706 707 o A list of all of the motifs in motif_list that match the schema. 708 """ 709 assert motif in motif_list, \ 710 "Expected starting motif present in remaining motifs." 711 712 # convert random positions in the motif to ambiguous characters 713 # convert the motif into a list of characters so we can manipulate it 714 new_schema_list = list(motif) 715 for add_ambiguous in range(num_ambiguous): 716 # add an ambiguous position in a new place in the motif 717 while True: 718 ambig_pos = random.choice(list(range(len(new_schema_list)))) 719 720 # only add a position if it isn't already ambiguous 721 # otherwise, we'll try again 722 if new_schema_list[ambig_pos] != self._ambiguity_symbol: 723 new_schema_list[ambig_pos] = self._ambiguity_symbol 724 break 725 726 # convert the schema back to a string 727 new_schema = ''.join(new_schema_list) 728 729 # get the motifs that the schema matches 730 matched_motifs = [] 731 for motif in motif_list: 732 if matches_schema(motif, new_schema, self._ambiguity_symbol): 733 matched_motifs.append(motif) 734 735 return new_schema, matched_motifs
736
737 - def from_signatures(self, signature_repository, num_ambiguous):
738 raise NotImplementedError("Still need to code this.")
739