Package Bio :: Module triefind
[hide private]
[frames] | no frames]

Source Code for Module Bio.triefind

  1  # This code is part of the Biopython distribution and governed by its 
  2  # license.  Please see the LICENSE file that should have been included 
  3  # as part of this package. 
  4  # 
  5   
  6  """ 
  7  Given a trie, find all occurrences of a word in the trie in a string. 
  8   
  9  Like searching a string for a substring, except that the substring is 
 10  any word in a trie. 
 11   
 12  Functions: 
 13  match         Find longest key in a trie matching the beginning of the string. 
 14  match_all     Find all keys in a trie matching the beginning of the string. 
 15  find          Find keys in a trie matching anywhere in a string. 
 16  find_words    Find keys in a trie matching whole words in a string. 
 17   
 18  """ 
 19   
 20  import string 
 21  import re 
 22   
 23   
24 -def match(string, trie):
25 """match(string, trie) -> longest key or None 26 27 Find the longest key in the trie that matches the beginning of the 28 string. 29 30 """ 31 longest = None 32 for i in range(len(string)): 33 substr = string[:i + 1] 34 if not trie.has_prefix(substr): 35 break 36 if substr in trie: 37 longest = substr 38 return longest
39 40
41 -def match_all(string, trie):
42 """match_all(string, trie) -> list of keys 43 44 Find all the keys in the trie that matches the beginning of the 45 string. 46 47 """ 48 matches = [] 49 for i in range(len(string)): 50 substr = string[:i + 1] 51 if not trie.has_prefix(substr): 52 break 53 if substr in trie: 54 matches.append(substr) 55 return matches
56 57
58 -def find(string, trie):
59 """find(string, trie) -> list of tuples (key, start, end) 60 61 Find all the keys in the trie that match anywhere in the string. 62 63 """ 64 results = [] 65 start = 0 # index to start the search 66 while start < len(string): 67 # Look for a match. 68 keys = match_all(string[start:], trie) 69 for key in keys: 70 results.append((key, start, start + len(key))) 71 start += 1 72 return results
73 74 DEFAULT_BOUNDARY_CHARS = string.punctuation + string.whitespace 75 76
77 -def find_words(string, trie):
78 """find_words(string, trie) -> list of tuples (key, start, end) 79 80 Find all the keys in the trie that match full words in the string. 81 Word boundaries are defined as any punctuation or whitespace. 82 83 """ 84 _boundary_re = re.compile(r"[%s]+" % re.escape(DEFAULT_BOUNDARY_CHARS)) 85 86 results = [] 87 start = 0 # index of word boundary 88 while start < len(string): 89 # Look for a match. 90 keys = match_all(string[start:], trie) 91 for key in keys: 92 l = len(key) 93 # Make sure it ends at a boundary. 94 if start + l == len(string) or \ 95 _boundary_re.match(string[start + l]): 96 results.append((key, start, start + l)) 97 # Move forward to the next boundary. 98 m = _boundary_re.search(string, start) 99 if m is None: 100 break 101 start = m.end() 102 return results
103