Package Bio :: Module triefind
[hide private]
[frames] | no frames]

Source Code for Module Bio.triefind

 1  """ 
 2  Given a trie, find all occurrences of a word in the trie in a string. 
 3   
 4  Like searching a string for a substring, except that the substring is 
 5  any word in a trie. 
 6   
 7  Functions: 
 8  match         Find longest key in a trie matching the beginning of the string. 
 9  match_all     Find all keys in a trie matching the beginning of the string. 
10  find          Find keys in a trie matching anywhere in a string. 
11  find_words    Find keys in a trie matching whole words in a string. 
12   
13  """ 
14  import string 
15  import re 
16   
17   
18 -def match(string, trie):
19 """match(string, trie) -> longest key or None 20 21 Find the longest key in the trie that matches the beginning of the 22 string. 23 24 """ 25 longest = None 26 for i in range(len(string)): 27 substr = string[:i+1] 28 if not trie.has_prefix(substr): 29 break 30 if trie.has_key(substr): 31 longest = substr 32 return longest
33 34
35 -def match_all(string, trie):
36 """match_all(string, trie) -> list of keys 37 38 Find all the keys in the trie that matches the beginning of the 39 string. 40 41 """ 42 matches = [] 43 for i in range(len(string)): 44 substr = string[:i+1] 45 if not trie.has_prefix(substr): 46 break 47 if trie.has_key(substr): 48 matches.append(substr) 49 return matches
50 51
52 -def find(string, trie):
53 """find(string, trie) -> list of tuples (key, start, end) 54 55 Find all the keys in the trie that match anywhere in the string. 56 57 """ 58 results = [] 59 start = 0 # index to start the search 60 while start < len(string): 61 # Look for a match. 62 keys = match_all(string[start:], trie) 63 for key in keys: 64 results.append((key, start, start+len(key))) 65 start += 1 66 return results
67 68 DEFAULT_BOUNDARY_CHARS = string.punctuation + string.whitespace 69 70
71 -def find_words(string, trie):
72 """find_words(string, trie) -> list of tuples (key, start, end) 73 74 Find all the keys in the trie that match full words in the string. 75 Word boundaries are defined as any punctuation or whitespace. 76 77 """ 78 _boundary_re = re.compile(r"[%s]+" % re.escape(DEFAULT_BOUNDARY_CHARS)) 79 80 results = [] 81 start = 0 # index of word boundary 82 while start < len(string): 83 # Look for a match. 84 keys = match_all(string[start:], trie) 85 for key in keys: 86 l = len(key) 87 # Make sure it ends at a boundary. 88 if start+l == len(string) or \ 89 _boundary_re.match(string[start+l]): 90 results.append((key, start, start+l)) 91 # Move forward to the next boundary. 92 m = _boundary_re.search(string, start) 93 if m is None: 94 break 95 start = m.end() 96 return results
97