1 """
2 Given a trie, find all occurrences of a word in the trie in a string.
3
4 Like searching a string for a substring, except that the substring is
5 any word in a trie.
6
7 Functions:
8 match Find longest key in a trie matching the beginning of the string.
9 match_all Find all keys in a trie matching the beginning of the string.
10 find Find keys in a trie matching anywhere in a string.
11 find_words Find keys in a trie matching whole words in a string.
12
13 """
14 import string
15 import re
16
17
19 """match(string, trie) -> longest key or None
20
21 Find the longest key in the trie that matches the beginning of the
22 string.
23
24 """
25 longest = None
26 for i in range(len(string)):
27 substr = string[:i+1]
28 if not trie.has_prefix(substr):
29 break
30 if trie.has_key(substr):
31 longest = substr
32 return longest
33
34
36 """match_all(string, trie) -> list of keys
37
38 Find all the keys in the trie that matches the beginning of the
39 string.
40
41 """
42 matches = []
43 for i in range(len(string)):
44 substr = string[:i+1]
45 if not trie.has_prefix(substr):
46 break
47 if trie.has_key(substr):
48 matches.append(substr)
49 return matches
50
51
52 -def find(string, trie):
67
68 DEFAULT_BOUNDARY_CHARS = string.punctuation + string.whitespace
69
70
72 """find_words(string, trie) -> list of tuples (key, start, end)
73
74 Find all the keys in the trie that match full words in the string.
75 Word boundaries are defined as any punctuation or whitespace.
76
77 """
78 _boundary_re = re.compile(r"[%s]+" % re.escape(DEFAULT_BOUNDARY_CHARS))
79
80 results = []
81 start = 0
82 while start < len(string):
83
84 keys = match_all(string[start:], trie)
85 for key in keys:
86 l = len(key)
87
88 if start+l == len(string) or \
89 _boundary_re.match(string[start+l]):
90 results.append((key, start, start+l))
91
92 m = _boundary_re.search(string, start)
93 if m is None:
94 break
95 start = m.end()
96 return results
97