home · contact · privacy
In Markov text generation, lowercase earlier.
[plomlombot-irc.git] / plomsearch.py
1 class CompoundStatement:
2     def __init__(self, or_list, negneg=True):
3         self.or_list = or_list
4         self.neg = False if negneg else True
5     #def __repr__(self):
6     #    return "<" + str(not self.neg) + ": OR'd " + str(self.or_list) + ">"
7
8 class LogicParserError(Exception):
9     pass
10
11 def parseToCompoundStatement(string):
12     parenthesis_in = "("
13     parenthesis_out = ")"
14     quotes = "'\""
15     escape = '\\'
16     space = " "
17     meta_marker = "\0"
18     not_words = ["NOT"]
19     and_words = ["AND"]
20     or_words = ["OR"]
21
22     def tokenize(string):
23         tokens = []
24         string = string.replace(meta_marker, "")
25         quote = ""
26         token = ""
27         parentheses = parenthesis_in + parenthesis_out
28         escaped = False
29         in_token = False
30         for char in string:
31             if in_token and quote == "" and char in quotes + parentheses \
32                     and not escaped:
33                 in_token = False
34                 tokens += [token]
35                 token = ""
36             if not in_token:
37                 if char in quotes:
38                     in_token = True
39                     quote = char
40                     token = meta_marker 
41                     continue
42                 elif char in parentheses:
43                     tokens += [char]
44                     continue
45                 elif char == space:
46                     continue
47                 else:
48                     in_token = True
49             if in_token:
50                 if not escaped:
51                     if char == escape:
52                         escaped = True
53                         continue
54                     if char == quote or (quote == "" and char == space):
55                         if char == quote:
56                             quote = ""
57                         in_token = False
58                         tokens += [token]
59                         token = ""
60                         continue
61                 else:
62                     escaped = False
63                 token += char
64         if quote:
65             raise LogicParserError("Token not properly closed.")
66         if in_token:
67             tokens += [token]
68         return tokens
69
70     def parenthesize(tokens):
71         open_parentheses = 0
72         compounds = []
73         def group_by_parentheses(i):
74             nonlocal open_parentheses
75             compound = []
76             while i < len(tokens):
77                 if tokens[i] == parenthesis_in:
78                     open_parentheses += 1
79                     i, token = group_by_parentheses(i + 1)
80                     compound += [token]
81                 elif tokens[i] == parenthesis_out:
82                     open_parentheses -= 1
83                     if open_parentheses < 0:
84                         raise LogicParserError("Improper parentheses.")
85                     return i + 1, compound
86                 else:
87                     compound += [tokens[i]]
88                     i += 1
89             return i, compound
90         _, compounds = group_by_parentheses(0)
91         if open_parentheses > 0:
92             raise LogicParserError("Improper parentheses.")
93         return compounds
94
95     def group_by_negation(tree):
96         i = 0
97         while i < len(tree):
98             if type(tree[i]) == str and tree[i] in not_words:
99                 if i > len(tree) - 2:
100                     raise LogicParserError("Improper negation.")
101                 # NOT A = [False, A]
102                 tree[i] = [False, tree[i + 1]]
103                 tree.pop(i + 1)
104                 if type(tree[i][1]) == list:
105                     group_by_negation(tree[i][1])
106             elif type(tree[i]) == list:
107                 group_by_negation(tree[i])
108             i += 1
109
110     def group_by_and(tree):
111         i = 0
112         if type(tree[i]) == bool:
113             i += 1
114         if type(tree[i]) == list:
115             group_by_and(tree[i])
116         if tree[i] in or_words + and_words:
117             raise LogicParserError("Improper AND/OR placement.")
118         while len(tree[i:]) > 1:
119             if tree[i + 1] not in or_words + and_words:
120                 raise LogicParserError("Improper token grouping.")
121             elif len(tree[i:]) < 3 or \
122                     tree[i + 2] in or_words + and_words:
123                 raise LogicParserError("Improper AND/OR placement.")
124             if type(tree[i + 2]) == list:
125                 group_by_and(tree[i + 2])
126             if tree[i + 1] in and_words:
127                 # A AND B = NOT (NOT A OR NOT B)
128                 tree[i] = [False, [[False, tree[i]], [False, tree[i + 2]]]]
129                 tree.pop(i + 2)
130                 tree.pop(i + 1)
131             else:
132                 i += 2
133
134     def group_by_or(tree):
135         i = 0
136         if type(tree[i]) == bool:
137             i += 1
138         if type(tree[i]) == list:
139             group_by_or(tree[i])
140         if tree[i] in or_words:
141             raise LogicParserError("Improper OR placement.")
142         while len(tree[i:]) > 1:
143             if tree[i + 1] in or_words:
144                 if type(tree[i + 2]) == list:
145                     group_by_or(tree[i + 2])
146                 tree[i + 1] = tree[i + 2]
147                 tree.pop(i + 2)
148             else:
149                 if type(tree[i + 1]) == list:
150                     group_by_or(tree[i + 1])
151                 i += 1
152
153     def flatten(tree):
154         i = 0
155         while i < len(tree):
156             if type(tree[i]) == list:
157                 tree[i] = flatten(tree[i])
158             i += 1
159         if len(tree) == 1 and type(tree[0]) == list:
160             # ( A ) = A
161             tree = tree[0]
162         if len(tree) == 2 and tree[0] == False and type(tree[1]) == list \
163                 and len(tree[1]) == 2 and tree[1][0] == False:
164             # NOT NOT A = A
165             tree = tree[1][1]
166         return tree
167
168     def strip_meta_marker(tree):
169         i = 0
170         while i < len(tree):
171             if type(tree[i]) == list:
172                 strip_meta_marker(tree[i])
173             elif type(tree[i]) == str:
174                 tree[i] = tree[i].replace(meta_marker, "")
175             i += 1
176
177     def toCompoundStatement(compounds):
178         def transform(tree):
179             negneg = True
180             i = 0
181             or_group = []
182             if tree[0] == False:
183                 negneg = False
184                 i = 1
185             while i < len(tree):
186                 if type(tree[i]) == list:
187                     or_group += [transform(tree[i])]
188                 else:
189                     or_group += [tree[i]]
190                 i += 1
191             return CompoundStatement(or_group, negneg)
192         return transform(compounds)
193
194     tokens = tokenize(string)
195     compounds = parenthesize(tokens) 
196     group_by_negation(compounds)
197     group_by_and(compounds)
198     group_by_or(compounds)
199     flatten(compounds)
200     strip_meta_marker(compounds)
201     return toCompoundStatement(compounds)
202
203 def search(query, string_list):
204
205     def testStringMatchLogic(statement, compare_value):
206         if type(statement) == str:
207             statement_true = statement in compare_value 
208         elif type(statement) == CompoundStatement:
209             or_list_true = False
210             if len(statement.or_list) > 1:
211                 for i_statement in statement.or_list:
212                     if testStringMatchLogic(i_statement, compare_value):
213                         or_list_true = True
214                         break
215             else:
216                 or_list_true = testStringMatchLogic(statement.or_list[0],
217                         compare_value)
218             if statement.neg:
219                 statement_true = not or_list_true
220             else:
221                 statement_true = or_list_true
222         return statement_true 
223
224     results = []
225     statement = parseToCompoundStatement(query)
226     for i in range(len(string_list)):
227         if testStringMatchLogic(statement, string_list[i]):
228             results += [[i, string_list[i]]]
229     return results
230
231 #TEST:
232 #lines = [
233 #"Hallo Welt,",
234 #"wie geht es dir,",
235 #"ist heut nicht ein schöner Tag?"
236 #]
237 #query = "NOT (geht OR 'ö')"
238 #for line in search(query, lines):
239 #    print(line)