1 module dtrie;
2 import std.typecons : tuple, Tuple;
3 import std.algorithm : reverse, sort, SwapStrategy;
4 import std.conv : to;
5 
6 import queue : Queue;
7 import bitarray : SuccinctBitVector;
8 import lib.exception : ValueError, KeyError;
9 
10 /**
11 The node of the tree.
12 Each node has one character as its member.
13 */
14 class Node {
15     wchar label;
16     private Node[] children;
17     bool visited;
18 
19     this(wchar label) {
20         this.label = label;
21         this.children = [];
22         this.visited = false;
23     }
24 
25     void addChild(Node child) {
26         this.children ~= child;
27     }
28 
29     ulong getNChildren() {
30         return this.children.length;
31     }
32 }
33 
34 
35 /**
36 This class has:<br>
37     A function which constructs a tree by inserted words.<br>
38     A function which dumps the tree as a LOUDS bit-string.<br>
39 */
40 class LoudsBitStringBuilder {
41     private Node tree;
42     this(string[] words) {
43         import std.conv : to;
44         import std.algorithm.iteration : map;
45         import std.array : array;
46 
47         this.tree = new Node(' ');  //make root
48 
49         //copy given string array into newly allocated string array
50         //to avoid influencing behavior of words at the outside of
51         //this class
52         string[] words_ = new string[words.length];
53         words_[0..$] = words[0..$];
54 
55         //sort words in alphabetical order.
56         sort!("a < b", SwapStrategy.stable)(words_);
57         foreach(string word; words_) {
58             wchar[] w = array(map!(to!wchar)(word));
59             this.build(this.tree, w, 0);
60         }
61     }
62 
63     /**
64     Build a tree.
65     */
66     private void build(Node node, wchar[] word, uint depth) {
67         if(depth == word.length) {
68             return;
69         }
70 
71         foreach(Node child; node.children) {
72             if(child.label == word[depth]) {
73                 this.build(child, word, depth+1);
74                 return;
75             }
76         }
77 
78         Node child = new Node(word[depth]);
79         node.addChild(child);
80         this.build(child, word, depth+1);
81         return;
82     }
83 
84     /**
85     Dumps a LOUDS bit-string.
86     */
87     Tuple!(SuccinctBitVector, wchar[]) dump() {
88         //construct a bit vector by Breadth-first search
89         SuccinctBitVector bitvector = new SuccinctBitVector();
90         wchar[] labels;
91 
92         //set the root node
93         bitvector.push(1);
94         bitvector.push(0);
95         labels ~= cast(wchar)(' ');
96 
97         Queue!Node queue = new Queue!Node();
98         queue.append(this.tree);
99 
100         while(!queue.isempty) {
101             Node node = queue.pop();
102             labels ~= node.label;
103 
104             // append N ones and 0
105             // N is the number of children of the current node
106             for(auto i = 0; i < node.getNChildren(); i++) {
107                 bitvector.push(1);
108             }
109             bitvector.push(0);
110 
111             foreach(Node child; node.children) {
112                 if(child.visited) {
113                     continue;
114                 }
115 
116                 child.visited = true;
117                 queue.append(child);
118             }
119         }
120 
121         bitvector.build();
122         return tuple(bitvector, labels);
123     }
124 }
125 
126 
127 //smoke test
128 unittest {
129     string[] words = ["an", "i", "of", "one", "our", "out"];
130 
131     auto constructor = new LoudsBitStringBuilder(words);
132     auto t = constructor.dump();
133     SuccinctBitVector bitvector = t[0];
134 
135     //the length of the bitvector should be a multiple of 8
136     assert(bitvector.toString() == "101110100111000101100000");
137 }
138 
139 
140 //Ensure the same bit string generated if the word order is randomized.
141 unittest {
142     string[] words = ["our", "out", "i", "an", "of", "one"];
143 
144     auto constructor = new LoudsBitStringBuilder(words);
145     auto t = constructor.dump();
146     SuccinctBitVector bitvector = t[0];
147 
148     //the length of the bitvector should be a multiple of 8
149     assert(bitvector.toString() == "101110100111000101100000");
150 }
151 
152 
153 unittest {
154     string[] words = ["the", "then", "they"];
155 
156     auto constructor = new LoudsBitStringBuilder(words);
157     auto t = constructor.dump();
158     SuccinctBitVector bitvector = t[0];
159     assert(bitvector.toString() == "1010101011000000");
160 }
161 
162 
163 /**
164 Map of words and node numbers with a trie.
165 */
166 class WordNodeNumberMap {
167     private SuccinctBitVector bitvector;
168     private wchar[] labels;
169 
170     /**
171     Build the dictionary.
172     Raise ValueError if the empty string is contained in the words.
173     */
174     this(string[] words)
175     in {
176         foreach(string word; words) {
177             if(word.length == 0) {
178                 throw new ValueError("Empty string recieved");
179             }
180         }
181     }
182     body {
183         //convert words to a LOUDS bit-string
184         auto t = new LoudsBitStringBuilder(words).dump();
185         this.bitvector = t[0];  //LOUDS bit-string
186         this.labels = t[1]; //labels of each node
187     }
188     //should raise ValueError if the empty string recieved
189     unittest {
190         bool error_thrown = false;
191         try {
192             WordNodeNumberMap map = new WordNodeNumberMap([""]);
193         } catch(ValueError e) {
194             error_thrown = true;
195         }
196         assert(error_thrown);
197     }
198 
199     /**
200     Return the node number of the child if exists.
201     */
202     private uint traceChildren(uint current_node_number, wchar character) {
203         uint node_number;
204 
205         //get the corresponding index of the node number
206         uint index = this.bitvector.select0(current_node_number-1);
207 
208         //numbers next to the index of the node mean the node's children
209         //search brothers
210         index += 1;
211         while(this.bitvector.get(index) == 1) {
212             node_number = this.bitvector.rank1(index);
213             if(this.labels[node_number] == character) {
214                 return node_number;
215             }
216             index += 1;
217         }
218 
219         throw new ValueError("Child not found.");
220     }
221 
222     /**
223     Return the leaf node number if the query exists in the tree and
224     throws KeyError if not exist.
225     */
226     uint getNodeNumber(string word)
227     in {
228         if(word.length == 0) {
229             throw new KeyError("Empty word.");
230         }
231     }
232     body {
233         uint node_number = 1;
234         foreach(wchar character; word) {
235             try {
236                 node_number = this.traceChildren(node_number, character);
237             } catch(ValueError e) {
238                 throw new KeyError(word);
239             }
240         }
241         return node_number;
242     }
243 
244     /**
245     Return the word the associated node number.
246     */
247     string getWord(uint node_number)
248     in {
249         if(node_number >= this.labels.length)  {
250             throw new ValueError("Node number is too large.");
251         }
252     }
253     body {
254         wchar[] word;
255         //search from the leaf node
256         while(node_number != 1) {
257             word ~= this.labels[node_number];
258 
259             //get the parent node
260             uint index = this.bitvector.select1(node_number-1);
261             node_number = this.bitvector.rank0(index);
262         }
263 
264         //reverse the word because searched from the leaf node
265         reverse(word);
266         return word.to!string;
267     }
268 }
269 
270 
271 //smoke test
272 unittest {
273     void testGetNodeNumber(string[] keys, uint[] answers,
274                            string[] words_not_in_keys) {
275         WordNodeNumberMap map = new WordNodeNumberMap(keys);
276         foreach(uint i, string key; keys) {
277             //the set of keys in the dictionary should be found
278             uint result = map.getNodeNumber(key);
279             assert(result == answers[i]);
280         }
281 
282         //KeyError should be thrown if the key is not found
283         foreach(string word; words_not_in_keys) {
284             bool error_thrown = false;
285             try {
286                 map.getNodeNumber(word);
287             } catch(KeyError e) {
288                 error_thrown = true;
289             }
290             if(!error_thrown) {
291                 import std.string : format;
292                 import core.exception : AssertError;
293                 throw new AssertError(
294                         format("KeyError is not thrown at key '%s'", word));
295             }
296         }
297     }
298 
299     testGetNodeNumber(["an", "i", "of", "one", "our", "out"],
300                       [5, 3, 6, 9, 10, 11],
301                       ["hello", "ant", "ones", "ours", ""]);
302     testGetNodeNumber(["the", "then", "they"], [4, 5, 6],
303                       ["hi", "thus", "that", "them", ""]);
304 
305     void testGetWord(uint[] node_numbers, string[] keys) {
306         import std.algorithm : max, reduce;
307         WordNodeNumberMap map = new WordNodeNumberMap(keys);
308         sort!("a < b", SwapStrategy.stable)(keys);
309         foreach(uint i, uint node_number; node_numbers) {
310             string key = map.getWord(node_number);
311             assert(key == keys[i]);
312         }
313 
314         bool error_thrown = false;
315         uint node_number = reduce!max(node_numbers) + 1;
316         try {
317             map.getWord(node_number);
318         } catch(ValueError e) {
319             error_thrown = true;
320         }
321         assert(error_thrown);
322     }
323 
324     testGetWord([5, 3, 6, 9, 10, 11], ["an", "i", "of", "one", "our", "out"]);
325     testGetWord([4, 5, 6], ["the", "then", "they"]);
326 }
327 
328 
329 //TODO try to make only this class public
330 //TODO consider the name
331 /**
332 Interface of kana-kanji dictionary.
333 */
334 class Dictionary {
335     /*
336     This is the implementation of a string-to-string dictionary by
337     using a copule of trie trees.
338     Trie provides a function which returns the node number given the key, and
339     also has a function which returns the key given the node number.
340     Algorithm:
341         Construction:
342             1. Build trie trees. One is with keys, and another one is with
343             values.
344             2. Associate node numbers given by each trie tree using an
345             associative array.
346         Indexing access:
347             1. Extract the node number associated with the key.
348             2. Get the node number corresponding to the node number given by
349             the key using the associative array.
350             3. Extract the value with the node number obtained at 2.
351     */
352 
353     private WordNodeNumberMap key_to_node_number;
354     private WordNodeNumberMap node_number_to_value;
355     //The initial size of the associative array is 0, but will be getting
356     //expanded while building.
357     private uint[][] node_number_map;
358 
359     this(string[] keys, string[] values)
360     in {
361         if(keys.length != values.length) {
362             throw new ValueError(
363                 "The number of keys doesn't match the number of values.");
364         }
365     }
366     body {
367         this.key_to_node_number = new WordNodeNumberMap(keys);
368         this.node_number_to_value = new WordNodeNumberMap(values);
369 
370         for(uint i = 0; i < keys.length; i++) {
371             this.associate_node_numbers(keys[i], values[i]);
372         }
373     }
374 
375     private void associate_node_numbers(string key, string value) {
376         uint key_node_number = this.key_to_node_number.getNodeNumber(key);
377         uint value_node_number = this.node_number_to_value.getNodeNumber(value);
378         //expand the associative array
379         if(key_node_number >= this.node_number_map.length) {
380             ulong size = key_node_number+1-this.node_number_map.length;
381             this.node_number_map ~= new uint[][](size, 0); //additional space
382         }
383         this.node_number_map[key_node_number] ~= value_node_number;
384     }
385 
386     /**
387     Return the value given the key.
388     Content of default_ will be returned if the key doesn't exist among the
389     key set.
390     */
391     string[] get(string key) {
392         string[] values;
393         try {
394             uint key_node_number = this.key_to_node_number.getNodeNumber(key);
395             uint[] value_node_numbers = this.node_number_map[key_node_number];
396             foreach(v; value_node_numbers) {
397                 values ~= this.node_number_to_value.getWord(v);
398             }
399         } catch(KeyError e) {
400             return cast(string[])[];
401         }
402         return values;
403     }
404 }
405 
406 
407 ///
408 unittest {
409     auto dictionary = new Dictionary(["Win", "hot"], ["Lose", "cold"]);
410     assert(dictionary.get("Win") == ["Lose"]);
411     assert(dictionary.get("hot") == ["cold"]);
412     assert(dictionary.get("won") == []);
413 }
414 
415 
416 //smoke test
417 unittest {
418     string[] keys = [
419         "accept", "affirm", "include", "arrive", "invest",
420         "begin", "offer", "conceal", "discharge", "recognize",
421         "enrich", "rise", "expose", "remember", "sleep",
422         "hide", "sink", "hurt", "wax", "accept",
423         "affirm", "include", "arrive", "invest", "begin",
424         "offer", "conceal", "discharge", "recognize", "enrich",
425         "rise", "expose", "remember", "sleep", "hide",
426         "sink", "hurt", "wax"
427     ];
428 
429     string[] values = [
430         "reject", "deny", "exclude", "depart", "divest",
431         "end", "refuse", "reveal", "convict", "ignore",
432         "impoverish", "fall, set", "conceal", "forget",
433         "wake", "seek", "swim", "heal", "wane",
434         "reject", "deny", "exclude", "depart", "divest",
435         "end", "refuse", "reveal", "convict", "ignore",
436         "impoverish", "fall, set", "conceal", "forget",
437         "wake", "seek", "swim", "heal", "wane"
438     ];
439 
440     auto dictionary = new Dictionary(keys, values);
441     foreach(uint i, string key; keys) {
442         assert(dictionary.get(key)[0] == values[i]);
443     }
444 }
445 
446 
447 //test for multibyte strings
448 unittest {
449     string[] keys = [
450         "すもーくちーず"
451     ];
452 
453     string[] values = [
454         "スモークチーズ"
455     ];
456 
457     auto dictionary = new Dictionary(keys, values);
458     foreach(uint i, string key; keys) {
459         assert(dictionary.get(key)[0] == values[i]);
460     }
461 }
462 
463 //test for duplicate values
464 unittest {
465     string[] keys = [
466         "あけます", "あけます", "あけます",
467         "あけました", "あけました", "あけました"
468     ];
469 
470     string[] values = [
471         "開けます", "明けます", "空けます",
472         "開けました", "明けました", "空けました"
473     ];
474 
475     auto dictionary = new Dictionary(keys, values);
476     assert(dictionary.get("あけます") == ["開けます", "明けます", "空けます"]);
477     assert(dictionary.get("あけました") ==
478            ["開けました", "明けました", "空けました"]);
479 }