1 module dtrie; 2 import std.typecons : tuple, Tuple; 3 import std.algorithm : reverse, sort, SwapStrategy; 4 import std.conv : to; 5 6 import queue : Queue; 7 import bitarray : SuccinctBitVector; 8 import lib.exception : ValueError, KeyError; 9 10 /** 11 The node of the tree. 12 Each node has one character as its member. 13 */ 14 class Node { 15 wchar label; 16 private Node[] children; 17 bool visited; 18 19 this(wchar label) { 20 this.label = label; 21 this.children = []; 22 this.visited = false; 23 } 24 25 void addChild(Node child) { 26 this.children ~= child; 27 } 28 29 ulong getNChildren() { 30 return this.children.length; 31 } 32 } 33 34 35 /** 36 This class has:<br> 37 A function which constructs a tree by inserted words.<br> 38 A function which dumps the tree as a LOUDS bit-string.<br> 39 */ 40 class LoudsBitStringBuilder { 41 private Node tree; 42 this(string[] words) { 43 import std.conv : to; 44 import std.algorithm.iteration : map; 45 import std.array : array; 46 47 this.tree = new Node(' '); //make root 48 49 //copy given string array into newly allocated string array 50 //to avoid influencing behavior of words at the outside of 51 //this class 52 string[] words_ = new string[words.length]; 53 words_[0..$] = words[0..$]; 54 55 //sort words in alphabetical order. 56 sort!("a < b", SwapStrategy.stable)(words_); 57 foreach(string word; words_) { 58 wchar[] w = array(map!(to!wchar)(word)); 59 this.build(this.tree, w, 0); 60 } 61 } 62 63 /** 64 Build a tree. 65 */ 66 private void build(Node node, wchar[] word, uint depth) { 67 if(depth == word.length) { 68 return; 69 } 70 71 foreach(Node child; node.children) { 72 if(child.label == word[depth]) { 73 this.build(child, word, depth+1); 74 return; 75 } 76 } 77 78 Node child = new Node(word[depth]); 79 node.addChild(child); 80 this.build(child, word, depth+1); 81 return; 82 } 83 84 /** 85 Dumps a LOUDS bit-string. 86 */ 87 Tuple!(SuccinctBitVector, wchar[]) dump() { 88 //construct a bit vector by Breadth-first search 89 SuccinctBitVector bitvector = new SuccinctBitVector(); 90 wchar[] labels; 91 92 //set the root node 93 bitvector.push(1); 94 bitvector.push(0); 95 labels ~= cast(wchar)(' '); 96 97 Queue!Node queue = new Queue!Node(); 98 queue.append(this.tree); 99 100 while(!queue.isempty) { 101 Node node = queue.pop(); 102 labels ~= node.label; 103 104 // append N ones and 0 105 // N is the number of children of the current node 106 for(auto i = 0; i < node.getNChildren(); i++) { 107 bitvector.push(1); 108 } 109 bitvector.push(0); 110 111 foreach(Node child; node.children) { 112 if(child.visited) { 113 continue; 114 } 115 116 child.visited = true; 117 queue.append(child); 118 } 119 } 120 121 bitvector.build(); 122 return tuple(bitvector, labels); 123 } 124 } 125 126 127 //smoke test 128 unittest { 129 string[] words = ["an", "i", "of", "one", "our", "out"]; 130 131 auto constructor = new LoudsBitStringBuilder(words); 132 auto t = constructor.dump(); 133 SuccinctBitVector bitvector = t[0]; 134 135 //the length of the bitvector should be a multiple of 8 136 assert(bitvector.toString() == "101110100111000101100000"); 137 } 138 139 140 //Ensure the same bit string generated if the word order is randomized. 141 unittest { 142 string[] words = ["our", "out", "i", "an", "of", "one"]; 143 144 auto constructor = new LoudsBitStringBuilder(words); 145 auto t = constructor.dump(); 146 SuccinctBitVector bitvector = t[0]; 147 148 //the length of the bitvector should be a multiple of 8 149 assert(bitvector.toString() == "101110100111000101100000"); 150 } 151 152 153 unittest { 154 string[] words = ["the", "then", "they"]; 155 156 auto constructor = new LoudsBitStringBuilder(words); 157 auto t = constructor.dump(); 158 SuccinctBitVector bitvector = t[0]; 159 assert(bitvector.toString() == "1010101011000000"); 160 } 161 162 163 /** 164 Map of words and node numbers with a trie. 165 */ 166 class WordNodeNumberMap { 167 private SuccinctBitVector bitvector; 168 private wchar[] labels; 169 170 /** 171 Build the dictionary. 172 Raise ValueError if the empty string is contained in the words. 173 */ 174 this(string[] words) 175 in { 176 foreach(string word; words) { 177 if(word.length == 0) { 178 throw new ValueError("Empty string recieved"); 179 } 180 } 181 } 182 body { 183 //convert words to a LOUDS bit-string 184 auto t = new LoudsBitStringBuilder(words).dump(); 185 this.bitvector = t[0]; //LOUDS bit-string 186 this.labels = t[1]; //labels of each node 187 } 188 //should raise ValueError if the empty string recieved 189 unittest { 190 bool error_thrown = false; 191 try { 192 WordNodeNumberMap map = new WordNodeNumberMap([""]); 193 } catch(ValueError e) { 194 error_thrown = true; 195 } 196 assert(error_thrown); 197 } 198 199 /** 200 Return the node number of the child if exists. 201 */ 202 private uint traceChildren(uint current_node_number, wchar character) { 203 uint node_number; 204 205 //get the corresponding index of the node number 206 uint index = this.bitvector.select0(current_node_number-1); 207 208 //numbers next to the index of the node mean the node's children 209 //search brothers 210 index += 1; 211 while(this.bitvector.get(index) == 1) { 212 node_number = this.bitvector.rank1(index); 213 if(this.labels[node_number] == character) { 214 return node_number; 215 } 216 index += 1; 217 } 218 219 throw new ValueError("Child not found."); 220 } 221 222 /** 223 Return the leaf node number if the query exists in the tree and 224 throws KeyError if not exist. 225 */ 226 uint getNodeNumber(string word) 227 in { 228 if(word.length == 0) { 229 throw new KeyError("Empty word."); 230 } 231 } 232 body { 233 uint node_number = 1; 234 foreach(wchar character; word) { 235 try { 236 node_number = this.traceChildren(node_number, character); 237 } catch(ValueError e) { 238 throw new KeyError(word); 239 } 240 } 241 return node_number; 242 } 243 244 /** 245 Return the word the associated node number. 246 */ 247 string getWord(uint node_number) 248 in { 249 if(node_number >= this.labels.length) { 250 throw new ValueError("Node number is too large."); 251 } 252 } 253 body { 254 wchar[] word; 255 //search from the leaf node 256 while(node_number != 1) { 257 word ~= this.labels[node_number]; 258 259 //get the parent node 260 uint index = this.bitvector.select1(node_number-1); 261 node_number = this.bitvector.rank0(index); 262 } 263 264 //reverse the word because searched from the leaf node 265 reverse(word); 266 return word.to!string; 267 } 268 } 269 270 271 //smoke test 272 unittest { 273 void testGetNodeNumber(string[] keys, uint[] answers, 274 string[] words_not_in_keys) { 275 WordNodeNumberMap map = new WordNodeNumberMap(keys); 276 foreach(uint i, string key; keys) { 277 //the set of keys in the dictionary should be found 278 uint result = map.getNodeNumber(key); 279 assert(result == answers[i]); 280 } 281 282 //KeyError should be thrown if the key is not found 283 foreach(string word; words_not_in_keys) { 284 bool error_thrown = false; 285 try { 286 map.getNodeNumber(word); 287 } catch(KeyError e) { 288 error_thrown = true; 289 } 290 if(!error_thrown) { 291 import std.string : format; 292 import core.exception : AssertError; 293 throw new AssertError( 294 format("KeyError is not thrown at key '%s'", word)); 295 } 296 } 297 } 298 299 testGetNodeNumber(["an", "i", "of", "one", "our", "out"], 300 [5, 3, 6, 9, 10, 11], 301 ["hello", "ant", "ones", "ours", ""]); 302 testGetNodeNumber(["the", "then", "they"], [4, 5, 6], 303 ["hi", "thus", "that", "them", ""]); 304 305 void testGetWord(uint[] node_numbers, string[] keys) { 306 import std.algorithm : max, reduce; 307 WordNodeNumberMap map = new WordNodeNumberMap(keys); 308 sort!("a < b", SwapStrategy.stable)(keys); 309 foreach(uint i, uint node_number; node_numbers) { 310 string key = map.getWord(node_number); 311 assert(key == keys[i]); 312 } 313 314 bool error_thrown = false; 315 uint node_number = reduce!max(node_numbers) + 1; 316 try { 317 map.getWord(node_number); 318 } catch(ValueError e) { 319 error_thrown = true; 320 } 321 assert(error_thrown); 322 } 323 324 testGetWord([5, 3, 6, 9, 10, 11], ["an", "i", "of", "one", "our", "out"]); 325 testGetWord([4, 5, 6], ["the", "then", "they"]); 326 } 327 328 329 //TODO try to make only this class public 330 //TODO consider the name 331 /** 332 Interface of kana-kanji dictionary. 333 */ 334 class Dictionary { 335 /* 336 This is the implementation of a string-to-string dictionary by 337 using a copule of trie trees. 338 Trie provides a function which returns the node number given the key, and 339 also has a function which returns the key given the node number. 340 Algorithm: 341 Construction: 342 1. Build trie trees. One is with keys, and another one is with 343 values. 344 2. Associate node numbers given by each trie tree using an 345 associative array. 346 Indexing access: 347 1. Extract the node number associated with the key. 348 2. Get the node number corresponding to the node number given by 349 the key using the associative array. 350 3. Extract the value with the node number obtained at 2. 351 */ 352 353 private WordNodeNumberMap key_to_node_number; 354 private WordNodeNumberMap node_number_to_value; 355 //The initial size of the associative array is 0, but will be getting 356 //expanded while building. 357 private uint[][] node_number_map; 358 359 this(string[] keys, string[] values) 360 in { 361 if(keys.length != values.length) { 362 throw new ValueError( 363 "The number of keys doesn't match the number of values."); 364 } 365 } 366 body { 367 this.key_to_node_number = new WordNodeNumberMap(keys); 368 this.node_number_to_value = new WordNodeNumberMap(values); 369 370 for(uint i = 0; i < keys.length; i++) { 371 this.associate_node_numbers(keys[i], values[i]); 372 } 373 } 374 375 private void associate_node_numbers(string key, string value) { 376 uint key_node_number = this.key_to_node_number.getNodeNumber(key); 377 uint value_node_number = this.node_number_to_value.getNodeNumber(value); 378 //expand the associative array 379 if(key_node_number >= this.node_number_map.length) { 380 ulong size = key_node_number+1-this.node_number_map.length; 381 this.node_number_map ~= new uint[][](size, 0); //additional space 382 } 383 this.node_number_map[key_node_number] ~= value_node_number; 384 } 385 386 /** 387 Return the value given the key. 388 Content of default_ will be returned if the key doesn't exist among the 389 key set. 390 */ 391 string[] get(string key) { 392 string[] values; 393 try { 394 uint key_node_number = this.key_to_node_number.getNodeNumber(key); 395 uint[] value_node_numbers = this.node_number_map[key_node_number]; 396 foreach(v; value_node_numbers) { 397 values ~= this.node_number_to_value.getWord(v); 398 } 399 } catch(KeyError e) { 400 return cast(string[])[]; 401 } 402 return values; 403 } 404 } 405 406 407 /// 408 unittest { 409 auto dictionary = new Dictionary(["Win", "hot"], ["Lose", "cold"]); 410 assert(dictionary.get("Win") == ["Lose"]); 411 assert(dictionary.get("hot") == ["cold"]); 412 assert(dictionary.get("won") == []); 413 } 414 415 416 //smoke test 417 unittest { 418 string[] keys = [ 419 "accept", "affirm", "include", "arrive", "invest", 420 "begin", "offer", "conceal", "discharge", "recognize", 421 "enrich", "rise", "expose", "remember", "sleep", 422 "hide", "sink", "hurt", "wax", "accept", 423 "affirm", "include", "arrive", "invest", "begin", 424 "offer", "conceal", "discharge", "recognize", "enrich", 425 "rise", "expose", "remember", "sleep", "hide", 426 "sink", "hurt", "wax" 427 ]; 428 429 string[] values = [ 430 "reject", "deny", "exclude", "depart", "divest", 431 "end", "refuse", "reveal", "convict", "ignore", 432 "impoverish", "fall, set", "conceal", "forget", 433 "wake", "seek", "swim", "heal", "wane", 434 "reject", "deny", "exclude", "depart", "divest", 435 "end", "refuse", "reveal", "convict", "ignore", 436 "impoverish", "fall, set", "conceal", "forget", 437 "wake", "seek", "swim", "heal", "wane" 438 ]; 439 440 auto dictionary = new Dictionary(keys, values); 441 foreach(uint i, string key; keys) { 442 assert(dictionary.get(key)[0] == values[i]); 443 } 444 } 445 446 447 //test for multibyte strings 448 unittest { 449 string[] keys = [ 450 "すもーくちーず" 451 ]; 452 453 string[] values = [ 454 "スモークチーズ" 455 ]; 456 457 auto dictionary = new Dictionary(keys, values); 458 foreach(uint i, string key; keys) { 459 assert(dictionary.get(key)[0] == values[i]); 460 } 461 } 462 463 //test for duplicate values 464 unittest { 465 string[] keys = [ 466 "あけます", "あけます", "あけます", 467 "あけました", "あけました", "あけました" 468 ]; 469 470 string[] values = [ 471 "開けます", "明けます", "空けます", 472 "開けました", "明けました", "空けました" 473 ]; 474 475 auto dictionary = new Dictionary(keys, values); 476 assert(dictionary.get("あけます") == ["開けます", "明けます", "空けます"]); 477 assert(dictionary.get("あけました") == 478 ["開けました", "明けました", "空けました"]); 479 }