ICU 57.1 57.1
stringtriebuilder.h
Go to the documentation of this file.
1/*
2*******************************************************************************
3* Copyright (C) 2010-2012,2014, International Business Machines
4* Corporation and others. All Rights Reserved.
5*******************************************************************************
6* file name: stringtriebuilder.h
7* encoding: US-ASCII
8* tab size: 8 (not used)
9* indentation:4
10*
11* created on: 2010dec24
12* created by: Markus W. Scherer
13*/
14
15#ifndef __STRINGTRIEBUILDER_H__
16#define __STRINGTRIEBUILDER_H__
17
18#include "unicode/utypes.h"
19#include "unicode/uobject.h"
20
26// Forward declaration.
27struct UHashtable;
28typedef struct UHashtable UHashtable;
29
51};
52
54
62public:
63#ifndef U_HIDE_INTERNAL_API
65 static UBool hashNode(const void *node);
67 static UBool equalNodes(const void *left, const void *right);
68#endif /* U_HIDE_INTERNAL_API */
69
70protected:
71 // Do not enclose the protected default constructor with #ifndef U_HIDE_INTERNAL_API
72 // or else the compiler will create a public default constructor.
77
78#ifndef U_HIDE_INTERNAL_API
80 void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode);
83
85 void build(UStringTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode);
86
88 int32_t writeNode(int32_t start, int32_t limit, int32_t unitIndex);
90 int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length);
91#endif /* U_HIDE_INTERNAL_API */
92
93 class Node;
94
95#ifndef U_HIDE_INTERNAL_API
97 Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode);
99 Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
100 int32_t length, UErrorCode &errorCode);
101#endif /* U_HIDE_INTERNAL_API */
102
104 virtual int32_t getElementStringLength(int32_t i) const = 0;
106 virtual UChar getElementUnit(int32_t i, int32_t unitIndex) const = 0;
108 virtual int32_t getElementValue(int32_t i) const = 0;
109
110 // Finds the first unit index after this one where
111 // the first and last element have different units again.
113 virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const = 0;
114
115 // Number of different units at unitIndex.
117 virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const = 0;
119 virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const = 0;
121 virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const = 0;
122
124 virtual UBool matchNodesCanHaveValues() const = 0;
125
127 virtual int32_t getMaxBranchLinearSubNodeLength() const = 0;
129 virtual int32_t getMinLinearMatch() const = 0;
131 virtual int32_t getMaxLinearMatchLength() const = 0;
132
133#ifndef U_HIDE_INTERNAL_API
134 // max(BytesTrie::kMaxBranchLinearSubNodeLength, UCharsTrie::kMaxBranchLinearSubNodeLength).
136 static const int32_t kMaxBranchLinearSubNodeLength=5;
137
138 // Maximum number of nested split-branch levels for a branch on all 2^16 possible UChar units.
139 // log2(2^16/kMaxBranchLinearSubNodeLength) rounded up.
141 static const int32_t kMaxSplitBranchLevels=14;
142
153 Node *registerNode(Node *newNode, UErrorCode &errorCode);
164 Node *registerFinalValue(int32_t value, UErrorCode &errorCode);
165#endif /* U_HIDE_INTERNAL_API */
166
167 /*
168 * C++ note:
169 * registerNode() and registerFinalValue() take ownership of their input nodes,
170 * and only return owned nodes.
171 * If they see a failure UErrorCode, they will delete the input node.
172 * If they get a NULL pointer, they will record a U_MEMORY_ALLOCATION_ERROR.
173 * If there is a failure, they return NULL.
174 *
175 * NULL Node pointers can be safely passed into other Nodes because
176 * they call the static Node::hashCode() which checks for a NULL pointer first.
177 *
178 * Therefore, as long as builder functions register a new node,
179 * they need to check for failures only before explicitly dereferencing
180 * a Node pointer, or before setting a new UErrorCode.
181 */
182
183 // Hash set of nodes, maps from nodes to integer 1.
185 UHashtable *nodes;
186
187#ifndef U_HIDE_INTERNAL_API
189 class Node : public UObject {
190 public:
191 Node(int32_t initialHash) : hash(initialHash), offset(0) {}
192 inline int32_t hashCode() const { return hash; }
193 // Handles node==NULL.
194 static inline int32_t hashCode(const Node *node) { return node==NULL ? 0 : node->hashCode(); }
195 // Base class operator==() compares the actual class types.
196 virtual UBool operator==(const Node &other) const;
197 inline UBool operator!=(const Node &other) const { return !operator==(other); }
225 virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
226 // write() must set the offset to a positive value.
227 virtual void write(StringTrieBuilder &builder) = 0;
228 // See markRightEdgesFirst.
229 inline void writeUnlessInsideRightEdge(int32_t firstRight, int32_t lastRight,
230 StringTrieBuilder &builder) {
231 // Note: Edge numbers are negative, lastRight<=firstRight.
232 // If offset>0 then this node and its sub-nodes have been written already
233 // and we need not write them again.
234 // If this node is part of the unwritten right branch edge,
235 // then we wait until that is written.
236 if(offset<0 && (offset<lastRight || firstRight<offset)) {
237 write(builder);
238 }
239 }
240 inline int32_t getOffset() const { return offset; }
241 protected:
242 int32_t hash;
243 int32_t offset;
244 };
245
246 // This class should not be overridden because
247 // registerFinalValue() compares a stack-allocated FinalValueNode
248 // (stack-allocated so that we don't unnecessarily create lots of duplicate nodes)
249 // with the input node, and the
250 // !Node::operator==(other) used inside FinalValueNode::operator==(other)
251 // will be false if the typeid's are different.
253 class FinalValueNode : public Node {
254 public:
255 FinalValueNode(int32_t v) : Node(0x111111*37+v), value(v) {}
256 virtual UBool operator==(const Node &other) const;
257 virtual void write(StringTrieBuilder &builder);
258 protected:
259 int32_t value;
260 };
261
265 class ValueNode : public Node {
266 public:
267 ValueNode(int32_t initialHash) : Node(initialHash), hasValue(FALSE), value(0) {}
268 virtual UBool operator==(const Node &other) const;
269 void setValue(int32_t v) {
270 hasValue=TRUE;
271 value=v;
272 hash=hash*37+v;
273 }
274 protected:
275 UBool hasValue;
276 int32_t value;
277 };
278
283 public:
284 IntermediateValueNode(int32_t v, Node *nextNode)
285 : ValueNode(0x222222*37+hashCode(nextNode)), next(nextNode) { setValue(v); }
286 virtual UBool operator==(const Node &other) const;
287 virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
288 virtual void write(StringTrieBuilder &builder);
289 protected:
290 Node *next;
291 };
292
296 class LinearMatchNode : public ValueNode {
297 public:
298 LinearMatchNode(int32_t len, Node *nextNode)
299 : ValueNode((0x333333*37+len)*37+hashCode(nextNode)),
300 length(len), next(nextNode) {}
301 virtual UBool operator==(const Node &other) const;
302 virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
303 protected:
304 int32_t length;
305 Node *next;
306 };
307
311 class BranchNode : public Node {
312 public:
313 BranchNode(int32_t initialHash) : Node(initialHash) {}
314 protected:
315 int32_t firstEdgeNumber;
316 };
317
321 class ListBranchNode : public BranchNode {
322 public:
323 ListBranchNode() : BranchNode(0x444444), length(0) {}
324 virtual UBool operator==(const Node &other) const;
325 virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
326 virtual void write(StringTrieBuilder &builder);
327 // Adds a unit with a final value.
328 void add(int32_t c, int32_t value) {
329 units[length]=(UChar)c;
330 equal[length]=NULL;
331 values[length]=value;
332 ++length;
333 hash=(hash*37+c)*37+value;
334 }
335 // Adds a unit which leads to another match node.
336 void add(int32_t c, Node *node) {
337 units[length]=(UChar)c;
338 equal[length]=node;
339 values[length]=0;
340 ++length;
341 hash=(hash*37+c)*37+hashCode(node);
342 }
343 protected:
344 Node *equal[kMaxBranchLinearSubNodeLength]; // NULL means "has final value".
345 int32_t length;
346 int32_t values[kMaxBranchLinearSubNodeLength];
347 UChar units[kMaxBranchLinearSubNodeLength];
348 };
349
354 public:
355 SplitBranchNode(UChar middleUnit, Node *lessThanNode, Node *greaterOrEqualNode)
356 : BranchNode(((0x555555*37+middleUnit)*37+
357 hashCode(lessThanNode))*37+hashCode(greaterOrEqualNode)),
358 unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {}
359 virtual UBool operator==(const Node &other) const;
360 virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
361 virtual void write(StringTrieBuilder &builder);
362 protected:
363 UChar unit;
364 Node *lessThan;
365 Node *greaterOrEqual;
366 };
367
368 // Branch head node, for writing the actual node lead unit.
370 class BranchHeadNode : public ValueNode {
371 public:
372 BranchHeadNode(int32_t len, Node *subNode)
373 : ValueNode((0x666666*37+len)*37+hashCode(subNode)),
374 length(len), next(subNode) {}
375 virtual UBool operator==(const Node &other) const;
376 virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
377 virtual void write(StringTrieBuilder &builder);
378 protected:
379 int32_t length;
380 Node *next; // A branch sub-node.
381 };
382#endif /* U_HIDE_INTERNAL_API */
383
385 virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
386 Node *nextNode) const = 0;
387
389 virtual int32_t write(int32_t unit) = 0;
391 virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) = 0;
393 virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal) = 0;
395 virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node) = 0;
397 virtual int32_t writeDeltaTo(int32_t jumpTarget) = 0;
398};
399
401
402#endif // __STRINGTRIEBUILDER_H__
virtual int32_t markRightEdgesFirst(int32_t edgeNumber)
Traverses the Node graph and numbers branch edges, with rightmost edges first.
virtual int32_t markRightEdgesFirst(int32_t edgeNumber)
Traverses the Node graph and numbers branch edges, with rightmost edges first.
virtual int32_t markRightEdgesFirst(int32_t edgeNumber)
Traverses the Node graph and numbers branch edges, with rightmost edges first.
virtual int32_t markRightEdgesFirst(int32_t edgeNumber)
Traverses the Node graph and numbers branch edges, with rightmost edges first.
virtual int32_t markRightEdgesFirst(int32_t edgeNumber)
Traverses the Node graph and numbers branch edges, with rightmost edges first.
virtual int32_t markRightEdgesFirst(int32_t edgeNumber)
Traverses the Node graph and numbers branch edges, with rightmost edges first.
Base class for string trie builder classes.
virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const =0
static UBool equalNodes(const void *left, const void *right)
virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const =0
virtual int32_t getMaxBranchLinearSubNodeLength() const =0
Node * makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length, UErrorCode &errorCode)
static UBool hashNode(const void *node)
virtual int32_t write(int32_t unit)=0
void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode)
virtual Node * createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length, Node *nextNode) const =0
virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const =0
virtual int32_t getMinLinearMatch() const =0
virtual int32_t getElementValue(int32_t i) const =0
virtual int32_t getElementStringLength(int32_t i) const =0
virtual UChar getElementUnit(int32_t i, int32_t unitIndex) const =0
Node * registerFinalValue(int32_t value, UErrorCode &errorCode)
Makes sure that there is only one unique FinalValueNode registered with this value.
virtual UBool matchNodesCanHaveValues() const =0
virtual ~StringTrieBuilder()
virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal)=0
int32_t writeNode(int32_t start, int32_t limit, int32_t unitIndex)
virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const =0
int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length)
virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length)=0
void deleteCompactBuilder()
virtual int32_t writeDeltaTo(int32_t jumpTarget)=0
void build(UStringTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode)
Node * makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode)
virtual int32_t getMaxLinearMatchLength() const =0
Node * registerNode(Node *newNode, UErrorCode &errorCode)
Makes sure that there is only one unique node registered that is equivalent to newNode.
virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node)=0
UObject is the common ICU "boilerplate" class.
Definition: uobject.h:221
U_EXPORT UBool operator==(const StringPiece &x, const StringPiece &y)
Global operator == for StringPiece.
UBool operator!=(const StringPiece &x, const StringPiece &y)
Global operator != for StringPiece.
Definition: stringpiece.h:218
UStringTrieBuildOption
Build options for BytesTrieBuilder and CharsTrieBuilder.
@ USTRINGTRIE_BUILD_SMALL
Builds a trie more slowly, attempting to generate a shorter but equivalent serialization.
@ USTRINGTRIE_BUILD_FAST
Builds a trie quickly.
int8_t UBool
The ICU boolean type.
Definition: umachine.h:234
uint16_t UChar
Define UChar to be UCHAR_TYPE, if that is #defined (for example, to char16_t), or wchar_t if that is ...
Definition: umachine.h:312
#define TRUE
The TRUE value of a UBool.
Definition: umachine.h:238
#define FALSE
The FALSE value of a UBool.
Definition: umachine.h:242
C++ API: Common ICU base class UObject.
Basic definitions for ICU, for both C and C++ APIs.
#define NULL
Define NULL if necessary, to 0 for C++ and to ((void *)0) for C.
Definition: utypes.h:186
UErrorCode
Error code to replace exception handling, so that the code is compatible with all C++ compilers,...
Definition: utypes.h:476
#define U_COMMON_API
Set to export library symbols from inside the common library, and to import them from outside.
Definition: utypes.h:357
#define U_NAMESPACE_END
This is used to end a declaration of a public ICU C++ API.
Definition: uversion.h:130
#define U_NAMESPACE_BEGIN
This is used to begin a declaration of a public ICU C++ API.
Definition: uversion.h:129