00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017 #ifndef __BYTESTRIE_H__
00018 #define __BYTESTRIE_H__
00019
00025 #include "unicode/utypes.h"
00026
00027 #if U_SHOW_CPLUSPLUS_API
00028
00029 #include "unicode/stringpiece.h"
00030 #include "unicode/uobject.h"
00031 #include "unicode/ustringtrie.h"
00032
00033 class BytesTrieTest;
00034
00035 U_NAMESPACE_BEGIN
00036
00037 class ByteSink;
00038 class BytesTrieBuilder;
00039 class CharString;
00040 class UVector32;
00041
00055 class U_COMMON_API BytesTrie : public UMemory {
00056 public:
00071 BytesTrie(const void *trieBytes)
00072 : ownedArray_(NULL), bytes_(static_cast<const uint8_t *>(trieBytes)),
00073 pos_(bytes_), remainingMatchLength_(-1) {}
00074
00079 ~BytesTrie();
00080
00087 BytesTrie(const BytesTrie &other)
00088 : ownedArray_(NULL), bytes_(other.bytes_),
00089 pos_(other.pos_), remainingMatchLength_(other.remainingMatchLength_) {}
00090
00096 BytesTrie &reset() {
00097 pos_=bytes_;
00098 remainingMatchLength_=-1;
00099 return *this;
00100 }
00101
00110 uint64_t getState64() const {
00111 return (static_cast<uint64_t>(remainingMatchLength_ + 2) << kState64RemainingShift) |
00112 (uint64_t)(pos_ - bytes_);
00113 }
00114
00129 BytesTrie &resetToState64(uint64_t state) {
00130 remainingMatchLength_ = static_cast<int32_t>(state >> kState64RemainingShift) - 2;
00131 pos_ = bytes_ + (state & kState64PosMask);
00132 return *this;
00133 }
00134
00140 class State : public UMemory {
00141 public:
00146 State() { bytes=NULL; }
00147 private:
00148 friend class BytesTrie;
00149
00150 const uint8_t *bytes;
00151 const uint8_t *pos;
00152 int32_t remainingMatchLength;
00153 };
00154
00162 const BytesTrie &saveState(State &state) const {
00163 state.bytes=bytes_;
00164 state.pos=pos_;
00165 state.remainingMatchLength=remainingMatchLength_;
00166 return *this;
00167 }
00168
00179 BytesTrie &resetToState(const State &state) {
00180 if(bytes_==state.bytes && bytes_!=NULL) {
00181 pos_=state.pos;
00182 remainingMatchLength_=state.remainingMatchLength;
00183 }
00184 return *this;
00185 }
00186
00193 UStringTrieResult current() const;
00194
00203 inline UStringTrieResult first(int32_t inByte) {
00204 remainingMatchLength_=-1;
00205 if(inByte<0) {
00206 inByte+=0x100;
00207 }
00208 return nextImpl(bytes_, inByte);
00209 }
00210
00218 UStringTrieResult next(int32_t inByte);
00219
00235 UStringTrieResult next(const char *s, int32_t length);
00236
00246 inline int32_t getValue() const {
00247 const uint8_t *pos=pos_;
00248 int32_t leadByte=*pos++;
00249
00250 return readValue(pos, leadByte>>1);
00251 }
00252
00262 inline UBool hasUniqueValue(int32_t &uniqueValue) const {
00263 const uint8_t *pos=pos_;
00264
00265 return pos!=NULL && findUniqueValue(pos+remainingMatchLength_+1, false, uniqueValue);
00266 }
00267
00276 int32_t getNextBytes(ByteSink &out) const;
00277
00282 class U_COMMON_API Iterator : public UMemory {
00283 public:
00295 Iterator(const void *trieBytes, int32_t maxStringLength, UErrorCode &errorCode);
00296
00308 Iterator(const BytesTrie &trie, int32_t maxStringLength, UErrorCode &errorCode);
00309
00314 ~Iterator();
00315
00321 Iterator &reset();
00322
00327 UBool hasNext() const;
00328
00343 UBool next(UErrorCode &errorCode);
00344
00349 StringPiece getString() const;
00354 int32_t getValue() const { return value_; }
00355
00356 private:
00357 UBool truncateAndStop();
00358
00359 const uint8_t *branchNext(const uint8_t *pos, int32_t length, UErrorCode &errorCode);
00360
00361 const uint8_t *bytes_;
00362 const uint8_t *pos_;
00363 const uint8_t *initialPos_;
00364 int32_t remainingMatchLength_;
00365 int32_t initialRemainingMatchLength_;
00366
00367 CharString *str_;
00368 int32_t maxLength_;
00369 int32_t value_;
00370
00371
00372
00373
00374
00375
00376
00377
00378 UVector32 *stack_;
00379 };
00380
00381 private:
00382 friend class BytesTrieBuilder;
00383 friend class ::BytesTrieTest;
00384
00391 BytesTrie(void *adoptBytes, const void *trieBytes)
00392 : ownedArray_(static_cast<uint8_t *>(adoptBytes)),
00393 bytes_(static_cast<const uint8_t *>(trieBytes)),
00394 pos_(bytes_), remainingMatchLength_(-1) {}
00395
00396
00397 BytesTrie &operator=(const BytesTrie &other);
00398
00399 inline void stop() {
00400 pos_=NULL;
00401 }
00402
00403
00404
00405 static int32_t readValue(const uint8_t *pos, int32_t leadByte);
00406 static inline const uint8_t *skipValue(const uint8_t *pos, int32_t leadByte) {
00407
00408 if(leadByte>=(kMinTwoByteValueLead<<1)) {
00409 if(leadByte<(kMinThreeByteValueLead<<1)) {
00410 ++pos;
00411 } else if(leadByte<(kFourByteValueLead<<1)) {
00412 pos+=2;
00413 } else {
00414 pos+=3+((leadByte>>1)&1);
00415 }
00416 }
00417 return pos;
00418 }
00419 static inline const uint8_t *skipValue(const uint8_t *pos) {
00420 int32_t leadByte=*pos++;
00421 return skipValue(pos, leadByte);
00422 }
00423
00424
00425 static const uint8_t *jumpByDelta(const uint8_t *pos);
00426
00427 static inline const uint8_t *skipDelta(const uint8_t *pos) {
00428 int32_t delta=*pos++;
00429 if(delta>=kMinTwoByteDeltaLead) {
00430 if(delta<kMinThreeByteDeltaLead) {
00431 ++pos;
00432 } else if(delta<kFourByteDeltaLead) {
00433 pos+=2;
00434 } else {
00435 pos+=3+(delta&1);
00436 }
00437 }
00438 return pos;
00439 }
00440
00441 static inline UStringTrieResult valueResult(int32_t node) {
00442 return (UStringTrieResult)(USTRINGTRIE_INTERMEDIATE_VALUE-(node&kValueIsFinal));
00443 }
00444
00445
00446 UStringTrieResult branchNext(const uint8_t *pos, int32_t length, int32_t inByte);
00447
00448
00449 UStringTrieResult nextImpl(const uint8_t *pos, int32_t inByte);
00450
00451
00452
00453
00454 static const uint8_t *findUniqueValueFromBranch(const uint8_t *pos, int32_t length,
00455 UBool haveUniqueValue, int32_t &uniqueValue);
00456
00457
00458 static UBool findUniqueValue(const uint8_t *pos, UBool haveUniqueValue, int32_t &uniqueValue);
00459
00460
00461
00462 static void getNextBranchBytes(const uint8_t *pos, int32_t length, ByteSink &out);
00463 static void append(ByteSink &out, int c);
00464
00465
00466
00467
00468
00469
00470
00471
00472
00473
00474
00475
00476
00477
00478
00479
00480
00481
00482
00483
00484
00485
00486
00487
00488
00489
00490
00491
00492
00493
00494
00495
00496
00497
00498
00499
00500
00501
00502
00503
00504 static const int32_t kMaxBranchLinearSubNodeLength=5;
00505
00506
00507 static const int32_t kMinLinearMatch=0x10;
00508 static const int32_t kMaxLinearMatchLength=0x10;
00509
00510
00511
00512
00513
00514
00515 static const int32_t kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength;
00516
00517 static const int32_t kValueIsFinal=1;
00518
00519
00520 static const int32_t kMinOneByteValueLead=kMinValueLead/2;
00521 static const int32_t kMaxOneByteValue=0x40;
00522
00523 static const int32_t kMinTwoByteValueLead=kMinOneByteValueLead+kMaxOneByteValue+1;
00524 static const int32_t kMaxTwoByteValue=0x1aff;
00525
00526 static const int32_t kMinThreeByteValueLead=kMinTwoByteValueLead+(kMaxTwoByteValue>>8)+1;
00527 static const int32_t kFourByteValueLead=0x7e;
00528
00529
00530 static const int32_t kMaxThreeByteValue=((kFourByteValueLead-kMinThreeByteValueLead)<<16)-1;
00531
00532 static const int32_t kFiveByteValueLead=0x7f;
00533
00534
00535 static const int32_t kMaxOneByteDelta=0xbf;
00536 static const int32_t kMinTwoByteDeltaLead=kMaxOneByteDelta+1;
00537 static const int32_t kMinThreeByteDeltaLead=0xf0;
00538 static const int32_t kFourByteDeltaLead=0xfe;
00539 static const int32_t kFiveByteDeltaLead=0xff;
00540
00541 static const int32_t kMaxTwoByteDelta=((kMinThreeByteDeltaLead-kMinTwoByteDeltaLead)<<8)-1;
00542 static const int32_t kMaxThreeByteDelta=((kFourByteDeltaLead-kMinThreeByteDeltaLead)<<16)-1;
00543
00544
00545
00546
00547
00548 static constexpr int32_t kState64RemainingShift = 59;
00549 static constexpr uint64_t kState64PosMask = (UINT64_C(1) << kState64RemainingShift) - 1;
00550
00551 uint8_t *ownedArray_;
00552
00553
00554 const uint8_t *bytes_;
00555
00556
00557
00558
00559 const uint8_t *pos_;
00560
00561 int32_t remainingMatchLength_;
00562 };
00563
00564 U_NAMESPACE_END
00565
00566 #endif
00567
00568 #endif // __BYTESTRIE_H__