00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013 #ifndef UNICODESET_H
00014 #define UNICODESET_H
00015
00016 #include "unicode/utypes.h"
00017
00018 #if U_SHOW_CPLUSPLUS_API
00019
00020 #include "unicode/ucpmap.h"
00021 #include "unicode/unifilt.h"
00022 #include "unicode/unistr.h"
00023 #include "unicode/uset.h"
00024
00030 U_NAMESPACE_BEGIN
00031
00032
00033 class BMPSet;
00034 class ParsePosition;
00035 class RBBIRuleScanner;
00036 class SymbolTable;
00037 class UnicodeSetStringSpan;
00038 class UVector;
00039 class RuleCharacterIterator;
00040
00285 class U_COMMON_API UnicodeSet U_FINAL : public UnicodeFilter {
00286 private:
00291 static constexpr int32_t INITIAL_CAPACITY = 25;
00292
00293 static constexpr uint8_t kIsBogus = 1;
00294
00295 UChar32* list = stackList;
00296 int32_t capacity = INITIAL_CAPACITY;
00297 int32_t len = 1;
00298 uint8_t fFlags = 0;
00299
00300 BMPSet *bmpSet = nullptr;
00301 UChar32* buffer = nullptr;
00302 int32_t bufferCapacity = 0;
00303
00313 char16_t *pat = nullptr;
00314 int32_t patLen = 0;
00315
00316 UVector* strings = nullptr;
00317 UnicodeSetStringSpan *stringSpan = nullptr;
00318
00324 UChar32 stackList[INITIAL_CAPACITY];
00325
00326 public:
00336 inline UBool isBogus(void) const;
00337
00354 void setToBogus();
00355
00356 public:
00357
00358 enum {
00363 MIN_VALUE = 0,
00364
00369 MAX_VALUE = 0x10ffff
00370 };
00371
00372
00373
00374
00375
00376 public:
00377
00382 UnicodeSet();
00383
00392 UnicodeSet(UChar32 start, UChar32 end);
00393
00394 #ifndef U_HIDE_INTERNAL_API
00395
00398 enum ESerialization {
00399 kSerialized
00400 };
00401
00412 UnicodeSet(const uint16_t buffer[], int32_t bufferLen,
00413 ESerialization serialization, UErrorCode &status);
00414 #endif
00415
00424 UnicodeSet(const UnicodeString& pattern,
00425 UErrorCode& status);
00426
00427 #ifndef U_HIDE_INTERNAL_API
00428
00440 UnicodeSet(const UnicodeString& pattern,
00441 uint32_t options,
00442 const SymbolTable* symbols,
00443 UErrorCode& status);
00444 #endif
00445
00459 UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
00460 uint32_t options,
00461 const SymbolTable* symbols,
00462 UErrorCode& status);
00463
00468 UnicodeSet(const UnicodeSet& o);
00469
00474 virtual ~UnicodeSet();
00475
00481 UnicodeSet& operator=(const UnicodeSet& o);
00482
00494 virtual bool operator==(const UnicodeSet& o) const;
00495
00501 inline bool operator!=(const UnicodeSet& o) const;
00502
00512 virtual UnicodeSet* clone() const override;
00513
00521 virtual int32_t hashCode(void) const;
00522
00531 inline static UnicodeSet *fromUSet(USet *uset);
00532
00541 inline static const UnicodeSet *fromUSet(const USet *uset);
00542
00550 inline USet *toUSet();
00551
00552
00560 inline const USet * toUSet() const;
00561
00562
00563
00564
00565
00566
00575 inline UBool isFrozen() const;
00576
00590 UnicodeSet *freeze();
00591
00600 UnicodeSet *cloneAsThawed() const;
00601
00602
00603
00604
00605
00615 UnicodeSet& set(UChar32 start, UChar32 end);
00616
00622 static UBool resemblesPattern(const UnicodeString& pattern,
00623 int32_t pos);
00624
00637 UnicodeSet& applyPattern(const UnicodeString& pattern,
00638 UErrorCode& status);
00639
00640 #ifndef U_HIDE_INTERNAL_API
00641
00657 UnicodeSet& applyPattern(const UnicodeString& pattern,
00658 uint32_t options,
00659 const SymbolTable* symbols,
00660 UErrorCode& status);
00661 #endif
00662
00694 UnicodeSet& applyPattern(const UnicodeString& pattern,
00695 ParsePosition& pos,
00696 uint32_t options,
00697 const SymbolTable* symbols,
00698 UErrorCode& status);
00699
00713 virtual UnicodeString& toPattern(UnicodeString& result,
00714 UBool escapeUnprintable = false) const override;
00715
00738 UnicodeSet& applyIntPropertyValue(UProperty prop,
00739 int32_t value,
00740 UErrorCode& ec);
00741
00771 UnicodeSet& applyPropertyAlias(const UnicodeString& prop,
00772 const UnicodeString& value,
00773 UErrorCode& ec);
00774
00787 virtual int32_t size(void) const;
00788
00795 virtual UBool isEmpty(void) const;
00796
00797 #ifndef U_HIDE_DRAFT_API
00798
00802 UBool hasStrings() const;
00803 #endif // U_HIDE_DRAFT_API
00804
00812 virtual UBool contains(UChar32 c) const override;
00813
00822 virtual UBool contains(UChar32 start, UChar32 end) const;
00823
00831 UBool contains(const UnicodeString& s) const;
00832
00840 virtual UBool containsAll(const UnicodeSet& c) const;
00841
00849 UBool containsAll(const UnicodeString& s) const;
00850
00859 UBool containsNone(UChar32 start, UChar32 end) const;
00860
00868 UBool containsNone(const UnicodeSet& c) const;
00869
00877 UBool containsNone(const UnicodeString& s) const;
00878
00887 inline UBool containsSome(UChar32 start, UChar32 end) const;
00888
00896 inline UBool containsSome(const UnicodeSet& s) const;
00897
00905 inline UBool containsSome(const UnicodeString& s) const;
00906
00925 int32_t span(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const;
00926
00939 inline int32_t span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const;
00940
00958 int32_t spanBack(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const;
00959
00973 inline int32_t spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const;
00974
00993 int32_t spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const;
00994
01012 int32_t spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const;
01013
01018 virtual UMatchDegree matches(const Replaceable& text,
01019 int32_t& offset,
01020 int32_t limit,
01021 UBool incremental) override;
01022
01023 private:
01046 static int32_t matchRest(const Replaceable& text,
01047 int32_t start, int32_t limit,
01048 const UnicodeString& s);
01049
01059 int32_t findCodePoint(UChar32 c) const;
01060
01061 public:
01062
01070 virtual void addMatchSetTo(UnicodeSet& toUnionTo) const override;
01071
01080 int32_t indexOf(UChar32 c) const;
01081
01097 UChar32 charAt(int32_t index) const;
01098
01113 virtual UnicodeSet& add(UChar32 start, UChar32 end);
01114
01125 UnicodeSet& add(UChar32 c);
01126
01138 UnicodeSet& add(const UnicodeString& s);
01139
01140 private:
01146 static int32_t getSingleCP(const UnicodeString& s);
01147
01148 void _add(const UnicodeString& s);
01149
01150 public:
01159 UnicodeSet& addAll(const UnicodeString& s);
01160
01168 UnicodeSet& retainAll(const UnicodeString& s);
01169
01177 UnicodeSet& complementAll(const UnicodeString& s);
01178
01186 UnicodeSet& removeAll(const UnicodeString& s);
01187
01196 static UnicodeSet* U_EXPORT2 createFrom(const UnicodeString& s);
01197
01198
01206 static UnicodeSet* U_EXPORT2 createFromAll(const UnicodeString& s);
01207
01219 virtual UnicodeSet& retain(UChar32 start, UChar32 end);
01220
01221
01230 UnicodeSet& retain(UChar32 c);
01231
01242 UnicodeSet& retain(const UnicodeString &s);
01243
01257 virtual UnicodeSet& remove(UChar32 start, UChar32 end);
01258
01269 UnicodeSet& remove(UChar32 c);
01270
01280 UnicodeSet& remove(const UnicodeString& s);
01281
01294 virtual UnicodeSet& complement();
01295
01308 virtual UnicodeSet& complement(UChar32 start, UChar32 end);
01309
01320 UnicodeSet& complement(UChar32 c);
01321
01331 UnicodeSet& complement(const UnicodeString& s);
01332
01345 virtual UnicodeSet& addAll(const UnicodeSet& c);
01346
01358 virtual UnicodeSet& retainAll(const UnicodeSet& c);
01359
01371 virtual UnicodeSet& removeAll(const UnicodeSet& c);
01372
01383 virtual UnicodeSet& complementAll(const UnicodeSet& c);
01384
01391 virtual UnicodeSet& clear(void);
01392
01418 UnicodeSet& closeOver(int32_t attribute);
01419
01426 virtual UnicodeSet &removeAllStrings();
01427
01435 virtual int32_t getRangeCount(void) const;
01436
01444 virtual UChar32 getRangeStart(int32_t index) const;
01445
01453 virtual UChar32 getRangeEnd(int32_t index) const;
01454
01503 int32_t serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const;
01504
01511 virtual UnicodeSet& compact();
01512
01524 static UClassID U_EXPORT2 getStaticClassID(void);
01525
01534 virtual UClassID getDynamicClassID(void) const override;
01535
01536 private:
01537
01538
01539
01540 friend class USetAccess;
01541
01542 const UnicodeString* getString(int32_t index) const;
01543
01544
01545
01546
01547
01548 private:
01549
01555 virtual UBool matchesIndexValue(uint8_t v) const override;
01556
01557 private:
01558 friend class RBBIRuleScanner;
01559
01560
01561
01562
01563
01564 UnicodeSet(const UnicodeSet& o, UBool );
01565 UnicodeSet& copyFrom(const UnicodeSet& o, UBool asThawed);
01566
01567
01568
01569
01570
01571 void applyPatternIgnoreSpace(const UnicodeString& pattern,
01572 ParsePosition& pos,
01573 const SymbolTable* symbols,
01574 UErrorCode& status);
01575
01576 void applyPattern(RuleCharacterIterator& chars,
01577 const SymbolTable* symbols,
01578 UnicodeString& rebuiltPat,
01579 uint32_t options,
01580 UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
01581 int32_t depth,
01582 UErrorCode& ec);
01583
01584
01585
01586
01587
01588 static int32_t nextCapacity(int32_t minCapacity);
01589
01590 bool ensureCapacity(int32_t newLen);
01591
01592 bool ensureBufferCapacity(int32_t newLen);
01593
01594 void swapBuffers(void);
01595
01596 UBool allocateStrings(UErrorCode &status);
01597 int32_t stringsSize() const;
01598 UBool stringsContains(const UnicodeString &s) const;
01599
01600 UnicodeString& _toPattern(UnicodeString& result,
01601 UBool escapeUnprintable) const;
01602
01603 UnicodeString& _generatePattern(UnicodeString& result,
01604 UBool escapeUnprintable) const;
01605
01606 static void _appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable);
01607
01608 static void _appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable);
01609
01610 static void _appendToPat(UnicodeString &result, UChar32 start, UChar32 end,
01611 UBool escapeUnprintable);
01612
01613
01614
01615
01616
01617 void exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity);
01618
01619 void add(const UChar32* other, int32_t otherLen, int8_t polarity);
01620
01621 void retain(const UChar32* other, int32_t otherLen, int8_t polarity);
01622
01628 static UBool resemblesPropertyPattern(const UnicodeString& pattern,
01629 int32_t pos);
01630
01631 static UBool resemblesPropertyPattern(RuleCharacterIterator& chars,
01632 int32_t iterOpts);
01633
01673 UnicodeSet& applyPropertyPattern(const UnicodeString& pattern,
01674 ParsePosition& ppos,
01675 UErrorCode &ec);
01676
01677 void applyPropertyPattern(RuleCharacterIterator& chars,
01678 UnicodeString& rebuiltPat,
01679 UErrorCode& ec);
01680
01681 static const UnicodeSet* getInclusions(int32_t src, UErrorCode &status);
01682
01687 typedef UBool (*Filter)(UChar32 codePoint, void* context);
01688
01698 void applyFilter(Filter filter,
01699 void* context,
01700 const UnicodeSet* inclusions,
01701 UErrorCode &status);
01702
01703
01704 void applyIntPropertyValue(const UCPMap *map,
01705 UCPMapValueFilter *filter, const void *context,
01706 UErrorCode &errorCode);
01707
01711 void setPattern(const UnicodeString& newPat) {
01712 setPattern(newPat.getBuffer(), newPat.length());
01713 }
01714 void setPattern(const char16_t *newPat, int32_t newPatLen);
01718 void releasePattern();
01719
01720 friend class UnicodeSetIterator;
01721 };
01722
01723
01724
01725 inline bool UnicodeSet::operator!=(const UnicodeSet& o) const {
01726 return !operator==(o);
01727 }
01728
01729 inline UBool UnicodeSet::isFrozen() const {
01730 return (UBool)(bmpSet!=NULL || stringSpan!=NULL);
01731 }
01732
01733 inline UBool UnicodeSet::containsSome(UChar32 start, UChar32 end) const {
01734 return !containsNone(start, end);
01735 }
01736
01737 inline UBool UnicodeSet::containsSome(const UnicodeSet& s) const {
01738 return !containsNone(s);
01739 }
01740
01741 inline UBool UnicodeSet::containsSome(const UnicodeString& s) const {
01742 return !containsNone(s);
01743 }
01744
01745 inline UBool UnicodeSet::isBogus() const {
01746 return (UBool)(fFlags & kIsBogus);
01747 }
01748
01749 inline UnicodeSet *UnicodeSet::fromUSet(USet *uset) {
01750 return reinterpret_cast<UnicodeSet *>(uset);
01751 }
01752
01753 inline const UnicodeSet *UnicodeSet::fromUSet(const USet *uset) {
01754 return reinterpret_cast<const UnicodeSet *>(uset);
01755 }
01756
01757 inline USet *UnicodeSet::toUSet() {
01758 return reinterpret_cast<USet *>(this);
01759 }
01760
01761 inline const USet *UnicodeSet::toUSet() const {
01762 return reinterpret_cast<const USet *>(this);
01763 }
01764
01765 inline int32_t UnicodeSet::span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const {
01766 int32_t sLength=s.length();
01767 if(start<0) {
01768 start=0;
01769 } else if(start>sLength) {
01770 start=sLength;
01771 }
01772 return start+span(s.getBuffer()+start, sLength-start, spanCondition);
01773 }
01774
01775 inline int32_t UnicodeSet::spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const {
01776 int32_t sLength=s.length();
01777 if(limit<0) {
01778 limit=0;
01779 } else if(limit>sLength) {
01780 limit=sLength;
01781 }
01782 return spanBack(s.getBuffer(), limit, spanCondition);
01783 }
01784
01785 U_NAMESPACE_END
01786
01787 #endif
01788
01789 #endif