ICU 57.1 57.1
uniset.h
Go to the documentation of this file.
1/*
2***************************************************************************
3* Copyright (C) 1999-2016, International Business Machines Corporation
4* and others. All Rights Reserved.
5***************************************************************************
6* Date Name Description
7* 10/20/99 alan Creation.
8***************************************************************************
9*/
10
11#ifndef UNICODESET_H
12#define UNICODESET_H
13
14#include "unicode/unifilt.h"
15#include "unicode/unistr.h"
16#include "unicode/uset.h"
17
24
25// Forward Declarations.
28class BMPSet;
29class ParsePosition;
30class RBBIRuleScanner;
31class SymbolTable;
32class UnicodeSetStringSpan;
33class UVector;
34class RuleCharacterIterator;
35
276class U_COMMON_API UnicodeSet U_FINAL : public UnicodeFilter {
277
278 int32_t len; // length of list used; 0 <= len <= capacity
279 int32_t capacity; // capacity of list
280 UChar32* list; // MUST be terminated with HIGH
281 BMPSet *bmpSet; // The set is frozen iff either bmpSet or stringSpan is not NULL.
282 UChar32* buffer; // internal buffer, may be NULL
283 int32_t bufferCapacity; // capacity of buffer
284 int32_t patLen;
285
295 UChar *pat;
296 UVector* strings; // maintained in sorted order
297 UnicodeSetStringSpan *stringSpan;
298
299private:
300 enum { // constants
301 kIsBogus = 1 // This set is bogus (i.e. not valid)
302 };
303 uint8_t fFlags; // Bit flag (see constants above)
304public:
314 inline UBool isBogus(void) const;
315
333
334public:
335
336 enum {
341 MIN_VALUE = 0,
342
347 MAX_VALUE = 0x10ffff
348 };
349
350 //----------------------------------------------------------------
351 // Constructors &c
352 //----------------------------------------------------------------
353
354public:
355
361
371
372#ifndef U_HIDE_INTERNAL_API
377 kSerialized /* result of serialize() */
378 };
379
390 UnicodeSet(const uint16_t buffer[], int32_t bufferLen,
391 ESerialization serialization, UErrorCode &status);
392#endif /* U_HIDE_INTERNAL_API */
393
402 UnicodeSet(const UnicodeString& pattern,
403 UErrorCode& status);
404
405#ifndef U_HIDE_INTERNAL_API
418 UnicodeSet(const UnicodeString& pattern,
419 uint32_t options,
420 const SymbolTable* symbols,
421 UErrorCode& status);
422#endif /* U_HIDE_INTERNAL_API */
423
438 uint32_t options,
439 const SymbolTable* symbols,
440 UErrorCode& status);
441
447
452 virtual ~UnicodeSet();
453
460
472 virtual UBool operator==(const UnicodeSet& o) const;
473
479 UBool operator!=(const UnicodeSet& o) const;
480
490 virtual UnicodeFunctor* clone() const;
491
499 virtual int32_t hashCode(void) const;
500
509 inline static UnicodeSet *fromUSet(USet *uset);
510
519 inline static const UnicodeSet *fromUSet(const USet *uset);
520
528 inline USet *toUSet();
529
530
538 inline const USet * toUSet() const;
539
540
541 //----------------------------------------------------------------
542 // Freezable API
543 //----------------------------------------------------------------
544
553 inline UBool isFrozen() const;
554
569
579
580 //----------------------------------------------------------------
581 // Public API
582 //----------------------------------------------------------------
583
595
601 static UBool resemblesPattern(const UnicodeString& pattern,
602 int32_t pos);
603
617 UErrorCode& status);
618
619#ifndef U_HIDE_INTERNAL_API
637 uint32_t options,
638 const SymbolTable* symbols,
639 UErrorCode& status);
640#endif /* U_HIDE_INTERNAL_API */
641
674 ParsePosition& pos,
675 uint32_t options,
676 const SymbolTable* symbols,
677 UErrorCode& status);
678
693 UBool escapeUnprintable = FALSE) const;
694
718 int32_t value,
719 UErrorCode& ec);
720
751 const UnicodeString& value,
752 UErrorCode& ec);
753
762 virtual int32_t size(void) const;
763
770 virtual UBool isEmpty(void) const;
771
779 virtual UBool contains(UChar32 c) const;
780
789 virtual UBool contains(UChar32 start, UChar32 end) const;
790
798 UBool contains(const UnicodeString& s) const;
799
807 virtual UBool containsAll(const UnicodeSet& c) const;
808
817
827
836
845
854 inline UBool containsSome(UChar32 start, UChar32 end) const;
855
863 inline UBool containsSome(const UnicodeSet& s) const;
864
872 inline UBool containsSome(const UnicodeString& s) const;
873
892 int32_t span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
893
906 inline int32_t span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const;
907
925 int32_t spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
926
940 inline int32_t spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const;
941
960 int32_t spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const;
961
979 int32_t spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const;
980
985 virtual UMatchDegree matches(const Replaceable& text,
986 int32_t& offset,
987 int32_t limit,
988 UBool incremental);
989
990private:
1013 static int32_t matchRest(const Replaceable& text,
1014 int32_t start, int32_t limit,
1015 const UnicodeString& s);
1016
1026 int32_t findCodePoint(UChar32 c) const;
1027
1028public:
1029
1037 virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;
1038
1047 int32_t indexOf(UChar32 c) const;
1048
1058 UChar32 charAt(int32_t index) const;
1059
1074 virtual UnicodeSet& add(UChar32 start, UChar32 end);
1075
1084
1097
1098 private:
1104 static int32_t getSingleCP(const UnicodeString& s);
1105
1106 void _add(const UnicodeString& s);
1107
1108 public:
1118
1128
1138
1148
1157 static UnicodeSet* U_EXPORT2 createFrom(const UnicodeString& s);
1158
1159
1167 static UnicodeSet* U_EXPORT2 createFromAll(const UnicodeString& s);
1168
1182 virtual UnicodeSet& retain(UChar32 start, UChar32 end);
1183
1184
1191
1205 virtual UnicodeSet& remove(UChar32 start, UChar32 end);
1206
1215
1226
1234 virtual UnicodeSet& complement(void);
1235
1250 virtual UnicodeSet& complement(UChar32 start, UChar32 end);
1251
1260
1272
1285 virtual UnicodeSet& addAll(const UnicodeSet& c);
1286
1298 virtual UnicodeSet& retainAll(const UnicodeSet& c);
1299
1311 virtual UnicodeSet& removeAll(const UnicodeSet& c);
1312
1324
1331 virtual UnicodeSet& clear(void);
1332
1358 UnicodeSet& closeOver(int32_t attribute);
1359
1367
1375 virtual int32_t getRangeCount(void) const;
1376
1384 virtual UChar32 getRangeStart(int32_t index) const;
1385
1393 virtual UChar32 getRangeEnd(int32_t index) const;
1394
1443 int32_t serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const;
1444
1452
1464 static UClassID U_EXPORT2 getStaticClassID(void);
1465
1474 virtual UClassID getDynamicClassID(void) const;
1475
1476private:
1477
1478 // Private API for the USet API
1479
1480 friend class USetAccess;
1481
1482 int32_t getStringCount() const;
1483
1484 const UnicodeString* getString(int32_t index) const;
1485
1486 //----------------------------------------------------------------
1487 // RuleBasedTransliterator support
1488 //----------------------------------------------------------------
1489
1490private:
1491
1497 virtual UBool matchesIndexValue(uint8_t v) const;
1498
1499private:
1500 friend class RBBIRuleScanner;
1501
1502 //----------------------------------------------------------------
1503 // Implementation: Clone as thawed (see ICU4J Freezable)
1504 //----------------------------------------------------------------
1505
1506 UnicodeSet(const UnicodeSet& o, UBool /* asThawed */);
1507
1508 //----------------------------------------------------------------
1509 // Implementation: Pattern parsing
1510 //----------------------------------------------------------------
1511
1512 void applyPatternIgnoreSpace(const UnicodeString& pattern,
1513 ParsePosition& pos,
1514 const SymbolTable* symbols,
1515 UErrorCode& status);
1516
1517 void applyPattern(RuleCharacterIterator& chars,
1518 const SymbolTable* symbols,
1519 UnicodeString& rebuiltPat,
1520 uint32_t options,
1521 UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
1522 UErrorCode& ec);
1523
1524 //----------------------------------------------------------------
1525 // Implementation: Utility methods
1526 //----------------------------------------------------------------
1527
1528 void ensureCapacity(int32_t newLen, UErrorCode& ec);
1529
1530 void ensureBufferCapacity(int32_t newLen, UErrorCode& ec);
1531
1532 void swapBuffers(void);
1533
1534 UBool allocateStrings(UErrorCode &status);
1535
1536 UnicodeString& _toPattern(UnicodeString& result,
1537 UBool escapeUnprintable) const;
1538
1539 UnicodeString& _generatePattern(UnicodeString& result,
1540 UBool escapeUnprintable) const;
1541
1542 static void _appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable);
1543
1544 static void _appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable);
1545
1546 //----------------------------------------------------------------
1547 // Implementation: Fundamental operators
1548 //----------------------------------------------------------------
1549
1550 void exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity);
1551
1552 void add(const UChar32* other, int32_t otherLen, int8_t polarity);
1553
1554 void retain(const UChar32* other, int32_t otherLen, int8_t polarity);
1555
1561 static UBool resemblesPropertyPattern(const UnicodeString& pattern,
1562 int32_t pos);
1563
1564 static UBool resemblesPropertyPattern(RuleCharacterIterator& chars,
1565 int32_t iterOpts);
1566
1606 UnicodeSet& applyPropertyPattern(const UnicodeString& pattern,
1607 ParsePosition& ppos,
1608 UErrorCode &ec);
1609
1610 void applyPropertyPattern(RuleCharacterIterator& chars,
1611 UnicodeString& rebuiltPat,
1612 UErrorCode& ec);
1613
1614 friend void U_CALLCONV UnicodeSet_initInclusion(int32_t src, UErrorCode &status);
1615 static const UnicodeSet* getInclusions(int32_t src, UErrorCode &status);
1616
1621 typedef UBool (*Filter)(UChar32 codePoint, void* context);
1622
1632 void applyFilter(Filter filter,
1633 void* context,
1634 int32_t src,
1635 UErrorCode &status);
1636
1640 void setPattern(const UnicodeString& newPat);
1644 void releasePattern();
1645
1646 friend class UnicodeSetIterator;
1647};
1648
1649
1650
1652 return !operator==(o);
1653}
1654
1656 return (UBool)(bmpSet!=NULL || stringSpan!=NULL);
1657}
1658
1660 return !containsNone(start, end);
1661}
1662
1664 return !containsNone(s);
1665}
1666
1668 return !containsNone(s);
1669}
1670
1672 return (UBool)(fFlags & kIsBogus);
1673}
1674
1676 return reinterpret_cast<UnicodeSet *>(uset);
1677}
1678
1679inline const UnicodeSet *UnicodeSet::fromUSet(const USet *uset) {
1680 return reinterpret_cast<const UnicodeSet *>(uset);
1681}
1682
1684 return reinterpret_cast<USet *>(this);
1685}
1686
1687inline const USet *UnicodeSet::toUSet() const {
1688 return reinterpret_cast<const USet *>(this);
1689}
1690
1691inline int32_t UnicodeSet::span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const {
1692 int32_t sLength=s.length();
1693 if(start<0) {
1694 start=0;
1695 } else if(start>sLength) {
1696 start=sLength;
1697 }
1698 return start+span(s.getBuffer()+start, sLength-start, spanCondition);
1699}
1700
1701inline int32_t UnicodeSet::spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const {
1702 int32_t sLength=s.length();
1703 if(limit<0) {
1704 limit=0;
1705 } else if(limit>sLength) {
1706 limit=sLength;
1707 }
1708 return spanBack(s.getBuffer(), limit, spanCondition);
1709}
1710
1712
1713#endif
ParsePosition is a simple class used by Format and its subclasses to keep track of the current positi...
Definition: parsepos.h:47
Replaceable is an abstract base class representing a string of characters that supports the replaceme...
Definition: rep.h:71
An interface that defines both lookup protocol and parsing of symbolic names.
Definition: symtable.h:54
UnicodeFilter defines a protocol for selecting a subset of the full range (U+0000 to U+10FFFF) of Uni...
Definition: unifilt.h:59
UnicodeFunctor is an abstract base class for objects that perform match and/or replace operations on ...
Definition: unifunct.h:33
virtual UBool matchesIndexValue(uint8_t v) const =0
Returns TRUE if this matcher will match a character c, where c & 0xFF == v, at offset,...
UnicodeSetIterator iterates over the contents of a UnicodeSet.
Definition: usetiter.h:61
A mutable set of Unicode characters and multicharacter strings.
Definition: uniset.h:276
UnicodeFunctor * freeze()
Freeze the set (make it immutable).
virtual UnicodeSet & clear(void)
Removes all of the elements from this set.
static UnicodeSet * fromUSet(USet *uset)
Get a UnicodeSet pointer from a USet.
Definition: uniset.h:1675
UnicodeSet()
Constructs an empty set.
virtual int32_t getRangeCount(void) const
Iteration method that returns the number of ranges contained in this set.
UBool isFrozen() const
Determines whether the set has been frozen (made immutable) or not.
Definition: uniset.h:1655
virtual int32_t size(void) const
Returns the number of elements in this set (its cardinality).
static UnicodeSet * createFrom(const UnicodeString &s)
Makes a set from a multicharacter string.
UnicodeSet & addAll(const UnicodeString &s)
Adds each of the characters in this string to the set.
virtual UChar32 getRangeEnd(int32_t index) const
Iteration method that returns the last character in the specified range of this set.
static UClassID getStaticClassID(void)
Return the class ID for this class.
UnicodeSet & remove(UChar32 c)
Removes the specified character from this set if it is present.
UBool containsNone(const UnicodeSet &c) const
Returns true if this set contains none of the characters and strings of the given set.
virtual ~UnicodeSet()
Destructs the set.
UBool operator!=(const UnicodeSet &o) const
Compares the specified object with this set for equality.
Definition: uniset.h:1651
UnicodeSet & add(UChar32 c)
Adds the specified character to this set if it is not already present.
virtual UnicodeSet & compact()
Reallocate this objects internal structures to take up the least possible space, without changing thi...
virtual UMatchDegree matches(const Replaceable &text, int32_t &offset, int32_t limit, UBool incremental)
Implement UnicodeMatcher::matches()
void setToBogus()
Make this UnicodeSet object invalid.
virtual void addMatchSetTo(UnicodeSet &toUnionTo) const
Implementation of UnicodeMatcher API.
UnicodeSet(const UnicodeSet &o)
Constructs a set that is identical to the given UnicodeSet.
UnicodeSet & remove(const UnicodeString &s)
Removes the specified string from this set if it is present.
UBool containsNone(UChar32 start, UChar32 end) const
Returns true if this set contains none of the characters of the given range.
virtual UnicodeSet & removeAllStrings()
Remove all strings from this set.
virtual UChar32 getRangeStart(int32_t index) const
Iteration method that returns the first character in the specified range of this set.
UnicodeSet & complement(UChar32 c)
Complements the specified character in this set.
virtual UnicodeSet & add(UChar32 start, UChar32 end)
Adds the specified range to this set if it is not already present.
static UBool resemblesPattern(const UnicodeString &pattern, int32_t pos)
Return true if the given position, in the given pattern, appears to be the start of a UnicodeSet patt...
UBool isBogus(void) const
Determine if this object contains a valid set.
Definition: uniset.h:1671
UnicodeSet & closeOver(int32_t attribute)
Close this set over the given attribute.
UnicodeSet & operator=(const UnicodeSet &o)
Assigns this object to be a copy of another.
int32_t indexOf(UChar32 c) const
Returns the index of the given character within this set, where the set is ordered by ascending code ...
UChar32 charAt(int32_t index) const
Returns the character at the given index within this set, where the set is ordered by ascending code ...
virtual UBool contains(UChar32 start, UChar32 end) const
Returns true if this set contains every character of the given range.
int32_t serialize(uint16_t *dest, int32_t destCapacity, UErrorCode &ec) const
Serializes this set into an array of 16-bit integers.
virtual UBool contains(UChar32 c) const
Returns true if this set contains the given character.
UBool containsSome(UChar32 start, UChar32 end) const
Returns true if this set contains one or more of the characters in the given range.
Definition: uniset.h:1659
virtual UnicodeSet & remove(UChar32 start, UChar32 end)
Removes the specified range from this set if it is present.
virtual int32_t hashCode(void) const
Returns the hash code value for this set.
virtual UBool isEmpty(void) const
Returns true if this set contains no elements.
UnicodeSet & removeAll(const UnicodeString &s)
Remove EACH of the characters in this string.
UBool contains(const UnicodeString &s) const
Returns true if this set contains the given multicharacter string.
UnicodeSet & complementAll(const UnicodeString &s)
Complement EACH of the characters in this string.
UnicodeSet & applyPattern(const UnicodeString &pattern, ParsePosition &pos, uint32_t options, const SymbolTable *symbols, UErrorCode &status)
Parses the given pattern, starting at the given position.
UnicodeSet & add(const UnicodeString &s)
Adds the specified multicharacter to this set if it is not already present.
virtual UBool operator==(const UnicodeSet &o) const
Compares the specified object with this set for equality.
ESerialization
Definition: uniset.h:376
UnicodeSet(UChar32 start, UChar32 end)
Constructs a set containing the given range.
UnicodeSet & retainAll(const UnicodeString &s)
Retains EACH of the characters in this string.
virtual UnicodeSet & retain(UChar32 start, UChar32 end)
Retain only the elements in this set that are contained in the specified range.
UnicodeSet(const UnicodeString &pattern, uint32_t options, const SymbolTable *symbols, UErrorCode &status)
Constructs a set from the given pattern.
virtual UnicodeString & toPattern(UnicodeString &result, UBool escapeUnprintable=FALSE) const
Returns a string representation of this set.
USet * toUSet()
Produce a USet * pointer for this UnicodeSet.
Definition: uniset.h:1683
virtual UnicodeSet & complement(void)
Inverts this set.
UnicodeSet & retain(UChar32 c)
Retain the specified character from this set if it is present.
UnicodeSet & applyPattern(const UnicodeString &pattern, uint32_t options, const SymbolTable *symbols, UErrorCode &status)
Modifies this set to represent the set specified by the given pattern, optionally ignoring Unicode Pa...
UBool containsNone(const UnicodeString &s) const
Returns true if this set contains none of the characters of the given string.
virtual UnicodeSet & complement(UChar32 start, UChar32 end)
Complements the specified range in this set.
virtual UnicodeSet & removeAll(const UnicodeSet &c)
Removes from this set all of its elements that are contained in the specified set.
virtual UnicodeFunctor * clone() const
Returns a copy of this object.
UnicodeSet & applyPattern(const UnicodeString &pattern, UErrorCode &status)
Modifies this set to represent the set specified by the given pattern, ignoring Unicode Pattern_White...
UnicodeSet & applyPropertyAlias(const UnicodeString &prop, const UnicodeString &value, UErrorCode &ec)
Modifies this set to contain those code points which have the given value for the given property.
int32_t spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const
Returns the start of the trailing substring of the input string which consists only of characters and...
UBool containsAll(const UnicodeString &s) const
Returns true if this set contains all the characters of the given string.
virtual UBool containsAll(const UnicodeSet &c) const
Returns true if this set contains all the characters and strings of the given set.
virtual UnicodeSet & retainAll(const UnicodeSet &c)
Retains only the elements in this set that are contained in the specified set.
virtual UClassID getDynamicClassID(void) const
Implement UnicodeFunctor API.
UnicodeSet & complement(const UnicodeString &s)
Complement the specified string in this set.
UnicodeSet(const UnicodeString &pattern, UErrorCode &status)
Constructs a set from the given pattern.
friend void UnicodeSet_initInclusion(int32_t src, UErrorCode &status)
UnicodeSet & set(UChar32 start, UChar32 end)
Make this object represent the range start - end.
UnicodeSet(const uint16_t buffer[], int32_t bufferLen, ESerialization serialization, UErrorCode &status)
Constructs a set from the output of serialize().
int32_t span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const
Returns the length of the initial substring of the input string which consists only of characters and...
virtual UnicodeSet & addAll(const UnicodeSet &c)
Adds all of the elements in the specified set to this set if they're not already present.
int32_t spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const
Returns the length of the initial substring of the input string which consists only of characters and...
virtual UnicodeSet & complementAll(const UnicodeSet &c)
Complements in this set all elements contained in the specified set.
static UnicodeSet * createFromAll(const UnicodeString &s)
Makes a set from each of the characters in the string.
UnicodeSet & applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode &ec)
Modifies this set to contain those code points which have the given value for the given binary or enu...
int32_t spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const
Returns the start of the trailing substring of the input string which consists only of characters and...
UnicodeFunctor * cloneAsThawed() const
Clone the set and make the clone mutable.
UnicodeSet(const UnicodeString &pattern, ParsePosition &pos, uint32_t options, const SymbolTable *symbols, UErrorCode &status)
Constructs a set from the given pattern.
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition: unistr.h:294
UChar * getBuffer(int32_t minCapacity)
Get a read/write pointer to the internal buffer.
int32_t length(void) const
Return the length of the UnicodeString object.
Definition: unistr.h:3794
#define U_CALLCONV
Similar to U_CDECL_BEGIN/U_CDECL_END, this qualifier is necessary in callback function typedefs to ma...
Definition: platform.h:849
UBool operator!=(const StringPiece &x, const StringPiece &y)
Global operator != for StringPiece.
Definition: stringpiece.h:218
UProperty
Selection constants for Unicode properties.
Definition: uchar.h:161
struct USet USet
Definition: ucnv.h:67
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition: umachine.h:332
int8_t UBool
The ICU boolean type.
Definition: umachine.h:234
uint16_t UChar
Define UChar to be UCHAR_TYPE, if that is #defined (for example, to char16_t), or wchar_t if that is ...
Definition: umachine.h:312
#define FALSE
The FALSE value of a UBool.
Definition: umachine.h:242
C++ API: Unicode Filter.
UMatchDegree
Constants returned by UnicodeMatcher::matches() indicating the degree of match.
Definition: unimatch.h:30
U_NAMESPACE_BEGIN void UnicodeSet_initInclusion(int32_t src, UErrorCode &status)
C++ API: Unicode String.
void * UClassID
UClassID is used to identify classes without using the compiler's RTTI.
Definition: uobject.h:91
C API: Unicode Set.
USetSpanCondition
Argument values for whether span() and similar functions continue while the current character is cont...
Definition: uset.h:150
#define NULL
Define NULL if necessary, to 0 for C++ and to ((void *)0) for C.
Definition: utypes.h:186
UErrorCode
Error code to replace exception handling, so that the code is compatible with all C++ compilers,...
Definition: utypes.h:476
#define U_COMMON_API
Set to export library symbols from inside the common library, and to import them from outside.
Definition: utypes.h:357
#define U_NAMESPACE_END
This is used to end a declaration of a public ICU C++ API.
Definition: uversion.h:130
#define U_NAMESPACE_BEGIN
This is used to begin a declaration of a public ICU C++ API.
Definition: uversion.h:129