ICU 57.1  57.1
rbbi.h
Go to the documentation of this file.
1 /*
2 ***************************************************************************
3 * Copyright (C) 1999-2014 International Business Machines Corporation *
4 * and others. All rights reserved. *
5 ***************************************************************************
6 
7 **********************************************************************
8 * Date Name Description
9 * 10/22/99 alan Creation.
10 * 11/11/99 rgillam Complete port from Java.
11 **********************************************************************
12 */
13 
14 #ifndef RBBI_H
15 #define RBBI_H
16 
17 #include "unicode/utypes.h"
18 
24 #if !UCONFIG_NO_BREAK_ITERATION
25 
26 #include "unicode/brkiter.h"
27 #include "unicode/udata.h"
28 #include "unicode/parseerr.h"
29 #include "unicode/schriter.h"
30 #include "unicode/uchriter.h"
31 
32 
33 struct UTrie;
34 
36 
38 struct RBBIDataHeader;
39 class RuleBasedBreakIteratorTables;
40 class BreakIterator;
41 class RBBIDataWrapper;
42 class UStack;
43 class LanguageBreakEngine;
44 class UnhandledEngine;
45 struct RBBIStateTable;
46 
47 
48 
49 
66 
67 protected:
73 
80 
87 
94 
99  RBBIDataWrapper *fData;
100 
105 
113 
120 
129 
135 
142 
151 
159  UnhandledEngine *fUnhandledBreakEngine;
160 
166  int32_t fBreakType;
167 
168 protected:
169  //=======================================================================
170  // constructors
171  //=======================================================================
172 
173 #ifndef U_HIDE_INTERNAL_API
174 
182  enum EDontAdopt {
183  kDontAdopt
184  };
185 
196  RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status);
197 
206  RuleBasedBreakIterator(const RBBIDataHeader* data, enum EDontAdopt dontAdopt, UErrorCode &status);
207 #endif /* U_HIDE_INTERNAL_API */
208 
209 
210  friend class RBBIRuleBuilder;
212  friend class BreakIterator;
213 
214 
215 
216 public:
217 
223 
231 
241  UParseError &parseError,
242  UErrorCode &status);
243 
267  RuleBasedBreakIterator(const uint8_t *compiledRules,
268  uint32_t ruleLength,
269  UErrorCode &status);
270 
284 
289  virtual ~RuleBasedBreakIterator();
290 
298  RuleBasedBreakIterator& operator=(const RuleBasedBreakIterator& that);
299 
308  virtual UBool operator==(const BreakIterator& that) const;
309 
317  UBool operator!=(const BreakIterator& that) const;
318 
329  virtual BreakIterator* clone() const;
330 
336  virtual int32_t hashCode(void) const;
337 
343  virtual const UnicodeString& getRules(void) const;
344 
345  //=======================================================================
346  // BreakIterator overrides
347  //=======================================================================
348 
374  virtual CharacterIterator& getText(void) const;
375 
376 
391  virtual UText *getUText(UText *fillIn, UErrorCode &status) const;
392 
400  virtual void adoptText(CharacterIterator* newText);
401 
408  virtual void setText(const UnicodeString& newText);
409 
423  virtual void setText(UText *text, UErrorCode &status);
424 
430  virtual int32_t first(void);
431 
437  virtual int32_t last(void);
438 
449  virtual int32_t next(int32_t n);
450 
456  virtual int32_t next(void);
457 
463  virtual int32_t previous(void);
464 
472  virtual int32_t following(int32_t offset);
473 
481  virtual int32_t preceding(int32_t offset);
482 
491  virtual UBool isBoundary(int32_t offset);
492 
498  virtual int32_t current(void) const;
499 
500 
533  virtual int32_t getRuleStatus() const;
534 
558  virtual int32_t getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status);
559 
571  virtual UClassID getDynamicClassID(void) const;
572 
584  static UClassID U_EXPORT2 getStaticClassID(void);
585 
612  virtual BreakIterator * createBufferClone(void *stackBuffer,
613  int32_t &BufferSize,
614  UErrorCode &status);
615 
616 
634  virtual const uint8_t *getBinaryRules(uint32_t &length);
635 
661  virtual RuleBasedBreakIterator &refreshInputText(UText *input, UErrorCode &status);
662 
663 
664 protected:
665  //=======================================================================
666  // implementation
667  //=======================================================================
673  virtual void reset(void);
674 
675 #if 0
676 
684  virtual UBool isDictionaryChar(UChar32);
685 
690  virtual int32_t getBreakType() const;
691 #endif
692 
697  virtual void setBreakType(int32_t type);
698 
699 #ifndef U_HIDE_INTERNAL_API
700 
704  void init();
705 #endif /* U_HIDE_INTERNAL_API */
706 
707 private:
708 
718  int32_t handlePrevious(const RBBIStateTable *statetable);
719 
729  int32_t handleNext(const RBBIStateTable *statetable);
730 
731 protected:
732 
733 #ifndef U_HIDE_INTERNAL_API
734 
748  int32_t checkDictionary(int32_t startPos, int32_t endPos, UBool reverse);
749 #endif /* U_HIDE_INTERNAL_API */
750 
751 private:
752 
759  const LanguageBreakEngine *getLanguageBreakEngine(UChar32 c);
760 
764  void makeRuleStatusValid();
765 
766 };
767 
768 //------------------------------------------------------------------------------
769 //
770 // Inline Functions Definitions ...
771 //
772 //------------------------------------------------------------------------------
773 
775  return !operator==(that);
776 }
777 
779 
780 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
781 
782 #endif
icu::RuleBasedBreakIterator::fLanguageBreakEngines
UStack * fLanguageBreakEngines
If present, UStack of LanguageBreakEngine objects that might handle dictionary characters.
Definition: rbbi.h:150
icu::RuleBasedBreakIterator::fSCharIter
StringCharacterIterator * fSCharIter
When the input text is provided by a UnicodeString, this will point to a characterIterator that wraps...
Definition: rbbi.h:86
icu::BreakIterator
The BreakIterator class implements methods for finding the location of boundaries in text.
Definition: brkiter.h:100
parseerr.h
C API: Parse Error Information.
icu::RuleBasedBreakIterator::fCachedBreakPositions
int32_t * fCachedBreakPositions
When a range of characters is divided up using the dictionary, the break positions that are discovere...
Definition: rbbi.h:128
utypes.h
Basic definitions for ICU, for both C and C++ APIs.
icu::BreakIterator::current
virtual int32_t current(void) const =0
Return character index of the current interator position within the text.
UBool
int8_t UBool
The ICU boolean type.
Definition: umachine.h:234
icu::RuleBasedBreakIterator::fData
RBBIDataWrapper * fData
The rule data for this BreakIterator instance.
Definition: rbbi.h:99
icu::BreakIterator::getRuleStatus
virtual int32_t getRuleStatus() const
For RuleBasedBreakIterators, return the status tag from the break rule that determined the most recen...
icu::operator==
U_EXPORT UBool operator==(const StringPiece &x, const StringPiece &y)
Global operator == for StringPiece.
icu::RuleBasedBreakIterator::fPositionInCache
int32_t fPositionInCache
if fCachedBreakPositions is not null, this indicates which item in the cache the current iteration po...
Definition: rbbi.h:141
U_COMMON_API
#define U_COMMON_API
Definition: utypes.h:357
icu::BreakIterator::createBufferClone
virtual BreakIterator * createBufferClone(void *stackBuffer, int32_t &BufferSize, UErrorCode &status)=0
Deprecated functionality.
icu::BreakIterator::setText
virtual void setText(const UnicodeString &text)=0
Change the text over which this operates.
icu::RuleBasedBreakIterator::EDontAdopt
EDontAdopt
Constant to be used in the constructor RuleBasedBreakIterator(RBBIDataHeader*, EDontAdopt,...
Definition: rbbi.h:182
icu::BreakIterator::isBoundary
virtual UBool isBoundary(int32_t offset)=0
Return true if the specfied position is a boundary position.
brkiter.h
C++ API: Break Iterator.
UParseError
A UParseError struct is used to returned detailed information about parsing errors.
Definition: parseerr.h:56
icu::BreakIterator::getDynamicClassID
virtual UClassID getDynamicClassID(void) const =0
Return a polymorphic class ID for this object.
icu::BreakIterator::preceding
virtual int32_t preceding(int32_t offset)=0
Set the iterator position to the first boundary preceding the specified offset.
icu::UnicodeString
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition: unistr.h:293
schriter.h
C++ API: String Character Iterator.
icu::RuleBasedBreakIterator::fLastRuleStatusIndex
int32_t fLastRuleStatusIndex
Index of the Rule {tag} values for the most recent match.
Definition: rbbi.h:104
icu::StringCharacterIterator
A concrete subclass of CharacterIterator that iterates over the characters (code units or code points...
Definition: schriter.h:43
UChar32
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition: umachine.h:332
UClassID
void * UClassID
UClassID is used to identify classes without using the compiler's RTTI.
Definition: uobject.h:91
icu::BreakIterator::refreshInputText
virtual BreakIterator & refreshInputText(UText *input, UErrorCode &status)=0
Set the subject text string upon which the break iterator is operating without changing any other asp...
UErrorCode
UErrorCode
Error code to replace exception handling, so that the code is compatible with all C++ compilers,...
Definition: utypes.h:476
icu::RuleBasedBreakIterator::fBreakType
int32_t fBreakType
The type of the break iterator, or -1 if it has not been set.
Definition: rbbi.h:166
icu::RuleBasedBreakIterator::fNumCachedBreakPositions
int32_t fNumCachedBreakPositions
The number of elements in fCachedBreakPositions.
Definition: rbbi.h:134
icu::RuleBasedBreakIterator
A subclass of BreakIterator whose behavior is specified using a list of rules.
Definition: rbbi.h:65
udata.h
C API: Data loading interface.
icu::BreakIterator::following
virtual int32_t following(int32_t offset)=0
Advance the iterator to the first boundary following the specified offset.
icu::operator!=
UBool operator!=(const StringPiece &x, const StringPiece &y)
Global operator != for StringPiece.
Definition: stringpiece.h:218
uchriter.h
C++ API: UChar Character Iterator.
UText
UText struct.
Definition: utext.h:1343
icu::BreakIterator::adoptText
virtual void adoptText(CharacterIterator *it)=0
Change the text over which this operates.
icu::BreakIterator::operator!=
UBool operator!=(const BreakIterator &rhs) const
Returns the complement of the result of operator==.
Definition: brkiter.h:129
icu::RuleBasedBreakIterator::fUnhandledBreakEngine
UnhandledEngine * fUnhandledBreakEngine
If present, the special LanguageBreakEngine used for handling characters that are in the dictionary s...
Definition: rbbi.h:159
UDataMemory
struct UDataMemory UDataMemory
Forward declaration of the data memory type.
Definition: udata.h:156
icu::BreakIterator::last
virtual int32_t last(void)=0
Set the iterator position to the index immediately BEYOND the last character in the text being scanne...
icu::RuleBasedBreakIterator::fDictionaryCharCount
uint32_t fDictionaryCharCount
Counter for the number of characters encountered with the "dictionary" flag set.
Definition: rbbi.h:119
icu::BreakIterator::getRuleStatusVec
virtual int32_t getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status)
For RuleBasedBreakIterators, get the status (tag) values from the break rule(s) that determined the m...
icu::CharacterIterator
Abstract class that defines an API for iteration on text objects.
Definition: chariter.h:356
icu::RuleBasedBreakIterator::fText
UText * fText
The UText through which this BreakIterator accesses the text.
Definition: rbbi.h:72
icu::RuleBasedBreakIterator::fLastStatusIndexValid
UBool fLastStatusIndexValid
Rule tag value valid flag.
Definition: rbbi.h:112
icu::BreakIterator::clone
virtual BreakIterator * clone(void) const =0
Return a polymorphic copy of this object.
icu::BreakIterator::operator==
virtual UBool operator==(const BreakIterator &) const =0
Return true if another object is semantically equal to this one.
icu::BreakIterator::first
virtual int32_t first(void)=0
Sets the current iteration position to the beginning of the text, position zero.
icu::BreakIterator::getUText
virtual UText * getUText(UText *fillIn, UErrorCode &status) const =0
Get a UText for the text being analyzed.
icu::BreakIterator::next
virtual int32_t next(void)=0
Advance the iterator to the boundary following the current boundary.
icu::UCharCharacterIterator
A concrete subclass of CharacterIterator that iterates over the characters (code units or code points...
Definition: uchriter.h:33
icu::BreakIterator::previous
virtual int32_t previous(void)=0
Set the iterator position to the boundary preceding the current boundary.
icu::BreakIterator::getText
virtual CharacterIterator & getText(void) const =0
Return a CharacterIterator over the text being analyzed.
U_NAMESPACE_END
#define U_NAMESPACE_END
Definition: uversion.h:130
U_NAMESPACE_BEGIN
#define U_NAMESPACE_BEGIN
Definition: uversion.h:129
icu::RuleBasedBreakIterator::fDCharIter
UCharCharacterIterator * fDCharIter
When the input text is provided by a UText, this dummy CharacterIterator over an empty string will be...
Definition: rbbi.h:93
icu::RuleBasedBreakIterator::fCharIter
CharacterIterator * fCharIter
A character iterator that refers to the same text as the UText, above.
Definition: rbbi.h:79