7#ifndef __UTFITERATOR_H__
8#define __UTFITERATOR_H__
12#if U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API || !defined(UTYPES_H)
15#if defined(__cpp_lib_ranges)
135#ifndef U_HIDE_DRAFT_API
170namespace U_HEADER_ONLY_NAMESPACE {
173#if U_CPLUSPLUS_VERSION >= 20
176template<
typename Iter>
180template<
typename Iter>
184template<
typename Iter>
188template<
typename Iter>
192template<
typename Range>
193constexpr bool range = std::ranges::range<Range>;
198template<
typename Iter>
202template<
typename Iter>
206template<
typename Iter>
209 std::forward_iterator_tag,
210 typename std::iterator_traits<Iter>::iterator_category>;
213template<
typename Iter>
216 std::bidirectional_iterator_tag,
217 typename std::iterator_traits<Iter>::iterator_category>;
220template<
typename Range,
typename =
void>
224template<
typename Range>
227 std::void_t<decltype(std::declval<Range>().begin()),
228 decltype(std::declval<Range>().end())>> : std::true_type {};
231template<
typename Range>
240template <
typename... Args>
247template<
typename CP32,
bool skipSurrogates>
249 static_assert(
sizeof(CP32) == 4,
"CP32 must be a 32-bit type to hold a code point");
273 if (skipSurrogates && c_ == 0xd800) {
301template<
typename CP32>
303 static_assert(
sizeof(CP32) == 4,
"CP32 must be a 32-bit type to hold a code point");
332template<
typename CP32>
334 static_assert(
sizeof(CP32) == 4,
"CP32 must be a 32-bit type to hold a code point");
366template<
typename CP32,
typename UnitIter,
typename =
void>
368 static_assert(
sizeof(CP32) == 4,
"CP32 must be a 32-bit type to hold a code point");
394 UnitIter
begin()
const {
return start_; }
401 UnitIter
end()
const {
return limit_; }
409#if U_CPLUSPLUS_VERSION >= 20
415 template<std::contiguous_iterator Iter = UnitIter>
416 std::basic_string_view<Unit>
stringView()
const {
417 return std::basic_string_view<Unit>(
begin(),
end());
425 template<typename Iter = UnitIter, typename Unit = typename std::iterator_traits<Iter>::value_type>
426 std::enable_if_t<std::is_pointer_v<Iter> ||
427 std::is_same_v<Iter, typename std::basic_string<Unit>::iterator> ||
428 std::is_same_v<Iter, typename std::basic_string<Unit>::const_iterator> ||
429 std::is_same_v<Iter, typename std::basic_string_view<Unit>::iterator> ||
430 std::is_same_v<Iter, typename std::basic_string_view<Unit>::const_iterator>,
431 std::basic_string_view<Unit>>
433 return std::basic_string_view<Unit>(&*start_, len_);
448template<
typename CP32,
typename UnitIter>
449class UnsafeCodeUnits<
452 std::enable_if_t<!prv::forward_iterator<UnitIter>>> {
453 static_assert(
sizeof(CP32) == 4,
"CP32 must be a 32-bit type to hold a code point");
462 uint8_t
length()
const {
return len_; }
486template<
typename CP32,
typename UnitIter,
typename =
void>
511template<
typename CP32,
typename UnitIter>
515 std::enable_if_t<!prv::forward_iterator<UnitIter>>> :
516 public UnsafeCodeUnits<CP32, UnitIter> {
521 CodeUnits(
const CodeUnits &other) =
default;
535 typename UnitIter,
typename LimitIter = UnitIter,
typename =
void>
549template<
typename CP32, UTFIllFormedBehavior behavior,
typename UnitIter,
typename LimitIter>
553 std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 1>> {
554 static_assert(
sizeof(CP32) == 4,
"CP32 must be a 32-bit type to hold a code point");
556 "For 8-bit strings, the SURROGATE option does not have an equivalent.");
568 U_FORCE_INLINE static void inc(UnitIter &p,
const LimitIter &limit) {
574 if ((0xe0 <= b && b < 0xf0)) {
579 }
else if (b < 0xe0) {
609 if (0xe0 <= b2 && b2 <= 0xf4) {
628 UnitIter &p0, UnitIter &p,
const LimitIter &limit) {
629 constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
631 CP32 c = uint8_t(*p);
634 if constexpr (isMultiPass) {
635 return {c, 1,
true, p0, p};
651 (c = (c << 6) | (t & 0x3f), ++length, ++p != limit) &&
652 (t = *p - 0x80) <= 0x3f) &&
654 (c = (c << 6) | t, ++length, ++p != limit)
656 c >= 0xc2 && (c &= 0x1f, 1)) &&
658 (t = *p - 0x80) <= 0x3f) {
662 if constexpr (isMultiPass) {
663 return {c, length,
true, p0, p};
665 return {c, length,
true};
668 if constexpr (isMultiPass) {
669 return {sub(), length,
false, p0, p};
671 return {sub(), length,
false};
675 U_FORCE_INLINE static CodeUnits<CP32, UnitIter> decAndRead(UnitIter start, UnitIter &p) {
678 CP32 c = uint8_t(*--p);
680 return {c, 1,
true, p, p0};
688 c = ((b1 - 0xc0) << 6) | (c & 0x3f);
689 return {c, 2,
true, p, p0};
690 }
else if (b1 < 0xf0 ?
695 return {sub(), 2,
false, p, p0};
701 if (0xe0 <= b2 && b2 <= 0xf4) {
706 c = (b2 << 12) | ((b1 & 0x3f) << 6) | c;
707 return {c, 3,
true, p, p0};
712 return {sub(), 3,
false, p, p0};
716 if (0xf0 <= b3 && b3 <= 0xf4) {
720 c = (b3 << 18) | ((b2 & 0x3f) << 12) | ((b1 & 0x3f) << 6) | c;
721 return {c, 4,
true, p, p0};
727 return {sub(), 1,
false, p, p0};
732template<
typename CP32, UTFIllFormedBehavior behavior,
typename UnitIter,
typename LimitIter>
736 std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 2>> {
737 static_assert(
sizeof(CP32) == 4,
"CP32 must be a 32-bit type to hold a code point");
751 U_FORCE_INLINE static void inc(UnitIter &p,
const LimitIter &limit) {
769 UnitIter &p0, UnitIter &p,
const LimitIter &limit) {
770 constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
772 CP32 c =
static_cast<CP32
>(*p);
775 if constexpr (isMultiPass) {
776 return {c, 1,
true, p0, p};
785 if constexpr (isMultiPass) {
786 return {c, 2,
true, p0, p};
791 if constexpr (isMultiPass) {
792 return {sub(c), 1,
false, p0, p};
794 return {sub(c), 1,
false};
800 U_FORCE_INLINE static CodeUnits<CP32, UnitIter> decAndRead(UnitIter start, UnitIter &p) {
803 CP32 c =
static_cast<CP32
>(*--p);
805 return {c, 1,
true, p, p0};
812 return {c, 2,
true, p, p0};
814 return {sub(c), 1,
false, p, p0};
821template<
typename CP32, UTFIllFormedBehavior behavior,
typename UnitIter,
typename LimitIter>
825 std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 4>> {
826 static_assert(
sizeof(CP32) == 4,
"CP32 must be a 32-bit type to hold a code point");
829 U_FORCE_INLINE static CP32 sub(
bool forSurrogate, CP32 surrogate) {
836 return forSurrogate ? surrogate : 0xfffd;
849 UnitIter &p0, UnitIter &p,
const LimitIter &) {
850 constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
854 if (uc < 0xd800 || (0xe000 <= uc && uc <= 0x10ffff)) {
855 if constexpr (isMultiPass) {
856 return {c, 1,
true, p0, p};
861 if constexpr (isMultiPass) {
862 return {sub(uc < 0xe000, c), 1,
false, p0, p};
864 return {sub(uc < 0xe000, c), 1,
false};
869 U_FORCE_INLINE static CodeUnits<CP32, UnitIter> decAndRead(UnitIter , UnitIter &p) {
873 if (uc < 0xd800 || (0xe000 <= uc && uc <= 0x10ffff)) {
874 return {c, 1,
true, p, p0};
876 return {sub(uc < 0xe000, c), 1,
false, p, p0};
883template<
typename CP32,
typename UnitIter,
typename =
void>
887template<
typename CP32,
typename UnitIter>
891 std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 1>> {
892 static_assert(
sizeof(CP32) == 4,
"CP32 must be a 32-bit type to hold a code point");
905 U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> readAndInc(UnitIter &p0, UnitIter &p) {
906 constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
908 CP32 c = uint8_t(*p);
911 if constexpr (isMultiPass) {
912 return {c, 1, p0, p};
916 }
else if (c < 0xe0) {
917 c = ((c & 0x1f) << 6) | (*p & 0x3f);
919 if constexpr (isMultiPass) {
920 return {c, 2, p0, p};
924 }
else if (c < 0xf0) {
927 c = uint16_t(c << 12) | ((*p & 0x3f) << 6);
931 if constexpr (isMultiPass) {
932 return {c, 3, p0, p};
937 c = ((c & 7) << 18) | ((*p & 0x3f) << 12);
939 c |= (*p & 0x3f) << 6;
943 if constexpr (isMultiPass) {
944 return {c, 4, p0, p};
951 U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> decAndRead(UnitIter &p) {
954 CP32 c = uint8_t(*--p);
956 return {c, 1, p, p0};
961 for (uint8_t shift = 6;;) {
965 c |= uint32_t{b} << shift;
968 c |= (uint32_t{b} & 0x3f) << shift;
974 return {c, count, p, p0};
979template<
typename CP32,
typename UnitIter>
983 std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 2>> {
984 static_assert(
sizeof(CP32) == 4,
"CP32 must be a 32-bit type to hold a code point");
1002 U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> readAndInc(UnitIter &p0, UnitIter &p) {
1003 constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
1005 CP32 c =
static_cast<CP32
>(*p);
1008 if constexpr (isMultiPass) {
1009 return {c, 1, p0, p};
1017 if constexpr (isMultiPass) {
1018 return {c, 2, p0, p};
1025 U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> decAndRead(UnitIter &p) {
1028 CP32 c =
static_cast<CP32
>(*--p);
1030 return {c, 1, p, p0};
1034 return {c, 2, p, p0};
1040template<
typename CP32,
typename UnitIter>
1044 std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 4>> {
1045 static_assert(
sizeof(CP32) == 4,
"CP32 must be a 32-bit type to hold a code point");
1055 U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> readAndInc(UnitIter &p0, UnitIter &p) {
1056 constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
1059 if constexpr (isMultiPass) {
1060 return {c, 1, p0, p};
1066 U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> decAndRead(UnitIter &p) {
1069 return {c, 1, p, p0};
1101 typename UnitIter,
typename LimitIter = UnitIter,
typename =
void>
1103 static_assert(
sizeof(CP32) == 4,
"CP32 must be a 32-bit type to hold a code point");
1104 using Impl = UTFImpl<CP32, behavior, UnitIter, LimitIter>;
1128 prv::bidirectional_iterator<UnitIter>,
1129 std::bidirectional_iterator_tag,
1130 std::forward_iterator_tag>;
1146 p_(p), start_(start), limit_(limit), units_(0, 0, false, p, p) {}
1159 p_(p), start_(p), limit_(limit), units_(0, 0, false, p, p) {}
1195 return getLogicalPosition() == other.getLogicalPosition();
1214 !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1217 return iter.getLogicalPosition() == s;
1220#if U_CPLUSPLUS_VERSION < 20
1233 !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1236 return iter.getLogicalPosition() == s;
1246 !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1257 !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1271 units_ = Impl::readAndInc(p0, p_, limit_);
1288 units_ = Impl::readAndInc(p0, p_, limit_);
1291 return Proxy(units_);
1304 }
else if (state_ == 0) {
1305 Impl::inc(p_, limit_);
1328 }
else if (state_ == 0) {
1330 units_ = Impl::readAndInc(p0, p_, limit_);
1351 template<
typename Iter = UnitIter>
1353 std::enable_if_t<prv::bidirectional_iterator<Iter>,
UTFIterator &>
1357 p_ = units_.begin();
1359 units_ = Impl::decAndRead(start_, p_);
1371 template<
typename Iter = UnitIter>
1373 std::enable_if_t<prv::bidirectional_iterator<Iter>,
UTFIterator>
1381 friend class std::reverse_iterator<
UTFIterator<CP32, behavior, UnitIter>>;
1384 return state_ <= 0 ? p_ : units_.begin();
1388 mutable UnitIter p_;
1395 mutable CodeUnits<CP32, UnitIter> units_;
1400 mutable int8_t state_ = 0;
1405template<
typename CP32, UTFIllFormedBehavior behavior,
typename UnitIter,
typename LimitIter>
1408 UnitIter, LimitIter,
1409 std::enable_if_t<!prv::forward_iterator<UnitIter>>> {
1410 static_assert(
sizeof(CP32) == 4,
"CP32 must be a 32-bit type to hold a code point");
1411 using Impl = UTFImpl<CP32, behavior, UnitIter, LimitIter>;
1418 explicit Proxy(CodeUnits<CP32, UnitIter> &units) : units_(units) {}
1419 CodeUnits<CP32, UnitIter> &operator*() {
return units_; }
1420 CodeUnits<CP32, UnitIter> *operator->() {
return &units_; }
1422 CodeUnits<CP32, UnitIter> units_;
1426 using value_type = CodeUnits<CP32, UnitIter>;
1445 return p_ == other.p_ && ahead_ == other.ahead_;
1453 !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1456 return !iter.ahead_ && iter.p_ == s;
1459#if U_CPLUSPLUS_VERSION < 20
1462 !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1465 return !iter.ahead_ && iter.p_ == s;
1470 !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1476 !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1483 units_ = Impl::readAndInc(p_, p_, limit_);
1491 units_ = Impl::readAndInc(p_, p_, limit_);
1494 return Proxy(units_);
1502 Impl::inc(p_, limit_);
1512 units_ = Impl::readAndInc(p_, p_, limit_);
1515 return Proxy(units_);
1520 mutable UnitIter p_;
1526 mutable CodeUnits<CP32, UnitIter> units_ = {0, 0,
false};
1530 mutable bool ahead_ =
false;
1540template<
typename CP32, UTFIllFormedBehavior behavior,
typename UnitIter>
1541class std::reverse_iterator<U_HEADER_ONLY_NAMESPACE::UTFIterator<CP32, behavior, UnitIter>> {
1542 static_assert(
sizeof(CP32) == 4,
"CP32 must be a 32-bit type to hold a code point");
1543 using Impl = U_HEADER_ONLY_NAMESPACE::UTFImpl<CP32, behavior, UnitIter>;
1550 explicit Proxy(CodeUnits_ units) : units_(units) {}
1551 CodeUnits_ &operator*() {
return units_; }
1552 CodeUnits_ *operator->() {
return &units_; }
1558 using value_type = CodeUnits_;
1559 using reference = value_type;
1560 using pointer = Proxy;
1562 using iterator_category = std::bidirectional_iterator_tag;
1565 p_(iter.getLogicalPosition()), start_(iter.start_), limit_(iter.limit_),
1566 units_(0, 0, false, p_, p_) {}
1567 U_FORCE_INLINE reverse_iterator() : p_{}, start_{}, limit_{}, units_(0, 0, false, p_, p_) {}
1569 U_FORCE_INLINE reverse_iterator(reverse_iterator &&src)
noexcept =
default;
1570 U_FORCE_INLINE reverse_iterator &operator=(reverse_iterator &&src)
noexcept =
default;
1572 U_FORCE_INLINE reverse_iterator(
const reverse_iterator &other) =
default;
1573 U_FORCE_INLINE reverse_iterator &operator=(
const reverse_iterator &other) =
default;
1576 return getLogicalPosition() == other.getLogicalPosition();
1582 units_ = Impl::decAndRead(start_, p_);
1590 units_ = Impl::decAndRead(start_, p_);
1593 return Proxy(units_);
1600 }
else if (state_ == 0) {
1601 Impl::dec(start_, p_);
1604 p_ = units_.begin();
1613 reverse_iterator result(*
this);
1616 }
else if (state_ == 0) {
1617 units_ = Impl::decAndRead(start_, p_);
1618 reverse_iterator result(*
this);
1623 reverse_iterator result(*
this);
1625 p_ = units_.begin();
1637 units_ = Impl::readAndInc(p0, p_, limit_);
1643 reverse_iterator result(*
this);
1650 return state_ >= 0 ? p_ : units_.end();
1654 mutable UnitIter p_;
1661 mutable CodeUnits_ units_;
1666 mutable int8_t state_ = 0;
1670namespace U_HEADER_ONLY_NAMESPACE {
1695 typename UnitIter,
typename LimitIter = UnitIter>
1696auto utfIterator(UnitIter start, UnitIter p, LimitIter limit) {
1697 return UTFIterator<CP32, behavior, UnitIter, LimitIter>(
1698 std::move(start), std::move(p), std::move(limit));
1722 typename UnitIter,
typename LimitIter = UnitIter>
1723auto utfIterator(UnitIter p, LimitIter limit) {
1724 return UTFIterator<CP32, behavior, UnitIter, LimitIter>(
1725 std::move(p), std::move(limit));
1752template<
typename CP32, UTFIllFormedBehavior behavior,
typename UnitIter>
1753auto utfIterator(UnitIter p) {
1754 return UTFIterator<CP32, behavior, UnitIter>(std::move(p));
1784template<
typename CP32, UTFIllFormedBehavior behavior,
typename Range>
1785class UTFStringCodePoints {
1786 static_assert(
sizeof(CP32) == 4,
"CP32 must be a 32-bit type to hold a code point");
1792 UTFStringCodePoints() =
default;
1799 template<
typename R = Range,
typename = std::enable_if_t<!std::is_reference_v<R>>>
1800 explicit UTFStringCodePoints(Range unitRange) : unitRange(std::move(unitRange)) {}
1809 template<
typename R = Range,
typename = std::enable_if_t<std::is_reference_v<R>>,
typename =
void>
1810 explicit UTFStringCodePoints(Range unitRange) : unitRange(unitRange) {}
1813 UTFStringCodePoints(
const UTFStringCodePoints &other) =
default;
1816 UTFStringCodePoints &operator=(
const UTFStringCodePoints &other) =
default;
1823 return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end());
1830 template<
typename R = Range,
typename = std::enable_if_t<prv::range<const R>>>
1831 auto begin()
const {
1832 return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end());
1840 using UnitIter =
decltype(unitRange.begin());
1841 using LimitIter =
decltype(unitRange.end());
1842 if constexpr (!std::is_same_v<UnitIter, LimitIter>) {
1844 return unitRange.end();
1845 }
else if constexpr (prv::bidirectional_iterator<UnitIter>) {
1846 return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end(), unitRange.end());
1849 return utfIterator<CP32, behavior>(unitRange.end(), unitRange.end());
1857 template<
typename R = Range,
typename = std::enable_if_t<prv::range<const R>>>
1859 using UnitIter =
decltype(unitRange.begin());
1860 using LimitIter =
decltype(unitRange.end());
1861 if constexpr (!std::is_same_v<UnitIter, LimitIter>) {
1863 return unitRange.end();
1864 }
else if constexpr (prv::bidirectional_iterator<UnitIter>) {
1865 return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end(), unitRange.end());
1868 return utfIterator<CP32, behavior>(unitRange.end(), unitRange.end());
1876 auto rbegin()
const {
1877 return std::make_reverse_iterator(end());
1885 return std::make_reverse_iterator(begin());
1893template<
typename CP32, UTFIllFormedBehavior behavior>
1894struct UTFStringCodePointsAdaptor
1896 __cpp_lib_bind_back >= 2022'02
1897 : std::ranges::range_adaptor_closure<UTFStringCodePointsAdaptor<CP32, behavior>>
1901 template<
typename Range>
1902 auto operator()(Range &&unitRange)
const {
1903#if defined(__cpp_lib_ranges) && __cpp_lib_ranges >= 2021'10
1904 return UTFStringCodePoints<CP32, behavior, std::ranges::views::all_t<Range>>(
1905 std::forward<Range>(unitRange));
1907 if constexpr (prv::is_basic_string_view_v<std::decay_t<Range>>) {
1910 return UTFStringCodePoints<CP32, behavior, std::decay_t<Range>>(
1911 std::forward<Range>(unitRange));
1913 return UTFStringCodePoints<CP32, behavior, Range>(std::forward<Range>(unitRange));
1933template<
typename CP32, UTFIllFormedBehavior behavior>
1934constexpr UTFStringCodePointsAdaptor<CP32, behavior> utfStringCodePoints;
1959template<
typename CP32,
typename UnitIter,
typename =
void>
1960class UnsafeUTFIterator {
1961 static_assert(
sizeof(CP32) == 4,
"CP32 must be a 32-bit type to hold a code point");
1962 using Impl = UnsafeUTFImpl<CP32, UnitIter>;
1968 explicit Proxy(UnsafeCodeUnits<CP32, UnitIter> &units) : units_(units) {}
1969 UnsafeCodeUnits<CP32, UnitIter> &operator*() {
return units_; }
1970 UnsafeCodeUnits<CP32, UnitIter> *operator->() {
return &units_; }
1972 UnsafeCodeUnits<CP32, UnitIter> units_;
1977 using value_type = UnsafeCodeUnits<CP32, UnitIter>;
1979 using reference = value_type;
1981 using pointer = Proxy;
1983 using difference_type = prv::iter_difference_t<UnitIter>;
1985 using iterator_category = std::conditional_t<
1986 prv::bidirectional_iterator<UnitIter>,
1987 std::bidirectional_iterator_tag,
1988 std::forward_iterator_tag>;
1999 U_FORCE_INLINE explicit UnsafeUTFIterator(UnitIter p) : p_(p), units_(0, 0, p, p) {}
2005 U_FORCE_INLINE UnsafeUTFIterator() : p_{}, units_(0, 0, p_, p_) {}
2008 U_FORCE_INLINE UnsafeUTFIterator(UnsafeUTFIterator &&src)
noexcept =
default;
2010 U_FORCE_INLINE UnsafeUTFIterator &operator=(UnsafeUTFIterator &&src)
noexcept =
default;
2013 U_FORCE_INLINE UnsafeUTFIterator(
const UnsafeUTFIterator &other) =
default;
2015 U_FORCE_INLINE UnsafeUTFIterator &operator=(
const UnsafeUTFIterator &other) =
default;
2023 return getLogicalPosition() == other.getLogicalPosition();
2040 !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2042 operator==(
const UnsafeUTFIterator &iter,
const Sentinel &s) {
2043 return iter.getLogicalPosition() == s;
2046#if U_CPLUSPLUS_VERSION < 20
2055 !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2057 operator==(
const Sentinel &s,
const UnsafeUTFIterator &iter) {
2058 return iter.getLogicalPosition() == s;
2068 !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2070 operator!=(
const UnsafeUTFIterator &iter,
const Sentinel &s) {
return !(iter == s); }
2079 !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2081 operator!=(
const Sentinel &s,
const UnsafeUTFIterator &iter) {
return !(iter == s); }
2090 U_FORCE_INLINE UnsafeCodeUnits<CP32, UnitIter> operator*()
const {
2093 units_ = Impl::readAndInc(p0, p_);
2110 units_ = Impl::readAndInc(p0, p_);
2113 return Proxy(units_);
2126 }
else if (state_ == 0) {
2147 UnsafeUTFIterator result(*
this);
2150 }
else if (state_ == 0) {
2152 units_ = Impl::readAndInc(p0, p_);
2153 UnsafeUTFIterator result(*
this);
2158 UnsafeUTFIterator result(*
this);
2173 template<
typename Iter = UnitIter>
2175 std::enable_if_t<prv::bidirectional_iterator<Iter>, UnsafeUTFIterator &>
2179 p_ = units_.begin();
2181 units_ = Impl::decAndRead(p_);
2193 template<
typename Iter = UnitIter>
2195 std::enable_if_t<prv::bidirectional_iterator<Iter>, UnsafeUTFIterator>
2197 UnsafeUTFIterator result(*
this);
2203 friend class std::reverse_iterator<UnsafeUTFIterator<CP32, UnitIter>>;
2206 return state_ <= 0 ? p_ : units_.begin();
2210 mutable UnitIter p_;
2213 mutable UnsafeCodeUnits<CP32, UnitIter> units_;
2218 mutable int8_t state_ = 0;
2223template<
typename CP32,
typename UnitIter>
2224class UnsafeUTFIterator<
2227 std::enable_if_t<!prv::forward_iterator<UnitIter>>> {
2228 static_assert(
sizeof(CP32) == 4,
"CP32 must be a 32-bit type to hold a code point");
2229 using Impl = UnsafeUTFImpl<CP32, UnitIter>;
2236 explicit Proxy(UnsafeCodeUnits<CP32, UnitIter> &units) : units_(units) {}
2237 UnsafeCodeUnits<CP32, UnitIter> &operator*() {
return units_; }
2238 UnsafeCodeUnits<CP32, UnitIter> *operator->() {
return &units_; }
2240 UnsafeCodeUnits<CP32, UnitIter> units_;
2244 using value_type = UnsafeCodeUnits<CP32, UnitIter>;
2245 using reference = value_type;
2246 using pointer = Proxy;
2247 using difference_type = prv::iter_difference_t<UnitIter>;
2248 using iterator_category = std::input_iterator_tag;
2250 U_FORCE_INLINE explicit UnsafeUTFIterator(UnitIter p) : p_(std::move(p)) {}
2252 U_FORCE_INLINE UnsafeUTFIterator(UnsafeUTFIterator &&src)
noexcept =
default;
2253 U_FORCE_INLINE UnsafeUTFIterator &operator=(UnsafeUTFIterator &&src)
noexcept =
default;
2255 U_FORCE_INLINE UnsafeUTFIterator(
const UnsafeUTFIterator &other) =
default;
2256 U_FORCE_INLINE UnsafeUTFIterator &operator=(
const UnsafeUTFIterator &other) =
default;
2259 return p_ == other.p_ && ahead_ == other.ahead_;
2267 !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2269 operator==(
const UnsafeUTFIterator &iter,
const Sentinel &s) {
2270 return !iter.ahead_ && iter.p_ == s;
2273#if U_CPLUSPLUS_VERSION < 20
2276 !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2278 operator==(
const Sentinel &s,
const UnsafeUTFIterator &iter) {
2279 return !iter.ahead_ && iter.p_ == s;
2284 !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2286 operator!=(
const UnsafeUTFIterator &iter,
const Sentinel &s) {
return !(iter == s); }
2290 !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2292 operator!=(
const Sentinel &s,
const UnsafeUTFIterator &iter) {
return !(iter == s); }
2295 U_FORCE_INLINE UnsafeCodeUnits<CP32, UnitIter> operator*()
const {
2297 units_ = Impl::readAndInc(p_, p_);
2305 units_ = Impl::readAndInc(p_, p_);
2308 return Proxy(units_);
2326 units_ = Impl::readAndInc(p_, p_);
2329 return Proxy(units_);
2334 mutable UnitIter p_;
2337 mutable UnsafeCodeUnits<CP32, UnitIter> units_ = {0, 0};
2341 mutable bool ahead_ =
false;
2351template<
typename CP32,
typename UnitIter>
2352class std::reverse_iterator<U_HEADER_ONLY_NAMESPACE::UnsafeUTFIterator<CP32, UnitIter>> {
2353 static_assert(
sizeof(CP32) == 4,
"CP32 must be a 32-bit type to hold a code point");
2354 using Impl = U_HEADER_ONLY_NAMESPACE::UnsafeUTFImpl<CP32, UnitIter>;
2361 explicit Proxy(UnsafeCodeUnits_ units) : units_(units) {}
2362 UnsafeCodeUnits_ &operator*() {
return units_; }
2363 UnsafeCodeUnits_ *operator->() {
return &units_; }
2365 UnsafeCodeUnits_ units_;
2369 using value_type = UnsafeCodeUnits_;
2370 using reference = value_type;
2371 using pointer = Proxy;
2373 using iterator_category = std::bidirectional_iterator_tag;
2375 U_FORCE_INLINE explicit reverse_iterator(U_HEADER_ONLY_NAMESPACE::UnsafeUTFIterator<CP32, UnitIter> iter) :
2376 p_(iter.getLogicalPosition()), units_(0, 0, p_, p_) {}
2377 U_FORCE_INLINE reverse_iterator() : p_{}, units_(0, 0, p_, p_) {}
2379 U_FORCE_INLINE reverse_iterator(reverse_iterator &&src)
noexcept =
default;
2380 U_FORCE_INLINE reverse_iterator &operator=(reverse_iterator &&src)
noexcept =
default;
2382 U_FORCE_INLINE reverse_iterator(
const reverse_iterator &other) =
default;
2383 U_FORCE_INLINE reverse_iterator &operator=(
const reverse_iterator &other) =
default;
2386 return getLogicalPosition() == other.getLogicalPosition();
2392 units_ = Impl::decAndRead(p_);
2400 units_ = Impl::decAndRead(p_);
2403 return Proxy(units_);
2410 }
else if (state_ == 0) {
2414 p_ = units_.begin();
2423 reverse_iterator result(*
this);
2426 }
else if (state_ == 0) {
2427 units_ = Impl::decAndRead(p_);
2428 reverse_iterator result(*
this);
2433 reverse_iterator result(*
this);
2435 p_ = units_.begin();
2447 units_ = Impl::readAndInc(p0, p_);
2453 reverse_iterator result(*
this);
2460 return state_ >= 0 ? p_ : units_.end();
2464 mutable UnitIter p_;
2467 mutable UnsafeCodeUnits_ units_;
2472 mutable int8_t state_ = 0;
2476namespace U_HEADER_ONLY_NAMESPACE {
2493template<
typename CP32,
typename UnitIter>
2494auto unsafeUTFIterator(UnitIter iter) {
2495 return UnsafeUTFIterator<CP32, UnitIter>(std::move(iter));
2525template<
typename CP32,
typename Range>
2526class UnsafeUTFStringCodePoints {
2527 static_assert(
sizeof(CP32) == 4,
"CP32 must be a 32-bit type to hold a code point");
2533 UnsafeUTFStringCodePoints() =
default;
2540 template<
typename R = Range,
typename = std::enable_if_t<!std::is_reference_v<R>>>
2541 explicit UnsafeUTFStringCodePoints(Range unitRange) : unitRange(std::move(unitRange)) {}
2550 template<
typename R = Range,
typename = std::enable_if_t<std::is_reference_v<R>>,
typename =
void>
2551 explicit UnsafeUTFStringCodePoints(Range unitRange) : unitRange(unitRange) {}
2554 UnsafeUTFStringCodePoints(
const UnsafeUTFStringCodePoints &other) =
default;
2557 UnsafeUTFStringCodePoints &operator=(
const UnsafeUTFStringCodePoints &other) =
default;
2564 return unsafeUTFIterator<CP32>(unitRange.begin());
2571 template<
typename R = Range,
typename = std::enable_if_t<prv::range<const R>>>
2572 auto begin()
const {
2573 return unsafeUTFIterator<CP32>(unitRange.begin());
2581 using UnitIter =
decltype(unitRange.begin());
2582 using LimitIter =
decltype(unitRange.end());
2583 if constexpr (!std::is_same_v<UnitIter, LimitIter>) {
2585 return unitRange.end();
2587 return unsafeUTFIterator<CP32>(unitRange.end());
2595 template<
typename R = Range,
typename = std::enable_if_t<prv::range<const R>>>
2597 using UnitIter =
decltype(unitRange.begin());
2598 using LimitIter =
decltype(unitRange.end());
2599 if constexpr (!std::is_same_v<UnitIter, LimitIter>) {
2601 return unitRange.end();
2603 return unsafeUTFIterator<CP32>(unitRange.end());
2611 auto rbegin()
const {
2612 return std::make_reverse_iterator(end());
2620 return std::make_reverse_iterator(begin());
2628template<
typename CP32>
2629struct UnsafeUTFStringCodePointsAdaptor
2631 __cpp_lib_bind_back >= 2022'02
2632 : std::ranges::range_adaptor_closure<UnsafeUTFStringCodePointsAdaptor<CP32>>
2636 template<
typename Range>
2637 auto operator()(Range &&unitRange)
const {
2638#if defined(__cpp_lib_ranges) && __cpp_lib_ranges >= 2021'10
2639 return UnsafeUTFStringCodePoints<CP32, std::ranges::views::all_t<Range>>(std::forward<Range>(unitRange));
2641 if constexpr (prv::is_basic_string_view_v<std::decay_t<Range>>) {
2644 return UnsafeUTFStringCodePoints<CP32, std::decay_t<Range>>(std::forward<Range>(unitRange));
2646 return UnsafeUTFStringCodePoints<CP32, Range>(std::forward<Range>(unitRange));
2665template<
typename CP32>
2666constexpr UnsafeUTFStringCodePointsAdaptor<CP32> unsafeUTFStringCodePoints;
2671#if defined(__cpp_lib_ranges)
2672template <
typename CP32, UTFIllFormedBehavior behavior,
typename Range>
2673constexpr bool std::ranges::enable_borrowed_range<
2674 U_HEADER_ONLY_NAMESPACE::UTFStringCodePoints<CP32, behavior, Range>> =
2675 std::ranges::enable_borrowed_range<Range>;
2677template <
typename CP32,
typename Range>
2678constexpr bool std::ranges::enable_borrowed_range<
2679 U_HEADER_ONLY_NAMESPACE::UnsafeUTFStringCodePoints<CP32, Range>> =
2680 std::ranges::enable_borrowed_range<Range>;
U_COMMON_API UBool operator==(const StringPiece &x, const StringPiece &y)
Global operator == for StringPiece.
bool operator!=(const StringPiece &x, const StringPiece &y)
Global operator != for StringPiece.
#define U_SENTINEL
This value is intended for sentinel values for APIs that (take or) return single code points (UChar32...
#define U_FORCE_INLINE
Forces function inlining on compilers that are known to support it.
C API: 16-bit Unicode handling macros.
#define U16_IS_SURROGATE_TRAIL(c)
Assuming c is a surrogate code point (U16_IS_SURROGATE(c)), is it a trail surrogate?
#define U16_IS_SURROGATE_LEAD(c)
Assuming c is a surrogate code point (U16_IS_SURROGATE(c)), is it a lead surrogate?
#define U16_GET_SUPPLEMENTARY(lead, trail)
Get a supplementary code point value (U+10000..U+10ffff) from its lead and trail surrogates.
#define U16_IS_SURROGATE(c)
Is this code unit a surrogate (U+d800..U+dfff)?
#define U16_IS_LEAD(c)
Is this code unit a lead surrogate (U+d800..U+dbff)?
#define U16_IS_TRAIL(c)
Is this code unit a trail surrogate (U+dc00..U+dfff)?
C API: 8-bit Unicode handling macros.
#define U8_COUNT_TRAIL_BYTES_UNSAFE(leadByte)
Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence.
#define U8_IS_VALID_LEAD3_AND_T1(lead, t1)
Internal 3-byte UTF-8 validity check.
#define U8_IS_VALID_LEAD4_AND_T1(lead, t1)
Internal 4-byte UTF-8 validity check.
#define U8_IS_SINGLE(c)
Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)?
#define U8_LEAD3_T1_BITS
Internal bit vector for 3-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD3_AND_T1.
#define U8_LEAD4_T1_BITS
Internal bit vector for 4-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD4_AND_T1.
#define U8_IS_LEAD(c)
Is this code unit (byte) a UTF-8 lead byte? (0xC2..0xF4)
#define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes)
Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value.
#define U8_IS_TRAIL(c)
Is this code unit (byte) a UTF-8 trail byte? (0x80..0xBF)
typename std::iterator_traits< Iter >::difference_type iter_difference_t
constexpr bool is_basic_string_view_v
constexpr bool forward_iterator
typename std::iterator_traits< Iter >::value_type iter_value_t
constexpr bool bidirectional_iterator
UTFIllFormedBehavior
Some defined behaviors for handling ill-formed Unicode strings.
@ UTF_BEHAVIOR_FFFD
Returns U+FFFD Replacement Character.
@ UTF_BEHAVIOR_SURROGATE
UTF-8: Not allowed; UTF-16: returns the unpaired surrogate; UTF-32: returns the surrogate code point,...
@ UTF_BEHAVIOR_NEGATIVE
Returns a negative value (-1=U_SENTINEL) instead of a code point.
Basic definitions for ICU, for both C and C++ APIs.
C API: API for accessing ICU version numbers.