1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_UTIL_CHARACTER_ITERATOR_H_ 16 #define ICING_UTIL_CHARACTER_ITERATOR_H_ 17 18 #include <string> 19 #include <string_view> 20 21 #include "icing/legacy/core/icing-string-util.h" 22 #include "icing/util/i18n-utils.h" 23 #include "unicode/utypes.h" 24 25 namespace icing { 26 namespace lib { 27 28 class CharacterIterator { 29 public: CharacterIterator(std::string_view text)30 explicit CharacterIterator(std::string_view text) 31 : text_(text), 32 cached_current_char_(i18n_utils::kInvalidUChar32), 33 utf8_index_(0), 34 utf16_index_(0), 35 utf32_index_(0) {} 36 CharacterIterator()37 CharacterIterator() : utf8_index_(-1), utf16_index_(-1), utf32_index_(-1) {} 38 39 // Returns the character that the iterator currently points to. 40 // i18n_utils::kInvalidUChar32 if unable to read that character. 41 // 42 // REQUIRES: the instance is not in an undefined state (i.e. all previous 43 // calls succeeded). 44 // 45 // RETURNS: 46 // - Null character if the iterator is at the end of the text. 47 // - The character that the iterator currently points to, if the iterator is 48 // within the text. 49 // - i18n_utils::kInvalidUChar32, if unable to decode the character. 50 UChar32 GetCurrentChar() const; 51 52 // Moves current position to desired_utf8_index. 53 // REQUIRES: 0 <= desired_utf8_index <= text_.length() 54 bool MoveToUtf8(int desired_utf8_index); 55 56 // Advances from current position to the character that includes the specified 57 // UTF-8 index. 58 // 59 // desired_utf8_index should be in range [0, text_.length()]. Note that it is 60 // allowed to point one index past the end (i.e. equals text_.length()), but 61 // no further. 62 // 63 // REQUIRES: 64 // - The instance is not in an undefined state (i.e. all previous calls 65 // succeeded). 66 // - The current position is not ahead of desired_utf8_index, i.e. 67 // utf8_index() <= desired_utf8_index. 68 // 69 // RETURNS: 70 // - True if successfully advanced. 71 // - False otherwise. Also the iterator will be in an undefined state. 72 bool AdvanceToUtf8(int desired_utf8_index); 73 74 // Rewinds from current position to the character that includes the specified 75 // UTF-8 index. 76 // REQUIRES: 0 <= desired_utf8_index 77 bool RewindToUtf8(int desired_utf8_index); 78 79 // Moves current position to desired_utf16_index. 80 // REQUIRES: 0 <= desired_utf16_index <= text_.utf16_length() 81 bool MoveToUtf16(int desired_utf16_index); 82 83 // Advances current position to desired_utf16_index. 84 // REQUIRES: desired_utf16_index <= text_.utf16_length() 85 // desired_utf16_index is allowed to point one index past the end, but no 86 // further. 87 bool AdvanceToUtf16(int desired_utf16_index); 88 89 // Rewinds current position to desired_utf16_index. 90 // REQUIRES: 0 <= desired_utf16_index 91 bool RewindToUtf16(int desired_utf16_index); 92 93 // Moves current position to desired_utf32_index. 94 // REQUIRES: 0 <= desired_utf32_index <= text_.utf32_length() 95 bool MoveToUtf32(int desired_utf32_index); 96 97 // Advances current position to desired_utf32_index. 98 // REQUIRES: desired_utf32_index <= text_.utf32_length() 99 // desired_utf32_index is allowed to point one index past the end, but no 100 // further. 101 bool AdvanceToUtf32(int desired_utf32_index); 102 103 // Rewinds current position to desired_utf32_index. 104 // REQUIRES: 0 <= desired_utf32_index 105 bool RewindToUtf32(int desired_utf32_index); 106 is_valid()107 bool is_valid() const { 108 return text_.data() != nullptr && utf8_index_ >= 0 && utf16_index_ >= 0 && 109 utf32_index_ >= 0; 110 } 111 text()112 std::string_view text() const { return text_; } utf8_index()113 int utf8_index() const { return utf8_index_; } utf16_index()114 int utf16_index() const { return utf16_index_; } utf32_index()115 int utf32_index() const { return utf32_index_; } 116 117 bool operator==(const CharacterIterator& rhs) const { 118 // cached_current_char_ is just that: a cached value. As such, it's not 119 // considered for equality. 120 return text_ == rhs.text_ && utf8_index_ == rhs.utf8_index_ && 121 utf16_index_ == rhs.utf16_index_ && utf32_index_ == rhs.utf32_index_; 122 } 123 DebugString()124 std::string DebugString() const { 125 return IcingStringUtil::StringPrintf("(u8:%d,u16:%d,u32:%d)", utf8_index_, 126 utf16_index_, utf32_index_); 127 } 128 129 private: 130 // Resets the character iterator to the start of the text if any of the 131 // indices are negative. 132 void ResetToStartIfNecessary(); 133 134 std::string_view text_; 135 mutable UChar32 cached_current_char_; 136 int utf8_index_; 137 int utf16_index_; 138 int utf32_index_; 139 }; 140 141 } // namespace lib 142 } // namespace icing 143 144 #endif // ICING_UTIL_CHARACTER_ITERATOR_H_ 145