utils/utf8/unicodetext.h

/*
 * Copyright (C) 2018 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef LIBTEXTCLASSIFIER_UTILS_UTF8_UNICODETEXT_H_
#define LIBTEXTCLASSIFIER_UTILS_UTF8_UNICODETEXT_H_

#include <iterator>
#include <string>
#include <utility>
#include <vector>

#include "utils/base/integral_types.h"
#include "utils/base/logging.h"
#include "utils/strings/stringpiece.h"
#include "absl/strings/string_view.h"

namespace libtextclassifier3 {

// ***************************** UnicodeText **************************
//
// A UnicodeText object is a wrapper around a sequence of Unicode
// codepoint values that allows iteration over these values.
//
// The internal representation of the text is UTF-8. Since UTF-8 is a
// variable-width format, UnicodeText does not provide random access
// to the text, and changes to the text are permitted only at the end.
//
// The UnicodeText class defines a const_iterator. The dereferencing
// operator (*) returns a codepoint (int32). The iterator is a
// read-only iterator. It becomes invalid if the text is changed.
//
// Codepoints are integers in the range [0, 0xD7FF] or [0xE000,
// 0x10FFFF], but UnicodeText has the additional restriction that it
// can contain only those characters that are valid for interchange on
// the Web. This excludes all of the control codes except for carriage
// return, line feed, and horizontal tab.  It also excludes
// non-characters, but codepoints that are in the Private Use regions
// are allowed, as are codepoints that are unassigned. (See the
// Unicode reference for details.)
//
// MEMORY MANAGEMENT:
//
// PointToUTF8(buffer, size) creates an alias pointing to buffer.
//
// The purpose of an alias is to avoid making an unnecessary copy of a
// UTF-8 buffer while still providing access to the Unicode values
// within that text through iterators. The lifetime of an alias must not
// exceed the lifetime of the buffer from which it was constructed.
//
// Aliases should be used with care. If the source from which an alias
// was created is freed, or if the contents are changed, while the
// alias is still in use, fatal errors could result. But it can be
// quite useful to have a UnicodeText "window" through which to see a
// UTF-8 buffer without having to pay the price of making a copy.

class UnicodeText {
 public:
  class const_iterator;

  UnicodeText();  // Create an empty text.
  UnicodeText(const UnicodeText& src, bool do_copy = true);
  UnicodeText& operator=(UnicodeText&& src);
  ~UnicodeText();

  class const_iterator {
    typedef const_iterator CI;

   public:
    typedef std::bidirectional_iterator_tag iterator_category;
    typedef char32 value_type;
    typedef int difference_type;
    typedef void pointer;            // (Not needed.)
    typedef const char32 reference;  // (Needed for const_reverse_iterator)

    // Iterators are default-constructible.
    const_iterator();

    // It's safe to make multiple passes over a UnicodeText.
    const_iterator(const const_iterator&) = default;
    const_iterator& operator=(const const_iterator&) = default;

    char32 operator*() const;  // Dereference

    const_iterator& operator++();     // Advance (++iter)
    const_iterator operator++(int) {  // (iter++)
      const_iterator result(*this);
      ++*this;
      return result;
    }

    const_iterator& operator--();     // Retreat (--iter)
    const_iterator operator--(int) {  // (iter--)
      const_iterator result(*this);
      --*this;
      return result;
    }

    friend bool operator==(const CI& lhs, const CI& rhs) {
      return lhs.it_ == rhs.it_;
    }
    friend bool operator!=(const CI& lhs, const CI& rhs) {
      return !(lhs == rhs);
    }
    friend bool operator<(const CI& lhs, const CI& rhs);
    friend bool operator>(const CI& lhs, const CI& rhs) { return rhs < lhs; }
    friend bool operator<=(const CI& lhs, const CI& rhs) {
      return !(rhs < lhs);
    }
    friend bool operator>=(const CI& lhs, const CI& rhs) {
      return !(lhs < rhs);
    }

    int utf8_length() const {
      const unsigned char byte = static_cast<unsigned char>(it_[0]);
      if (byte < 0x80) {
        return 1;
      } else if (byte < 0xE0) {
        return 2;
      } else if (byte < 0xF0) {
        return 3;
      } else {
        return 4;
      }
    }
    const char* utf8_data() const { return it_; }

   private:
    friend class UnicodeText;
    explicit const_iterator(const char* it) : it_(it) {}

    const char* it_;
  };

  const_iterator begin() const;
  const_iterator end() const;

  // Gets pointer to the underlying utf8 data.
  const char* data() const;

  // Gets length (in bytes) of the underlying utf8 data.
  int size_bytes() const;

  // Computes length (in number of Unicode codepoints) of the underlying utf8
  // data.
  // NOTE: Complexity O(n).
  int size_codepoints() const;

  bool empty() const;

  // Checks whether the underlying data is valid utf8 data.
  bool is_valid() const;

  bool operator==(const UnicodeText& other) const;

  // x.PointToUTF8(buf,len) changes x so that it points to buf
  // ("becomes an alias"). It does not take ownership or copy buf.
  // This function assumes that the input is interchange valid UTF8.
  UnicodeText& Copy(const UnicodeText& src);
  UnicodeText& PointToUTF8(const char* utf8_buffer, int byte_length);
  UnicodeText& CopyUTF8(const char* utf8_buffer, int byte_length);

  // Calling this may invalidate pointers to underlying data.
  UnicodeText& AppendUTF8(const char* utf8, int len);
  UnicodeText& push_back(char32 ch);
  void clear();

  // Returns an iterator for each codepoint.
  std::vector<const_iterator> Codepoints() const;

  // Returns the list of codepoints of the UnicodeText.
  std::vector<char32> CodepointsChar32() const;

  std::string ToUTF8String() const;
  std::string UTF8Substring(int begin_codepoint, int end_codepoint) const;
  static std::string UTF8Substring(const const_iterator& it_begin,
                                   const const_iterator& it_end);
  static UnicodeText Substring(const UnicodeText& text, int begin_codepoint,
                               int end_codepoint, bool do_copy = true);
  static UnicodeText Substring(const const_iterator& it_begin,
                               const const_iterator& it_end,
                               bool do_copy = true);

 private:
  friend class const_iterator;

  class Repr {  // A byte-string.
   public:
    char* data_;
    int size_;
    int capacity_;
    bool ours_;  // Do we own data_?

    Repr() : data_(nullptr), size_(0), capacity_(0), ours_(true) {}
    Repr& operator=(Repr&& src);
    ~Repr() {
      if (ours_) delete[] data_;
    }

    void clear();
    void reserve(int capacity);
    void resize(int size);

    void append(const char* bytes, int byte_length);
    void Copy(const char* data, int size);
    void PointTo(const char* data, int size);

   private:
    Repr& operator=(const Repr&);
    Repr(const Repr& other);
  };

  Repr repr_;
};

typedef std::pair<UnicodeText::const_iterator, UnicodeText::const_iterator>
    UnicodeTextRange;

// NOTE: The following are needed to avoid implicit conversion from char* to
// std::string, or from ::string to std::string, because if this happens it
// often results in invalid memory access to a temporary object created during
// such conversion (if do_copy == false).
// NOTE: These methods don't check if the input string is UTF8 well formed, for
// efficiency reasons. Use UnicodeText::is_valid() when explicitly needed.
UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len,
                              bool do_copy = true);
UnicodeText UTF8ToUnicodeText(const char* utf8_buf, bool do_copy = true);
UnicodeText UTF8ToUnicodeText(const std::string& str, bool do_copy = true);
UnicodeText UTF8ToUnicodeText(StringPiece str, bool do_copy = true);
UnicodeText UTF8ToUnicodeText(absl::string_view str, bool do_copy = true);

inline logging::LoggingStringStream& operator<<(
    logging::LoggingStringStream& stream, const UnicodeText& message) {
  stream.message.append(message.data(), message.size_bytes());
  return stream;
}

}  // namespace libtextclassifier3

#endif  // LIBTEXTCLASSIFIER_UTILS_UTF8_UNICODETEXT_H_