native/utils/tokenizer-utils.cc

/*
 * Copyright (C) 2018 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "utils/tokenizer-utils.h"

#include <iterator>

#include "utils/codepoint-range.h"
#include "utils/strings/utf8.h"
#include "utils/utf8/unicodetext.h"
#include "utils/utf8/unilib-common.h"
#include "absl/container/flat_hash_set.h"

namespace libtextclassifier3 {

using libtextclassifier3::Token;

std::vector<Token> TokenizeOnSpace(const std::string& text) {
  return TokenizeOnDelimiters(text, {' '});
}

std::vector<Token> TokenizeOnDelimiters(
    const std::string& text, const absl::flat_hash_set<char32>& delimiters,
    bool create_tokens_for_non_space_delimiters) {
  return TokenizeWithFilter(text, [&](char32 codepoint) {
    bool to_split = delimiters.find(codepoint) != delimiters.end();
    bool to_keep =
        (create_tokens_for_non_space_delimiters) ? codepoint != ' ' : false;
    return FilterResult{to_split, to_keep};
  });
}

std::vector<Token> TokenizeOnWhiteSpacePunctuationAndChineseLetter(
    const absl::string_view text) {
  return TokenizeWithFilter(text, [](char32 codepoint) {
    bool is_whitespace = IsWhitespace(codepoint);
    bool to_split =
        is_whitespace || IsPunctuation(codepoint) || IsChineseLetter(codepoint);
    bool to_keep = !is_whitespace;
    return FilterResult{to_split, to_keep};
  });
}
}  // namespace  libtextclassifier3