1 /*
2 * Copyright (C) 2018 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "utils/tokenizer-utils.h"
18
19 #include <iterator>
20
21 #include "utils/codepoint-range.h"
22 #include "utils/strings/utf8.h"
23 #include "utils/utf8/unicodetext.h"
24 #include "utils/utf8/unilib-common.h"
25 #include "absl/container/flat_hash_set.h"
26
27 namespace libtextclassifier3 {
28
29 using libtextclassifier3::Token;
30
TokenizeOnSpace(const std::string & text)31 std::vector<Token> TokenizeOnSpace(const std::string& text) {
32 return TokenizeOnDelimiters(text, {' '});
33 }
34
TokenizeOnDelimiters(const std::string & text,const absl::flat_hash_set<char32> & delimiters,bool create_tokens_for_non_space_delimiters)35 std::vector<Token> TokenizeOnDelimiters(
36 const std::string& text, const absl::flat_hash_set<char32>& delimiters,
37 bool create_tokens_for_non_space_delimiters) {
38 return TokenizeWithFilter(text, [&](char32 codepoint) {
39 bool to_split = delimiters.find(codepoint) != delimiters.end();
40 bool to_keep =
41 (create_tokens_for_non_space_delimiters) ? codepoint != ' ' : false;
42 return FilterResult{to_split, to_keep};
43 });
44 }
45
TokenizeOnWhiteSpacePunctuationAndChineseLetter(const absl::string_view text)46 std::vector<Token> TokenizeOnWhiteSpacePunctuationAndChineseLetter(
47 const absl::string_view text) {
48 return TokenizeWithFilter(text, [](char32 codepoint) {
49 bool is_whitespace = IsWhitespace(codepoint);
50 bool to_split =
51 is_whitespace || IsPunctuation(codepoint) || IsChineseLetter(codepoint);
52 bool to_keep = !is_whitespace;
53 return FilterResult{to_split, to_keep};
54 });
55 }
56 } // namespace libtextclassifier3
57