• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "utils/tokenizer-utils.h"
18 
19 #include <iterator>
20 
21 #include "utils/codepoint-range.h"
22 #include "utils/strings/utf8.h"
23 #include "utils/utf8/unicodetext.h"
24 #include "utils/utf8/unilib-common.h"
25 #include "absl/container/flat_hash_set.h"
26 
27 namespace libtextclassifier3 {
28 
29 using libtextclassifier3::Token;
30 
TokenizeOnSpace(const std::string & text)31 std::vector<Token> TokenizeOnSpace(const std::string& text) {
32   return TokenizeOnDelimiters(text, {' '});
33 }
34 
TokenizeOnDelimiters(const std::string & text,const absl::flat_hash_set<char32> & delimiters,bool create_tokens_for_non_space_delimiters)35 std::vector<Token> TokenizeOnDelimiters(
36     const std::string& text, const absl::flat_hash_set<char32>& delimiters,
37     bool create_tokens_for_non_space_delimiters) {
38   return TokenizeWithFilter(text, [&](char32 codepoint) {
39     bool to_split = delimiters.find(codepoint) != delimiters.end();
40     bool to_keep =
41         (create_tokens_for_non_space_delimiters) ? codepoint != ' ' : false;
42     return FilterResult{to_split, to_keep};
43   });
44 }
45 
TokenizeOnWhiteSpacePunctuationAndChineseLetter(const absl::string_view text)46 std::vector<Token> TokenizeOnWhiteSpacePunctuationAndChineseLetter(
47     const absl::string_view text) {
48   return TokenizeWithFilter(text, [](char32 codepoint) {
49     bool is_whitespace = IsWhitespace(codepoint);
50     bool to_split =
51         is_whitespace || IsPunctuation(codepoint) || IsChineseLetter(codepoint);
52     bool to_keep = !is_whitespace;
53     return FilterResult{to_split, to_keep};
54   });
55 }
56 }  // namespace  libtextclassifier3
57