1 /*
2 * Copyright (C) 2018 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "utils/normalization.h"
18
19 #include "utils/base/logging.h"
20 #include "utils/normalization_generated.h"
21
22 namespace libtextclassifier3 {
23
NormalizeText(const UniLib & unilib,const NormalizationOptions * normalization_options,const UnicodeText & text)24 UnicodeText NormalizeText(const UniLib& unilib,
25 const NormalizationOptions* normalization_options,
26 const UnicodeText& text) {
27 return NormalizeTextCodepointWise(
28 unilib, normalization_options->codepointwise_normalization(), text);
29 }
30
NormalizeTextCodepointWise(const UniLib & unilib,const uint32 codepointwise_ops,const UnicodeText & text)31 UnicodeText NormalizeTextCodepointWise(const UniLib& unilib,
32 const uint32 codepointwise_ops,
33 const UnicodeText& text) {
34 // Sanity check.
35 TC3_CHECK(!((codepointwise_ops &
36 NormalizationOptions_::CodepointwiseNormalizationOp_LOWERCASE) &&
37 (codepointwise_ops &
38 NormalizationOptions_::CodepointwiseNormalizationOp_UPPERCASE)));
39
40 UnicodeText result;
41 for (const char32 codepoint : text) {
42 // Skip whitespace.
43 if ((codepointwise_ops &
44 NormalizationOptions_::CodepointwiseNormalizationOp_DROP_WHITESPACE) &&
45 unilib.IsWhitespace(codepoint)) {
46 continue;
47 }
48
49 // Skip punctuation.
50 if ((codepointwise_ops &
51 NormalizationOptions_::
52 CodepointwiseNormalizationOp_DROP_PUNCTUATION) &&
53 unilib.IsPunctuation(codepoint)) {
54 continue;
55 }
56
57 int32 normalized_codepoint = codepoint;
58
59 // Lower case.
60 if (codepointwise_ops &
61 NormalizationOptions_::CodepointwiseNormalizationOp_LOWERCASE) {
62 normalized_codepoint = unilib.ToLower(normalized_codepoint);
63
64 // Upper case.
65 } else if (codepointwise_ops &
66 NormalizationOptions_::CodepointwiseNormalizationOp_UPPERCASE) {
67 normalized_codepoint = unilib.ToUpper(normalized_codepoint);
68 }
69
70 result.push_back(normalized_codepoint);
71 }
72 return result;
73 }
74
75 } // namespace libtextclassifier3
76