1 /*
2 * Copyright (C) 2018 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "lang_id/features/relevant-script-feature.h"
18
19 #include <string>
20
21 #include "lang_id/common/fel/feature-types.h"
22 #include "lang_id/common/fel/task-context.h"
23 #include "lang_id/common/fel/workspace.h"
24 #include "lang_id/common/lite_base/logging.h"
25 #include "lang_id/common/utf8.h"
26 #include "lang_id/script/script-detector.h"
27
28 namespace libtextclassifier3 {
29 namespace mobile {
30 namespace lang_id {
31
Setup(TaskContext * context)32 bool RelevantScriptFeature::Setup(TaskContext *context) {
33 std::string script_detector_name = GetParameter(
34 "script_detector_name", /* default_value = */ "tiny-script-detector");
35
36 // We don't use absl::WrapUnique, nor the rest of absl, see http://b/71873194
37 script_detector_.reset(ScriptDetector::Create(script_detector_name));
38 if (script_detector_ == nullptr) {
39 // This means ScriptDetector::Create() could not find the requested
40 // script_detector_name. In that case, Create() already logged an error
41 // message.
42 return false;
43 }
44
45 // We use default value 172 because this is the number of scripts supported by
46 // the first model we trained with this feature. See http://b/70617713.
47 // Newer models may support more scripts.
48 num_supported_scripts_ = GetIntParameter("num_supported_scripts", 172);
49 return true;
50 }
51
Init(TaskContext * context)52 bool RelevantScriptFeature::Init(TaskContext *context) {
53 set_feature_type(new NumericFeatureType(name(), num_supported_scripts_));
54 return true;
55 }
56
Evaluate(const WorkspaceSet & workspaces,const LightSentence & sentence,FeatureVector * result) const57 void RelevantScriptFeature::Evaluate(
58 const WorkspaceSet &workspaces, const LightSentence &sentence,
59 FeatureVector *result) const {
60 // counts[s] is the number of characters with script s.
61 std::vector<int> counts(num_supported_scripts_);
62 int total_count = 0;
63 for (const std::string &word : sentence) {
64 const char *const word_end = word.data() + word.size();
65 const char *curr = word.data();
66
67 // Skip over token start '^'.
68 SAFTM_DCHECK_EQ(*curr, '^');
69 curr += utils::OneCharLen(curr);
70 while (true) {
71 const int num_bytes = utils::OneCharLen(curr);
72
73 int script = script_detector_->GetScript(curr, num_bytes);
74
75 // We do this update and the if (...) break below *before* incrementing
76 // counts[script] in order to skip the token end '$'.
77 curr += num_bytes;
78 if (curr >= word_end) {
79 SAFTM_DCHECK_EQ(*(curr - num_bytes), '$');
80 break;
81 }
82 SAFTM_DCHECK_GE(script, 0);
83
84 if (script < num_supported_scripts_) {
85 counts[script]++;
86 total_count++;
87 } else {
88 // Unsupported script: this usually indicates a script that is
89 // recognized by newer versions of the code, after the model was
90 // trained. E.g., new code running with old model.
91 }
92 }
93 }
94
95 for (int script_id = 0; script_id < num_supported_scripts_; ++script_id) {
96 int count = counts[script_id];
97 if (count > 0) {
98 const float weight = static_cast<float>(count) / total_count;
99 FloatFeatureValue value(script_id, weight);
100 result->add(feature_type(), value.discrete_value);
101 }
102 }
103 }
104
105 SAFTM_STATIC_REGISTRATION(RelevantScriptFeature);
106
107 } // namespace lang_id
108 } // namespace mobile
109 } // namespace nlp_saft
110