• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "lang_id/features/relevant-script-feature.h"
18 
19 #include <string>
20 #include <vector>
21 
22 #include "lang_id/common/fel/feature-types.h"
23 #include "lang_id/common/fel/task-context.h"
24 #include "lang_id/common/fel/workspace.h"
25 #include "lang_id/common/lite_base/logging.h"
26 #include "lang_id/common/utf8.h"
27 #include "lang_id/script/script-detector.h"
28 
29 namespace libtextclassifier3 {
30 namespace mobile {
31 namespace lang_id {
32 
Setup(TaskContext * context)33 bool RelevantScriptFeature::Setup(TaskContext *context) {
34   std::string script_detector_name = GetParameter(
35       "script_detector_name", /* default_value = */ "tiny-script-detector");
36 
37   // We don't use absl::WrapUnique, nor the rest of absl, see http://b/71873194
38   script_detector_.reset(ScriptDetector::Create(script_detector_name));
39   if (script_detector_ == nullptr) {
40     // This means ScriptDetector::Create() could not find the requested
41     // script_detector_name.  In that case, Create() already logged an error
42     // message.
43     return false;
44   }
45 
46   // We use default value 172 because this is the number of scripts supported by
47   // the first model we trained with this feature.  See http://b/70617713.
48   // Newer models may support more scripts.
49   num_supported_scripts_ = GetIntParameter("num_supported_scripts", 172);
50   return true;
51 }
52 
Init(TaskContext * context)53 bool RelevantScriptFeature::Init(TaskContext *context) {
54   set_feature_type(new NumericFeatureType(name(), num_supported_scripts_));
55   return true;
56 }
57 
Evaluate(const WorkspaceSet & workspaces,const LightSentence & sentence,FeatureVector * result) const58 void RelevantScriptFeature::Evaluate(
59     const WorkspaceSet &workspaces, const LightSentence &sentence,
60     FeatureVector *result) const {
61   // counts[s] is the number of characters with script s.
62   std::vector<int> counts(num_supported_scripts_);
63   int total_count = 0;
64   for (const std::string &word : sentence) {
65     const char *const word_end = word.data() + word.size();
66     const char *curr = word.data();
67 
68     // Skip over token start '^'.
69     SAFTM_DCHECK_EQ(*curr, '^');
70     curr += utils::OneCharLen(curr);
71     while (true) {
72       const int num_bytes = utils::OneCharLen(curr);
73 
74       int script = script_detector_->GetScript(curr, num_bytes);
75 
76       // We do this update and the if (...) break below *before* incrementing
77       // counts[script] in order to skip the token end '$'.
78       curr += num_bytes;
79       if (curr >= word_end) {
80         SAFTM_DCHECK_EQ(*(curr - num_bytes), '$');
81         break;
82       }
83       SAFTM_DCHECK_GE(script, 0);
84 
85       if (script < num_supported_scripts_) {
86         counts[script]++;
87         total_count++;
88       } else {
89         // Unsupported script: this usually indicates a script that is
90         // recognized by newer versions of the code, after the model was
91         // trained.  E.g., new code running with old model.
92       }
93     }
94   }
95 
96   for (int script_id = 0; script_id < num_supported_scripts_; ++script_id) {
97     int count = counts[script_id];
98     if (count > 0) {
99       const float weight = static_cast<float>(count) / total_count;
100       FloatFeatureValue value(script_id, weight);
101       result->add(feature_type(), value.discrete_value);
102     }
103   }
104 }
105 
106 SAFTM_STATIC_REGISTRATION(RelevantScriptFeature);
107 
108 }  // namespace lang_id
109 }  // namespace mobile
110 }  // namespace nlp_saft
111