• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "utils/grammar/analyzer.h"
18 
19 #include "utils/base/status_macros.h"
20 #include "utils/utf8/unicodetext.h"
21 
22 namespace libtextclassifier3::grammar {
23 
Analyzer(const UniLib * unilib,const RulesSet * rules_set)24 Analyzer::Analyzer(const UniLib* unilib, const RulesSet* rules_set)
25     // TODO(smillius): Add tokenizer options to `RulesSet`.
26     : owned_tokenizer_(new Tokenizer(libtextclassifier3::TokenizationType_ICU,
27                                      unilib,
28                                      /*codepoint_ranges=*/{},
29                                      /*internal_tokenizer_codepoint_ranges=*/{},
30                                      /*split_on_script_change=*/false,
31                                      /*icu_preserve_whitespace_tokens=*/false)),
32       tokenizer_(owned_tokenizer_.get()),
33       parser_(unilib, rules_set),
34       semantic_evaluator_(rules_set->semantic_values_schema() != nullptr
35                               ? flatbuffers::GetRoot<reflection::Schema>(
36                                     rules_set->semantic_values_schema()->data())
37                               : nullptr) {}
38 
Analyzer(const UniLib * unilib,const RulesSet * rules_set,const Tokenizer * tokenizer)39 Analyzer::Analyzer(const UniLib* unilib, const RulesSet* rules_set,
40                    const Tokenizer* tokenizer)
41     : tokenizer_(tokenizer),
42       parser_(unilib, rules_set),
43       semantic_evaluator_(rules_set->semantic_values_schema() != nullptr
44                               ? flatbuffers::GetRoot<reflection::Schema>(
45                                     rules_set->semantic_values_schema()->data())
46                               : nullptr) {}
47 
Parse(const TextContext & input,UnsafeArena * arena,bool deduplicate_derivations) const48 StatusOr<std::vector<EvaluatedDerivation>> Analyzer::Parse(
49     const TextContext& input, UnsafeArena* arena,
50     bool deduplicate_derivations) const {
51   std::vector<EvaluatedDerivation> result;
52 
53   std::vector<Derivation> derivations = parser_.Parse(input, arena);
54   if (deduplicate_derivations) {
55     derivations = DeduplicateDerivations<Derivation>(derivations);
56   }
57   // Evaluate each derivation.
58   for (const Derivation& derivation : derivations) {
59     if (derivation.IsValid()) {
60       TC3_ASSIGN_OR_RETURN(const SemanticValue* value,
61                            semantic_evaluator_.Eval(input, derivation, arena));
62       result.emplace_back(
63           EvaluatedDerivation{{/*parse_tree=*/derivation.parse_tree,
64                                /*rule_id=*/derivation.rule_id},
65                               /*semantic_value=*/value});
66     }
67   }
68 
69   return result;
70 }
71 
Parse(const UnicodeText & text,const std::vector<Locale> & locales,UnsafeArena * arena,bool deduplicate_derivations) const72 StatusOr<std::vector<EvaluatedDerivation>> Analyzer::Parse(
73     const UnicodeText& text, const std::vector<Locale>& locales,
74     UnsafeArena* arena, bool deduplicate_derivations) const {
75   return Parse(BuildTextContextForInput(text, locales), arena,
76                deduplicate_derivations);
77 }
78 
BuildTextContextForInput(const UnicodeText & text,const std::vector<Locale> & locales) const79 TextContext Analyzer::BuildTextContextForInput(
80     const UnicodeText& text, const std::vector<Locale>& locales) const {
81   TextContext context;
82   context.text = UnicodeText(text, /*do_copy=*/false);
83   context.tokens = tokenizer_->Tokenize(context.text);
84   context.codepoints = context.text.Codepoints();
85   context.codepoints.push_back(context.text.end());
86   context.locales = locales;
87   context.context_span.first = 0;
88   context.context_span.second = context.tokens.size();
89   return context;
90 }
91 
92 }  // namespace libtextclassifier3::grammar
93