1 /*
2 * Copyright (C) 2018 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 // Inference code for the text classification model.
18
19 #ifndef LIBTEXTCLASSIFIER_ANNOTATOR_ANNOTATOR_H_
20 #define LIBTEXTCLASSIFIER_ANNOTATOR_ANNOTATOR_H_
21
22 #include <memory>
23 #include <set>
24 #include <string>
25 #include <unordered_set>
26 #include <vector>
27
28 #include "annotator/contact/contact-engine.h"
29 #include "annotator/datetime/datetime-grounder.h"
30 #include "annotator/datetime/parser.h"
31 #include "annotator/duration/duration.h"
32 #include "annotator/experimental/experimental.h"
33 #include "annotator/feature-processor.h"
34 #include "annotator/grammar/grammar-annotator.h"
35 #include "annotator/installed_app/installed-app-engine.h"
36 #include "annotator/knowledge/knowledge-engine.h"
37 #include "annotator/model-executor.h"
38 #include "annotator/model_generated.h"
39 #include "annotator/number/number.h"
40 #include "annotator/person_name/person-name-engine.h"
41 #include "annotator/pod_ner/pod-ner.h"
42 #include "annotator/strip-unpaired-brackets.h"
43 #include "annotator/translate/translate.h"
44 #include "annotator/types.h"
45 #include "annotator/vocab/vocab-annotator.h"
46 #include "annotator/zlib-utils.h"
47 #include "utils/base/status.h"
48 #include "utils/base/statusor.h"
49 #include "utils/calendar/calendar.h"
50 #include "utils/flatbuffers/flatbuffers.h"
51 #include "utils/flatbuffers/mutable.h"
52 #include "utils/i18n/locale.h"
53 #include "utils/memory/mmap.h"
54 #include "utils/utf8/unicodetext.h"
55 #include "utils/utf8/unilib.h"
56 #include "utils/zlib/zlib.h"
57 #include "lang_id/lang-id.h"
58
59 namespace libtextclassifier3 {
60
61 // Holds TFLite interpreters for selection and classification models.
62 // NOTE: This class is not thread-safe, thus should NOT be re-used across
63 // threads.
64 class InterpreterManager {
65 public:
66 // The constructor can be called with nullptr for any of the executors, and is
67 // a defined behavior, as long as the corresponding *Interpreter() method is
68 // not called when the executor is null.
InterpreterManager(const ModelExecutor * selection_executor,const ModelExecutor * classification_executor)69 InterpreterManager(const ModelExecutor* selection_executor,
70 const ModelExecutor* classification_executor)
71 : selection_executor_(selection_executor),
72 classification_executor_(classification_executor) {}
73
74 // Gets or creates and caches an interpreter for the selection model.
75 tflite::Interpreter* SelectionInterpreter();
76
77 // Gets or creates and caches an interpreter for the classification model.
78 tflite::Interpreter* ClassificationInterpreter();
79
80 private:
81 const ModelExecutor* selection_executor_;
82 const ModelExecutor* classification_executor_;
83
84 std::unique_ptr<tflite::Interpreter> selection_interpreter_;
85 std::unique_ptr<tflite::Interpreter> classification_interpreter_;
86 };
87
88 // Stores entity types enabled for annotation, and provides operator() for
89 // checking whether a given entity type is enabled.
90 class EnabledEntityTypes {
91 public:
EnabledEntityTypes(const std::unordered_set<std::string> & entity_types)92 explicit EnabledEntityTypes(
93 const std::unordered_set<std::string>& entity_types)
94 : entity_types_(entity_types) {}
95
operator()96 bool operator()(const std::string& entity_type) const {
97 return entity_types_.empty() ||
98 entity_types_.find(entity_type) != entity_types_.cend();
99 }
100
101 private:
102 const std::unordered_set<std::string>& entity_types_;
103 };
104
105 // A text processing model that provides text classification, annotation,
106 // selection suggestion for various types.
107 // NOTE: This class is not thread-safe.
108 class Annotator {
109 public:
110 static std::unique_ptr<Annotator> FromUnownedBuffer(
111 const char* buffer, int size, const UniLib* unilib = nullptr,
112 const CalendarLib* calendarlib = nullptr);
113 // Copies the underlying model buffer string.
114 static std::unique_ptr<Annotator> FromString(
115 const std::string& buffer, const UniLib* unilib = nullptr,
116 const CalendarLib* calendarlib = nullptr);
117 // Takes ownership of the mmap.
118 static std::unique_ptr<Annotator> FromScopedMmap(
119 std::unique_ptr<ScopedMmap>* mmap, const UniLib* unilib = nullptr,
120 const CalendarLib* calendarlib = nullptr);
121 static std::unique_ptr<Annotator> FromScopedMmap(
122 std::unique_ptr<ScopedMmap>* mmap, std::unique_ptr<UniLib> unilib,
123 std::unique_ptr<CalendarLib> calendarlib);
124 static std::unique_ptr<Annotator> FromFileDescriptor(
125 int fd, int offset, int size, const UniLib* unilib = nullptr,
126 const CalendarLib* calendarlib = nullptr);
127 static std::unique_ptr<Annotator> FromFileDescriptor(
128 int fd, int offset, int size, std::unique_ptr<UniLib> unilib,
129 std::unique_ptr<CalendarLib> calendarlib);
130 static std::unique_ptr<Annotator> FromFileDescriptor(
131 int fd, const UniLib* unilib = nullptr,
132 const CalendarLib* calendarlib = nullptr);
133 static std::unique_ptr<Annotator> FromFileDescriptor(
134 int fd, std::unique_ptr<UniLib> unilib,
135 std::unique_ptr<CalendarLib> calendarlib);
136 static std::unique_ptr<Annotator> FromPath(
137 const std::string& path, const UniLib* unilib = nullptr,
138 const CalendarLib* calendarlib = nullptr);
139 static std::unique_ptr<Annotator> FromPath(
140 const std::string& path, std::unique_ptr<UniLib> unilib,
141 std::unique_ptr<CalendarLib> calendarlib);
142
143 // Returns true if the model is ready for use.
IsInitialized()144 bool IsInitialized() { return initialized_; }
145
146 // Initializes the knowledge engine with the given config.
147 bool InitializeKnowledgeEngine(const std::string& serialized_config);
148
149 // Initializes the contact engine with the given config.
150 bool InitializeContactEngine(const std::string& serialized_config);
151
152 // Cleans up the resources associated with the contact engine.
153 void CleanUpContactEngine();
154
155 // Initializes the installed app engine with the given config.
156 bool InitializeInstalledAppEngine(const std::string& serialized_config);
157
158 // Initializes the person name engine with the given person name model in the
159 // provided buffer. The buffer needs to outlive the annotator.
160 bool InitializePersonNameEngineFromUnownedBuffer(const void* buffer,
161 int size);
162
163 // Initializes the person name engine with the given person name model from
164 // the provided mmap.
165 bool InitializePersonNameEngineFromScopedMmap(const ScopedMmap& mmap);
166
167 // Initializes the person name engine with the given person name model in the
168 // provided file path.
169 bool InitializePersonNameEngineFromPath(const std::string& path);
170
171 // Initializes the person name engine with the given person name model in the
172 // provided file descriptor.
173 bool InitializePersonNameEngineFromFileDescriptor(int fd, int offset,
174 int size);
175
176 // Initializes the experimental annotators if available.
177 // Returns true if there is an implementation of experimental annotators
178 // linked in.
179 bool InitializeExperimentalAnnotators();
180
181 // Sets up the lang-id instance that should be used.
182 bool SetLangId(const libtextclassifier3::mobile::lang_id::LangId* lang_id);
183
184 // Runs inference for given a context and current selection (i.e. index
185 // of the first and one past last selected characters (utf8 codepoint
186 // offsets)). Returns the indices (utf8 codepoint offsets) of the selection
187 // beginning character and one past selection end character.
188 // Returns the original click_indices if an error occurs.
189 // NOTE: The selection indices are passed in and returned in terms of
190 // UTF8 codepoints (not bytes).
191 // Requires that the model is a smart selection model.
192 CodepointSpan SuggestSelection(
193 const std::string& context, CodepointSpan click_indices,
194 const SelectionOptions& options = SelectionOptions()) const;
195
196 // Classifies the selected text given the context string.
197 // Returns an empty result if an error occurs.
198 std::vector<ClassificationResult> ClassifyText(
199 const std::string& context, const CodepointSpan& selection_indices,
200 const ClassificationOptions& options = ClassificationOptions()) const;
201
202 // Annotates the given structed input request. Models which handle the full
203 // context request will receive all the metadata they require. While models
204 // that don't use the extra context are called using only a string.
205 // For each fragment the annotations are sorted by their position in
206 // the fragment and exclude spans classified as 'other'.
207 //
208 // The number of vectors of annotated spans will match the number
209 // of input fragments. The order of annotation span vectors will match the
210 // order of input fragments. If annotation is not possible for any of the
211 // annotators, no annotation is returned.
212 StatusOr<Annotations> AnnotateStructuredInput(
213 const std::vector<InputFragment>& string_fragments,
214 const AnnotationOptions& options = AnnotationOptions()) const;
215
216 // Annotates given input text. The annotations are sorted by their position
217 // in the context string and exclude spans classified as 'other'.
218 std::vector<AnnotatedSpan> Annotate(
219 const std::string& context,
220 const AnnotationOptions& options = AnnotationOptions()) const;
221
222 // Looks up a knowledge entity by its id. Returns the serialized knowledge
223 // result.
224 StatusOr<std::string> LookUpKnowledgeEntity(const std::string& id) const;
225
226 // Looks up an entity's property.
227 StatusOr<std::string> LookUpKnowledgeEntityProperty(
228 const std::string& mid_str, const std::string& property) const;
229
230 const Model* model() const;
231 const reflection::Schema* entity_data_schema() const;
232
233 // Exposes the feature processor for tests and evaluations.
234 const FeatureProcessor* SelectionFeatureProcessorForTests() const;
235 const FeatureProcessor* ClassificationFeatureProcessorForTests() const;
236
237 // Exposes the date time parser for tests and evaluations.
238 const DatetimeParser* DatetimeParserForTests() const;
239
240 static const std::string& kPhoneCollection;
241 static const std::string& kAddressCollection;
242 static const std::string& kDateCollection;
243 static const std::string& kUrlCollection;
244 static const std::string& kEmailCollection;
245
246 protected:
247 struct ScoredChunk {
248 TokenSpan token_span;
249 float score;
250 };
251
252 // NOTE: ValidateAndInitialize needs to be called before any other method.
Annotator()253 Annotator() : initialized_(false) {}
254
255 // Checks that model contains all required fields, and initializes internal
256 // datastructures.
257 // Needs to be called before any other method is.
258 void ValidateAndInitialize(const Model* model, const UniLib* unilib,
259 const CalendarLib* calendarlib);
260
261 // Initializes regular expressions for the regex model.
262 bool InitializeRegexModel(ZlibDecompressor* decompressor);
263
264 // Resolves conflicts in the list of candidates by removing some overlapping
265 // ones. Returns indices of the surviving ones.
266 // NOTE: Assumes that the candidates are sorted according to their position in
267 // the span.
268 bool ResolveConflicts(const std::vector<AnnotatedSpan>& candidates,
269 const std::string& context,
270 const std::vector<Token>& cached_tokens,
271 const std::vector<Locale>& detected_text_language_tags,
272 const BaseOptions& options,
273 InterpreterManager* interpreter_manager,
274 std::vector<int>* result) const;
275
276 // Resolves one conflict between candidates on indices 'start_index'
277 // (inclusive) and 'end_index' (exclusive). Assigns the winning candidate
278 // indices to 'chosen_indices'. Returns false if a problem arises.
279 bool ResolveConflict(const std::string& context,
280 const std::vector<Token>& cached_tokens,
281 const std::vector<AnnotatedSpan>& candidates,
282 const std::vector<Locale>& detected_text_language_tags,
283 int start_index, int end_index,
284 const BaseOptions& options,
285 InterpreterManager* interpreter_manager,
286 std::vector<int>* chosen_indices) const;
287
288 // Gets selection candidates from the ML model.
289 // Provides the tokens produced during tokenization of the context string for
290 // reuse.
291 bool ModelSuggestSelection(
292 const UnicodeText& context_unicode, const CodepointSpan& click_indices,
293 const std::vector<Locale>& detected_text_language_tags,
294 InterpreterManager* interpreter_manager, std::vector<Token>* tokens,
295 std::vector<AnnotatedSpan>* result) const;
296
297 // Classifies the selected text given the context string with the
298 // classification model.
299 // The following arguments are optional:
300 // - cached_tokens - can be given as empty
301 // - embedding_cache - can be given as nullptr
302 // - tokens - can be given as nullptr
303 // Returns true if no error occurred.
304 bool ModelClassifyText(
305 const std::string& context, const std::vector<Token>& cached_tokens,
306 const std::vector<Locale>& detected_text_language_tags,
307 const CodepointSpan& selection_indices, const BaseOptions& options,
308 InterpreterManager* interpreter_manager,
309 FeatureProcessor::EmbeddingCache* embedding_cache,
310 std::vector<ClassificationResult>* classification_results,
311 std::vector<Token>* tokens) const;
312
313 // Same as above, but (for optimization) takes the context as UnicodeText and
314 // takes the following extra arguments:
315 // - span_begin, span_end - iterators in context_unicode corresponding to
316 // selection_indices
317 // - line - a UnicodeTextRange within context_unicode corresponding to the
318 // line containing the selection - optional, can be given as nullptr
319 bool ModelClassifyText(
320 const UnicodeText& context_unicode,
321 const std::vector<Token>& cached_tokens,
322 const std::vector<Locale>& detected_text_language_tags,
323 const UnicodeText::const_iterator& span_begin,
324 const UnicodeText::const_iterator& span_end, const UnicodeTextRange* line,
325 const CodepointSpan& selection_indices, const BaseOptions& options,
326 InterpreterManager* interpreter_manager,
327 FeatureProcessor::EmbeddingCache* embedding_cache,
328 std::vector<ClassificationResult>* classification_results,
329 std::vector<Token>* tokens) const;
330
331 // Returns a relative token span that represents how many tokens on the left
332 // from the selection and right from the selection are needed for the
333 // classifier input.
334 TokenSpan ClassifyTextUpperBoundNeededTokens() const;
335
336 // Classifies the selected text with the regular expressions models.
337 // Returns true if no error happened, false otherwise.
338 bool RegexClassifyText(
339 const std::string& context, const CodepointSpan& selection_indices,
340 std::vector<ClassificationResult>* classification_result) const;
341
342 // Classifies the selected text with the date time model.
343 // Returns true if no error happened, false otherwise.
344 bool DatetimeClassifyText(
345 const std::string& context, const CodepointSpan& selection_indices,
346 const ClassificationOptions& options,
347 std::vector<ClassificationResult>* classification_results) const;
348
349 // Chunks given input text with the selection model and classifies the spans
350 // with the classification model.
351 // The annotations are sorted by their position in the context string and
352 // exclude spans classified as 'other'.
353 // Provides the tokens produced during tokenization of the context string for
354 // reuse.
355 bool ModelAnnotate(const std::string& context,
356 const std::vector<Locale>& detected_text_language_tags,
357 const AnnotationOptions& options,
358 InterpreterManager* interpreter_manager,
359 std::vector<Token>* tokens,
360 std::vector<AnnotatedSpan>* result) const;
361
362 // Groups the tokens into chunks. A chunk is a token span that should be the
363 // suggested selection when any of its contained tokens is clicked. The chunks
364 // are non-overlapping and are sorted by their position in the context string.
365 // "num_tokens" is the total number of tokens available (as this method does
366 // not need the actual vector of tokens).
367 // "span_of_interest" is a span of all the tokens that could be clicked.
368 // The resulting chunks all have to overlap with it and they cover this span
369 // completely. The first and last chunk might extend beyond it.
370 // The chunks vector is cleared before filling.
371 bool ModelChunk(int num_tokens, const TokenSpan& span_of_interest,
372 tflite::Interpreter* selection_interpreter,
373 const CachedFeatures& cached_features,
374 std::vector<TokenSpan>* chunks) const;
375
376 // A helper method for ModelChunk(). It generates scored chunk candidates for
377 // a click context model.
378 // NOTE: The returned chunks can (and most likely do) overlap.
379 bool ModelClickContextScoreChunks(
380 int num_tokens, const TokenSpan& span_of_interest,
381 const CachedFeatures& cached_features,
382 tflite::Interpreter* selection_interpreter,
383 std::vector<ScoredChunk>* scored_chunks) const;
384
385 // A helper method for ModelChunk(). It generates scored chunk candidates for
386 // a bounds-sensitive model.
387 // NOTE: The returned chunks can (and most likely do) overlap.
388 bool ModelBoundsSensitiveScoreChunks(
389 int num_tokens, const TokenSpan& span_of_interest,
390 const TokenSpan& inference_span, const CachedFeatures& cached_features,
391 tflite::Interpreter* selection_interpreter,
392 std::vector<ScoredChunk>* scored_chunks) const;
393
394 // Produces chunks isolated by a set of regular expressions.
395 bool RegexChunk(const UnicodeText& context_unicode,
396 const std::vector<int>& rules,
397 bool is_serialized_entity_data_enabled,
398 const EnabledEntityTypes& enabled_entity_types,
399 const AnnotationUsecase& annotation_usecase,
400
401 std::vector<AnnotatedSpan>* result) const;
402
403 // Produces chunks from the datetime parser.
404 bool DatetimeChunk(const UnicodeText& context_unicode,
405 int64 reference_time_ms_utc,
406 const std::string& reference_timezone,
407 const std::string& locales, ModeFlag mode,
408 AnnotationUsecase annotation_usecase,
409 bool is_serialized_entity_data_enabled,
410 std::vector<AnnotatedSpan>* result) const;
411
412 // Returns whether a classification should be filtered.
413 bool FilteredForAnnotation(const AnnotatedSpan& span) const;
414 bool FilteredForClassification(
415 const ClassificationResult& classification) const;
416 bool FilteredForSelection(const AnnotatedSpan& span) const;
417
418 // Computes the selection boundaries from a regular expression match.
419 CodepointSpan ComputeSelectionBoundaries(
420 const UniLib::RegexMatcher* match,
421 const RegexModel_::Pattern* config) const;
422
423 // Returns whether a regex pattern provides entity data from a match.
424 bool HasEntityData(const RegexModel_::Pattern* pattern) const;
425
426 // Constructs and serializes entity data from regex matches.
427 bool SerializedEntityDataFromRegexMatch(
428 const RegexModel_::Pattern* pattern, UniLib::RegexMatcher* matcher,
429 std::string* serialized_entity_data) const;
430
431 // For knowledge candidates which have a ContactPointer, fill in the
432 // appropriate contact metadata, if possible.
433 void AddContactMetadataToKnowledgeClassificationResults(
434 std::vector<AnnotatedSpan>* candidates) const;
435
436 // Gets priority score from the list of classification results.
437 float GetPriorityScore(
438 const std::vector<ClassificationResult>& classification) const;
439
440 // Verifies a regex match and returns true if verification was successful.
441 bool VerifyRegexMatchCandidate(
442 const std::string& context,
443 const VerificationOptions* verification_options, const std::string& match,
444 const UniLib::RegexMatcher* matcher) const;
445
446 const Model* model_;
447
448 std::unique_ptr<const ModelExecutor> selection_executor_;
449 std::unique_ptr<const ModelExecutor> classification_executor_;
450 std::unique_ptr<const EmbeddingExecutor> embedding_executor_;
451
452 std::unique_ptr<const FeatureProcessor> selection_feature_processor_;
453 std::unique_ptr<const FeatureProcessor> classification_feature_processor_;
454
455 std::unique_ptr<const grammar::Analyzer> analyzer_;
456 std::unique_ptr<const DatetimeGrounder> datetime_grounder_;
457 std::unique_ptr<const DatetimeParser> datetime_parser_;
458 std::unique_ptr<const GrammarAnnotator> grammar_annotator_;
459
460 std::string owned_buffer_;
461 std::unique_ptr<UniLib> owned_unilib_;
462 std::unique_ptr<CalendarLib> owned_calendarlib_;
463
464 private:
465 struct CompiledRegexPattern {
466 const RegexModel_::Pattern* config;
467 std::unique_ptr<UniLib::RegexPattern> pattern;
468 };
469
470 // Removes annotations the entity type of which is not in the set of enabled
471 // entity types.
472 void RemoveNotEnabledEntityTypes(
473 const EnabledEntityTypes& is_entity_type_enabled,
474 std::vector<AnnotatedSpan>* annotated_spans) const;
475
476 // Runs only annotators that do not support structured input. Does conflict
477 // resolution, removal of disallowed entities and sorting on both new
478 // generated candidates and passed in entities.
479 // Returns Status::Error if the annotation failed, in which case the vector of
480 // candidates should be ignored.
481 Status AnnotateSingleInput(const std::string& context,
482 const AnnotationOptions& options,
483 std::vector<AnnotatedSpan>* candidates) const;
484
485 // Parses the money amount into whole and decimal part and fills in the
486 // entity data information.
487 bool ParseAndFillInMoneyAmount(std::string* serialized_entity_data,
488 const UniLib::RegexMatcher* match,
489 const RegexModel_::Pattern* config,
490 const UnicodeText& context_unicode) const;
491
492 // Given the regex capturing groups, extract the one representing the money
493 // quantity and fills in the actual string and the power of 10 the amount
494 // should be multiplied with.
495 void GetMoneyQuantityFromCapturingGroup(const UniLib::RegexMatcher* match,
496 const RegexModel_::Pattern* config,
497 const UnicodeText& context_unicode,
498 std::string* quantity,
499 int* exponent) const;
500
501 // Returns true if any of the ff-model entity types is enabled.
502 bool IsAnyModelEntityTypeEnabled(
503 const EnabledEntityTypes& is_entity_type_enabled) const;
504
505 // Returns true if any of the regex entity types is enabled.
506 bool IsAnyRegexEntityTypeEnabled(
507 const EnabledEntityTypes& is_entity_type_enabled) const;
508
509 // Returns true if any of the POD NER entity types is enabled.
510 bool IsAnyPodNerEntityTypeEnabled(
511 const EnabledEntityTypes& is_entity_type_enabled) const;
512
513 std::unique_ptr<ScopedMmap> mmap_;
514 bool initialized_ = false;
515 bool enabled_for_annotation_ = false;
516 bool enabled_for_classification_ = false;
517 bool enabled_for_selection_ = false;
518 std::unordered_set<std::string> filtered_collections_annotation_;
519 std::unordered_set<std::string> filtered_collections_classification_;
520 std::unordered_set<std::string> filtered_collections_selection_;
521
522 std::vector<CompiledRegexPattern> regex_patterns_;
523
524 // Indices into regex_patterns_ for the different modes.
525 std::vector<int> annotation_regex_patterns_, classification_regex_patterns_,
526 selection_regex_patterns_;
527
528 const UniLib* unilib_;
529 const CalendarLib* calendarlib_;
530
531 std::unique_ptr<const KnowledgeEngine> knowledge_engine_;
532 std::unique_ptr<const ContactEngine> contact_engine_;
533 std::unique_ptr<const InstalledAppEngine> installed_app_engine_;
534 std::unique_ptr<const NumberAnnotator> number_annotator_;
535 std::unique_ptr<const DurationAnnotator> duration_annotator_;
536 std::unique_ptr<const PersonNameEngine> person_name_engine_;
537 std::unique_ptr<const TranslateAnnotator> translate_annotator_;
538 std::unique_ptr<const PodNerAnnotator> pod_ner_annotator_;
539 std::unique_ptr<const ExperimentalAnnotator> experimental_annotator_;
540 std::unique_ptr<const VocabAnnotator> vocab_annotator_;
541
542 // Builder for creating extra data.
543 const reflection::Schema* entity_data_schema_;
544 std::unique_ptr<MutableFlatbufferBuilder> entity_data_builder_;
545
546 // Locales for which the entire model triggers.
547 std::vector<Locale> model_triggering_locales_;
548
549 // Locales for which the ML model triggers.
550 std::vector<Locale> ml_model_triggering_locales_;
551
552 // Locales that the dictionary classification support.
553 std::vector<Locale> dictionary_locales_;
554
555 // Decimal and thousands number separators.
556 std::unordered_set<char32> money_separators_;
557
558 // Model for language identification.
559 const libtextclassifier3::mobile::lang_id::LangId* lang_id_ = nullptr;
560
561 // If true, will prioritize the longest annotation during conflict resolution.
562 bool prioritize_longest_annotation_ = false;
563
564 // If true, the annotator will perform conflict resolution between the
565 // different sub-annotators also in the RAW mode. If false, no conflict
566 // resolution will be performed in RAW mode.
567 bool do_conflict_resolution_in_raw_mode_ = true;
568 };
569
570 namespace internal {
571
572 // Helper function, which if the initial 'span' contains only white-spaces,
573 // moves the selection to a single-codepoint selection on the left side
574 // of this block of white-space.
575 CodepointSpan SnapLeftIfWhitespaceSelection(const CodepointSpan& span,
576 const UnicodeText& context_unicode,
577 const UniLib& unilib);
578
579 // Copies tokens from 'cached_tokens' that are
580 // 'tokens_around_selection_to_copy' (on the left, and right) tokens distant
581 // from the tokens that correspond to 'selection_indices'.
582 std::vector<Token> CopyCachedTokens(const std::vector<Token>& cached_tokens,
583 const CodepointSpan& selection_indices,
584 TokenSpan tokens_around_selection_to_copy);
585 } // namespace internal
586
587 // Interprets the buffer as a Model flatbuffer and returns it for reading.
588 const Model* ViewModel(const void* buffer, int size);
589
590 // Opens model from given path and runs a function, passing the loaded Model
591 // flatbuffer as an argument.
592 //
593 // This is mainly useful if we don't want to pay the cost for the model
594 // initialization because we'll be only reading some flatbuffer values from the
595 // file.
596 template <typename ReturnType, typename Func>
VisitAnnotatorModel(const std::string & path,Func function)597 ReturnType VisitAnnotatorModel(const std::string& path, Func function) {
598 ScopedMmap mmap(path);
599 if (!mmap.handle().ok()) {
600 function(/*model=*/nullptr);
601 }
602 const Model* model =
603 ViewModel(mmap.handle().start(), mmap.handle().num_bytes());
604 return function(model);
605 }
606
607 } // namespace libtextclassifier3
608
609 #endif // LIBTEXTCLASSIFIER_ANNOTATOR_ANNOTATOR_H_
610