1 /*
2 * Copyright (C) 2018 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 // Inference code for the text classification model.
18
19 #ifndef LIBTEXTCLASSIFIER_ANNOTATOR_ANNOTATOR_H_
20 #define LIBTEXTCLASSIFIER_ANNOTATOR_ANNOTATOR_H_
21
22 #include <memory>
23 #include <set>
24 #include <string>
25 #include <unordered_set>
26 #include <vector>
27
28 #include "annotator/contact/contact-engine.h"
29 #include "annotator/datetime/datetime-grounder.h"
30 #include "annotator/datetime/parser.h"
31 #include "annotator/duration/duration.h"
32 #include "annotator/experimental/experimental.h"
33 #include "annotator/feature-processor.h"
34 #include "annotator/grammar/grammar-annotator.h"
35 #include "annotator/installed_app/installed-app-engine.h"
36 #include "annotator/knowledge/knowledge-engine.h"
37 #include "annotator/model-executor.h"
38 #include "annotator/model_generated.h"
39 #include "annotator/number/number.h"
40 #include "annotator/person_name/person-name-engine.h"
41 #include "annotator/pod_ner/pod-ner.h"
42 #include "annotator/strip-unpaired-brackets.h"
43 #include "annotator/translate/translate.h"
44 #include "annotator/types.h"
45 #include "annotator/vocab/vocab-annotator.h"
46 #include "annotator/zlib-utils.h"
47 #include "utils/base/status.h"
48 #include "utils/base/statusor.h"
49 #include "utils/calendar/calendar.h"
50 #include "utils/flatbuffers/flatbuffers.h"
51 #include "utils/flatbuffers/mutable.h"
52 #include "utils/i18n/locale.h"
53 #include "utils/memory/mmap.h"
54 #include "utils/utf8/unicodetext.h"
55 #include "utils/utf8/unilib.h"
56 #include "utils/zlib/zlib.h"
57 #include "lang_id/lang-id.h"
58
59 namespace libtextclassifier3 {
60
61 // Holds TFLite interpreters for selection and classification models.
62 // NOTE: This class is not thread-safe, thus should NOT be re-used across
63 // threads.
64 class InterpreterManager {
65 public:
66 // The constructor can be called with nullptr for any of the executors, and is
67 // a defined behavior, as long as the corresponding *Interpreter() method is
68 // not called when the executor is null.
InterpreterManager(const ModelExecutor * selection_executor,const ModelExecutor * classification_executor)69 InterpreterManager(const ModelExecutor* selection_executor,
70 const ModelExecutor* classification_executor)
71 : selection_executor_(selection_executor),
72 classification_executor_(classification_executor) {}
73
74 // Gets or creates and caches an interpreter for the selection model.
75 tflite::Interpreter* SelectionInterpreter();
76
77 // Gets or creates and caches an interpreter for the classification model.
78 tflite::Interpreter* ClassificationInterpreter();
79
80 private:
81 const ModelExecutor* selection_executor_;
82 const ModelExecutor* classification_executor_;
83
84 std::unique_ptr<tflite::Interpreter> selection_interpreter_;
85 std::unique_ptr<tflite::Interpreter> classification_interpreter_;
86 };
87
88 // Stores entity types enabled for annotation, and provides operator() for
89 // checking whether a given entity type is enabled.
90 class EnabledEntityTypes {
91 public:
EnabledEntityTypes(const std::unordered_set<std::string> & entity_types)92 explicit EnabledEntityTypes(
93 const std::unordered_set<std::string>& entity_types)
94 : entity_types_(entity_types) {}
95
operator()96 bool operator()(const std::string& entity_type) const {
97 return entity_types_.empty() ||
98 entity_types_.find(entity_type) != entity_types_.cend();
99 }
100
101 private:
102 const std::unordered_set<std::string>& entity_types_;
103 };
104
105 // A text processing model that provides text classification, annotation,
106 // selection suggestion for various types.
107 // NOTE: This class is not thread-safe.
108 class Annotator {
109 public:
110 static std::unique_ptr<Annotator> FromUnownedBuffer(
111 const char* buffer, int size, const UniLib* unilib = nullptr,
112 const CalendarLib* calendarlib = nullptr);
113 // Copies the underlying model buffer string.
114 static std::unique_ptr<Annotator> FromString(
115 const std::string& buffer, const UniLib* unilib = nullptr,
116 const CalendarLib* calendarlib = nullptr);
117 // Takes ownership of the mmap.
118 static std::unique_ptr<Annotator> FromScopedMmap(
119 std::unique_ptr<ScopedMmap>* mmap, const UniLib* unilib = nullptr,
120 const CalendarLib* calendarlib = nullptr);
121 static std::unique_ptr<Annotator> FromScopedMmap(
122 std::unique_ptr<ScopedMmap>* mmap, std::unique_ptr<UniLib> unilib,
123 std::unique_ptr<CalendarLib> calendarlib);
124 static std::unique_ptr<Annotator> FromFileDescriptor(
125 int fd, int offset, int size, const UniLib* unilib = nullptr,
126 const CalendarLib* calendarlib = nullptr);
127 static std::unique_ptr<Annotator> FromFileDescriptor(
128 int fd, int offset, int size, std::unique_ptr<UniLib> unilib,
129 std::unique_ptr<CalendarLib> calendarlib);
130 static std::unique_ptr<Annotator> FromFileDescriptor(
131 int fd, const UniLib* unilib = nullptr,
132 const CalendarLib* calendarlib = nullptr);
133 static std::unique_ptr<Annotator> FromFileDescriptor(
134 int fd, std::unique_ptr<UniLib> unilib,
135 std::unique_ptr<CalendarLib> calendarlib);
136 static std::unique_ptr<Annotator> FromPath(
137 const std::string& path, const UniLib* unilib = nullptr,
138 const CalendarLib* calendarlib = nullptr);
139 static std::unique_ptr<Annotator> FromPath(
140 const std::string& path, std::unique_ptr<UniLib> unilib,
141 std::unique_ptr<CalendarLib> calendarlib);
142
143 // Returns true if the model is ready for use.
IsInitialized()144 bool IsInitialized() { return initialized_; }
145
146 // Initializes the knowledge engine with the given config.
147 bool InitializeKnowledgeEngine(const std::string& serialized_config);
148
149 // Initializes the contact engine with the given config.
150 bool InitializeContactEngine(const std::string& serialized_config);
151
152 // Initializes the installed app engine with the given config.
153 bool InitializeInstalledAppEngine(const std::string& serialized_config);
154
155 // Initializes the person name engine with the given person name model in the
156 // provided buffer. The buffer needs to outlive the annotator.
157 bool InitializePersonNameEngineFromUnownedBuffer(const void* buffer,
158 int size);
159
160 // Initializes the person name engine with the given person name model from
161 // the provided mmap.
162 bool InitializePersonNameEngineFromScopedMmap(const ScopedMmap& mmap);
163
164 // Initializes the person name engine with the given person name model in the
165 // provided file path.
166 bool InitializePersonNameEngineFromPath(const std::string& path);
167
168 // Initializes the person name engine with the given person name model in the
169 // provided file descriptor.
170 bool InitializePersonNameEngineFromFileDescriptor(int fd, int offset,
171 int size);
172
173 // Initializes the experimental annotators if available.
174 // Returns true if there is an implementation of experimental annotators
175 // linked in.
176 bool InitializeExperimentalAnnotators();
177
178 // Sets up the lang-id instance that should be used.
179 bool SetLangId(const libtextclassifier3::mobile::lang_id::LangId* lang_id);
180
181 // Runs inference for given a context and current selection (i.e. index
182 // of the first and one past last selected characters (utf8 codepoint
183 // offsets)). Returns the indices (utf8 codepoint offsets) of the selection
184 // beginning character and one past selection end character.
185 // Returns the original click_indices if an error occurs.
186 // NOTE: The selection indices are passed in and returned in terms of
187 // UTF8 codepoints (not bytes).
188 // Requires that the model is a smart selection model.
189 CodepointSpan SuggestSelection(
190 const std::string& context, CodepointSpan click_indices,
191 const SelectionOptions& options = SelectionOptions()) const;
192
193 // Classifies the selected text given the context string.
194 // Returns an empty result if an error occurs.
195 std::vector<ClassificationResult> ClassifyText(
196 const std::string& context, const CodepointSpan& selection_indices,
197 const ClassificationOptions& options = ClassificationOptions()) const;
198
199 // Annotates the given structed input request. Models which handle the full
200 // context request will receive all the metadata they require. While models
201 // that don't use the extra context are called using only a string.
202 // For each fragment the annotations are sorted by their position in
203 // the fragment and exclude spans classified as 'other'.
204 //
205 // The number of vectors of annotated spans will match the number
206 // of input fragments. The order of annotation span vectors will match the
207 // order of input fragments. If annotation is not possible for any of the
208 // annotators, no annotation is returned.
209 StatusOr<Annotations> AnnotateStructuredInput(
210 const std::vector<InputFragment>& string_fragments,
211 const AnnotationOptions& options = AnnotationOptions()) const;
212
213 // Annotates given input text. The annotations are sorted by their position
214 // in the context string and exclude spans classified as 'other'.
215 std::vector<AnnotatedSpan> Annotate(
216 const std::string& context,
217 const AnnotationOptions& options = AnnotationOptions()) const;
218
219 // Looks up a knowledge entity by its id. Returns the serialized knowledge
220 // result.
221 StatusOr<std::string> LookUpKnowledgeEntity(const std::string& id) const;
222
223 // Looks up an entity's property.
224 StatusOr<std::string> LookUpKnowledgeEntityProperty(
225 const std::string& mid_str, const std::string& property) const;
226
227 const Model* model() const;
228 const reflection::Schema* entity_data_schema() const;
229
230 // Exposes the feature processor for tests and evaluations.
231 const FeatureProcessor* SelectionFeatureProcessorForTests() const;
232 const FeatureProcessor* ClassificationFeatureProcessorForTests() const;
233
234 // Exposes the date time parser for tests and evaluations.
235 const DatetimeParser* DatetimeParserForTests() const;
236
237 static const std::string& kPhoneCollection;
238 static const std::string& kAddressCollection;
239 static const std::string& kDateCollection;
240 static const std::string& kUrlCollection;
241 static const std::string& kEmailCollection;
242
243 protected:
244 struct ScoredChunk {
245 TokenSpan token_span;
246 float score;
247 };
248
249 // NOTE: ValidateAndInitialize needs to be called before any other method.
Annotator()250 Annotator() : initialized_(false) {}
251
252 // Checks that model contains all required fields, and initializes internal
253 // datastructures.
254 // Needs to be called before any other method is.
255 void ValidateAndInitialize(const Model* model, const UniLib* unilib,
256 const CalendarLib* calendarlib);
257
258 // Initializes regular expressions for the regex model.
259 bool InitializeRegexModel(ZlibDecompressor* decompressor);
260
261 // Resolves conflicts in the list of candidates by removing some overlapping
262 // ones. Returns indices of the surviving ones.
263 // NOTE: Assumes that the candidates are sorted according to their position in
264 // the span.
265 bool ResolveConflicts(const std::vector<AnnotatedSpan>& candidates,
266 const std::string& context,
267 const std::vector<Token>& cached_tokens,
268 const std::vector<Locale>& detected_text_language_tags,
269 const BaseOptions& options,
270 InterpreterManager* interpreter_manager,
271 std::vector<int>* result) const;
272
273 // Resolves one conflict between candidates on indices 'start_index'
274 // (inclusive) and 'end_index' (exclusive). Assigns the winning candidate
275 // indices to 'chosen_indices'. Returns false if a problem arises.
276 bool ResolveConflict(const std::string& context,
277 const std::vector<Token>& cached_tokens,
278 const std::vector<AnnotatedSpan>& candidates,
279 const std::vector<Locale>& detected_text_language_tags,
280 int start_index, int end_index,
281 const BaseOptions& options,
282 InterpreterManager* interpreter_manager,
283 std::vector<int>* chosen_indices) const;
284
285 // Gets selection candidates from the ML model.
286 // Provides the tokens produced during tokenization of the context string for
287 // reuse.
288 bool ModelSuggestSelection(
289 const UnicodeText& context_unicode, const CodepointSpan& click_indices,
290 const std::vector<Locale>& detected_text_language_tags,
291 InterpreterManager* interpreter_manager, std::vector<Token>* tokens,
292 std::vector<AnnotatedSpan>* result) const;
293
294 // Classifies the selected text given the context string with the
295 // classification model.
296 // The following arguments are optional:
297 // - cached_tokens - can be given as empty
298 // - embedding_cache - can be given as nullptr
299 // - tokens - can be given as nullptr
300 // Returns true if no error occurred.
301 bool ModelClassifyText(
302 const std::string& context, const std::vector<Token>& cached_tokens,
303 const std::vector<Locale>& detected_text_language_tags,
304 const CodepointSpan& selection_indices, const BaseOptions& options,
305 InterpreterManager* interpreter_manager,
306 FeatureProcessor::EmbeddingCache* embedding_cache,
307 std::vector<ClassificationResult>* classification_results,
308 std::vector<Token>* tokens) const;
309
310 // Same as above, but (for optimization) takes the context as UnicodeText and
311 // takes the following extra arguments:
312 // - span_begin, span_end - iterators in context_unicode corresponding to
313 // selection_indices
314 // - line - a UnicodeTextRange within context_unicode corresponding to the
315 // line containing the selection - optional, can be given as nullptr
316 bool ModelClassifyText(
317 const UnicodeText& context_unicode,
318 const std::vector<Token>& cached_tokens,
319 const std::vector<Locale>& detected_text_language_tags,
320 const UnicodeText::const_iterator& span_begin,
321 const UnicodeText::const_iterator& span_end, const UnicodeTextRange* line,
322 const CodepointSpan& selection_indices, const BaseOptions& options,
323 InterpreterManager* interpreter_manager,
324 FeatureProcessor::EmbeddingCache* embedding_cache,
325 std::vector<ClassificationResult>* classification_results,
326 std::vector<Token>* tokens) const;
327
328 // Returns a relative token span that represents how many tokens on the left
329 // from the selection and right from the selection are needed for the
330 // classifier input.
331 TokenSpan ClassifyTextUpperBoundNeededTokens() const;
332
333 // Classifies the selected text with the regular expressions models.
334 // Returns true if no error happened, false otherwise.
335 bool RegexClassifyText(
336 const std::string& context, const CodepointSpan& selection_indices,
337 std::vector<ClassificationResult>* classification_result) const;
338
339 // Classifies the selected text with the date time model.
340 // Returns true if no error happened, false otherwise.
341 bool DatetimeClassifyText(
342 const std::string& context, const CodepointSpan& selection_indices,
343 const ClassificationOptions& options,
344 std::vector<ClassificationResult>* classification_results) const;
345
346 // Chunks given input text with the selection model and classifies the spans
347 // with the classification model.
348 // The annotations are sorted by their position in the context string and
349 // exclude spans classified as 'other'.
350 // Provides the tokens produced during tokenization of the context string for
351 // reuse.
352 bool ModelAnnotate(const std::string& context,
353 const std::vector<Locale>& detected_text_language_tags,
354 const AnnotationOptions& options,
355 InterpreterManager* interpreter_manager,
356 std::vector<Token>* tokens,
357 std::vector<AnnotatedSpan>* result) const;
358
359 // Groups the tokens into chunks. A chunk is a token span that should be the
360 // suggested selection when any of its contained tokens is clicked. The chunks
361 // are non-overlapping and are sorted by their position in the context string.
362 // "num_tokens" is the total number of tokens available (as this method does
363 // not need the actual vector of tokens).
364 // "span_of_interest" is a span of all the tokens that could be clicked.
365 // The resulting chunks all have to overlap with it and they cover this span
366 // completely. The first and last chunk might extend beyond it.
367 // The chunks vector is cleared before filling.
368 bool ModelChunk(int num_tokens, const TokenSpan& span_of_interest,
369 tflite::Interpreter* selection_interpreter,
370 const CachedFeatures& cached_features,
371 std::vector<TokenSpan>* chunks) const;
372
373 // A helper method for ModelChunk(). It generates scored chunk candidates for
374 // a click context model.
375 // NOTE: The returned chunks can (and most likely do) overlap.
376 bool ModelClickContextScoreChunks(
377 int num_tokens, const TokenSpan& span_of_interest,
378 const CachedFeatures& cached_features,
379 tflite::Interpreter* selection_interpreter,
380 std::vector<ScoredChunk>* scored_chunks) const;
381
382 // A helper method for ModelChunk(). It generates scored chunk candidates for
383 // a bounds-sensitive model.
384 // NOTE: The returned chunks can (and most likely do) overlap.
385 bool ModelBoundsSensitiveScoreChunks(
386 int num_tokens, const TokenSpan& span_of_interest,
387 const TokenSpan& inference_span, const CachedFeatures& cached_features,
388 tflite::Interpreter* selection_interpreter,
389 std::vector<ScoredChunk>* scored_chunks) const;
390
391 // Produces chunks isolated by a set of regular expressions.
392 bool RegexChunk(const UnicodeText& context_unicode,
393 const std::vector<int>& rules,
394 bool is_serialized_entity_data_enabled,
395 const EnabledEntityTypes& enabled_entity_types,
396 const AnnotationUsecase& annotation_usecase,
397
398 std::vector<AnnotatedSpan>* result) const;
399
400 // Produces chunks from the datetime parser.
401 bool DatetimeChunk(const UnicodeText& context_unicode,
402 int64 reference_time_ms_utc,
403 const std::string& reference_timezone,
404 const std::string& locales, ModeFlag mode,
405 AnnotationUsecase annotation_usecase,
406 bool is_serialized_entity_data_enabled,
407 std::vector<AnnotatedSpan>* result) const;
408
409 // Returns whether a classification should be filtered.
410 bool FilteredForAnnotation(const AnnotatedSpan& span) const;
411 bool FilteredForClassification(
412 const ClassificationResult& classification) const;
413 bool FilteredForSelection(const AnnotatedSpan& span) const;
414
415 // Computes the selection boundaries from a regular expression match.
416 CodepointSpan ComputeSelectionBoundaries(
417 const UniLib::RegexMatcher* match,
418 const RegexModel_::Pattern* config) const;
419
420 // Returns whether a regex pattern provides entity data from a match.
421 bool HasEntityData(const RegexModel_::Pattern* pattern) const;
422
423 // Constructs and serializes entity data from regex matches.
424 bool SerializedEntityDataFromRegexMatch(
425 const RegexModel_::Pattern* pattern, UniLib::RegexMatcher* matcher,
426 std::string* serialized_entity_data) const;
427
428 // For knowledge candidates which have a ContactPointer, fill in the
429 // appropriate contact metadata, if possible.
430 void AddContactMetadataToKnowledgeClassificationResults(
431 std::vector<AnnotatedSpan>* candidates) const;
432
433 // Gets priority score from the list of classification results.
434 float GetPriorityScore(
435 const std::vector<ClassificationResult>& classification) const;
436
437 // Verifies a regex match and returns true if verification was successful.
438 bool VerifyRegexMatchCandidate(
439 const std::string& context,
440 const VerificationOptions* verification_options, const std::string& match,
441 const UniLib::RegexMatcher* matcher) const;
442
443 const Model* model_;
444
445 std::unique_ptr<const ModelExecutor> selection_executor_;
446 std::unique_ptr<const ModelExecutor> classification_executor_;
447 std::unique_ptr<const EmbeddingExecutor> embedding_executor_;
448
449 std::unique_ptr<const FeatureProcessor> selection_feature_processor_;
450 std::unique_ptr<const FeatureProcessor> classification_feature_processor_;
451
452 std::unique_ptr<const grammar::Analyzer> analyzer_;
453 std::unique_ptr<const DatetimeGrounder> datetime_grounder_;
454 std::unique_ptr<const DatetimeParser> datetime_parser_;
455 std::unique_ptr<const GrammarAnnotator> grammar_annotator_;
456
457 std::string owned_buffer_;
458 std::unique_ptr<UniLib> owned_unilib_;
459 std::unique_ptr<CalendarLib> owned_calendarlib_;
460
461 private:
462 struct CompiledRegexPattern {
463 const RegexModel_::Pattern* config;
464 std::unique_ptr<UniLib::RegexPattern> pattern;
465 };
466
467 // Removes annotations the entity type of which is not in the set of enabled
468 // entity types.
469 void RemoveNotEnabledEntityTypes(
470 const EnabledEntityTypes& is_entity_type_enabled,
471 std::vector<AnnotatedSpan>* annotated_spans) const;
472
473 // Runs only annotators that do not support structured input. Does conflict
474 // resolution, removal of disallowed entities and sorting on both new
475 // generated candidates and passed in entities.
476 // Returns Status::Error if the annotation failed, in which case the vector of
477 // candidates should be ignored.
478 Status AnnotateSingleInput(const std::string& context,
479 const AnnotationOptions& options,
480 std::vector<AnnotatedSpan>* candidates) const;
481
482 // Parses the money amount into whole and decimal part and fills in the
483 // entity data information.
484 bool ParseAndFillInMoneyAmount(std::string* serialized_entity_data,
485 const UniLib::RegexMatcher* match,
486 const RegexModel_::Pattern* config,
487 const UnicodeText& context_unicode) const;
488
489 // Given the regex capturing groups, extract the one representing the money
490 // quantity and fills in the actual string and the power of 10 the amount
491 // should be multiplied with.
492 void GetMoneyQuantityFromCapturingGroup(const UniLib::RegexMatcher* match,
493 const RegexModel_::Pattern* config,
494 const UnicodeText& context_unicode,
495 std::string* quantity,
496 int* exponent) const;
497
498 // Returns true if any of the ff-model entity types is enabled.
499 bool IsAnyModelEntityTypeEnabled(
500 const EnabledEntityTypes& is_entity_type_enabled) const;
501
502 // Returns true if any of the regex entity types is enabled.
503 bool IsAnyRegexEntityTypeEnabled(
504 const EnabledEntityTypes& is_entity_type_enabled) const;
505
506 // Returns true if any of the POD NER entity types is enabled.
507 bool IsAnyPodNerEntityTypeEnabled(
508 const EnabledEntityTypes& is_entity_type_enabled) const;
509
510 std::unique_ptr<ScopedMmap> mmap_;
511 bool initialized_ = false;
512 bool enabled_for_annotation_ = false;
513 bool enabled_for_classification_ = false;
514 bool enabled_for_selection_ = false;
515 std::unordered_set<std::string> filtered_collections_annotation_;
516 std::unordered_set<std::string> filtered_collections_classification_;
517 std::unordered_set<std::string> filtered_collections_selection_;
518
519 std::vector<CompiledRegexPattern> regex_patterns_;
520
521 // Indices into regex_patterns_ for the different modes.
522 std::vector<int> annotation_regex_patterns_, classification_regex_patterns_,
523 selection_regex_patterns_;
524
525 const UniLib* unilib_;
526 const CalendarLib* calendarlib_;
527
528 std::unique_ptr<const KnowledgeEngine> knowledge_engine_;
529 std::unique_ptr<const ContactEngine> contact_engine_;
530 std::unique_ptr<const InstalledAppEngine> installed_app_engine_;
531 std::unique_ptr<const NumberAnnotator> number_annotator_;
532 std::unique_ptr<const DurationAnnotator> duration_annotator_;
533 std::unique_ptr<const PersonNameEngine> person_name_engine_;
534 std::unique_ptr<const TranslateAnnotator> translate_annotator_;
535 std::unique_ptr<const PodNerAnnotator> pod_ner_annotator_;
536 std::unique_ptr<const ExperimentalAnnotator> experimental_annotator_;
537 std::unique_ptr<const VocabAnnotator> vocab_annotator_;
538
539 // Builder for creating extra data.
540 const reflection::Schema* entity_data_schema_;
541 std::unique_ptr<MutableFlatbufferBuilder> entity_data_builder_;
542
543 // Locales for which the entire model triggers.
544 std::vector<Locale> model_triggering_locales_;
545
546 // Locales for which the ML model triggers.
547 std::vector<Locale> ml_model_triggering_locales_;
548
549 // Locales that the dictionary classification support.
550 std::vector<Locale> dictionary_locales_;
551
552 // Decimal and thousands number separators.
553 std::unordered_set<char32> money_separators_;
554
555 // Model for language identification.
556 const libtextclassifier3::mobile::lang_id::LangId* lang_id_ = nullptr;
557
558 // If true, will prioritize the longest annotation during conflict resolution.
559 bool prioritize_longest_annotation_ = false;
560
561 // If true, the annotator will perform conflict resolution between the
562 // different sub-annotators also in the RAW mode. If false, no conflict
563 // resolution will be performed in RAW mode.
564 bool do_conflict_resolution_in_raw_mode_ = true;
565 };
566
567 namespace internal {
568
569 // Helper function, which if the initial 'span' contains only white-spaces,
570 // moves the selection to a single-codepoint selection on the left side
571 // of this block of white-space.
572 CodepointSpan SnapLeftIfWhitespaceSelection(const CodepointSpan& span,
573 const UnicodeText& context_unicode,
574 const UniLib& unilib);
575
576 // Copies tokens from 'cached_tokens' that are
577 // 'tokens_around_selection_to_copy' (on the left, and right) tokens distant
578 // from the tokens that correspond to 'selection_indices'.
579 std::vector<Token> CopyCachedTokens(const std::vector<Token>& cached_tokens,
580 const CodepointSpan& selection_indices,
581 TokenSpan tokens_around_selection_to_copy);
582 } // namespace internal
583
584 // Interprets the buffer as a Model flatbuffer and returns it for reading.
585 const Model* ViewModel(const void* buffer, int size);
586
587 // Opens model from given path and runs a function, passing the loaded Model
588 // flatbuffer as an argument.
589 //
590 // This is mainly useful if we don't want to pay the cost for the model
591 // initialization because we'll be only reading some flatbuffer values from the
592 // file.
593 template <typename ReturnType, typename Func>
VisitAnnotatorModel(const std::string & path,Func function)594 ReturnType VisitAnnotatorModel(const std::string& path, Func function) {
595 ScopedMmap mmap(path);
596 if (!mmap.handle().ok()) {
597 function(/*model=*/nullptr);
598 }
599 const Model* model =
600 ViewModel(mmap.handle().start(), mmap.handle().num_bytes());
601 return function(model);
602 }
603
604 } // namespace libtextclassifier3
605
606 #endif // LIBTEXTCLASSIFIER_ANNOTATOR_ANNOTATOR_H_
607