utils/grammar/matcher.h

/*
 * Copyright (C) 2018 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

// A token matcher based on context-free grammars.
//
// A lexer passes token to the matcher: literal terminal strings and token
// types. It passes tokens to the matcher by calling AddTerminal() and
// AddMatch() for literal terminals and token types, respectively.
// The lexer passes each token along with the [begin, end) position range
// in which it occurs.  So for an input string "Groundhog February 2, 2007", the
// lexer would tell the matcher that:
//
//     "Groundhog" occurs at [0, 9)
//     <space> occurs at [9, 10)
//     "February" occurs at [10, 18)
//     <space> occurs at [18, 19)
//     <string_of_digits> occurs at [19, 20)
//     "," occurs at [20, 21)
//     <space> occurs at [21, 22)
//     <string_of_digits> occurs at [22, 26)
//
// The lexer passes tokens to the matcher by calling AddTerminal() and
// AddMatch() for literal terminals and token types, respectively.
//
// Although it is unnecessary for this example grammar, a lexer can
// output multiple tokens for the same input range.  So our lexer could
// additionally output:
//     "2" occurs at [19, 20)        // a second token for [19, 20)
//     "2007" occurs at [22, 26)
//     <syllable> occurs at [0, 6)   // overlaps with (Groundhog [0, 9))
//     <syllable> occurs at [6, 9)
// The only constraint on the lexer's output is that it has to pass tokens
// to the matcher in left-to-right order, strictly speaking, their "end"
// positions must be nondecreasing.  (This constraint allows a more
// efficient matching algorithm.)  The "begin" positions can be in any
// order.
//
// There are two kinds of supported callbacks:
// (1) OUTPUT:  Callbacks are the only output mechanism a matcher has.  For each
// "top-level" rule in your grammar, like the rule for <date> above -- something
// you're trying to find instances of -- you use a callback which the matcher
// will invoke every time it finds an instance of <date>.
// (2) FILTERS:
// Callbacks allow you to put extra conditions on when a grammar rule
// applies.  In the example grammar, the rule
//
//       <day> ::= <string_of_digits>     // must be between 1 and 31
//
// should only apply for *some* <string_of_digits> tokens, not others.
// By using a filter callback on this rule, you can tell the matcher that
// an instance of the rule's RHS is only *sometimes* considered an
// instance of its LHS.  The filter callback will get invoked whenever
// the matcher finds an instance of <string_of_digits>.  The callback can
// look at the digits and decide whether they represent a number between
// 1 and 31.  If so, the callback calls Matcher::AddMatch() to tell the
// matcher there's a <day> there.  If not, the callback simply exits
// without calling AddMatch().
//
// Technically, a FILTER callback can make any number of calls to
// AddMatch() or even AddTerminal().  But the expected usage is to just
// make zero or one call to AddMatch().  OUTPUT callbacks are not expected
// to call either of these -- output callbacks are invoked merely as a
// side-effect, not in order to decide whether a rule applies or not.
//
// In the above example, you would probably use three callbacks.  Filter
// callbacks on the rules for <day> and <year> would check the numeric
// value of the <string_of_digits>.  An output callback on the rule for
// <date> would simply increment the counter of dates found on the page.
//
// Note that callbacks are attached to rules, not to nonterminals.  You
// could have two alternative rules for <date> and use a different
// callback for each one.

#ifndef LIBTEXTCLASSIFIER_UTILS_GRAMMAR_MATCHER_H_
#define LIBTEXTCLASSIFIER_UTILS_GRAMMAR_MATCHER_H_

#include <array>
#include <functional>
#include <vector>

#include "annotator/types.h"
#include "utils/base/arena.h"
#include "utils/grammar/callback-delegate.h"
#include "utils/grammar/match.h"
#include "utils/grammar/rules_generated.h"
#include "utils/strings/stringpiece.h"
#include "utils/utf8/unilib.h"

namespace libtextclassifier3::grammar {

class Matcher {
 public:
  explicit Matcher(const UniLib* unilib, const RulesSet* rules,
                   const std::vector<const RulesSet_::Rules*> rules_shards,
                   CallbackDelegate* delegate)
      : state_(STATE_DEFAULT),
        unilib_(*unilib),
        arena_(kBlocksize),
        rules_(rules),
        rules_shards_(rules_shards),
        delegate_(delegate) {
    TC3_CHECK(rules_ != nullptr);
    Reset();
  }
  explicit Matcher(const UniLib* unilib, const RulesSet* rules,
                   CallbackDelegate* delegate)
      : Matcher(unilib, rules, {}, delegate) {
    rules_shards_.reserve(rules->rules()->size());
    rules_shards_.insert(rules_shards_.end(), rules->rules()->begin(),
                         rules->rules()->end());
  }

  // Resets the matcher.
  void Reset();

  // Finish the matching.
  void Finish();

  // Tells the matcher that the given terminal was found occupying position
  // range [begin, end) in the input.
  // The matcher may invoke callback functions before returning, if this
  // terminal triggers any new matches for rules in the grammar.
  // Calls to AddTerminal() and AddMatch() must be in left-to-right order,
  // that is, the sequence of `end` values must be non-decreasing.
  void AddTerminal(const CodepointSpan codepoint_span, const int match_offset,
                   StringPiece terminal);
  void AddTerminal(const CodepointIndex begin, const CodepointIndex end,
                   StringPiece terminal) {
    AddTerminal(CodepointSpan{begin, end}, begin, terminal);
  }

  // Adds a nonterminal match to the chart.
  // This can be invoked by the lexer if the lexer needs to add nonterminals to
  // the chart.
  void AddMatch(Match* match);

  // Allocates memory from an area for a new match.
  // The `size` parameter is there to allow subclassing of the match object
  // with additional fields.
  Match* AllocateMatch(const size_t size) {
    return reinterpret_cast<Match*>(arena_.Alloc(size));
  }

  template <typename T>
  T* AllocateMatch() {
    return reinterpret_cast<T*>(arena_.Alloc(sizeof(T)));
  }

  template <typename T, typename... Args>
  T* AllocateAndInitMatch(Args... args) {
    T* match = AllocateMatch<T>();
    match->Init(args...);
    return match;
  }

  // Returns the current number of bytes allocated for all match objects.
  size_t ArenaSize() const { return arena_.status().bytes_allocated(); }

 private:
  static constexpr int kBlocksize = 16 << 10;

  // The state of the matcher.
  enum State {
    // The matcher is in the default state.
    STATE_DEFAULT = 0,

    // The matcher is currently processing queued match items.
    STATE_PROCESSING = 1,
  };
  State state_;

  // Process matches from lhs set.
  void ExecuteLhsSet(const CodepointSpan codepoint_span, const int match_offset,
                     const int whitespace_gap,
                     const std::function<void(Match*)>& initializer,
                     const RulesSet_::LhsSet* lhs_set,
                     CallbackDelegate* delegate);

  // Queues a newly created match item.
  void QueueForProcessing(Match* item);

  // Queues a match item for later post checking of the exclusion condition.
  // For exclusions we need to check that the `item->excluded_nonterminal`
  // doesn't match the same span. As we cannot know which matches have already
  // been added, we queue the item for later post checking - once all matches
  // up to `item->codepoint_span.second` have been added.
  void QueueForPostCheck(ExclusionMatch* item);

  // Adds pending items to the chart, possibly generating new matches as a
  // result.
  void ProcessPendingSet();

  // Returns whether the chart contains a match for a given nonterminal.
  bool ContainsMatch(const Nonterm nonterm, const CodepointSpan& span) const;

  // Checks all pending exclusion matches that their exclusion condition is
  // fulfilled.
  void ProcessPendingExclusionMatches();

  UniLib unilib_;

  // Memory arena for match allocation.
  UnsafeArena arena_;

  // The end position of the most recent match or terminal, for sanity
  // checking.
  int last_end_;

  // Rules.
  const RulesSet* rules_;

  // The set of items pending to be added to the chart as a singly-linked list.
  Match* pending_items_;

  // The set of items pending to be post-checked as a singly-linked list.
  ExclusionMatch* pending_exclusion_items_;

  // The chart data structure: a hashtable containing all matches, indexed by
  // their end positions.
  static constexpr int kChartHashTableNumBuckets = 1 << 8;
  static constexpr int kChartHashTableBitmask = kChartHashTableNumBuckets - 1;
  std::array<Match*, kChartHashTableNumBuckets> chart_;

  // The active rule shards.
  std::vector<const RulesSet_::Rules*> rules_shards_;

  // The callback handler.
  CallbackDelegate* delegate_;
};

}  // namespace libtextclassifier3::grammar

#endif  // LIBTEXTCLASSIFIER_UTILS_GRAMMAR_MATCHER_H_