1 /*
2 * Copyright (C) 2018 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #ifndef LIBTEXTCLASSIFIER_UTILS_GRAMMAR_MATCH_H_
18 #define LIBTEXTCLASSIFIER_UTILS_GRAMMAR_MATCH_H_
19
20 #include <functional>
21 #include <vector>
22
23 #include "annotator/types.h"
24 #include "utils/grammar/types.h"
25 #include "utils/strings/stringpiece.h"
26
27 namespace libtextclassifier3::grammar {
28
29 // Represents a single match that was found for a particular nonterminal.
30 // Instances should be created by calling Matcher::AllocateMatch().
31 // This uses an arena to allocate matches (and subclasses thereof).
32 struct Match {
33 static constexpr int16 kUnknownType = 0;
34 static constexpr int16 kTokenType = -1;
35 static constexpr int16 kDigitsType = -2;
36 static constexpr int16 kBreakType = -3;
37 static constexpr int16 kAssertionMatch = -4;
38 static constexpr int16 kMappingMatch = -5;
39 static constexpr int16 kExclusionMatch = -6;
40 static constexpr int16 kAnnotationMatch = -7;
41
42 void Init(const Nonterm arg_lhs, const CodepointSpan arg_codepoint_span,
43 const int arg_match_offset, const int arg_type = kUnknownType) {
44 lhs = arg_lhs;
45 codepoint_span = arg_codepoint_span;
46 match_offset = arg_match_offset;
47 type = arg_type;
48 rhs1 = nullptr;
49 rhs2 = nullptr;
50 }
51
InitMatch52 void Init(const Match& other) { *this = other; }
53
54 // For binary rule matches: rhs1 != NULL and rhs2 != NULL
55 // unary rule matches: rhs1 == NULL and rhs2 != NULL
56 // terminal rule matches: rhs1 != NULL and rhs2 == NULL
57 // custom leaves: rhs1 == NULL and rhs2 == NULL
IsInteriorNodeMatch58 bool IsInteriorNode() const { return rhs2 != nullptr; }
IsLeafMatch59 bool IsLeaf() const { return !rhs2; }
60
IsBinaryRuleMatch61 bool IsBinaryRule() const { return rhs1 && rhs2; }
IsUnaryRuleMatch62 bool IsUnaryRule() const { return !rhs1 && rhs2; }
IsTerminalRuleMatch63 bool IsTerminalRule() const { return rhs1 && !rhs2; }
HasLeadingWhitespaceMatch64 bool HasLeadingWhitespace() const {
65 return codepoint_span.first != match_offset;
66 }
67
unary_rule_rhsMatch68 const Match* unary_rule_rhs() const { return rhs2; }
69
70 // Used in singly-linked queue of matches for processing.
71 Match* next = nullptr;
72
73 // Nonterminal we found a match for.
74 Nonterm lhs = kUnassignedNonterm;
75
76 // Type of the match.
77 int16 type = kUnknownType;
78
79 // The span in codepoints.
80 CodepointSpan codepoint_span;
81
82 // The begin codepoint offset used during matching.
83 // This is usually including any prefix whitespace.
84 int match_offset;
85
86 union {
87 // The first sub match for binary rules.
88 const Match* rhs1 = nullptr;
89
90 // The terminal, for terminal rules.
91 const char* terminal;
92 };
93 // First or second sub-match for interior nodes.
94 const Match* rhs2 = nullptr;
95 };
96
97 // Match type to keep track of associated values.
98 struct MappingMatch : public Match {
99 // The associated id or value.
100 int64 id;
101 };
102
103 // Match type to keep track of assertions.
104 struct AssertionMatch : public Match {
105 // If true, the assertion is negative and will be valid if the input doesn't
106 // match.
107 bool negative;
108 };
109
110 // Match type to define exclusions.
111 struct ExclusionMatch : public Match {
112 // The nonterminal that denotes matches to exclude from a successful match.
113 // So the match is only valid if there is no match of `exclusion_nonterm`
114 // spanning the same text range.
115 Nonterm exclusion_nonterm;
116 };
117
118 // Match to represent an annotator annotated span in the grammar.
119 struct AnnotationMatch : public Match {
120 const ClassificationResult* annotation;
121 };
122
123 // Utility functions for parse tree traversal.
124
125 // Does a preorder traversal, calling `node_fn` on each node.
126 // `node_fn` is expected to return whether to continue expanding a node.
127 void Traverse(const Match* root,
128 const std::function<bool(const Match*)>& node_fn);
129
130 // Does a preorder traversal, calling `pred_fn` and returns the first node
131 // on which `pred_fn` returns true.
132 const Match* SelectFirst(const Match* root,
133 const std::function<bool(const Match*)>& pred_fn);
134
135 // Does a preorder traversal, selecting all nodes where `pred_fn` returns true.
136 std::vector<const Match*> SelectAll(
137 const Match* root, const std::function<bool(const Match*)>& pred_fn);
138
139 // Selects all terminals from a parse tree.
SelectTerminals(const Match * root)140 inline std::vector<const Match*> SelectTerminals(const Match* root) {
141 return SelectAll(root, &Match::IsTerminalRule);
142 }
143
144 // Selects all leaves from a parse tree.
SelectLeaves(const Match * root)145 inline std::vector<const Match*> SelectLeaves(const Match* root) {
146 return SelectAll(root, &Match::IsLeaf);
147 }
148
149 // Retrieves the first child node of a given type.
150 template <typename T>
SelectFirstOfType(const Match * root,const int16 type)151 const T* SelectFirstOfType(const Match* root, const int16 type) {
152 return static_cast<const T*>(SelectFirst(
153 root, [type](const Match* node) { return node->type == type; }));
154 }
155
156 // Retrieves all nodes of a given type.
157 template <typename T>
SelectAllOfType(const Match * root,const int16 type)158 const std::vector<const T*> SelectAllOfType(const Match* root,
159 const int16 type) {
160 std::vector<const T*> result;
161 Traverse(root, [&result, type](const Match* node) {
162 if (node->type == type) {
163 result.push_back(static_cast<const T*>(node));
164 }
165 return true;
166 });
167 return result;
168 }
169
170 } // namespace libtextclassifier3::grammar
171
172 #endif // LIBTEXTCLASSIFIER_UTILS_GRAMMAR_MATCH_H_
173