1 /*
2 * Copyright (C) 2018 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #ifndef LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_PARSE_TREE_H_
18 #define LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_PARSE_TREE_H_
19
20 #include <functional>
21 #include <vector>
22
23 #include "annotator/types.h"
24 #include "utils/grammar/semantics/expression_generated.h"
25 #include "utils/grammar/types.h"
26 #include "utils/strings/stringpiece.h"
27
28 namespace libtextclassifier3::grammar {
29
30 // Represents a parse tree for a match that was found for a nonterminal.
31 struct ParseTree {
32 enum class Type : int8 {
33 // Default, untyped match.
34 kDefault = 0,
35
36 // An assertion match (see: AssertionNode).
37 kAssertion = 1,
38
39 // A value mapping match (see: MappingNode).
40 kMapping = 2,
41
42 // An exclusion match (see: ExclusionNode).
43 kExclusion = 3,
44
45 // A match for an annotation (see: AnnotationNode).
46 kAnnotation = 4,
47
48 // A match for a semantic annotation (see: SemanticExpressionNode).
49 kExpression = 5,
50 };
51
52 explicit ParseTree() = default;
ParseTreeParseTree53 explicit ParseTree(const Nonterm lhs, const CodepointSpan& codepoint_span,
54 const int match_offset, const Type type)
55 : lhs(lhs),
56 type(type),
57 codepoint_span(codepoint_span),
58 match_offset(match_offset) {}
59
60 // For binary rule matches: rhs1 != NULL and rhs2 != NULL
61 // unary rule matches: rhs1 == NULL and rhs2 != NULL
62 // terminal rule matches: rhs1 != NULL and rhs2 == NULL
63 // custom leaves: rhs1 == NULL and rhs2 == NULL
IsInteriorNodeParseTree64 bool IsInteriorNode() const { return rhs2 != nullptr; }
IsLeafParseTree65 bool IsLeaf() const { return !rhs2; }
66
IsBinaryRuleParseTree67 bool IsBinaryRule() const { return rhs1 && rhs2; }
IsUnaryRuleParseTree68 bool IsUnaryRule() const { return !rhs1 && rhs2; }
IsTerminalRuleParseTree69 bool IsTerminalRule() const { return rhs1 && !rhs2; }
HasLeadingWhitespaceParseTree70 bool HasLeadingWhitespace() const {
71 return codepoint_span.first != match_offset;
72 }
73
unary_rule_rhsParseTree74 const ParseTree* unary_rule_rhs() const { return rhs2; }
75
76 // Used in singly-linked queue of matches for processing.
77 ParseTree* next = nullptr;
78
79 // Nonterminal we found a match for.
80 Nonterm lhs = kUnassignedNonterm;
81
82 // Type of the match.
83 Type type = Type::kDefault;
84
85 // The span in codepoints.
86 CodepointSpan codepoint_span;
87
88 // The begin codepoint offset used during matching.
89 // This is usually including any prefix whitespace.
90 int match_offset;
91
92 union {
93 // The first sub match for binary rules.
94 const ParseTree* rhs1 = nullptr;
95
96 // The terminal, for terminal rules.
97 const char* terminal;
98 };
99 // First or second sub-match for interior nodes.
100 const ParseTree* rhs2 = nullptr;
101 };
102
103 // Node type to keep track of associated values.
104 struct MappingNode : public ParseTree {
MappingNodeMappingNode105 explicit MappingNode(const Nonterm arg_lhs,
106 const CodepointSpan arg_codepoint_span,
107 const int arg_match_offset, const int64 arg_value)
108 : ParseTree(arg_lhs, arg_codepoint_span, arg_match_offset,
109 Type::kMapping),
110 id(arg_value) {}
111 // The associated id or value.
112 int64 id;
113 };
114
115 // Node type to keep track of assertions.
116 struct AssertionNode : public ParseTree {
AssertionNodeAssertionNode117 explicit AssertionNode(const Nonterm arg_lhs,
118 const CodepointSpan arg_codepoint_span,
119 const int arg_match_offset, const bool arg_negative)
120 : ParseTree(arg_lhs, arg_codepoint_span, arg_match_offset,
121 Type::kAssertion),
122 negative(arg_negative) {}
123 // If true, the assertion is negative and will be valid if the input doesn't
124 // match.
125 bool negative;
126 };
127
128 // Node type to define exclusions.
129 struct ExclusionNode : public ParseTree {
ExclusionNodeExclusionNode130 explicit ExclusionNode(const Nonterm arg_lhs,
131 const CodepointSpan arg_codepoint_span,
132 const int arg_match_offset,
133 const Nonterm arg_exclusion_nonterm)
134 : ParseTree(arg_lhs, arg_codepoint_span, arg_match_offset,
135 Type::kExclusion),
136 exclusion_nonterm(arg_exclusion_nonterm) {}
137 // The nonterminal that denotes matches to exclude from a successful match.
138 // So the match is only valid if there is no match of `exclusion_nonterm`
139 // spanning the same text range.
140 Nonterm exclusion_nonterm;
141 };
142
143 // Match to represent an annotator annotated span in the grammar.
144 struct AnnotationNode : public ParseTree {
AnnotationNodeAnnotationNode145 explicit AnnotationNode(const Nonterm arg_lhs,
146 const CodepointSpan arg_codepoint_span,
147 const int arg_match_offset,
148 const ClassificationResult* arg_annotation)
149 : ParseTree(arg_lhs, arg_codepoint_span, arg_match_offset,
150 Type::kAnnotation),
151 annotation(arg_annotation) {}
152 const ClassificationResult* annotation;
153 };
154
155 // Node type to represent an associated semantic expression.
156 struct SemanticExpressionNode : public ParseTree {
SemanticExpressionNodeSemanticExpressionNode157 explicit SemanticExpressionNode(const Nonterm arg_lhs,
158 const CodepointSpan arg_codepoint_span,
159 const int arg_match_offset,
160 const SemanticExpression* arg_expression)
161 : ParseTree(arg_lhs, arg_codepoint_span, arg_match_offset,
162 Type::kExpression),
163 expression(arg_expression) {}
164 const SemanticExpression* expression;
165 };
166
167 // Utility functions for parse tree traversal.
168
169 // Does a preorder traversal, calling `node_fn` on each node.
170 // `node_fn` is expected to return whether to continue expanding a node.
171 void Traverse(const ParseTree* root,
172 const std::function<bool(const ParseTree*)>& node_fn);
173
174 // Does a preorder traversal, selecting all nodes where `pred_fn` returns true.
175 std::vector<const ParseTree*> SelectAll(
176 const ParseTree* root,
177 const std::function<bool(const ParseTree*)>& pred_fn);
178
179 // Retrieves all nodes of a given type.
180 template <typename T>
SelectAllOfType(const ParseTree * root,const ParseTree::Type type)181 const std::vector<const T*> SelectAllOfType(const ParseTree* root,
182 const ParseTree::Type type) {
183 std::vector<const T*> result;
184 Traverse(root, [&result, type](const ParseTree* node) {
185 if (node->type == type) {
186 result.push_back(static_cast<const T*>(node));
187 }
188 return true;
189 });
190 return result;
191 }
192
193 } // namespace libtextclassifier3::grammar
194
195 #endif // LIBTEXTCLASSIFIER_UTILS_GRAMMAR_PARSING_PARSE_TREE_H_
196