1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2023 Google LLC. All rights reserved.
3 //
4 // Use of this source code is governed by a BSD-style
5 // license that can be found in the LICENSE file or at
6 // https://developers.google.com/open-source/licenses/bsd
7
8 #ifndef UPB_TEXT_ENCODE_INTERNAL_H_
9 #define UPB_TEXT_ENCODE_INTERNAL_H_
10
11 #include <stdarg.h>
12 #include <string.h>
13
14 #include "upb/base/descriptor_constants.h"
15 #include "upb/base/string_view.h"
16 #include "upb/message/array.h"
17 #include "upb/message/internal/map_sorter.h"
18 #include "upb/port/vsnprintf_compat.h"
19 #include "upb/text/options.h"
20 #include "upb/wire/eps_copy_input_stream.h"
21 #include "utf8_range.h"
22
23 // Must be last.
24 #include "upb/port/def.inc"
25
26 typedef struct {
27 char *buf, *ptr, *end;
28 size_t overflow;
29 int indent_depth;
30 int options;
31 const struct upb_DefPool* ext_pool;
32 _upb_mapsorter sorter;
33 } txtenc;
34
UPB_PRIVATE(_upb_TextEncode_PutBytes)35 UPB_INLINE void UPB_PRIVATE(_upb_TextEncode_PutBytes)(txtenc* e,
36 const void* data,
37 size_t len) {
38 size_t have = e->end - e->ptr;
39 if (UPB_LIKELY(have >= len)) {
40 memcpy(e->ptr, data, len);
41 e->ptr += len;
42 } else {
43 if (have) {
44 memcpy(e->ptr, data, have);
45 e->ptr += have;
46 }
47 e->overflow += (len - have);
48 }
49 }
50
UPB_PRIVATE(_upb_TextEncode_PutStr)51 UPB_INLINE void UPB_PRIVATE(_upb_TextEncode_PutStr)(txtenc* e,
52 const char* str) {
53 UPB_PRIVATE(_upb_TextEncode_PutBytes)(e, str, strlen(str));
54 }
55
UPB_PRIVATE(_upb_TextEncode_Printf)56 UPB_INLINE void UPB_PRIVATE(_upb_TextEncode_Printf)(txtenc* e, const char* fmt,
57 ...) {
58 size_t n;
59 size_t have = e->end - e->ptr;
60 va_list args;
61
62 va_start(args, fmt);
63 n = _upb_vsnprintf(e->ptr, have, fmt, args);
64 va_end(args);
65
66 if (UPB_LIKELY(have > n)) {
67 e->ptr += n;
68 } else {
69 e->ptr = UPB_PTRADD(e->ptr, have);
70 e->overflow += (n - have);
71 }
72 }
73
UPB_PRIVATE(_upb_TextEncode_Indent)74 UPB_INLINE void UPB_PRIVATE(_upb_TextEncode_Indent)(txtenc* e) {
75 if ((e->options & UPB_TXTENC_SINGLELINE) == 0) {
76 int i = e->indent_depth;
77 while (i-- > 0) {
78 UPB_PRIVATE(_upb_TextEncode_PutStr)(e, " ");
79 }
80 }
81 }
82
UPB_PRIVATE(_upb_TextEncode_EndField)83 UPB_INLINE void UPB_PRIVATE(_upb_TextEncode_EndField)(txtenc* e) {
84 if (e->options & UPB_TXTENC_SINGLELINE) {
85 UPB_PRIVATE(_upb_TextEncode_PutStr)(e, " ");
86 } else {
87 UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\n");
88 }
89 }
90
UPB_PRIVATE(_upb_TextEncode_Escaped)91 UPB_INLINE void UPB_PRIVATE(_upb_TextEncode_Escaped)(txtenc* e,
92 unsigned char ch) {
93 switch (ch) {
94 case '\n':
95 UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\\n");
96 break;
97 case '\r':
98 UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\\r");
99 break;
100 case '\t':
101 UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\\t");
102 break;
103 case '\"':
104 UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\\\"");
105 break;
106 case '\'':
107 UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\\'");
108 break;
109 case '\\':
110 UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\\\\");
111 break;
112 default:
113 UPB_PRIVATE(_upb_TextEncode_Printf)(e, "\\%03o", ch);
114 break;
115 }
116 }
117
118 // Returns true if `ch` needs to be escaped in TextFormat, independent of any
119 // UTF-8 validity issues.
UPB_PRIVATE(_upb_DefinitelyNeedsEscape)120 UPB_INLINE bool UPB_PRIVATE(_upb_DefinitelyNeedsEscape)(unsigned char ch) {
121 if (ch < 32) return true;
122 switch (ch) {
123 case '\"':
124 case '\'':
125 case '\\':
126 case 127:
127 return true;
128 }
129 return false;
130 }
131
UPB_PRIVATE(_upb_AsciiIsPrint)132 UPB_INLINE bool UPB_PRIVATE(_upb_AsciiIsPrint)(unsigned char ch) {
133 return ch >= 32 && ch < 127;
134 }
135
136 // Returns true if this is a high byte that requires UTF-8 validation. If the
137 // UTF-8 validation fails, we must escape the byte.
UPB_PRIVATE(_upb_NeedsUtf8Validation)138 UPB_INLINE bool UPB_PRIVATE(_upb_NeedsUtf8Validation)(unsigned char ch) {
139 return ch > 127;
140 }
141
142 // Returns the number of bytes in the prefix of `val` that do not need escaping.
143 // This is like utf8_range::SpanStructurallyValid(), except that it also
144 // terminates at any ASCII char that needs to be escaped in TextFormat (any char
145 // that has `DefinitelyNeedsEscape(ch) == true`).
146 //
147 // If we could get a variant of utf8_range::SpanStructurallyValid() that could
148 // terminate on any of these chars, that might be more efficient, but it would
149 // be much more complicated to modify that heavily SIMD code.
UPB_PRIVATE(_SkipPassthroughBytes)150 UPB_INLINE size_t UPB_PRIVATE(_SkipPassthroughBytes)(const char* ptr,
151 size_t size) {
152 for (size_t i = 0; i < size; i++) {
153 unsigned char uc = ptr[i];
154 if (UPB_PRIVATE(_upb_DefinitelyNeedsEscape)(uc)) return i;
155 if (UPB_PRIVATE(_upb_NeedsUtf8Validation)(uc)) {
156 // Find the end of this region of consecutive high bytes, so that we only
157 // give high bytes to the UTF-8 checker. This avoids needing to perform
158 // a second scan of the ASCII characters looking for characters that
159 // need escaping.
160 //
161 // We assume that high bytes are less frequent than plain, printable ASCII
162 // bytes, so we accept the double-scan of high bytes.
163 size_t end = i + 1;
164 for (; end < size; end++) {
165 if (!UPB_PRIVATE(_upb_NeedsUtf8Validation)(ptr[end])) break;
166 }
167 size_t n = end - i;
168 size_t ok = utf8_range_ValidPrefix(ptr + i, n);
169 if (ok != n) return i + ok;
170 i += ok - 1;
171 }
172 }
173 return size;
174 }
175
UPB_PRIVATE(_upb_HardenedPrintString)176 UPB_INLINE void UPB_PRIVATE(_upb_HardenedPrintString)(txtenc* e,
177 const char* ptr,
178 size_t len) {
179 // Print as UTF-8, while guarding against any invalid UTF-8 in the string
180 // field.
181 //
182 // If in the future we have a guaranteed invariant that invalid UTF-8 will
183 // never be present, we could avoid the UTF-8 check here.
184 UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\"");
185 const char* end = ptr + len;
186 while (ptr < end) {
187 size_t n = UPB_PRIVATE(_SkipPassthroughBytes)(ptr, end - ptr);
188 if (n != 0) {
189 UPB_PRIVATE(_upb_TextEncode_PutBytes)(e, ptr, n);
190 ptr += n;
191 if (ptr == end) break;
192 }
193
194 // If repeated calls to CEscape() and PrintString() are expensive, we could
195 // consider batching them, at the cost of some complexity.
196 UPB_PRIVATE(_upb_TextEncode_Escaped)(e, *ptr);
197 ptr++;
198 }
199 UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\"");
200 }
201
UPB_PRIVATE(_upb_TextEncode_Bytes)202 UPB_INLINE void UPB_PRIVATE(_upb_TextEncode_Bytes)(txtenc* e,
203 upb_StringView data) {
204 const char* ptr = data.data;
205 const char* end = ptr + data.size;
206 UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\"");
207 for (; ptr < end; ptr++) {
208 unsigned char uc = *ptr;
209 if (UPB_PRIVATE(_upb_AsciiIsPrint)(uc)) {
210 UPB_PRIVATE(_upb_TextEncode_PutBytes)(e, ptr, 1);
211 } else {
212 UPB_PRIVATE(_upb_TextEncode_Escaped)(e, uc);
213 }
214 }
215 UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\"");
216 }
217
UPB_PRIVATE(_upb_TextEncode_Nullz)218 UPB_INLINE size_t UPB_PRIVATE(_upb_TextEncode_Nullz)(txtenc* e, size_t size) {
219 size_t ret = e->ptr - e->buf + e->overflow;
220
221 if (size > 0) {
222 if (e->ptr == e->end) e->ptr--;
223 *e->ptr = '\0';
224 }
225
226 return ret;
227 }
228
229 const char* UPB_PRIVATE(_upb_TextEncode_Unknown)(txtenc* e, const char* ptr,
230 upb_EpsCopyInputStream* stream,
231 int groupnum);
232
233 // Must not be called for ctype = kUpb_CType_Enum, as they require different
234 // handling depending on whether or not we're doing reflection-based encoding.
235 void UPB_PRIVATE(_upb_TextEncode_Scalar)(txtenc* e, upb_MessageValue val,
236 upb_CType ctype);
237
238 #include "upb/port/undef.inc"
239
240 #endif // UPB_TEXT_ENCODE_INTERNAL_H_
241