• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2023 Google LLC.  All rights reserved.
3 //
4 // Use of this source code is governed by a BSD-style
5 // license that can be found in the LICENSE file or at
6 // https://developers.google.com/open-source/licenses/bsd
7 
8 #ifndef UPB_TEXT_ENCODE_INTERNAL_H_
9 #define UPB_TEXT_ENCODE_INTERNAL_H_
10 
11 #include <stdarg.h>
12 #include <string.h>
13 
14 #include "upb/base/descriptor_constants.h"
15 #include "upb/base/string_view.h"
16 #include "upb/message/array.h"
17 #include "upb/message/internal/map_sorter.h"
18 #include "upb/port/vsnprintf_compat.h"
19 #include "upb/text/options.h"
20 #include "upb/wire/eps_copy_input_stream.h"
21 #include "utf8_range.h"
22 
23 // Must be last.
24 #include "upb/port/def.inc"
25 
26 typedef struct {
27   char *buf, *ptr, *end;
28   size_t overflow;
29   int indent_depth;
30   int options;
31   const struct upb_DefPool* ext_pool;
32   _upb_mapsorter sorter;
33 } txtenc;
34 
UPB_PRIVATE(_upb_TextEncode_PutBytes)35 UPB_INLINE void UPB_PRIVATE(_upb_TextEncode_PutBytes)(txtenc* e,
36                                                       const void* data,
37                                                       size_t len) {
38   size_t have = e->end - e->ptr;
39   if (UPB_LIKELY(have >= len)) {
40     memcpy(e->ptr, data, len);
41     e->ptr += len;
42   } else {
43     if (have) {
44       memcpy(e->ptr, data, have);
45       e->ptr += have;
46     }
47     e->overflow += (len - have);
48   }
49 }
50 
UPB_PRIVATE(_upb_TextEncode_PutStr)51 UPB_INLINE void UPB_PRIVATE(_upb_TextEncode_PutStr)(txtenc* e,
52                                                     const char* str) {
53   UPB_PRIVATE(_upb_TextEncode_PutBytes)(e, str, strlen(str));
54 }
55 
UPB_PRIVATE(_upb_TextEncode_Printf)56 UPB_INLINE void UPB_PRIVATE(_upb_TextEncode_Printf)(txtenc* e, const char* fmt,
57                                                     ...) {
58   size_t n;
59   size_t have = e->end - e->ptr;
60   va_list args;
61 
62   va_start(args, fmt);
63   n = _upb_vsnprintf(e->ptr, have, fmt, args);
64   va_end(args);
65 
66   if (UPB_LIKELY(have > n)) {
67     e->ptr += n;
68   } else {
69     e->ptr = UPB_PTRADD(e->ptr, have);
70     e->overflow += (n - have);
71   }
72 }
73 
UPB_PRIVATE(_upb_TextEncode_Indent)74 UPB_INLINE void UPB_PRIVATE(_upb_TextEncode_Indent)(txtenc* e) {
75   if ((e->options & UPB_TXTENC_SINGLELINE) == 0) {
76     int i = e->indent_depth;
77     while (i-- > 0) {
78       UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "  ");
79     }
80   }
81 }
82 
UPB_PRIVATE(_upb_TextEncode_EndField)83 UPB_INLINE void UPB_PRIVATE(_upb_TextEncode_EndField)(txtenc* e) {
84   if (e->options & UPB_TXTENC_SINGLELINE) {
85     UPB_PRIVATE(_upb_TextEncode_PutStr)(e, " ");
86   } else {
87     UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\n");
88   }
89 }
90 
UPB_PRIVATE(_upb_TextEncode_Escaped)91 UPB_INLINE void UPB_PRIVATE(_upb_TextEncode_Escaped)(txtenc* e,
92                                                      unsigned char ch) {
93   switch (ch) {
94     case '\n':
95       UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\\n");
96       break;
97     case '\r':
98       UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\\r");
99       break;
100     case '\t':
101       UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\\t");
102       break;
103     case '\"':
104       UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\\\"");
105       break;
106     case '\'':
107       UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\\'");
108       break;
109     case '\\':
110       UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\\\\");
111       break;
112     default:
113       UPB_PRIVATE(_upb_TextEncode_Printf)(e, "\\%03o", ch);
114       break;
115   }
116 }
117 
118 // Returns true if `ch` needs to be escaped in TextFormat, independent of any
119 // UTF-8 validity issues.
UPB_PRIVATE(_upb_DefinitelyNeedsEscape)120 UPB_INLINE bool UPB_PRIVATE(_upb_DefinitelyNeedsEscape)(unsigned char ch) {
121   if (ch < 32) return true;
122   switch (ch) {
123     case '\"':
124     case '\'':
125     case '\\':
126     case 127:
127       return true;
128   }
129   return false;
130 }
131 
UPB_PRIVATE(_upb_AsciiIsPrint)132 UPB_INLINE bool UPB_PRIVATE(_upb_AsciiIsPrint)(unsigned char ch) {
133   return ch >= 32 && ch < 127;
134 }
135 
136 // Returns true if this is a high byte that requires UTF-8 validation.  If the
137 // UTF-8 validation fails, we must escape the byte.
UPB_PRIVATE(_upb_NeedsUtf8Validation)138 UPB_INLINE bool UPB_PRIVATE(_upb_NeedsUtf8Validation)(unsigned char ch) {
139   return ch > 127;
140 }
141 
142 // Returns the number of bytes in the prefix of `val` that do not need escaping.
143 // This is like utf8_range::SpanStructurallyValid(), except that it also
144 // terminates at any ASCII char that needs to be escaped in TextFormat (any char
145 // that has `DefinitelyNeedsEscape(ch) == true`).
146 //
147 // If we could get a variant of utf8_range::SpanStructurallyValid() that could
148 // terminate on any of these chars, that might be more efficient, but it would
149 // be much more complicated to modify that heavily SIMD code.
UPB_PRIVATE(_SkipPassthroughBytes)150 UPB_INLINE size_t UPB_PRIVATE(_SkipPassthroughBytes)(const char* ptr,
151                                                      size_t size) {
152   for (size_t i = 0; i < size; i++) {
153     unsigned char uc = ptr[i];
154     if (UPB_PRIVATE(_upb_DefinitelyNeedsEscape)(uc)) return i;
155     if (UPB_PRIVATE(_upb_NeedsUtf8Validation)(uc)) {
156       // Find the end of this region of consecutive high bytes, so that we only
157       // give high bytes to the UTF-8 checker.  This avoids needing to perform
158       // a second scan of the ASCII characters looking for characters that
159       // need escaping.
160       //
161       // We assume that high bytes are less frequent than plain, printable ASCII
162       // bytes, so we accept the double-scan of high bytes.
163       size_t end = i + 1;
164       for (; end < size; end++) {
165         if (!UPB_PRIVATE(_upb_NeedsUtf8Validation)(ptr[end])) break;
166       }
167       size_t n = end - i;
168       size_t ok = utf8_range_ValidPrefix(ptr + i, n);
169       if (ok != n) return i + ok;
170       i += ok - 1;
171     }
172   }
173   return size;
174 }
175 
UPB_PRIVATE(_upb_HardenedPrintString)176 UPB_INLINE void UPB_PRIVATE(_upb_HardenedPrintString)(txtenc* e,
177                                                       const char* ptr,
178                                                       size_t len) {
179   // Print as UTF-8, while guarding against any invalid UTF-8 in the string
180   // field.
181   //
182   // If in the future we have a guaranteed invariant that invalid UTF-8 will
183   // never be present, we could avoid the UTF-8 check here.
184   UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\"");
185   const char* end = ptr + len;
186   while (ptr < end) {
187     size_t n = UPB_PRIVATE(_SkipPassthroughBytes)(ptr, end - ptr);
188     if (n != 0) {
189       UPB_PRIVATE(_upb_TextEncode_PutBytes)(e, ptr, n);
190       ptr += n;
191       if (ptr == end) break;
192     }
193 
194     // If repeated calls to CEscape() and PrintString() are expensive, we could
195     // consider batching them, at the cost of some complexity.
196     UPB_PRIVATE(_upb_TextEncode_Escaped)(e, *ptr);
197     ptr++;
198   }
199   UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\"");
200 }
201 
UPB_PRIVATE(_upb_TextEncode_Bytes)202 UPB_INLINE void UPB_PRIVATE(_upb_TextEncode_Bytes)(txtenc* e,
203                                                    upb_StringView data) {
204   const char* ptr = data.data;
205   const char* end = ptr + data.size;
206   UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\"");
207   for (; ptr < end; ptr++) {
208     unsigned char uc = *ptr;
209     if (UPB_PRIVATE(_upb_AsciiIsPrint)(uc)) {
210       UPB_PRIVATE(_upb_TextEncode_PutBytes)(e, ptr, 1);
211     } else {
212       UPB_PRIVATE(_upb_TextEncode_Escaped)(e, uc);
213     }
214   }
215   UPB_PRIVATE(_upb_TextEncode_PutStr)(e, "\"");
216 }
217 
UPB_PRIVATE(_upb_TextEncode_Nullz)218 UPB_INLINE size_t UPB_PRIVATE(_upb_TextEncode_Nullz)(txtenc* e, size_t size) {
219   size_t ret = e->ptr - e->buf + e->overflow;
220 
221   if (size > 0) {
222     if (e->ptr == e->end) e->ptr--;
223     *e->ptr = '\0';
224   }
225 
226   return ret;
227 }
228 
229 const char* UPB_PRIVATE(_upb_TextEncode_Unknown)(txtenc* e, const char* ptr,
230                                                  upb_EpsCopyInputStream* stream,
231                                                  int groupnum);
232 
233 // Must not be called for ctype = kUpb_CType_Enum, as they require different
234 // handling depending on whether or not we're doing reflection-based encoding.
235 void UPB_PRIVATE(_upb_TextEncode_Scalar)(txtenc* e, upb_MessageValue val,
236                                          upb_CType ctype);
237 
238 #include "upb/port/undef.inc"
239 
240 #endif  // UPB_TEXT_ENCODE_INTERNAL_H_
241