• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc.  All rights reserved.
3 //
4 // Use of this source code is governed by a BSD-style
5 // license that can be found in the LICENSE file or at
6 // https://developers.google.com/open-source/licenses/bsd
7 
8 #include "google/protobuf/json/internal/writer.h"
9 
10 #include <cstdint>
11 #include <initializer_list>
12 #include <limits>
13 #include <utility>
14 
15 #include "absl/algorithm/container.h"
16 #include "absl/log/absl_check.h"
17 
18 // Must be included last.
19 #include "google/protobuf/port_def.inc"
20 
21 namespace google {
22 namespace protobuf {
23 namespace json_internal {
24 
25 // Tries to write a non-finite double if necessary; returns false if
26 // nothing was written.
MaybeWriteSpecialFp(double val)27 bool JsonWriter::MaybeWriteSpecialFp(double val) {
28   if (val == std::numeric_limits<double>::infinity()) {
29     Write("\"Infinity\"");
30   } else if (val == -std::numeric_limits<double>::infinity()) {
31     Write("\"-Infinity\"");
32   } else if (std::isnan(val)) {
33     Write("\"NaN\"");
34   } else {
35     return false;
36   }
37   return true;
38 }
39 
WriteBase64(absl::string_view str)40 void JsonWriter::WriteBase64(absl::string_view str) {
41   // This is the regular base64, not the "web-safe" version.
42   constexpr absl::string_view kBase64 =
43       "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
44   const char* ptr = str.data();
45   const char* end = ptr + str.size();
46 
47   // Reads the `n`th character off of `ptr` while gracefully avoiding
48   // sign extension due to implicit conversions
49   auto read = [&](size_t n) {
50     return static_cast<size_t>(static_cast<uint8_t>(ptr[n]));
51   };
52 
53   char buf[4];
54   absl::string_view view(buf, sizeof(buf));
55   Write("\"");
56 
57   while (end - ptr >= 3) {
58     buf[0] = kBase64[read(0) >> 2];
59     buf[1] = kBase64[((read(0) & 0x3) << 4) | (read(1) >> 4)];
60     buf[2] = kBase64[((read(1) & 0xf) << 2) | (read(2) >> 6)];
61     buf[3] = kBase64[read(2) & 0x3f];
62     Write(view);
63     ptr += 3;
64   }
65 
66   switch (end - ptr) {
67     case 2:
68       buf[0] = kBase64[read(0) >> 2];
69       buf[1] = kBase64[((read(0) & 0x3) << 4) | (read(1) >> 4)];
70       buf[2] = kBase64[(read(1) & 0xf) << 2];
71       buf[3] = '=';
72       Write(view);
73       break;
74     case 1:
75       buf[0] = kBase64[read(0) >> 2];
76       buf[1] = kBase64[((read(0) & 0x3) << 4)];
77       buf[2] = '=';
78       buf[3] = '=';
79       Write(view);
80       break;
81   }
82 
83   Write("\"");
84 }
85 
86 // The minimum value of a unicode high-surrogate code unit in the utf-16
87 // encoding. A high-surrogate is also known as a leading-surrogate.
88 // See http://www.unicode.org/glossary/#high_surrogate_code_unit
89 static constexpr uint16_t kMinHighSurrogate = 0xd800;
90 
91 // The minimum value of a unicode low-surrogate code unit in the utf-16
92 // encoding. A low-surrogate is also known as a trailing-surrogate.
93 // See http://www.unicode.org/glossary/#low_surrogate_code_unit
94 static constexpr uint16_t kMinLowSurrogate = 0xdc00;
95 
96 // The maximum value of a unicode low-surrogate code unit in the utf-16
97 // encoding. A low-surrogate is also known as a trailing surrogate.
98 // See http://www.unicode.org/glossary/#low_surrogate_code_unit
99 static constexpr uint16_t kMaxLowSurrogate = 0xdfff;
100 
101 // The minimum value of a unicode supplementary code point.
102 // See http://www.unicode.org/glossary/#supplementary_code_point
103 static constexpr uint32_t kMinSupplementaryCodePoint = 0x010000;
104 
105 // The maximum value of a unicode code point.
106 // See http://www.unicode.org/glossary/#code_point
107 static constexpr uint32_t kMaxCodePoint = 0x10ffff;
108 
109 // Indicates decoding failure; not a valid Unicode scalar.
110 static constexpr uint32_t kErrorSentinel = 0xaaaaaaaa;
111 
112 // A Unicode Scalar encoded two ways.
113 struct Utf8Scalar {
114   // The Unicode scalar value as a 32-bit integer. If decoding failed, this
115   // is equal to kErrorSentinel.
116   uint32_t u32;
117   // The Unicode scalar value encoded as UTF-8 bytes. May not reflect the
118   // contents of `u32` if it is kErrorSentinel.
119   absl::string_view utf8;
120 };
121 
122 // Parses a single UTF-8-encoded Unicode scalar from `str`. Returns a pair of
123 // the scalar and the UTF-8-encoded content corresponding to it from `str`.
124 //
125 // Returns U+FFFD on failure, and consumes an unspecified number of bytes in
126 // doing so.
ConsumeUtf8Scalar(absl::string_view & str)127 static Utf8Scalar ConsumeUtf8Scalar(absl::string_view& str) {
128   ABSL_DCHECK(!str.empty());
129   uint32_t scalar = static_cast<uint8_t>(str[0]);
130   const char* start = str.data();
131   size_t len = 1;
132 
133   str = str.substr(1);
134 
135   // Verify this is valid UTF-8. UTF-8 is a varint encoding satisfying
136   // one of the following (big-endian) patterns:
137   //
138   // 0b0xxxxxxx
139   // 0b110xxxxx'10xxxxxx
140   // 0b1110xxxx'10xxxxxx'10xxxxxx
141   // 0b11110xxx'10xxxxxx'10xxxxxx'10xxxxxx
142   //
143   // We don't need to decode it; just validate it.
144   int lookahead = 0;
145   switch (absl::countl_one(static_cast<uint8_t>(scalar))) {
146     case 0:
147       break;
148     case 2:
149       lookahead = 1;
150       scalar &= (1 << 5) - 1;
151       break;
152     case 3:
153       lookahead = 2;
154       scalar &= (1 << 4) - 1;
155       break;
156     case 4:
157       lookahead = 3;
158       scalar &= (1 << 3) - 1;
159       break;
160     default:
161       scalar = kErrorSentinel;
162       break;
163   }
164 
165   for (int i = 0; i < lookahead; ++i) {
166     if (str.empty()) {
167       scalar = kErrorSentinel;
168       break;
169     }
170 
171     uint8_t next = str[0];
172     str = str.substr(1);
173     ++len;
174 
175     // Looking for top 2 bits are 0b10.
176     if (next >> 6 != 2) {
177       scalar = kErrorSentinel;
178       break;
179     }
180     next &= (1 << 6) - 1;
181     scalar <<= 6;
182     scalar |= next;
183   }
184 
185   if (scalar > kMaxCodePoint) {
186     scalar = kErrorSentinel;
187   }
188 
189   return {scalar, absl::string_view(start, len)};
190 }
191 
192 // Decides whether we must escape `scalar`.
193 //
194 // If the given Unicode scalar would not use a \u escape, `custom_escape` will
195 // be set to a non-empty string.
MustEscape(uint32_t scalar,absl::string_view & custom_escape)196 static bool MustEscape(uint32_t scalar, absl::string_view& custom_escape) {
197   switch (scalar) {
198     // These escapes are defined by the JSON spec. We do not escape /.
199     case '\n':
200       custom_escape = R"(\n)";
201       return true;
202     case '\r':
203       custom_escape = R"(\r)";
204       return true;
205     case '\t':
206       custom_escape = R"(\t)";
207       return true;
208     case '\"':
209       custom_escape = R"(\")";
210       return true;
211     case '\f':
212       custom_escape = R"(\f)";
213       return true;
214     case '\b':
215       custom_escape = R"(\b)";
216       return true;
217     case '\\':
218       custom_escape = R"(\\)";
219       return true;
220 
221     case kErrorSentinel:
222       // Decoding failure turns into spaces, *not* replacement characters. We
223       // handle this separately from "normal" spaces so that it follows the
224       // escaping code-path.
225       //
226       // Note that literal replacement characters in the input string DO NOT
227       // get turned into spaces; this is only for decoding failures!
228       custom_escape = " ";
229       return true;
230 
231     // These are not required by the JSON spec, but help
232     // to prevent security bugs in JavaScript.
233     //
234     // These were originally present in the ESF parser, so they are kept for
235     // legacy compatibility (and because escaping most of these is in good
236     // taste, regardless).
237     case '<':
238     case '>':
239     case 0xfeff:      // Zero width no-break space.
240     case 0xfff9:      // Interlinear annotation anchor.
241     case 0xfffa:      // Interlinear annotation separator.
242     case 0xfffb:      // Interlinear annotation terminator.
243     case 0x00ad:      // Soft-hyphen.
244     case 0x06dd:      // Arabic end of ayah.
245     case 0x070f:      // Syriac abbreviation mark.
246     case 0x17b4:      // Khmer vowel inherent Aq.
247     case 0x17b5:      // Khmer vowel inherent Aa.
248     case 0x000e0001:  // Language tag.
249       return true;
250     default:
251       static constexpr std::pair<uint32_t, uint32_t> kEscapedRanges[] = {
252           {0x0000, 0x001f},          // ASCII control.
253           {0x007f, 0x009f},          // High ASCII bytes.
254           {0x0600, 0x0603},          // Arabic signs.
255           {0x200b, 0x200f},          // Zero width etc.
256           {0x2028, 0x202e},          // Separators etc.
257           {0x2060, 0x2064},          // Invisible etc.
258           {0x206a, 0x206f},          // Shaping etc.
259           {0x0001d173, 0x0001d17a},  // Music formatting.
260           {0x000e0020, 0x000e007f},  // TAG symbols.
261       };
262 
263       return absl::c_any_of(kEscapedRanges, [scalar](auto range) {
264         return range.first <= scalar && scalar <= range.second;
265       });
266   }
267 }
268 
WriteEscapedUtf8(absl::string_view str)269 void JsonWriter::WriteEscapedUtf8(absl::string_view str) {
270   while (!str.empty()) {
271     auto scalar = ConsumeUtf8Scalar(str);
272     absl::string_view custom_escape;
273 
274     if (!MustEscape(scalar.u32, custom_escape)) {
275       Write(scalar.utf8);
276       continue;
277     }
278 
279     if (!custom_escape.empty()) {
280       Write(custom_escape);
281       continue;
282     }
283 
284     if (scalar.u32 < 0x10000) {
285       WriteUEscape(scalar.u32);
286       continue;
287     }
288 
289     uint16_t lo =
290         (scalar.u32 & (kMaxLowSurrogate - kMinLowSurrogate)) + kMinLowSurrogate;
291     uint16_t hi = (scalar.u32 >> 10) +
292                   (kMinHighSurrogate - (kMinSupplementaryCodePoint >> 10));
293     WriteUEscape(hi);
294     WriteUEscape(lo);
295   }
296 }
297 
WriteUEscape(uint16_t val)298 void JsonWriter::WriteUEscape(uint16_t val) {
299   char hex[7];
300   int len = absl::SNPrintF(hex, sizeof(hex), R"(\u%04x)", val);
301   Write(absl::string_view(hex, static_cast<size_t>(len)));
302 }
303 }  // namespace json_internal
304 }  // namespace protobuf
305 }  // namespace google
306 
307 #include "google/protobuf/port_undef.inc"
308