1 // Copyright 2016 The PDFium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7 #include "core/fpdfapi/page/cpdf_streamparser.h"
8
9 #include <ctype.h>
10
11 #include <algorithm>
12 #include <memory>
13 #include <utility>
14
15 #include "constants/stream_dict_common.h"
16 #include "core/fpdfapi/page/cpdf_docpagedata.h"
17 #include "core/fpdfapi/parser/cpdf_array.h"
18 #include "core/fpdfapi/parser/cpdf_boolean.h"
19 #include "core/fpdfapi/parser/cpdf_dictionary.h"
20 #include "core/fpdfapi/parser/cpdf_name.h"
21 #include "core/fpdfapi/parser/cpdf_null.h"
22 #include "core/fpdfapi/parser/cpdf_number.h"
23 #include "core/fpdfapi/parser/cpdf_stream.h"
24 #include "core/fpdfapi/parser/cpdf_string.h"
25 #include "core/fpdfapi/parser/fpdf_parser_decode.h"
26 #include "core/fpdfapi/parser/fpdf_parser_utility.h"
27 #include "core/fxcodec/data_and_bytes_consumed.h"
28 #include "core/fxcodec/jpeg/jpegmodule.h"
29 #include "core/fxcodec/scanlinedecoder.h"
30 #include "core/fxcrt/check.h"
31 #include "core/fxcrt/data_vector.h"
32 #include "core/fxcrt/fx_extension.h"
33 #include "core/fxcrt/fx_safe_types.h"
34 #include "core/fxcrt/span_util.h"
35 #include "core/fxge/calculate_pitch.h"
36
37 namespace {
38
39 const uint32_t kMaxNestedParsingLevel = 512;
40 const size_t kMaxStringLength = 32767;
41
42 const char kTrue[] = "true";
43 const char kFalse[] = "false";
44 const char kNull[] = "null";
45
DecodeAllScanlines(std::unique_ptr<ScanlineDecoder> pDecoder)46 uint32_t DecodeAllScanlines(std::unique_ptr<ScanlineDecoder> pDecoder) {
47 if (!pDecoder)
48 return FX_INVALID_OFFSET;
49
50 int ncomps = pDecoder->CountComps();
51 int bpc = pDecoder->GetBPC();
52 int width = pDecoder->GetWidth();
53 int height = pDecoder->GetHeight();
54 if (width <= 0 || height <= 0)
55 return FX_INVALID_OFFSET;
56
57 std::optional<uint32_t> maybe_size =
58 fxge::CalculatePitch8(bpc, ncomps, width);
59 if (!maybe_size.has_value())
60 return FX_INVALID_OFFSET;
61
62 FX_SAFE_UINT32 size = maybe_size.value();
63 size *= height;
64 if (size.ValueOrDefault(0) == 0)
65 return FX_INVALID_OFFSET;
66
67 for (int row = 0; row < height; ++row) {
68 if (pDecoder->GetScanline(row).empty())
69 break;
70 }
71 return pDecoder->GetSrcOffset();
72 }
73
DecodeInlineStream(pdfium::span<const uint8_t> src_span,int width,int height,const ByteString & decoder,RetainPtr<const CPDF_Dictionary> pParam,uint32_t orig_size)74 uint32_t DecodeInlineStream(pdfium::span<const uint8_t> src_span,
75 int width,
76 int height,
77 const ByteString& decoder,
78 RetainPtr<const CPDF_Dictionary> pParam,
79 uint32_t orig_size) {
80 // |decoder| should not be an abbreviation.
81 DCHECK(decoder != "A85");
82 DCHECK(decoder != "AHx");
83 DCHECK(decoder != "CCF");
84 DCHECK(decoder != "DCT");
85 DCHECK(decoder != "Fl");
86 DCHECK(decoder != "LZW");
87 DCHECK(decoder != "RL");
88
89 if (decoder == "FlateDecode") {
90 return FlateOrLZWDecode(/*use_lzw=*/false, src_span, pParam.Get(),
91 /*estimated_size=*/orig_size)
92 .bytes_consumed;
93 }
94 if (decoder == "LZWDecode") {
95 return FlateOrLZWDecode(
96 /*use_lzw=*/true, src_span, pParam.Get(),
97 /*estimated_size=*/0)
98 .bytes_consumed;
99 }
100 if (decoder == "DCTDecode") {
101 std::unique_ptr<ScanlineDecoder> pDecoder = JpegModule::CreateDecoder(
102 src_span, width, height, 0,
103 !pParam || pParam->GetIntegerFor("ColorTransform", 1));
104 return DecodeAllScanlines(std::move(pDecoder));
105 }
106 if (decoder == "CCITTFaxDecode") {
107 std::unique_ptr<ScanlineDecoder> pDecoder =
108 CreateFaxDecoder(src_span, width, height, pParam.Get());
109 return DecodeAllScanlines(std::move(pDecoder));
110 }
111
112 if (decoder == "ASCII85Decode") {
113 return A85Decode(src_span).bytes_consumed;
114 }
115 if (decoder == "ASCIIHexDecode") {
116 return HexDecode(src_span).bytes_consumed;
117 }
118 if (decoder == "RunLengthDecode") {
119 return RunLengthDecode(src_span).bytes_consumed;
120 }
121
122 return FX_INVALID_OFFSET;
123 }
124
125 } // namespace
126
CPDF_StreamParser(pdfium::span<const uint8_t> span)127 CPDF_StreamParser::CPDF_StreamParser(pdfium::span<const uint8_t> span)
128 : m_pBuf(span) {}
129
CPDF_StreamParser(pdfium::span<const uint8_t> span,const WeakPtr<ByteStringPool> & pPool)130 CPDF_StreamParser::CPDF_StreamParser(pdfium::span<const uint8_t> span,
131 const WeakPtr<ByteStringPool>& pPool)
132 : m_pPool(pPool), m_pBuf(span) {}
133
134 CPDF_StreamParser::~CPDF_StreamParser() = default;
135
ReadInlineStream(CPDF_Document * pDoc,RetainPtr<CPDF_Dictionary> pDict,const CPDF_Object * pCSObj)136 RetainPtr<CPDF_Stream> CPDF_StreamParser::ReadInlineStream(
137 CPDF_Document* pDoc,
138 RetainPtr<CPDF_Dictionary> pDict,
139 const CPDF_Object* pCSObj) {
140 if (m_Pos < m_pBuf.size() && PDFCharIsWhitespace(m_pBuf[m_Pos]))
141 m_Pos++;
142
143 if (m_Pos == m_pBuf.size())
144 return nullptr;
145
146 ByteString decoder;
147 RetainPtr<const CPDF_Dictionary> pParam;
148 RetainPtr<const CPDF_Object> pFilter = pDict->GetDirectObjectFor("Filter");
149 if (pFilter) {
150 const CPDF_Array* pArray = pFilter->AsArray();
151 if (pArray) {
152 decoder = pArray->GetByteStringAt(0);
153 RetainPtr<const CPDF_Array> pParams =
154 pDict->GetArrayFor(pdfium::stream::kDecodeParms);
155 if (pParams)
156 pParam = pParams->GetDictAt(0);
157 } else {
158 decoder = pFilter->GetString();
159 pParam = pDict->GetDictFor(pdfium::stream::kDecodeParms);
160 }
161 }
162 uint32_t width = pDict->GetIntegerFor("Width");
163 uint32_t height = pDict->GetIntegerFor("Height");
164 uint32_t bpc = 1;
165 uint32_t nComponents = 1;
166 if (pCSObj) {
167 RetainPtr<CPDF_ColorSpace> pCS =
168 CPDF_DocPageData::FromDocument(pDoc)->GetColorSpace(pCSObj, nullptr);
169 nComponents = pCS ? pCS->ComponentCount() : 3;
170 bpc = pDict->GetIntegerFor("BitsPerComponent");
171 }
172 std::optional<uint32_t> maybe_size =
173 fxge::CalculatePitch8(bpc, nComponents, width);
174 if (!maybe_size.has_value())
175 return nullptr;
176
177 FX_SAFE_UINT32 size = maybe_size.value();
178 size *= height;
179 if (!size.IsValid())
180 return nullptr;
181
182 uint32_t dwOrigSize = size.ValueOrDie();
183 DataVector<uint8_t> data;
184 uint32_t dwStreamSize;
185 if (decoder.IsEmpty()) {
186 dwOrigSize = std::min<uint32_t>(dwOrigSize, m_pBuf.size() - m_Pos);
187 auto src_span = m_pBuf.subspan(m_Pos, dwOrigSize);
188 data = DataVector<uint8_t>(src_span.begin(), src_span.end());
189 dwStreamSize = dwOrigSize;
190 m_Pos += dwOrigSize;
191 } else {
192 dwStreamSize = DecodeInlineStream(m_pBuf.subspan(m_Pos), width, height,
193 decoder, std::move(pParam), dwOrigSize);
194 if (!pdfium::IsValueInRangeForNumericType<int>(dwStreamSize)) {
195 return nullptr;
196 }
197
198 uint32_t dwSavePos = m_Pos;
199 m_Pos += dwStreamSize;
200 while (true) {
201 uint32_t dwPrevPos = m_Pos;
202 ElementType type = ParseNextElement();
203 if (type == ElementType::kEndOfData)
204 break;
205
206 if (type != ElementType::kKeyword) {
207 dwStreamSize += m_Pos - dwPrevPos;
208 continue;
209 }
210 if (GetWord() == "EI") {
211 m_Pos = dwPrevPos;
212 break;
213 }
214 dwStreamSize += m_Pos - dwPrevPos;
215 }
216 m_Pos = dwSavePos;
217 auto src_span = m_pBuf.subspan(m_Pos, dwStreamSize);
218 data = DataVector<uint8_t>(src_span.begin(), src_span.end());
219 m_Pos += dwStreamSize;
220 }
221 pDict->SetNewFor<CPDF_Number>("Length", static_cast<int>(dwStreamSize));
222 return pdfium::MakeRetain<CPDF_Stream>(std::move(data), std::move(pDict));
223 }
224
ParseNextElement()225 CPDF_StreamParser::ElementType CPDF_StreamParser::ParseNextElement() {
226 m_pLastObj.Reset();
227 m_WordSize = 0;
228 if (!PositionIsInBounds())
229 return ElementType::kEndOfData;
230
231 uint8_t ch = m_pBuf[m_Pos++];
232 while (true) {
233 while (PDFCharIsWhitespace(ch)) {
234 if (!PositionIsInBounds())
235 return ElementType::kEndOfData;
236
237 ch = m_pBuf[m_Pos++];
238 }
239
240 if (ch != '%')
241 break;
242
243 while (true) {
244 if (!PositionIsInBounds())
245 return ElementType::kEndOfData;
246
247 ch = m_pBuf[m_Pos++];
248 if (PDFCharIsLineEnding(ch))
249 break;
250 }
251 }
252
253 if (PDFCharIsDelimiter(ch) && ch != '/') {
254 m_Pos--;
255 m_pLastObj = ReadNextObject(false, false, 0);
256 return ElementType::kOther;
257 }
258
259 bool bIsNumber = true;
260 while (true) {
261 if (m_WordSize < kMaxWordLength)
262 m_WordBuffer[m_WordSize++] = ch;
263
264 if (!PDFCharIsNumeric(ch))
265 bIsNumber = false;
266
267 if (!PositionIsInBounds())
268 break;
269
270 ch = m_pBuf[m_Pos++];
271
272 if (PDFCharIsDelimiter(ch) || PDFCharIsWhitespace(ch)) {
273 m_Pos--;
274 break;
275 }
276 }
277
278 m_WordBuffer[m_WordSize] = 0;
279 if (bIsNumber)
280 return ElementType::kNumber;
281
282 if (m_WordBuffer[0] == '/')
283 return ElementType::kName;
284
285 if (m_WordSize == 4) {
286 if (GetWord() == kTrue) {
287 m_pLastObj = pdfium::MakeRetain<CPDF_Boolean>(true);
288 return ElementType::kOther;
289 }
290 if (GetWord() == kNull) {
291 m_pLastObj = pdfium::MakeRetain<CPDF_Null>();
292 return ElementType::kOther;
293 }
294 } else if (m_WordSize == 5) {
295 if (GetWord() == kFalse) {
296 m_pLastObj = pdfium::MakeRetain<CPDF_Boolean>(false);
297 return ElementType::kOther;
298 }
299 }
300 return ElementType::kKeyword;
301 }
302
ReadNextObject(bool bAllowNestedArray,bool bInArray,uint32_t dwRecursionLevel)303 RetainPtr<CPDF_Object> CPDF_StreamParser::ReadNextObject(
304 bool bAllowNestedArray,
305 bool bInArray,
306 uint32_t dwRecursionLevel) {
307 bool bIsNumber;
308 // Must get the next word before returning to avoid infinite loops.
309 GetNextWord(bIsNumber);
310 if (!m_WordSize || dwRecursionLevel > kMaxNestedParsingLevel)
311 return nullptr;
312
313 if (bIsNumber) {
314 m_WordBuffer[m_WordSize] = 0;
315 return pdfium::MakeRetain<CPDF_Number>(GetWord());
316 }
317
318 int first_char = m_WordBuffer[0];
319 if (first_char == '/') {
320 ByteString name = PDF_NameDecode(GetWord().Substr(1));
321 return pdfium::MakeRetain<CPDF_Name>(m_pPool, name);
322 }
323
324 if (first_char == '(') {
325 return pdfium::MakeRetain<CPDF_String>(m_pPool, ReadString());
326 }
327
328 if (first_char == '<') {
329 if (m_WordSize == 1) {
330 return pdfium::MakeRetain<CPDF_String>(m_pPool, ReadHexString(),
331 CPDF_String::DataType::kIsHex);
332 }
333
334 auto pDict = pdfium::MakeRetain<CPDF_Dictionary>(m_pPool);
335 while (true) {
336 GetNextWord(bIsNumber);
337 if (m_WordSize == 2 && m_WordBuffer[0] == '>')
338 break;
339
340 if (!m_WordSize || m_WordBuffer[0] != '/')
341 return nullptr;
342
343 ByteString key = PDF_NameDecode(GetWord().Substr(1));
344 RetainPtr<CPDF_Object> pObj =
345 ReadNextObject(true, bInArray, dwRecursionLevel + 1);
346 if (!pObj)
347 return nullptr;
348
349 pDict->SetFor(key, std::move(pObj));
350 }
351 return pDict;
352 }
353
354 if (first_char == '[') {
355 if ((!bAllowNestedArray && bInArray))
356 return nullptr;
357
358 auto pArray = pdfium::MakeRetain<CPDF_Array>();
359 while (true) {
360 RetainPtr<CPDF_Object> pObj =
361 ReadNextObject(bAllowNestedArray, true, dwRecursionLevel + 1);
362 if (pObj) {
363 pArray->Append(std::move(pObj));
364 continue;
365 }
366 if (!m_WordSize || m_WordBuffer[0] == ']')
367 break;
368 }
369 return pArray;
370 }
371
372 if (GetWord() == kFalse)
373 return pdfium::MakeRetain<CPDF_Boolean>(false);
374 if (GetWord() == kTrue)
375 return pdfium::MakeRetain<CPDF_Boolean>(true);
376 if (GetWord() == kNull)
377 return pdfium::MakeRetain<CPDF_Null>();
378 return nullptr;
379 }
380
381 // TODO(npm): the following methods are almost identical in cpdf_syntaxparser
GetNextWord(bool & bIsNumber)382 void CPDF_StreamParser::GetNextWord(bool& bIsNumber) {
383 m_WordSize = 0;
384 bIsNumber = true;
385 if (!PositionIsInBounds())
386 return;
387
388 uint8_t ch = m_pBuf[m_Pos++];
389 while (true) {
390 while (PDFCharIsWhitespace(ch)) {
391 if (!PositionIsInBounds()) {
392 return;
393 }
394 ch = m_pBuf[m_Pos++];
395 }
396
397 if (ch != '%')
398 break;
399
400 while (true) {
401 if (!PositionIsInBounds())
402 return;
403 ch = m_pBuf[m_Pos++];
404 if (PDFCharIsLineEnding(ch))
405 break;
406 }
407 }
408
409 if (PDFCharIsDelimiter(ch)) {
410 bIsNumber = false;
411 m_WordBuffer[m_WordSize++] = ch;
412 if (ch == '/') {
413 while (true) {
414 if (!PositionIsInBounds())
415 return;
416 ch = m_pBuf[m_Pos++];
417 if (!PDFCharIsOther(ch) && !PDFCharIsNumeric(ch)) {
418 m_Pos--;
419 return;
420 }
421 if (m_WordSize < kMaxWordLength)
422 m_WordBuffer[m_WordSize++] = ch;
423 }
424 } else if (ch == '<') {
425 if (!PositionIsInBounds())
426 return;
427 ch = m_pBuf[m_Pos++];
428 if (ch == '<')
429 m_WordBuffer[m_WordSize++] = ch;
430 else
431 m_Pos--;
432 } else if (ch == '>') {
433 if (!PositionIsInBounds())
434 return;
435 ch = m_pBuf[m_Pos++];
436 if (ch == '>')
437 m_WordBuffer[m_WordSize++] = ch;
438 else
439 m_Pos--;
440 }
441 return;
442 }
443
444 while (true) {
445 if (m_WordSize < kMaxWordLength)
446 m_WordBuffer[m_WordSize++] = ch;
447 if (!PDFCharIsNumeric(ch))
448 bIsNumber = false;
449 if (!PositionIsInBounds())
450 return;
451
452 ch = m_pBuf[m_Pos++];
453 if (PDFCharIsDelimiter(ch) || PDFCharIsWhitespace(ch)) {
454 m_Pos--;
455 break;
456 }
457 }
458 }
459
ReadString()460 ByteString CPDF_StreamParser::ReadString() {
461 if (!PositionIsInBounds())
462 return ByteString();
463
464 ByteString buf;
465 int parlevel = 0;
466 int status = 0;
467 int iEscCode = 0;
468 uint8_t ch = m_pBuf[m_Pos++];
469 while (true) {
470 switch (status) {
471 case 0:
472 if (ch == ')') {
473 if (parlevel == 0) {
474 return buf.First(std::min(buf.GetLength(), kMaxStringLength));
475 }
476 parlevel--;
477 buf += ')';
478 } else if (ch == '(') {
479 parlevel++;
480 buf += '(';
481 } else if (ch == '\\') {
482 status = 1;
483 } else {
484 buf += static_cast<char>(ch);
485 }
486 break;
487 case 1:
488 if (FXSYS_IsOctalDigit(ch)) {
489 iEscCode = FXSYS_DecimalCharToInt(static_cast<char>(ch));
490 status = 2;
491 break;
492 }
493 if (ch == '\r') {
494 status = 4;
495 break;
496 }
497 if (ch == '\n') {
498 // Do nothing.
499 } else if (ch == 'n') {
500 buf += '\n';
501 } else if (ch == 'r') {
502 buf += '\r';
503 } else if (ch == 't') {
504 buf += '\t';
505 } else if (ch == 'b') {
506 buf += '\b';
507 } else if (ch == 'f') {
508 buf += '\f';
509 } else {
510 buf += static_cast<char>(ch);
511 }
512 status = 0;
513 break;
514 case 2:
515 if (FXSYS_IsOctalDigit(ch)) {
516 iEscCode =
517 iEscCode * 8 + FXSYS_DecimalCharToInt(static_cast<char>(ch));
518 status = 3;
519 } else {
520 buf += static_cast<char>(iEscCode);
521 status = 0;
522 continue;
523 }
524 break;
525 case 3:
526 if (FXSYS_IsOctalDigit(ch)) {
527 iEscCode =
528 iEscCode * 8 + FXSYS_DecimalCharToInt(static_cast<char>(ch));
529 buf += static_cast<char>(iEscCode);
530 status = 0;
531 } else {
532 buf += static_cast<char>(iEscCode);
533 status = 0;
534 continue;
535 }
536 break;
537 case 4:
538 status = 0;
539 if (ch != '\n')
540 continue;
541 break;
542 }
543 if (!PositionIsInBounds())
544 return buf.First(std::min(buf.GetLength(), kMaxStringLength));
545
546 ch = m_pBuf[m_Pos++];
547 }
548 }
549
ReadHexString()550 DataVector<uint8_t> CPDF_StreamParser::ReadHexString() {
551 if (!PositionIsInBounds()) {
552 return DataVector<uint8_t>();
553 }
554
555 // TODO(thestig): Deduplicate CPDF_SyntaxParser::ReadHexString()?
556 DataVector<uint8_t> buf;
557 bool bFirst = true;
558 uint8_t code = 0;
559 while (PositionIsInBounds()) {
560 uint8_t ch = m_pBuf[m_Pos++];
561 if (ch == '>')
562 break;
563
564 if (!isxdigit(ch))
565 continue;
566
567 int val = FXSYS_HexCharToInt(ch);
568 if (bFirst) {
569 code = val * 16;
570 } else {
571 code += val;
572 buf.push_back(code);
573 }
574 bFirst = !bFirst;
575 }
576 if (!bFirst) {
577 buf.push_back(code);
578 }
579
580 if (buf.size() > kMaxStringLength) {
581 buf.resize(kMaxStringLength);
582 }
583 return buf;
584 }
585
PositionIsInBounds() const586 bool CPDF_StreamParser::PositionIsInBounds() const {
587 return m_Pos < m_pBuf.size();
588 }
589