1 // Copyright 2016 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7 #include "core/fpdfapi/page/cpdf_streamparser.h"
8
9 #include <limits.h>
10
11 #include <algorithm>
12 #include <memory>
13 #include <sstream>
14 #include <utility>
15
16 #include "constants/stream_dict_common.h"
17 #include "core/fpdfapi/page/cpdf_docpagedata.h"
18 #include "core/fpdfapi/parser/cpdf_array.h"
19 #include "core/fpdfapi/parser/cpdf_boolean.h"
20 #include "core/fpdfapi/parser/cpdf_dictionary.h"
21 #include "core/fpdfapi/parser/cpdf_document.h"
22 #include "core/fpdfapi/parser/cpdf_name.h"
23 #include "core/fpdfapi/parser/cpdf_null.h"
24 #include "core/fpdfapi/parser/cpdf_number.h"
25 #include "core/fpdfapi/parser/cpdf_stream.h"
26 #include "core/fpdfapi/parser/cpdf_string.h"
27 #include "core/fpdfapi/parser/fpdf_parser_decode.h"
28 #include "core/fpdfapi/parser/fpdf_parser_utility.h"
29 #include "core/fxcodec/fx_codec.h"
30 #include "core/fxcodec/jpeg/jpegmodule.h"
31 #include "core/fxcodec/scanlinedecoder.h"
32 #include "core/fxcrt/fx_extension.h"
33 #include "core/fxcrt/fx_memory_wrappers.h"
34 #include "core/fxcrt/fx_safe_types.h"
35 #include "third_party/base/ptr_util.h"
36
37 namespace {
38
39 const uint32_t kMaxNestedParsingLevel = 512;
40 const size_t kMaxStringLength = 32767;
41
42 const char kTrue[] = "true";
43 const char kFalse[] = "false";
44 const char kNull[] = "null";
45
DecodeAllScanlines(std::unique_ptr<ScanlineDecoder> pDecoder)46 uint32_t DecodeAllScanlines(std::unique_ptr<ScanlineDecoder> pDecoder) {
47 if (!pDecoder)
48 return FX_INVALID_OFFSET;
49
50 int ncomps = pDecoder->CountComps();
51 int bpc = pDecoder->GetBPC();
52 int width = pDecoder->GetWidth();
53 int height = pDecoder->GetHeight();
54 if (width <= 0 || height <= 0)
55 return FX_INVALID_OFFSET;
56
57 FX_SAFE_UINT32 size = fxcodec::CalculatePitch8(bpc, ncomps, width);
58 size *= height;
59 if (size.ValueOrDefault(0) == 0)
60 return FX_INVALID_OFFSET;
61
62 for (int row = 0; row < height; ++row) {
63 if (!pDecoder->GetScanline(row))
64 break;
65 }
66 return pDecoder->GetSrcOffset();
67 }
68
DecodeInlineStream(pdfium::span<const uint8_t> src_span,int width,int height,const ByteString & decoder,const CPDF_Dictionary * pParam,uint32_t orig_size)69 uint32_t DecodeInlineStream(pdfium::span<const uint8_t> src_span,
70 int width,
71 int height,
72 const ByteString& decoder,
73 const CPDF_Dictionary* pParam,
74 uint32_t orig_size) {
75 // |decoder| should not be an abbreviation.
76 ASSERT(decoder != "A85");
77 ASSERT(decoder != "AHx");
78 ASSERT(decoder != "CCF");
79 ASSERT(decoder != "DCT");
80 ASSERT(decoder != "Fl");
81 ASSERT(decoder != "LZW");
82 ASSERT(decoder != "RL");
83
84 std::unique_ptr<uint8_t, FxFreeDeleter> ignored_result;
85 uint32_t ignored_size;
86 if (decoder == "FlateDecode") {
87 return FlateOrLZWDecode(false, src_span, pParam, orig_size, &ignored_result,
88 &ignored_size);
89 }
90 if (decoder == "LZWDecode") {
91 return FlateOrLZWDecode(true, src_span, pParam, 0, &ignored_result,
92 &ignored_size);
93 }
94 if (decoder == "DCTDecode") {
95 std::unique_ptr<ScanlineDecoder> pDecoder =
96 fxcodec::ModuleMgr::GetInstance()->GetJpegModule()->CreateDecoder(
97 src_span, width, height, 0,
98 !pParam || pParam->GetIntegerFor("ColorTransform", 1));
99 return DecodeAllScanlines(std::move(pDecoder));
100 }
101 if (decoder == "CCITTFaxDecode") {
102 std::unique_ptr<ScanlineDecoder> pDecoder =
103 CreateFaxDecoder(src_span, width, height, pParam);
104 return DecodeAllScanlines(std::move(pDecoder));
105 }
106
107 if (decoder == "ASCII85Decode")
108 return A85Decode(src_span, &ignored_result, &ignored_size);
109 if (decoder == "ASCIIHexDecode")
110 return HexDecode(src_span, &ignored_result, &ignored_size);
111 if (decoder == "RunLengthDecode")
112 return RunLengthDecode(src_span, &ignored_result, &ignored_size);
113
114 return FX_INVALID_OFFSET;
115 }
116
117 } // namespace
118
CPDF_StreamParser(pdfium::span<const uint8_t> span)119 CPDF_StreamParser::CPDF_StreamParser(pdfium::span<const uint8_t> span)
120 : m_pBuf(span) {}
121
CPDF_StreamParser(pdfium::span<const uint8_t> span,const WeakPtr<ByteStringPool> & pPool)122 CPDF_StreamParser::CPDF_StreamParser(pdfium::span<const uint8_t> span,
123 const WeakPtr<ByteStringPool>& pPool)
124 : m_pPool(pPool), m_pBuf(span) {}
125
~CPDF_StreamParser()126 CPDF_StreamParser::~CPDF_StreamParser() {}
127
ReadInlineStream(CPDF_Document * pDoc,RetainPtr<CPDF_Dictionary> pDict,const CPDF_Object * pCSObj)128 RetainPtr<CPDF_Stream> CPDF_StreamParser::ReadInlineStream(
129 CPDF_Document* pDoc,
130 RetainPtr<CPDF_Dictionary> pDict,
131 const CPDF_Object* pCSObj) {
132 if (m_Pos < m_pBuf.size() && PDFCharIsWhitespace(m_pBuf[m_Pos]))
133 m_Pos++;
134
135 if (m_Pos == m_pBuf.size())
136 return nullptr;
137
138 ByteString decoder;
139 const CPDF_Dictionary* pParam = nullptr;
140 CPDF_Object* pFilter = pDict->GetDirectObjectFor("Filter");
141 if (pFilter) {
142 const CPDF_Array* pArray = pFilter->AsArray();
143 if (pArray) {
144 decoder = pArray->GetStringAt(0);
145 const CPDF_Array* pParams =
146 pDict->GetArrayFor(pdfium::stream::kDecodeParms);
147 if (pParams)
148 pParam = pParams->GetDictAt(0);
149 } else {
150 decoder = pFilter->GetString();
151 pParam = pDict->GetDictFor(pdfium::stream::kDecodeParms);
152 }
153 }
154 uint32_t width = pDict->GetIntegerFor("Width");
155 uint32_t height = pDict->GetIntegerFor("Height");
156 uint32_t bpc = 1;
157 uint32_t nComponents = 1;
158 if (pCSObj) {
159 RetainPtr<CPDF_ColorSpace> pCS =
160 CPDF_DocPageData::FromDocument(pDoc)->GetColorSpace(pCSObj, nullptr);
161 nComponents = pCS ? pCS->CountComponents() : 3;
162 bpc = pDict->GetIntegerFor("BitsPerComponent");
163 }
164 FX_SAFE_UINT32 size = fxcodec::CalculatePitch8(bpc, nComponents, width);
165 size *= height;
166 if (!size.IsValid())
167 return nullptr;
168
169 uint32_t dwOrigSize = size.ValueOrDie();
170 std::unique_ptr<uint8_t, FxFreeDeleter> pData;
171 uint32_t dwStreamSize;
172 if (decoder.IsEmpty()) {
173 dwOrigSize = std::min<uint32_t>(dwOrigSize, m_pBuf.size() - m_Pos);
174 pData.reset(FX_Alloc(uint8_t, dwOrigSize));
175 auto copy_span = m_pBuf.subspan(m_Pos, dwOrigSize);
176 memcpy(pData.get(), copy_span.data(), copy_span.size());
177 dwStreamSize = dwOrigSize;
178 m_Pos += dwOrigSize;
179 } else {
180 dwStreamSize = DecodeInlineStream(m_pBuf.subspan(m_Pos), width, height,
181 decoder, pParam, dwOrigSize);
182 if (!pdfium::base::IsValueInRangeForNumericType<int>(dwStreamSize))
183 return nullptr;
184
185 uint32_t dwSavePos = m_Pos;
186 m_Pos += dwStreamSize;
187 while (1) {
188 uint32_t dwPrevPos = m_Pos;
189 CPDF_StreamParser::SyntaxType type = ParseNextElement();
190 if (type == CPDF_StreamParser::EndOfData)
191 break;
192
193 if (type != CPDF_StreamParser::Keyword) {
194 dwStreamSize += m_Pos - dwPrevPos;
195 continue;
196 }
197 if (GetWord() == "EI") {
198 m_Pos = dwPrevPos;
199 break;
200 }
201 dwStreamSize += m_Pos - dwPrevPos;
202 }
203 m_Pos = dwSavePos;
204 pData.reset(FX_Alloc(uint8_t, dwStreamSize));
205 auto copy_span = m_pBuf.subspan(m_Pos, dwStreamSize);
206 memcpy(pData.get(), copy_span.data(), copy_span.size());
207 m_Pos += dwStreamSize;
208 }
209 pDict->SetNewFor<CPDF_Number>("Length", static_cast<int>(dwStreamSize));
210 return pdfium::MakeRetain<CPDF_Stream>(std::move(pData), dwStreamSize,
211 std::move(pDict));
212 }
213
ParseNextElement()214 CPDF_StreamParser::SyntaxType CPDF_StreamParser::ParseNextElement() {
215 m_pLastObj.Reset();
216 m_WordSize = 0;
217 if (!PositionIsInBounds())
218 return EndOfData;
219
220 uint8_t ch = m_pBuf[m_Pos++];
221 while (1) {
222 while (PDFCharIsWhitespace(ch)) {
223 if (!PositionIsInBounds())
224 return EndOfData;
225
226 ch = m_pBuf[m_Pos++];
227 }
228
229 if (ch != '%')
230 break;
231
232 while (1) {
233 if (!PositionIsInBounds())
234 return EndOfData;
235
236 ch = m_pBuf[m_Pos++];
237 if (PDFCharIsLineEnding(ch))
238 break;
239 }
240 }
241
242 if (PDFCharIsDelimiter(ch) && ch != '/') {
243 m_Pos--;
244 m_pLastObj = ReadNextObject(false, false, 0);
245 return Others;
246 }
247
248 bool bIsNumber = true;
249 while (1) {
250 if (m_WordSize < kMaxWordLength)
251 m_WordBuffer[m_WordSize++] = ch;
252
253 if (!PDFCharIsNumeric(ch))
254 bIsNumber = false;
255
256 if (!PositionIsInBounds())
257 break;
258
259 ch = m_pBuf[m_Pos++];
260
261 if (PDFCharIsDelimiter(ch) || PDFCharIsWhitespace(ch)) {
262 m_Pos--;
263 break;
264 }
265 }
266
267 m_WordBuffer[m_WordSize] = 0;
268 if (bIsNumber)
269 return Number;
270
271 if (m_WordBuffer[0] == '/')
272 return Name;
273
274 if (m_WordSize == 4) {
275 if (WordBufferMatches(kTrue)) {
276 m_pLastObj = pdfium::MakeRetain<CPDF_Boolean>(true);
277 return Others;
278 }
279 if (WordBufferMatches(kNull)) {
280 m_pLastObj = pdfium::MakeRetain<CPDF_Null>();
281 return Others;
282 }
283 } else if (m_WordSize == 5) {
284 if (WordBufferMatches(kFalse)) {
285 m_pLastObj = pdfium::MakeRetain<CPDF_Boolean>(false);
286 return Others;
287 }
288 }
289 return Keyword;
290 }
291
ReadNextObject(bool bAllowNestedArray,bool bInArray,uint32_t dwRecursionLevel)292 RetainPtr<CPDF_Object> CPDF_StreamParser::ReadNextObject(
293 bool bAllowNestedArray,
294 bool bInArray,
295 uint32_t dwRecursionLevel) {
296 bool bIsNumber;
297 // Must get the next word before returning to avoid infinite loops.
298 GetNextWord(bIsNumber);
299 if (!m_WordSize || dwRecursionLevel > kMaxNestedParsingLevel)
300 return nullptr;
301
302 if (bIsNumber) {
303 m_WordBuffer[m_WordSize] = 0;
304 return pdfium::MakeRetain<CPDF_Number>(
305 ByteStringView(m_WordBuffer, m_WordSize));
306 }
307
308 int first_char = m_WordBuffer[0];
309 if (first_char == '/') {
310 ByteString name =
311 PDF_NameDecode(ByteStringView(m_WordBuffer + 1, m_WordSize - 1));
312 return pdfium::MakeRetain<CPDF_Name>(m_pPool, name);
313 }
314
315 if (first_char == '(') {
316 ByteString str = ReadString();
317 return pdfium::MakeRetain<CPDF_String>(m_pPool, str, false);
318 }
319
320 if (first_char == '<') {
321 if (m_WordSize == 1)
322 return pdfium::MakeRetain<CPDF_String>(m_pPool, ReadHexString(), true);
323
324 auto pDict = pdfium::MakeRetain<CPDF_Dictionary>(m_pPool);
325 while (1) {
326 GetNextWord(bIsNumber);
327 if (m_WordSize == 2 && m_WordBuffer[0] == '>')
328 break;
329
330 if (!m_WordSize || m_WordBuffer[0] != '/')
331 return nullptr;
332
333 ByteString key =
334 PDF_NameDecode(ByteStringView(m_WordBuffer + 1, m_WordSize - 1));
335 RetainPtr<CPDF_Object> pObj =
336 ReadNextObject(true, bInArray, dwRecursionLevel + 1);
337 if (!pObj)
338 return nullptr;
339
340 if (!key.IsEmpty())
341 pDict->SetFor(key, std::move(pObj));
342 }
343 return pDict;
344 }
345
346 if (first_char == '[') {
347 if ((!bAllowNestedArray && bInArray))
348 return nullptr;
349
350 auto pArray = pdfium::MakeRetain<CPDF_Array>();
351 while (1) {
352 RetainPtr<CPDF_Object> pObj =
353 ReadNextObject(bAllowNestedArray, true, dwRecursionLevel + 1);
354 if (pObj) {
355 pArray->Add(std::move(pObj));
356 continue;
357 }
358 if (!m_WordSize || m_WordBuffer[0] == ']')
359 break;
360 }
361 return pArray;
362 }
363
364 if (WordBufferMatches(kFalse))
365 return pdfium::MakeRetain<CPDF_Boolean>(false);
366 if (WordBufferMatches(kTrue))
367 return pdfium::MakeRetain<CPDF_Boolean>(true);
368 if (WordBufferMatches(kNull))
369 return pdfium::MakeRetain<CPDF_Null>();
370 return nullptr;
371 }
372
373 // TODO(npm): the following methods are almost identical in cpdf_syntaxparser
GetNextWord(bool & bIsNumber)374 void CPDF_StreamParser::GetNextWord(bool& bIsNumber) {
375 m_WordSize = 0;
376 bIsNumber = true;
377 if (!PositionIsInBounds())
378 return;
379
380 uint8_t ch = m_pBuf[m_Pos++];
381 while (1) {
382 while (PDFCharIsWhitespace(ch)) {
383 if (!PositionIsInBounds()) {
384 return;
385 }
386 ch = m_pBuf[m_Pos++];
387 }
388
389 if (ch != '%')
390 break;
391
392 while (1) {
393 if (!PositionIsInBounds())
394 return;
395 ch = m_pBuf[m_Pos++];
396 if (PDFCharIsLineEnding(ch))
397 break;
398 }
399 }
400
401 if (PDFCharIsDelimiter(ch)) {
402 bIsNumber = false;
403 m_WordBuffer[m_WordSize++] = ch;
404 if (ch == '/') {
405 while (1) {
406 if (!PositionIsInBounds())
407 return;
408 ch = m_pBuf[m_Pos++];
409 if (!PDFCharIsOther(ch) && !PDFCharIsNumeric(ch)) {
410 m_Pos--;
411 return;
412 }
413 if (m_WordSize < kMaxWordLength)
414 m_WordBuffer[m_WordSize++] = ch;
415 }
416 } else if (ch == '<') {
417 if (!PositionIsInBounds())
418 return;
419 ch = m_pBuf[m_Pos++];
420 if (ch == '<')
421 m_WordBuffer[m_WordSize++] = ch;
422 else
423 m_Pos--;
424 } else if (ch == '>') {
425 if (!PositionIsInBounds())
426 return;
427 ch = m_pBuf[m_Pos++];
428 if (ch == '>')
429 m_WordBuffer[m_WordSize++] = ch;
430 else
431 m_Pos--;
432 }
433 return;
434 }
435
436 while (1) {
437 if (m_WordSize < kMaxWordLength)
438 m_WordBuffer[m_WordSize++] = ch;
439 if (!PDFCharIsNumeric(ch))
440 bIsNumber = false;
441 if (!PositionIsInBounds())
442 return;
443
444 ch = m_pBuf[m_Pos++];
445 if (PDFCharIsDelimiter(ch) || PDFCharIsWhitespace(ch)) {
446 m_Pos--;
447 break;
448 }
449 }
450 }
451
ReadString()452 ByteString CPDF_StreamParser::ReadString() {
453 if (!PositionIsInBounds())
454 return ByteString();
455
456 uint8_t ch = m_pBuf[m_Pos++];
457 std::ostringstream buf;
458 int parlevel = 0;
459 int status = 0;
460 int iEscCode = 0;
461 while (1) {
462 switch (status) {
463 case 0:
464 if (ch == ')') {
465 if (parlevel == 0) {
466 if (buf.tellp() <= 0)
467 return ByteString();
468
469 return ByteString(
470 buf.str().c_str(),
471 std::min(static_cast<size_t>(buf.tellp()), kMaxStringLength));
472 }
473 parlevel--;
474 buf << ')';
475 } else if (ch == '(') {
476 parlevel++;
477 buf << '(';
478 } else if (ch == '\\') {
479 status = 1;
480 } else {
481 buf << static_cast<char>(ch);
482 }
483 break;
484 case 1:
485 if (FXSYS_IsOctalDigit(ch)) {
486 iEscCode = FXSYS_DecimalCharToInt(static_cast<char>(ch));
487 status = 2;
488 break;
489 }
490 if (ch == '\r') {
491 status = 4;
492 break;
493 }
494 if (ch == '\n') {
495 // Do nothing.
496 } else if (ch == 'n') {
497 buf << '\n';
498 } else if (ch == 'r') {
499 buf << '\r';
500 } else if (ch == 't') {
501 buf << '\t';
502 } else if (ch == 'b') {
503 buf << '\b';
504 } else if (ch == 'f') {
505 buf << '\f';
506 } else {
507 buf << static_cast<char>(ch);
508 }
509 status = 0;
510 break;
511 case 2:
512 if (FXSYS_IsOctalDigit(ch)) {
513 iEscCode =
514 iEscCode * 8 + FXSYS_DecimalCharToInt(static_cast<char>(ch));
515 status = 3;
516 } else {
517 buf << static_cast<char>(iEscCode);
518 status = 0;
519 continue;
520 }
521 break;
522 case 3:
523 if (FXSYS_IsOctalDigit(ch)) {
524 iEscCode =
525 iEscCode * 8 + FXSYS_DecimalCharToInt(static_cast<char>(ch));
526 buf << static_cast<char>(iEscCode);
527 status = 0;
528 } else {
529 buf << static_cast<char>(iEscCode);
530 status = 0;
531 continue;
532 }
533 break;
534 case 4:
535 status = 0;
536 if (ch != '\n')
537 continue;
538 break;
539 }
540 if (!PositionIsInBounds())
541 break;
542
543 ch = m_pBuf[m_Pos++];
544 }
545 if (PositionIsInBounds())
546 ++m_Pos;
547
548 if (buf.tellp() <= 0)
549 return ByteString();
550
551 return ByteString(
552 buf.str().c_str(),
553 std::min(static_cast<size_t>(buf.tellp()), kMaxStringLength));
554 }
555
ReadHexString()556 ByteString CPDF_StreamParser::ReadHexString() {
557 if (!PositionIsInBounds())
558 return ByteString();
559
560 std::ostringstream buf;
561 bool bFirst = true;
562 int code = 0;
563 while (PositionIsInBounds()) {
564 uint8_t ch = m_pBuf[m_Pos++];
565 if (ch == '>')
566 break;
567
568 if (!std::isxdigit(ch))
569 continue;
570
571 int val = FXSYS_HexCharToInt(ch);
572 if (bFirst) {
573 code = val * 16;
574 } else {
575 code += val;
576 buf << static_cast<uint8_t>(code);
577 }
578 bFirst = !bFirst;
579 }
580 if (!bFirst)
581 buf << static_cast<char>(code);
582
583 if (buf.tellp() <= 0)
584 return ByteString();
585
586 return ByteString(
587 buf.str().c_str(),
588 std::min(static_cast<size_t>(buf.tellp()), kMaxStringLength));
589 }
590
PositionIsInBounds() const591 bool CPDF_StreamParser::PositionIsInBounds() const {
592 return m_Pos < m_pBuf.size();
593 }
594
WordBufferMatches(const char * pWord) const595 bool CPDF_StreamParser::WordBufferMatches(const char* pWord) const {
596 const size_t iLength = strlen(pWord);
597 return m_WordSize == iLength && memcmp(m_WordBuffer, pWord, iLength) == 0;
598 }
599