1 // Copyright 2016 The PDFium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7 #include "core/fpdfapi/page/cpdf_streamparser.h"
8
9 #include <ctype.h>
10
11 #include <algorithm>
12 #include <memory>
13 #include <utility>
14
15 #include "constants/stream_dict_common.h"
16 #include "core/fpdfapi/page/cpdf_docpagedata.h"
17 #include "core/fpdfapi/parser/cpdf_array.h"
18 #include "core/fpdfapi/parser/cpdf_boolean.h"
19 #include "core/fpdfapi/parser/cpdf_dictionary.h"
20 #include "core/fpdfapi/parser/cpdf_name.h"
21 #include "core/fpdfapi/parser/cpdf_null.h"
22 #include "core/fpdfapi/parser/cpdf_number.h"
23 #include "core/fpdfapi/parser/cpdf_stream.h"
24 #include "core/fpdfapi/parser/cpdf_string.h"
25 #include "core/fpdfapi/parser/fpdf_parser_decode.h"
26 #include "core/fpdfapi/parser/fpdf_parser_utility.h"
27 #include "core/fxcodec/jpeg/jpegmodule.h"
28 #include "core/fxcodec/scanlinedecoder.h"
29 #include "core/fxcrt/data_vector.h"
30 #include "core/fxcrt/fx_extension.h"
31 #include "core/fxcrt/fx_memory_wrappers.h"
32 #include "core/fxcrt/fx_safe_types.h"
33 #include "core/fxcrt/span_util.h"
34 #include "core/fxge/calculate_pitch.h"
35 #include "third_party/base/check.h"
36
37 namespace {
38
39 const uint32_t kMaxNestedParsingLevel = 512;
40 const size_t kMaxStringLength = 32767;
41
42 const char kTrue[] = "true";
43 const char kFalse[] = "false";
44 const char kNull[] = "null";
45
DecodeAllScanlines(std::unique_ptr<ScanlineDecoder> pDecoder)46 uint32_t DecodeAllScanlines(std::unique_ptr<ScanlineDecoder> pDecoder) {
47 if (!pDecoder)
48 return FX_INVALID_OFFSET;
49
50 int ncomps = pDecoder->CountComps();
51 int bpc = pDecoder->GetBPC();
52 int width = pDecoder->GetWidth();
53 int height = pDecoder->GetHeight();
54 if (width <= 0 || height <= 0)
55 return FX_INVALID_OFFSET;
56
57 absl::optional<uint32_t> maybe_size =
58 fxge::CalculatePitch8(bpc, ncomps, width);
59 if (!maybe_size.has_value())
60 return FX_INVALID_OFFSET;
61
62 FX_SAFE_UINT32 size = maybe_size.value();
63 size *= height;
64 if (size.ValueOrDefault(0) == 0)
65 return FX_INVALID_OFFSET;
66
67 for (int row = 0; row < height; ++row) {
68 if (pDecoder->GetScanline(row).empty())
69 break;
70 }
71 return pDecoder->GetSrcOffset();
72 }
73
DecodeInlineStream(pdfium::span<const uint8_t> src_span,int width,int height,const ByteString & decoder,RetainPtr<const CPDF_Dictionary> pParam,uint32_t orig_size)74 uint32_t DecodeInlineStream(pdfium::span<const uint8_t> src_span,
75 int width,
76 int height,
77 const ByteString& decoder,
78 RetainPtr<const CPDF_Dictionary> pParam,
79 uint32_t orig_size) {
80 // |decoder| should not be an abbreviation.
81 DCHECK(decoder != "A85");
82 DCHECK(decoder != "AHx");
83 DCHECK(decoder != "CCF");
84 DCHECK(decoder != "DCT");
85 DCHECK(decoder != "Fl");
86 DCHECK(decoder != "LZW");
87 DCHECK(decoder != "RL");
88
89 std::unique_ptr<uint8_t, FxFreeDeleter> ignored_result;
90 uint32_t ignored_size;
91 if (decoder == "FlateDecode") {
92 return FlateOrLZWDecode(false, src_span, pParam.Get(), orig_size,
93 &ignored_result, &ignored_size);
94 }
95 if (decoder == "LZWDecode") {
96 return FlateOrLZWDecode(true, src_span, pParam.Get(), 0, &ignored_result,
97 &ignored_size);
98 }
99 if (decoder == "DCTDecode") {
100 std::unique_ptr<ScanlineDecoder> pDecoder = JpegModule::CreateDecoder(
101 src_span, width, height, 0,
102 !pParam || pParam->GetIntegerFor("ColorTransform", 1));
103 return DecodeAllScanlines(std::move(pDecoder));
104 }
105 if (decoder == "CCITTFaxDecode") {
106 std::unique_ptr<ScanlineDecoder> pDecoder =
107 CreateFaxDecoder(src_span, width, height, pParam.Get());
108 return DecodeAllScanlines(std::move(pDecoder));
109 }
110
111 if (decoder == "ASCII85Decode")
112 return A85Decode(src_span, &ignored_result, &ignored_size);
113 if (decoder == "ASCIIHexDecode")
114 return HexDecode(src_span, &ignored_result, &ignored_size);
115 if (decoder == "RunLengthDecode")
116 return RunLengthDecode(src_span, &ignored_result, &ignored_size);
117
118 return FX_INVALID_OFFSET;
119 }
120
121 } // namespace
122
CPDF_StreamParser(pdfium::span<const uint8_t> span)123 CPDF_StreamParser::CPDF_StreamParser(pdfium::span<const uint8_t> span)
124 : m_pBuf(span) {}
125
CPDF_StreamParser(pdfium::span<const uint8_t> span,const WeakPtr<ByteStringPool> & pPool)126 CPDF_StreamParser::CPDF_StreamParser(pdfium::span<const uint8_t> span,
127 const WeakPtr<ByteStringPool>& pPool)
128 : m_pPool(pPool), m_pBuf(span) {}
129
130 CPDF_StreamParser::~CPDF_StreamParser() = default;
131
ReadInlineStream(CPDF_Document * pDoc,RetainPtr<CPDF_Dictionary> pDict,const CPDF_Object * pCSObj)132 RetainPtr<CPDF_Stream> CPDF_StreamParser::ReadInlineStream(
133 CPDF_Document* pDoc,
134 RetainPtr<CPDF_Dictionary> pDict,
135 const CPDF_Object* pCSObj) {
136 if (m_Pos < m_pBuf.size() && PDFCharIsWhitespace(m_pBuf[m_Pos]))
137 m_Pos++;
138
139 if (m_Pos == m_pBuf.size())
140 return nullptr;
141
142 ByteString decoder;
143 RetainPtr<const CPDF_Dictionary> pParam;
144 RetainPtr<const CPDF_Object> pFilter = pDict->GetDirectObjectFor("Filter");
145 if (pFilter) {
146 const CPDF_Array* pArray = pFilter->AsArray();
147 if (pArray) {
148 decoder = pArray->GetByteStringAt(0);
149 RetainPtr<const CPDF_Array> pParams =
150 pDict->GetArrayFor(pdfium::stream::kDecodeParms);
151 if (pParams)
152 pParam = pParams->GetDictAt(0);
153 } else {
154 decoder = pFilter->GetString();
155 pParam = pDict->GetDictFor(pdfium::stream::kDecodeParms);
156 }
157 }
158 uint32_t width = pDict->GetIntegerFor("Width");
159 uint32_t height = pDict->GetIntegerFor("Height");
160 uint32_t bpc = 1;
161 uint32_t nComponents = 1;
162 if (pCSObj) {
163 RetainPtr<CPDF_ColorSpace> pCS =
164 CPDF_DocPageData::FromDocument(pDoc)->GetColorSpace(pCSObj, nullptr);
165 nComponents = pCS ? pCS->CountComponents() : 3;
166 bpc = pDict->GetIntegerFor("BitsPerComponent");
167 }
168 absl::optional<uint32_t> maybe_size =
169 fxge::CalculatePitch8(bpc, nComponents, width);
170 if (!maybe_size.has_value())
171 return nullptr;
172
173 FX_SAFE_UINT32 size = maybe_size.value();
174 size *= height;
175 if (!size.IsValid())
176 return nullptr;
177
178 uint32_t dwOrigSize = size.ValueOrDie();
179 DataVector<uint8_t> data;
180 uint32_t dwStreamSize;
181 if (decoder.IsEmpty()) {
182 dwOrigSize = std::min<uint32_t>(dwOrigSize, m_pBuf.size() - m_Pos);
183 auto src_span = m_pBuf.subspan(m_Pos, dwOrigSize);
184 data = DataVector<uint8_t>(src_span.begin(), src_span.end());
185 dwStreamSize = dwOrigSize;
186 m_Pos += dwOrigSize;
187 } else {
188 dwStreamSize = DecodeInlineStream(m_pBuf.subspan(m_Pos), width, height,
189 decoder, std::move(pParam), dwOrigSize);
190 if (!pdfium::base::IsValueInRangeForNumericType<int>(dwStreamSize))
191 return nullptr;
192
193 uint32_t dwSavePos = m_Pos;
194 m_Pos += dwStreamSize;
195 while (true) {
196 uint32_t dwPrevPos = m_Pos;
197 ElementType type = ParseNextElement();
198 if (type == ElementType::kEndOfData)
199 break;
200
201 if (type != ElementType::kKeyword) {
202 dwStreamSize += m_Pos - dwPrevPos;
203 continue;
204 }
205 if (GetWord() == "EI") {
206 m_Pos = dwPrevPos;
207 break;
208 }
209 dwStreamSize += m_Pos - dwPrevPos;
210 }
211 m_Pos = dwSavePos;
212 auto src_span = m_pBuf.subspan(m_Pos, dwStreamSize);
213 data = DataVector<uint8_t>(src_span.begin(), src_span.end());
214 m_Pos += dwStreamSize;
215 }
216 pDict->SetNewFor<CPDF_Number>("Length", static_cast<int>(dwStreamSize));
217 return pdfium::MakeRetain<CPDF_Stream>(std::move(data), std::move(pDict));
218 }
219
ParseNextElement()220 CPDF_StreamParser::ElementType CPDF_StreamParser::ParseNextElement() {
221 m_pLastObj.Reset();
222 m_WordSize = 0;
223 if (!PositionIsInBounds())
224 return ElementType::kEndOfData;
225
226 uint8_t ch = m_pBuf[m_Pos++];
227 while (true) {
228 while (PDFCharIsWhitespace(ch)) {
229 if (!PositionIsInBounds())
230 return ElementType::kEndOfData;
231
232 ch = m_pBuf[m_Pos++];
233 }
234
235 if (ch != '%')
236 break;
237
238 while (true) {
239 if (!PositionIsInBounds())
240 return ElementType::kEndOfData;
241
242 ch = m_pBuf[m_Pos++];
243 if (PDFCharIsLineEnding(ch))
244 break;
245 }
246 }
247
248 if (PDFCharIsDelimiter(ch) && ch != '/') {
249 m_Pos--;
250 m_pLastObj = ReadNextObject(false, false, 0);
251 return ElementType::kOther;
252 }
253
254 bool bIsNumber = true;
255 while (true) {
256 if (m_WordSize < kMaxWordLength)
257 m_WordBuffer[m_WordSize++] = ch;
258
259 if (!PDFCharIsNumeric(ch))
260 bIsNumber = false;
261
262 if (!PositionIsInBounds())
263 break;
264
265 ch = m_pBuf[m_Pos++];
266
267 if (PDFCharIsDelimiter(ch) || PDFCharIsWhitespace(ch)) {
268 m_Pos--;
269 break;
270 }
271 }
272
273 m_WordBuffer[m_WordSize] = 0;
274 if (bIsNumber)
275 return ElementType::kNumber;
276
277 if (m_WordBuffer[0] == '/')
278 return ElementType::kName;
279
280 if (m_WordSize == 4) {
281 if (GetWord() == kTrue) {
282 m_pLastObj = pdfium::MakeRetain<CPDF_Boolean>(true);
283 return ElementType::kOther;
284 }
285 if (GetWord() == kNull) {
286 m_pLastObj = pdfium::MakeRetain<CPDF_Null>();
287 return ElementType::kOther;
288 }
289 } else if (m_WordSize == 5) {
290 if (GetWord() == kFalse) {
291 m_pLastObj = pdfium::MakeRetain<CPDF_Boolean>(false);
292 return ElementType::kOther;
293 }
294 }
295 return ElementType::kKeyword;
296 }
297
ReadNextObject(bool bAllowNestedArray,bool bInArray,uint32_t dwRecursionLevel)298 RetainPtr<CPDF_Object> CPDF_StreamParser::ReadNextObject(
299 bool bAllowNestedArray,
300 bool bInArray,
301 uint32_t dwRecursionLevel) {
302 bool bIsNumber;
303 // Must get the next word before returning to avoid infinite loops.
304 GetNextWord(bIsNumber);
305 if (!m_WordSize || dwRecursionLevel > kMaxNestedParsingLevel)
306 return nullptr;
307
308 if (bIsNumber) {
309 m_WordBuffer[m_WordSize] = 0;
310 return pdfium::MakeRetain<CPDF_Number>(GetWord());
311 }
312
313 int first_char = m_WordBuffer[0];
314 if (first_char == '/') {
315 ByteString name = PDF_NameDecode(GetWord().Substr(1));
316 return pdfium::MakeRetain<CPDF_Name>(m_pPool, name);
317 }
318
319 if (first_char == '(') {
320 ByteString str = ReadString();
321 return pdfium::MakeRetain<CPDF_String>(m_pPool, str, false);
322 }
323
324 if (first_char == '<') {
325 if (m_WordSize == 1)
326 return pdfium::MakeRetain<CPDF_String>(m_pPool, ReadHexString(), true);
327
328 auto pDict = pdfium::MakeRetain<CPDF_Dictionary>(m_pPool);
329 while (true) {
330 GetNextWord(bIsNumber);
331 if (m_WordSize == 2 && m_WordBuffer[0] == '>')
332 break;
333
334 if (!m_WordSize || m_WordBuffer[0] != '/')
335 return nullptr;
336
337 ByteString key = PDF_NameDecode(GetWord().Substr(1));
338 RetainPtr<CPDF_Object> pObj =
339 ReadNextObject(true, bInArray, dwRecursionLevel + 1);
340 if (!pObj)
341 return nullptr;
342
343 pDict->SetFor(key, std::move(pObj));
344 }
345 return pDict;
346 }
347
348 if (first_char == '[') {
349 if ((!bAllowNestedArray && bInArray))
350 return nullptr;
351
352 auto pArray = pdfium::MakeRetain<CPDF_Array>();
353 while (true) {
354 RetainPtr<CPDF_Object> pObj =
355 ReadNextObject(bAllowNestedArray, true, dwRecursionLevel + 1);
356 if (pObj) {
357 pArray->Append(std::move(pObj));
358 continue;
359 }
360 if (!m_WordSize || m_WordBuffer[0] == ']')
361 break;
362 }
363 return pArray;
364 }
365
366 if (GetWord() == kFalse)
367 return pdfium::MakeRetain<CPDF_Boolean>(false);
368 if (GetWord() == kTrue)
369 return pdfium::MakeRetain<CPDF_Boolean>(true);
370 if (GetWord() == kNull)
371 return pdfium::MakeRetain<CPDF_Null>();
372 return nullptr;
373 }
374
375 // TODO(npm): the following methods are almost identical in cpdf_syntaxparser
GetNextWord(bool & bIsNumber)376 void CPDF_StreamParser::GetNextWord(bool& bIsNumber) {
377 m_WordSize = 0;
378 bIsNumber = true;
379 if (!PositionIsInBounds())
380 return;
381
382 uint8_t ch = m_pBuf[m_Pos++];
383 while (true) {
384 while (PDFCharIsWhitespace(ch)) {
385 if (!PositionIsInBounds()) {
386 return;
387 }
388 ch = m_pBuf[m_Pos++];
389 }
390
391 if (ch != '%')
392 break;
393
394 while (true) {
395 if (!PositionIsInBounds())
396 return;
397 ch = m_pBuf[m_Pos++];
398 if (PDFCharIsLineEnding(ch))
399 break;
400 }
401 }
402
403 if (PDFCharIsDelimiter(ch)) {
404 bIsNumber = false;
405 m_WordBuffer[m_WordSize++] = ch;
406 if (ch == '/') {
407 while (true) {
408 if (!PositionIsInBounds())
409 return;
410 ch = m_pBuf[m_Pos++];
411 if (!PDFCharIsOther(ch) && !PDFCharIsNumeric(ch)) {
412 m_Pos--;
413 return;
414 }
415 if (m_WordSize < kMaxWordLength)
416 m_WordBuffer[m_WordSize++] = ch;
417 }
418 } else if (ch == '<') {
419 if (!PositionIsInBounds())
420 return;
421 ch = m_pBuf[m_Pos++];
422 if (ch == '<')
423 m_WordBuffer[m_WordSize++] = ch;
424 else
425 m_Pos--;
426 } else if (ch == '>') {
427 if (!PositionIsInBounds())
428 return;
429 ch = m_pBuf[m_Pos++];
430 if (ch == '>')
431 m_WordBuffer[m_WordSize++] = ch;
432 else
433 m_Pos--;
434 }
435 return;
436 }
437
438 while (true) {
439 if (m_WordSize < kMaxWordLength)
440 m_WordBuffer[m_WordSize++] = ch;
441 if (!PDFCharIsNumeric(ch))
442 bIsNumber = false;
443 if (!PositionIsInBounds())
444 return;
445
446 ch = m_pBuf[m_Pos++];
447 if (PDFCharIsDelimiter(ch) || PDFCharIsWhitespace(ch)) {
448 m_Pos--;
449 break;
450 }
451 }
452 }
453
ReadString()454 ByteString CPDF_StreamParser::ReadString() {
455 if (!PositionIsInBounds())
456 return ByteString();
457
458 ByteString buf;
459 int parlevel = 0;
460 int status = 0;
461 int iEscCode = 0;
462 uint8_t ch = m_pBuf[m_Pos++];
463 while (true) {
464 switch (status) {
465 case 0:
466 if (ch == ')') {
467 if (parlevel == 0) {
468 return buf.First(std::min(buf.GetLength(), kMaxStringLength));
469 }
470 parlevel--;
471 buf += ')';
472 } else if (ch == '(') {
473 parlevel++;
474 buf += '(';
475 } else if (ch == '\\') {
476 status = 1;
477 } else {
478 buf += static_cast<char>(ch);
479 }
480 break;
481 case 1:
482 if (FXSYS_IsOctalDigit(ch)) {
483 iEscCode = FXSYS_DecimalCharToInt(static_cast<char>(ch));
484 status = 2;
485 break;
486 }
487 if (ch == '\r') {
488 status = 4;
489 break;
490 }
491 if (ch == '\n') {
492 // Do nothing.
493 } else if (ch == 'n') {
494 buf += '\n';
495 } else if (ch == 'r') {
496 buf += '\r';
497 } else if (ch == 't') {
498 buf += '\t';
499 } else if (ch == 'b') {
500 buf += '\b';
501 } else if (ch == 'f') {
502 buf += '\f';
503 } else {
504 buf += static_cast<char>(ch);
505 }
506 status = 0;
507 break;
508 case 2:
509 if (FXSYS_IsOctalDigit(ch)) {
510 iEscCode =
511 iEscCode * 8 + FXSYS_DecimalCharToInt(static_cast<char>(ch));
512 status = 3;
513 } else {
514 buf += static_cast<char>(iEscCode);
515 status = 0;
516 continue;
517 }
518 break;
519 case 3:
520 if (FXSYS_IsOctalDigit(ch)) {
521 iEscCode =
522 iEscCode * 8 + FXSYS_DecimalCharToInt(static_cast<char>(ch));
523 buf += static_cast<char>(iEscCode);
524 status = 0;
525 } else {
526 buf += static_cast<char>(iEscCode);
527 status = 0;
528 continue;
529 }
530 break;
531 case 4:
532 status = 0;
533 if (ch != '\n')
534 continue;
535 break;
536 }
537 if (!PositionIsInBounds())
538 return buf.First(std::min(buf.GetLength(), kMaxStringLength));
539
540 ch = m_pBuf[m_Pos++];
541 }
542 }
543
ReadHexString()544 ByteString CPDF_StreamParser::ReadHexString() {
545 if (!PositionIsInBounds())
546 return ByteString();
547
548 ByteString buf;
549 bool bFirst = true;
550 int code = 0;
551 while (PositionIsInBounds()) {
552 uint8_t ch = m_pBuf[m_Pos++];
553 if (ch == '>')
554 break;
555
556 if (!isxdigit(ch))
557 continue;
558
559 int val = FXSYS_HexCharToInt(ch);
560 if (bFirst) {
561 code = val * 16;
562 } else {
563 code += val;
564 buf += static_cast<uint8_t>(code);
565 }
566 bFirst = !bFirst;
567 }
568 if (!bFirst)
569 buf += static_cast<char>(code);
570
571 return buf.First(std::min<size_t>(buf.GetLength(), kMaxStringLength));
572 }
573
PositionIsInBounds() const574 bool CPDF_StreamParser::PositionIsInBounds() const {
575 return m_Pos < m_pBuf.size();
576 }
577