• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2017 The PDFium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #include "core/fxcrt/cfx_seekablestreamproxy.h"
8 
9 #include <stdint.h>
10 
11 #include <algorithm>
12 #include <limits>
13 #include <utility>
14 
15 #include "build/build_config.h"
16 #include "core/fxcrt/check.h"
17 #include "core/fxcrt/check_op.h"
18 #include "core/fxcrt/data_vector.h"
19 #include "core/fxcrt/fx_extension.h"
20 #include "core/fxcrt/fx_safe_types.h"
21 #include "core/fxcrt/span.h"
22 #include "core/fxcrt/span_util.h"
23 
24 namespace {
25 
26 // Returns {src bytes consumed, dst chars produced}.
27 // Invalid sequences are silently not output.
UTF8Decode(pdfium::span<const uint8_t> pSrc,pdfium::span<wchar_t> pDst)28 std::pair<size_t, size_t> UTF8Decode(pdfium::span<const uint8_t> pSrc,
29                                      pdfium::span<wchar_t> pDst) {
30   DCHECK(!pDst.empty());
31 
32   uint32_t dwCode = 0;
33   int32_t iPending = 0;
34   size_t iSrcNum = 0;
35   size_t iDstNum = 0;
36   for (size_t iIndex = 0; iIndex < pSrc.size() && iDstNum < pDst.size();
37        ++iIndex) {
38     ++iSrcNum;
39     uint8_t byte = pSrc[iIndex];
40     if (byte < 0x80) {
41       iPending = 0;
42       pDst[iDstNum++] = byte;
43     } else if (byte < 0xc0) {
44       if (iPending < 1)
45         continue;
46 
47       dwCode = dwCode << 6;
48       dwCode |= (byte & 0x3f);
49       --iPending;
50       if (iPending == 0)
51         pDst[iDstNum++] = dwCode;
52     } else if (byte < 0xe0) {
53       iPending = 1;
54       dwCode = (byte & 0x1f);
55     } else if (byte < 0xf0) {
56       iPending = 2;
57       dwCode = (byte & 0x0f);
58     } else if (byte < 0xf8) {
59       iPending = 3;
60       dwCode = (byte & 0x07);
61     } else if (byte < 0xfc) {
62       iPending = 4;
63       dwCode = (byte & 0x03);
64     } else if (byte < 0xfe) {
65       iPending = 5;
66       dwCode = (byte & 0x01);
67     }
68   }
69   return {iSrcNum, iDstNum};
70 }
71 
UTF16ToWChar(pdfium::span<wchar_t> buffer)72 void UTF16ToWChar(pdfium::span<wchar_t> buffer) {
73 #if defined(WCHAR_T_IS_32_BIT)
74   auto src = fxcrt::reinterpret_span<uint16_t>(buffer);
75   // Perform self-intersecting copy in reverse order.
76   for (size_t i = buffer.size(); i > 0; --i) {
77     buffer[i - 1] = static_cast<wchar_t>(src[i - 1]);
78   }
79 #endif  // defined(WCHAR_T_IS_32_BIT)
80 }
81 
SwapByteOrder(pdfium::span<uint16_t> str)82 void SwapByteOrder(pdfium::span<uint16_t> str) {
83   for (auto& wch : str) {
84     wch = (wch >> 8) | (wch << 8);
85   }
86 }
87 
88 }  // namespace
89 
90 #define BOM_UTF8_MASK 0x00FFFFFF
91 #define BOM_UTF8 0x00BFBBEF
92 #define BOM_UTF16_MASK 0x0000FFFF
93 #define BOM_UTF16_BE 0x0000FFFE
94 #define BOM_UTF16_LE 0x0000FEFF
95 
CFX_SeekableStreamProxy(const RetainPtr<IFX_SeekableReadStream> & stream)96 CFX_SeekableStreamProxy::CFX_SeekableStreamProxy(
97     const RetainPtr<IFX_SeekableReadStream>& stream)
98     : m_pStream(stream) {
99   DCHECK(m_pStream);
100 
101   Seek(From::Begin, 0);
102 
103   uint32_t bom = 0;
104   ReadData(pdfium::byte_span_from_ref(bom).first<3>());
105 
106   bom &= BOM_UTF8_MASK;
107   if (bom == BOM_UTF8) {
108     m_wBOMLength = 3;
109     m_wCodePage = FX_CodePage::kUTF8;
110   } else {
111     bom &= BOM_UTF16_MASK;
112     if (bom == BOM_UTF16_BE) {
113       m_wBOMLength = 2;
114       m_wCodePage = FX_CodePage::kUTF16BE;
115     } else if (bom == BOM_UTF16_LE) {
116       m_wBOMLength = 2;
117       m_wCodePage = FX_CodePage::kUTF16LE;
118     } else {
119       m_wBOMLength = 0;
120       m_wCodePage = FX_GetACP();
121     }
122   }
123 
124   Seek(From::Begin, static_cast<FX_FILESIZE>(m_wBOMLength));
125 }
126 
127 CFX_SeekableStreamProxy::~CFX_SeekableStreamProxy() = default;
128 
GetSize() const129 FX_FILESIZE CFX_SeekableStreamProxy::GetSize() const {
130   return m_pStream->GetSize();
131 }
132 
GetPosition() const133 FX_FILESIZE CFX_SeekableStreamProxy::GetPosition() const {
134   return m_iPosition;
135 }
136 
IsEOF() const137 bool CFX_SeekableStreamProxy::IsEOF() const {
138   return m_iPosition >= GetSize();
139 }
140 
Seek(From eSeek,FX_FILESIZE iOffset)141 void CFX_SeekableStreamProxy::Seek(From eSeek, FX_FILESIZE iOffset) {
142   switch (eSeek) {
143     case From::Begin:
144       m_iPosition = iOffset;
145       break;
146     case From::Current: {
147       FX_SAFE_FILESIZE new_pos = m_iPosition;
148       new_pos += iOffset;
149       m_iPosition =
150           new_pos.ValueOrDefault(std::numeric_limits<FX_FILESIZE>::max());
151     } break;
152   }
153   m_iPosition = std::clamp(m_iPosition, static_cast<FX_FILESIZE>(0), GetSize());
154 }
155 
SetCodePage(FX_CodePage wCodePage)156 void CFX_SeekableStreamProxy::SetCodePage(FX_CodePage wCodePage) {
157   if (m_wBOMLength > 0)
158     return;
159   m_wCodePage = wCodePage;
160 }
161 
ReadData(pdfium::span<uint8_t> buffer)162 size_t CFX_SeekableStreamProxy::ReadData(pdfium::span<uint8_t> buffer) {
163   DCHECK(!buffer.empty());
164   const size_t remaining = static_cast<size_t>(GetSize() - m_iPosition);
165   size_t read_size = std::min(buffer.size(), remaining);
166   if (read_size == 0) {
167     return 0;
168   }
169   if (!m_pStream->ReadBlockAtOffset(buffer.first(read_size), m_iPosition)) {
170     return 0;
171   }
172   FX_SAFE_FILESIZE new_pos = m_iPosition;
173   new_pos += read_size;
174   m_iPosition = new_pos.ValueOrDefault(m_iPosition);
175   return new_pos.IsValid() ? read_size : 0;
176 }
177 
ReadBlock(pdfium::span<wchar_t> buffer)178 size_t CFX_SeekableStreamProxy::ReadBlock(pdfium::span<wchar_t> buffer) {
179   if (buffer.empty()) {
180     return 0;
181   }
182   if (m_wCodePage == FX_CodePage::kUTF16LE ||
183       m_wCodePage == FX_CodePage::kUTF16BE) {
184     size_t bytes_to_read = buffer.size() * sizeof(uint16_t);
185     size_t bytes_read =
186         ReadData(pdfium::as_writable_bytes(buffer).first(bytes_to_read));
187     size_t elements = bytes_read / sizeof(uint16_t);
188     if (m_wCodePage == FX_CodePage::kUTF16BE) {
189       SwapByteOrder(fxcrt::reinterpret_span<uint16_t>(buffer).first(elements));
190     }
191     UTF16ToWChar(buffer.first(elements));
192     return elements;
193   }
194   FX_FILESIZE pos = GetPosition();
195   size_t bytes_to_read =
196       std::min(buffer.size(), static_cast<size_t>(GetSize() - pos));
197   if (bytes_to_read == 0) {
198     return 0;
199   }
200   DataVector<uint8_t> byte_buf(bytes_to_read);
201   size_t bytes_read = ReadData(byte_buf);
202   if (m_wCodePage != FX_CodePage::kUTF8) {
203     return 0;
204   }
205   auto [src_bytes_consumed, dest_wchars_produced] =
206       UTF8Decode(pdfium::make_span(byte_buf).first(bytes_read), buffer);
207   Seek(From::Current, src_bytes_consumed - bytes_read);
208   return dest_wchars_produced;
209 }
210