1 // Copyright 2017 The PDFium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #include "core/fxcrt/cfx_seekablestreamproxy.h"
8 
9 #include <stdint.h>
10 
11 #include <algorithm>
12 #include <limits>
13 #include <utility>
14 
15 #include "build/build_config.h"
16 #include "core/fxcrt/data_vector.h"
17 #include "core/fxcrt/fx_extension.h"
18 #include "core/fxcrt/fx_safe_types.h"
19 #include "third_party/base/check.h"
20 #include "third_party/base/check_op.h"
21 #include "third_party/base/cxx17_backports.h"
22 
23 namespace {
24 
25 // Returns {src bytes consumed, dst chars produced}.
26 // Invalid sequences are silently not output.
UTF8Decode(pdfium::span<const uint8_t> pSrc,pdfium::span<wchar_t> pDst)27 std::pair<size_t, size_t> UTF8Decode(pdfium::span<const uint8_t> pSrc,
28                                      pdfium::span<wchar_t> pDst) {
29   DCHECK(!pDst.empty());
30 
31   uint32_t dwCode = 0;
32   int32_t iPending = 0;
33   size_t iSrcNum = 0;
34   size_t iDstNum = 0;
35   for (size_t iIndex = 0; iIndex < pSrc.size() && iDstNum < pDst.size();
36        ++iIndex) {
37     ++iSrcNum;
38     uint8_t byte = pSrc[iIndex];
39     if (byte < 0x80) {
40       iPending = 0;
41       pDst[iDstNum++] = byte;
42     } else if (byte < 0xc0) {
43       if (iPending < 1)
44         continue;
45 
46       dwCode = dwCode << 6;
47       dwCode |= (byte & 0x3f);
48       --iPending;
49       if (iPending == 0)
50         pDst[iDstNum++] = dwCode;
51     } else if (byte < 0xe0) {
52       iPending = 1;
53       dwCode = (byte & 0x1f);
54     } else if (byte < 0xf0) {
55       iPending = 2;
56       dwCode = (byte & 0x0f);
57     } else if (byte < 0xf8) {
58       iPending = 3;
59       dwCode = (byte & 0x07);
60     } else if (byte < 0xfc) {
61       iPending = 4;
62       dwCode = (byte & 0x03);
63     } else if (byte < 0xfe) {
64       iPending = 5;
65       dwCode = (byte & 0x01);
66     }
67   }
68   return {iSrcNum, iDstNum};
69 }
70 
71 #if defined(WCHAR_T_IS_UTF32)
72 static_assert(sizeof(wchar_t) > 2, "wchar_t is too small");
73 
UTF16ToWChar(void * pBuffer,size_t iLength)74 void UTF16ToWChar(void* pBuffer, size_t iLength) {
75   DCHECK(pBuffer);
76   DCHECK_GT(iLength, 0);
77 
78   uint16_t* pSrc = static_cast<uint16_t*>(pBuffer);
79   wchar_t* pDst = static_cast<wchar_t*>(pBuffer);
80 
81   // Perform self-intersecting copy in reverse order.
82   for (size_t i = iLength; i > 0; --i)
83     pDst[i - 1] = static_cast<wchar_t>(pSrc[i - 1]);
84 }
85 #endif  // defined(WCHAR_T_IS_UTF32)
86 
SwapByteOrder(uint16_t * pStr,size_t iLength)87 void SwapByteOrder(uint16_t* pStr, size_t iLength) {
88   while (iLength-- > 0) {
89     uint16_t wch = *pStr;
90     *pStr++ = (wch >> 8) | (wch << 8);
91   }
92 }
93 
94 }  // namespace
95 
96 #define BOM_UTF8_MASK 0x00FFFFFF
97 #define BOM_UTF8 0x00BFBBEF
98 #define BOM_UTF16_MASK 0x0000FFFF
99 #define BOM_UTF16_BE 0x0000FFFE
100 #define BOM_UTF16_LE 0x0000FEFF
101 
CFX_SeekableStreamProxy(const RetainPtr<IFX_SeekableReadStream> & stream)102 CFX_SeekableStreamProxy::CFX_SeekableStreamProxy(
103     const RetainPtr<IFX_SeekableReadStream>& stream)
104     : m_pStream(stream) {
105   DCHECK(m_pStream);
106 
107   Seek(From::Begin, 0);
108 
109   uint32_t bom = 0;
110   ReadData(reinterpret_cast<uint8_t*>(&bom), 3);
111 
112   bom &= BOM_UTF8_MASK;
113   if (bom == BOM_UTF8) {
114     m_wBOMLength = 3;
115     m_wCodePage = FX_CodePage::kUTF8;
116   } else {
117     bom &= BOM_UTF16_MASK;
118     if (bom == BOM_UTF16_BE) {
119       m_wBOMLength = 2;
120       m_wCodePage = FX_CodePage::kUTF16BE;
121     } else if (bom == BOM_UTF16_LE) {
122       m_wBOMLength = 2;
123       m_wCodePage = FX_CodePage::kUTF16LE;
124     } else {
125       m_wBOMLength = 0;
126       m_wCodePage = FX_GetACP();
127     }
128   }
129 
130   Seek(From::Begin, static_cast<FX_FILESIZE>(m_wBOMLength));
131 }
132 
133 CFX_SeekableStreamProxy::~CFX_SeekableStreamProxy() = default;
134 
GetSize()135 FX_FILESIZE CFX_SeekableStreamProxy::GetSize() {
136   return m_pStream->GetSize();
137 }
138 
GetPosition()139 FX_FILESIZE CFX_SeekableStreamProxy::GetPosition() {
140   return m_iPosition;
141 }
142 
IsEOF()143 bool CFX_SeekableStreamProxy::IsEOF() {
144   return m_iPosition >= GetSize();
145 }
146 
Seek(From eSeek,FX_FILESIZE iOffset)147 void CFX_SeekableStreamProxy::Seek(From eSeek, FX_FILESIZE iOffset) {
148   switch (eSeek) {
149     case From::Begin:
150       m_iPosition = iOffset;
151       break;
152     case From::Current: {
153       FX_SAFE_FILESIZE new_pos = m_iPosition;
154       new_pos += iOffset;
155       m_iPosition =
156           new_pos.ValueOrDefault(std::numeric_limits<FX_FILESIZE>::max());
157     } break;
158   }
159   m_iPosition =
160       pdfium::clamp(m_iPosition, static_cast<FX_FILESIZE>(0), GetSize());
161 }
162 
SetCodePage(FX_CodePage wCodePage)163 void CFX_SeekableStreamProxy::SetCodePage(FX_CodePage wCodePage) {
164   if (m_wBOMLength > 0)
165     return;
166   m_wCodePage = wCodePage;
167 }
168 
ReadData(uint8_t * pBuffer,size_t iBufferSize)169 size_t CFX_SeekableStreamProxy::ReadData(uint8_t* pBuffer, size_t iBufferSize) {
170   DCHECK(pBuffer);
171   DCHECK(iBufferSize > 0);
172 
173   iBufferSize =
174       std::min(iBufferSize, static_cast<size_t>(GetSize() - m_iPosition));
175   if (iBufferSize <= 0)
176     return 0;
177 
178   if (!m_pStream->ReadBlockAtOffset({pBuffer, iBufferSize}, m_iPosition))
179     return 0;
180 
181   FX_SAFE_FILESIZE new_pos = m_iPosition;
182   new_pos += iBufferSize;
183   m_iPosition = new_pos.ValueOrDefault(m_iPosition);
184   return new_pos.IsValid() ? iBufferSize : 0;
185 }
186 
ReadBlock(wchar_t * pStr,size_t size)187 size_t CFX_SeekableStreamProxy::ReadBlock(wchar_t* pStr, size_t size) {
188   if (!pStr || size == 0)
189     return 0;
190 
191   if (m_wCodePage == FX_CodePage::kUTF16LE ||
192       m_wCodePage == FX_CodePage::kUTF16BE) {
193     size_t iBytes = size * 2;
194     size_t iLen = ReadData(reinterpret_cast<uint8_t*>(pStr), iBytes);
195     size = iLen / 2;
196     if (m_wCodePage == FX_CodePage::kUTF16BE)
197       SwapByteOrder(reinterpret_cast<uint16_t*>(pStr), size);
198 
199 #if defined(WCHAR_T_IS_UTF32)
200     if (size > 0)
201       UTF16ToWChar(pStr, size);
202 #endif
203     return size;
204   }
205 
206   FX_FILESIZE pos = GetPosition();
207   size_t iBytes = std::min(size, static_cast<size_t>(GetSize() - pos));
208   if (iBytes == 0)
209     return 0;
210 
211   DataVector<uint8_t> buf(iBytes);
212   size_t iLen = ReadData(buf.data(), iBytes);
213   if (m_wCodePage != FX_CodePage::kUTF8)
214     return 0;
215 
216   size_t iSrc;
217   std::tie(iSrc, size) = UTF8Decode({buf.data(), iLen}, {pStr, size});
218   Seek(From::Current, iSrc - iLen);
219   return size;
220 }
221