1 // Copyright 2017 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7 #include "core/fxcrt/cfx_seekablestreamproxy.h"
8
9 #if defined(OS_WIN)
10 #include <io.h>
11 #endif
12
13 #include <algorithm>
14 #include <limits>
15 #include <memory>
16 #include <utility>
17 #include <vector>
18
19 #include "build/build_config.h"
20 #include "core/fxcrt/fx_codepage.h"
21 #include "core/fxcrt/fx_extension.h"
22 #include "core/fxcrt/fx_memory_wrappers.h"
23 #include "core/fxcrt/fx_safe_types.h"
24 #include "third_party/base/stl_util.h"
25
26 namespace {
27
28 // Returns {src bytes consumed, dst chars produced}.
29 // Invalid sequences are silently not output.
UTF8Decode(const char * pSrc,size_t srcLen,wchar_t * pDst,size_t dstLen)30 std::pair<size_t, size_t> UTF8Decode(const char* pSrc,
31 size_t srcLen,
32 wchar_t* pDst,
33 size_t dstLen) {
34 ASSERT(pDst);
35 ASSERT(dstLen > 0);
36
37 if (srcLen < 1)
38 return {0, 0};
39
40 uint32_t dwCode = 0;
41 int32_t iPending = 0;
42 size_t iSrcNum = 0;
43 size_t iDstNum = 0;
44 for (size_t iIndex = 0; iIndex < srcLen && iDstNum < dstLen; ++iIndex) {
45 ++iSrcNum;
46 uint8_t byte = static_cast<uint8_t>(*(pSrc + iIndex));
47 if (byte < 0x80) {
48 iPending = 0;
49 ++iDstNum;
50 *pDst++ = byte;
51 } else if (byte < 0xc0) {
52 if (iPending < 1)
53 continue;
54
55 dwCode = dwCode << 6;
56 dwCode |= (byte & 0x3f);
57 --iPending;
58 if (iPending == 0) {
59 ++iDstNum;
60 *pDst++ = dwCode;
61 }
62 } else if (byte < 0xe0) {
63 iPending = 1;
64 dwCode = (byte & 0x1f);
65 } else if (byte < 0xf0) {
66 iPending = 2;
67 dwCode = (byte & 0x0f);
68 } else if (byte < 0xf8) {
69 iPending = 3;
70 dwCode = (byte & 0x07);
71 } else if (byte < 0xfc) {
72 iPending = 4;
73 dwCode = (byte & 0x03);
74 } else if (byte < 0xfe) {
75 iPending = 5;
76 dwCode = (byte & 0x01);
77 }
78 }
79 return {iSrcNum, iDstNum};
80 }
81
82 #if defined(WCHAR_T_IS_UTF32)
83 static_assert(sizeof(wchar_t) > 2, "wchar_t is too small");
84
UTF16ToWChar(void * pBuffer,size_t iLength)85 void UTF16ToWChar(void* pBuffer, size_t iLength) {
86 ASSERT(pBuffer);
87 ASSERT(iLength > 0);
88
89 uint16_t* pSrc = static_cast<uint16_t*>(pBuffer);
90 wchar_t* pDst = static_cast<wchar_t*>(pBuffer);
91
92 // Perform self-intersecting copy in reverse order.
93 for (size_t i = iLength; i > 0; --i)
94 pDst[i - 1] = static_cast<wchar_t>(pSrc[i - 1]);
95 }
96 #endif // defined(WCHAR_T_IS_UTF32)
97
SwapByteOrder(uint16_t * pStr,size_t iLength)98 void SwapByteOrder(uint16_t* pStr, size_t iLength) {
99 while (iLength-- > 0) {
100 uint16_t wch = *pStr;
101 *pStr++ = (wch >> 8) | (wch << 8);
102 }
103 }
104
105 } // namespace
106
107 #define BOM_UTF8_MASK 0x00FFFFFF
108 #define BOM_UTF8 0x00BFBBEF
109 #define BOM_UTF16_MASK 0x0000FFFF
110 #define BOM_UTF16_BE 0x0000FFFE
111 #define BOM_UTF16_LE 0x0000FEFF
112
CFX_SeekableStreamProxy(const RetainPtr<IFX_SeekableReadStream> & stream)113 CFX_SeekableStreamProxy::CFX_SeekableStreamProxy(
114 const RetainPtr<IFX_SeekableReadStream>& stream)
115 : m_wCodePage(FX_CODEPAGE_DefANSI),
116 m_wBOMLength(0),
117 m_iPosition(0),
118 m_pStream(stream) {
119 ASSERT(m_pStream);
120
121 Seek(From::Begin, 0);
122
123 uint32_t bom = 0;
124 ReadData(reinterpret_cast<uint8_t*>(&bom), 3);
125
126 bom &= BOM_UTF8_MASK;
127 if (bom == BOM_UTF8) {
128 m_wBOMLength = 3;
129 m_wCodePage = FX_CODEPAGE_UTF8;
130 } else {
131 bom &= BOM_UTF16_MASK;
132 if (bom == BOM_UTF16_BE) {
133 m_wBOMLength = 2;
134 m_wCodePage = FX_CODEPAGE_UTF16BE;
135 } else if (bom == BOM_UTF16_LE) {
136 m_wBOMLength = 2;
137 m_wCodePage = FX_CODEPAGE_UTF16LE;
138 } else {
139 m_wBOMLength = 0;
140 m_wCodePage = FXSYS_GetACP();
141 }
142 }
143
144 Seek(From::Begin, static_cast<FX_FILESIZE>(m_wBOMLength));
145 }
146
147 CFX_SeekableStreamProxy::~CFX_SeekableStreamProxy() = default;
148
GetSize()149 FX_FILESIZE CFX_SeekableStreamProxy::GetSize() {
150 return m_pStream->GetSize();
151 }
152
GetPosition()153 FX_FILESIZE CFX_SeekableStreamProxy::GetPosition() {
154 return m_iPosition;
155 }
156
IsEOF()157 bool CFX_SeekableStreamProxy::IsEOF() {
158 return m_iPosition >= GetSize();
159 }
160
Seek(From eSeek,FX_FILESIZE iOffset)161 void CFX_SeekableStreamProxy::Seek(From eSeek, FX_FILESIZE iOffset) {
162 switch (eSeek) {
163 case From::Begin:
164 m_iPosition = iOffset;
165 break;
166 case From::Current: {
167 FX_SAFE_FILESIZE new_pos = m_iPosition;
168 new_pos += iOffset;
169 m_iPosition =
170 new_pos.ValueOrDefault(std::numeric_limits<FX_FILESIZE>::max());
171 } break;
172 }
173 m_iPosition =
174 pdfium::clamp(m_iPosition, static_cast<FX_FILESIZE>(0), GetSize());
175 }
176
SetCodePage(uint16_t wCodePage)177 void CFX_SeekableStreamProxy::SetCodePage(uint16_t wCodePage) {
178 if (m_wBOMLength > 0)
179 return;
180 m_wCodePage = wCodePage;
181 }
182
ReadData(uint8_t * pBuffer,size_t iBufferSize)183 size_t CFX_SeekableStreamProxy::ReadData(uint8_t* pBuffer, size_t iBufferSize) {
184 ASSERT(pBuffer);
185 ASSERT(iBufferSize > 0);
186
187 iBufferSize =
188 std::min(iBufferSize, static_cast<size_t>(GetSize() - m_iPosition));
189 if (iBufferSize <= 0)
190 return 0;
191
192 if (!m_pStream->ReadBlockAtOffset(pBuffer, m_iPosition, iBufferSize))
193 return 0;
194
195 FX_SAFE_FILESIZE new_pos = m_iPosition;
196 new_pos += iBufferSize;
197 m_iPosition = new_pos.ValueOrDefault(m_iPosition);
198 return new_pos.IsValid() ? iBufferSize : 0;
199 }
200
ReadBlock(wchar_t * pStr,size_t size)201 size_t CFX_SeekableStreamProxy::ReadBlock(wchar_t* pStr, size_t size) {
202 if (!pStr || size == 0)
203 return 0;
204
205 if (m_wCodePage == FX_CODEPAGE_UTF16LE ||
206 m_wCodePage == FX_CODEPAGE_UTF16BE) {
207 size_t iBytes = size * 2;
208 size_t iLen = ReadData(reinterpret_cast<uint8_t*>(pStr), iBytes);
209 size = iLen / 2;
210 if (m_wCodePage == FX_CODEPAGE_UTF16BE)
211 SwapByteOrder(reinterpret_cast<uint16_t*>(pStr), size);
212
213 #if defined(WCHAR_T_IS_UTF32)
214 if (size > 0)
215 UTF16ToWChar(pStr, size);
216 #endif
217 } else {
218 FX_FILESIZE pos = GetPosition();
219 size_t iBytes = std::min(size, static_cast<size_t>(GetSize() - pos));
220
221 if (iBytes > 0) {
222 std::vector<uint8_t, FxAllocAllocator<uint8_t>> buf(iBytes);
223
224 size_t iLen = ReadData(buf.data(), iBytes);
225 if (m_wCodePage != FX_CODEPAGE_UTF8)
226 return 0;
227
228 size_t iSrc = 0;
229 std::tie(iSrc, size) =
230 UTF8Decode(reinterpret_cast<const char*>(buf.data()), iLen,
231 static_cast<wchar_t*>(pStr), size);
232 Seek(From::Current, iSrc - iLen);
233 } else {
234 size = 0;
235 }
236 }
237
238 return size;
239 }
240