1 // Copyright 2017 The PDFium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7 #include "core/fpdfapi/font/cpdf_cmap.h"
8
9 #include <array>
10 #include <utility>
11 #include <vector>
12
13 #include "core/fpdfapi/cmaps/fpdf_cmaps.h"
14 #include "core/fpdfapi/font/cpdf_cmapparser.h"
15 #include "core/fpdfapi/font/cpdf_fontglobals.h"
16 #include "core/fpdfapi/parser/cpdf_simple_parser.h"
17 #include "core/fxcrt/check.h"
18 #include "core/fxcrt/fx_memcpy_wrappers.h"
19 #include "core/fxcrt/notreached.h"
20
21 namespace {
22
23 struct ByteRange {
24 uint8_t m_First;
25 uint8_t m_Last; // Inclusive.
26 };
27
28 struct PredefinedCMap {
29 const char* m_pName; // Raw, POD struct.
30 CIDSet m_Charset;
31 CIDCoding m_Coding;
32 CPDF_CMap::CodingScheme m_CodingScheme;
33 ByteRange m_LeadingSegs[2];
34 };
35
36 constexpr PredefinedCMap kPredefinedCMaps[] = {
37 {"GB-EUC",
38 CIDSET_GB1,
39 CIDCoding::kGB,
40 CPDF_CMap::MixedTwoBytes,
41 {{0xa1, 0xfe}}},
42 {"GBpc-EUC",
43 CIDSET_GB1,
44 CIDCoding::kGB,
45 CPDF_CMap::MixedTwoBytes,
46 {{0xa1, 0xfc}}},
47 {"GBK-EUC",
48 CIDSET_GB1,
49 CIDCoding::kGB,
50 CPDF_CMap::MixedTwoBytes,
51 {{0x81, 0xfe}}},
52 {"GBKp-EUC",
53 CIDSET_GB1,
54 CIDCoding::kGB,
55 CPDF_CMap::MixedTwoBytes,
56 {{0x81, 0xfe}}},
57 {"GBK2K-EUC",
58 CIDSET_GB1,
59 CIDCoding::kGB,
60 CPDF_CMap::MixedTwoBytes,
61 {{0x81, 0xfe}}},
62 {"GBK2K",
63 CIDSET_GB1,
64 CIDCoding::kGB,
65 CPDF_CMap::MixedTwoBytes,
66 {{0x81, 0xfe}}},
67 {"UniGB-UCS2", CIDSET_GB1, CIDCoding::kUCS2, CPDF_CMap::TwoBytes, {}},
68 {"UniGB-UTF16", CIDSET_GB1, CIDCoding::kUTF16, CPDF_CMap::TwoBytes, {}},
69 {"B5pc",
70 CIDSET_CNS1,
71 CIDCoding::kBIG5,
72 CPDF_CMap::MixedTwoBytes,
73 {{0xa1, 0xfc}}},
74 {"HKscs-B5",
75 CIDSET_CNS1,
76 CIDCoding::kBIG5,
77 CPDF_CMap::MixedTwoBytes,
78 {{0x88, 0xfe}}},
79 {"ETen-B5",
80 CIDSET_CNS1,
81 CIDCoding::kBIG5,
82 CPDF_CMap::MixedTwoBytes,
83 {{0xa1, 0xfe}}},
84 {"ETenms-B5",
85 CIDSET_CNS1,
86 CIDCoding::kBIG5,
87 CPDF_CMap::MixedTwoBytes,
88 {{0xa1, 0xfe}}},
89 {"UniCNS-UCS2", CIDSET_CNS1, CIDCoding::kUCS2, CPDF_CMap::TwoBytes, {}},
90 {"UniCNS-UTF16", CIDSET_CNS1, CIDCoding::kUTF16, CPDF_CMap::TwoBytes, {}},
91 {"83pv-RKSJ",
92 CIDSET_JAPAN1,
93 CIDCoding::kJIS,
94 CPDF_CMap::MixedTwoBytes,
95 {{0x81, 0x9f}, {0xe0, 0xfc}}},
96 {"90ms-RKSJ",
97 CIDSET_JAPAN1,
98 CIDCoding::kJIS,
99 CPDF_CMap::MixedTwoBytes,
100 {{0x81, 0x9f}, {0xe0, 0xfc}}},
101 {"90msp-RKSJ",
102 CIDSET_JAPAN1,
103 CIDCoding::kJIS,
104 CPDF_CMap::MixedTwoBytes,
105 {{0x81, 0x9f}, {0xe0, 0xfc}}},
106 {"90pv-RKSJ",
107 CIDSET_JAPAN1,
108 CIDCoding::kJIS,
109 CPDF_CMap::MixedTwoBytes,
110 {{0x81, 0x9f}, {0xe0, 0xfc}}},
111 {"Add-RKSJ",
112 CIDSET_JAPAN1,
113 CIDCoding::kJIS,
114 CPDF_CMap::MixedTwoBytes,
115 {{0x81, 0x9f}, {0xe0, 0xfc}}},
116 {"EUC",
117 CIDSET_JAPAN1,
118 CIDCoding::kJIS,
119 CPDF_CMap::MixedTwoBytes,
120 {{0x8e, 0x8e}, {0xa1, 0xfe}}},
121 {"H", CIDSET_JAPAN1, CIDCoding::kJIS, CPDF_CMap::TwoBytes, {{0x21, 0x7e}}},
122 {"V", CIDSET_JAPAN1, CIDCoding::kJIS, CPDF_CMap::TwoBytes, {{0x21, 0x7e}}},
123 {"Ext-RKSJ",
124 CIDSET_JAPAN1,
125 CIDCoding::kJIS,
126 CPDF_CMap::MixedTwoBytes,
127 {{0x81, 0x9f}, {0xe0, 0xfc}}},
128 {"UniJIS-UCS2", CIDSET_JAPAN1, CIDCoding::kUCS2, CPDF_CMap::TwoBytes, {}},
129 {"UniJIS-UCS2-HW",
130 CIDSET_JAPAN1,
131 CIDCoding::kUCS2,
132 CPDF_CMap::TwoBytes,
133 {}},
134 {"UniJIS-UTF16", CIDSET_JAPAN1, CIDCoding::kUTF16, CPDF_CMap::TwoBytes, {}},
135 {"KSC-EUC",
136 CIDSET_KOREA1,
137 CIDCoding::kKOREA,
138 CPDF_CMap::MixedTwoBytes,
139 {{0xa1, 0xfe}}},
140 {"KSCms-UHC",
141 CIDSET_KOREA1,
142 CIDCoding::kKOREA,
143 CPDF_CMap::MixedTwoBytes,
144 {{0x81, 0xfe}}},
145 {"KSCms-UHC-HW",
146 CIDSET_KOREA1,
147 CIDCoding::kKOREA,
148 CPDF_CMap::MixedTwoBytes,
149 {{0x81, 0xfe}}},
150 {"KSCpc-EUC",
151 CIDSET_KOREA1,
152 CIDCoding::kKOREA,
153 CPDF_CMap::MixedTwoBytes,
154 {{0xa1, 0xfd}}},
155 {"UniKS-UCS2", CIDSET_KOREA1, CIDCoding::kUCS2, CPDF_CMap::TwoBytes, {}},
156 {"UniKS-UTF16", CIDSET_KOREA1, CIDCoding::kUTF16, CPDF_CMap::TwoBytes, {}},
157 };
158
GetPredefinedCMap(ByteStringView cmapid)159 const PredefinedCMap* GetPredefinedCMap(ByteStringView cmapid) {
160 if (cmapid.GetLength() > 2)
161 cmapid = cmapid.First(cmapid.GetLength() - 2);
162 for (const auto& map : kPredefinedCMaps) {
163 if (cmapid == map.m_pName)
164 return ↦
165 }
166 return nullptr;
167 }
168
LoadLeadingSegments(const PredefinedCMap & map)169 std::vector<bool> LoadLeadingSegments(const PredefinedCMap& map) {
170 std::vector<bool> segments(256);
171 const auto seg_span = pdfium::make_span(map.m_LeadingSegs);
172 for (const ByteRange& seg : seg_span) {
173 if (seg.m_First == 0 && seg.m_Last == 0) {
174 break;
175 }
176 for (int b = seg.m_First; b <= seg.m_Last; ++b) {
177 segments[b] = true;
178 }
179 }
180 return segments;
181 }
182
CheckFourByteCodeRange(pdfium::span<uint8_t> codes,pdfium::span<const CPDF_CMap::CodeRange> ranges)183 int CheckFourByteCodeRange(pdfium::span<uint8_t> codes,
184 pdfium::span<const CPDF_CMap::CodeRange> ranges) {
185 for (size_t i = ranges.size(); i > 0; i--) {
186 const auto& range = ranges[i - 1];
187 if (range.m_CharSize < codes.size()) {
188 continue;
189 }
190 size_t iChar = 0;
191 while (iChar < codes.size()) {
192 if (codes[iChar] < range.m_Lower[iChar] ||
193 codes[iChar] > range.m_Upper[iChar]) {
194 break;
195 }
196 ++iChar;
197 }
198 if (iChar == range.m_CharSize) {
199 return 2;
200 }
201 if (iChar) {
202 return (codes.size() == range.m_CharSize) ? 2 : 1;
203 }
204 }
205 return 0;
206 }
207
GetFourByteCharSizeImpl(uint32_t charcode,pdfium::span<const CPDF_CMap::CodeRange> ranges)208 size_t GetFourByteCharSizeImpl(
209 uint32_t charcode,
210 pdfium::span<const CPDF_CMap::CodeRange> ranges) {
211 if (ranges.empty())
212 return 1;
213
214 std::array<uint8_t, 4> codes = {{
215 0x00,
216 0x00,
217 static_cast<uint8_t>(charcode >> 8 & 0xFF),
218 static_cast<uint8_t>(charcode),
219 }};
220 for (size_t offset = 0; offset < 4; offset++) {
221 size_t size = 4 - offset;
222 for (size_t j = 0; j < ranges.size(); j++) {
223 size_t iSeg = (ranges.size() - 1) - j;
224 if (ranges[iSeg].m_CharSize < size)
225 continue;
226 size_t iChar = 0;
227 while (iChar < size) {
228 if (codes[offset + iChar] < ranges[iSeg].m_Lower[iChar] ||
229 codes[offset + iChar] > ranges[iSeg].m_Upper[iChar]) {
230 break;
231 }
232 ++iChar;
233 }
234 if (iChar == ranges[iSeg].m_CharSize)
235 return size;
236 }
237 }
238 return 1;
239 }
240
FindEmbeddedCMap(pdfium::span<const fxcmap::CMap> pCMaps,ByteStringView bsName)241 const fxcmap::CMap* FindEmbeddedCMap(pdfium::span<const fxcmap::CMap> pCMaps,
242 ByteStringView bsName) {
243 for (size_t i = 0; i < pCMaps.size(); i++) {
244 if (bsName == pCMaps[i].m_Name)
245 return &pCMaps[i];
246 }
247 return nullptr;
248 }
249
250 } // namespace
251
CPDF_CMap(ByteStringView bsPredefinedName)252 CPDF_CMap::CPDF_CMap(ByteStringView bsPredefinedName)
253 : m_bVertical(bsPredefinedName.Back() == 'V') {
254 if (bsPredefinedName == "Identity-H" || bsPredefinedName == "Identity-V") {
255 m_Coding = CIDCoding::kCID;
256 m_bLoaded = true;
257 return;
258 }
259
260 const PredefinedCMap* map = GetPredefinedCMap(bsPredefinedName);
261 if (!map)
262 return;
263
264 m_Charset = map->m_Charset;
265 m_Coding = map->m_Coding;
266 m_CodingScheme = map->m_CodingScheme;
267 if (m_CodingScheme == MixedTwoBytes)
268 m_MixedTwoByteLeadingBytes = LoadLeadingSegments(*map);
269 m_pEmbedMap = FindEmbeddedCMap(
270 CPDF_FontGlobals::GetInstance()->GetEmbeddedCharset(m_Charset),
271 bsPredefinedName);
272 if (!m_pEmbedMap)
273 return;
274
275 m_bLoaded = true;
276 }
277
CPDF_CMap(pdfium::span<const uint8_t> spEmbeddedData)278 CPDF_CMap::CPDF_CMap(pdfium::span<const uint8_t> spEmbeddedData)
279 : m_DirectCharcodeToCIDTable(
280 FixedSizeDataVector<uint16_t>::Zeroed(kDirectMapTableSize)) {
281 CPDF_CMapParser parser(this);
282 CPDF_SimpleParser syntax(spEmbeddedData);
283 while (true) {
284 ByteStringView word = syntax.GetWord();
285 if (word.IsEmpty()) {
286 break;
287 }
288 parser.ParseWord(word);
289 }
290 }
291
292 CPDF_CMap::~CPDF_CMap() = default;
293
CIDFromCharCode(uint32_t charcode) const294 uint16_t CPDF_CMap::CIDFromCharCode(uint32_t charcode) const {
295 if (m_Coding == CIDCoding::kCID)
296 return static_cast<uint16_t>(charcode);
297
298 if (m_pEmbedMap)
299 return fxcmap::CIDFromCharCode(m_pEmbedMap, charcode);
300
301 if (m_DirectCharcodeToCIDTable.empty())
302 return static_cast<uint16_t>(charcode);
303
304 auto table_span = m_DirectCharcodeToCIDTable.span();
305 if (charcode < table_span.size())
306 return table_span[charcode];
307
308 auto it = std::lower_bound(m_AdditionalCharcodeToCIDMappings.begin(),
309 m_AdditionalCharcodeToCIDMappings.end(), charcode,
310 [](const CPDF_CMap::CIDRange& arg, uint32_t val) {
311 return arg.m_EndCode < val;
312 });
313 if (it == m_AdditionalCharcodeToCIDMappings.end() ||
314 it->m_StartCode > charcode) {
315 return 0;
316 }
317 return it->m_StartCID + charcode - it->m_StartCode;
318 }
319
GetNextChar(ByteStringView pString,size_t * pOffset) const320 uint32_t CPDF_CMap::GetNextChar(ByteStringView pString, size_t* pOffset) const {
321 size_t& offset = *pOffset;
322 auto pBytes = pString.unsigned_span();
323 switch (m_CodingScheme) {
324 case OneByte: {
325 return offset < pBytes.size() ? pBytes[offset++] : 0;
326 }
327 case TwoBytes: {
328 uint8_t byte1 = offset < pBytes.size() ? pBytes[offset++] : 0;
329 uint8_t byte2 = offset < pBytes.size() ? pBytes[offset++] : 0;
330 return 256 * byte1 + byte2;
331 }
332 case MixedTwoBytes: {
333 uint8_t byte1 = offset < pBytes.size() ? pBytes[offset++] : 0;
334 if (!m_MixedTwoByteLeadingBytes[byte1])
335 return byte1;
336 uint8_t byte2 = offset < pBytes.size() ? pBytes[offset++] : 0;
337 return 256 * byte1 + byte2;
338 }
339 case MixedFourBytes: {
340 std::array<uint8_t, 4> codes;
341 int char_size = 1;
342 codes[0] = offset < pBytes.size() ? pBytes[offset++] : 0;
343 while (true) {
344 int ret =
345 CheckFourByteCodeRange(pdfium::make_span(codes).first(char_size),
346 m_MixedFourByteLeadingRanges);
347 if (ret == 0)
348 return 0;
349 if (ret == 2) {
350 uint32_t charcode = 0;
351 for (int i = 0; i < char_size; i++)
352 charcode = (charcode << 8) + codes[i];
353 return charcode;
354 }
355 if (char_size == 4 || offset == pBytes.size())
356 return 0;
357 codes[char_size++] = pBytes[offset++];
358 }
359 }
360 }
361 NOTREACHED_NORETURN();
362 }
363
GetCharSize(uint32_t charcode) const364 int CPDF_CMap::GetCharSize(uint32_t charcode) const {
365 switch (m_CodingScheme) {
366 case OneByte:
367 return 1;
368 case TwoBytes:
369 return 2;
370 case MixedTwoBytes:
371 if (charcode < 0x100)
372 return 1;
373 return 2;
374 case MixedFourBytes:
375 if (charcode < 0x100)
376 return 1;
377 if (charcode < 0x10000)
378 return 2;
379 if (charcode < 0x1000000)
380 return 3;
381 return 4;
382 }
383 NOTREACHED_NORETURN();
384 }
385
CountChar(ByteStringView pString) const386 size_t CPDF_CMap::CountChar(ByteStringView pString) const {
387 switch (m_CodingScheme) {
388 case OneByte:
389 return pString.GetLength();
390 case TwoBytes:
391 return (pString.GetLength() + 1) / 2;
392 case MixedTwoBytes: {
393 size_t count = 0;
394 for (size_t i = 0; i < pString.GetLength(); i++) {
395 count++;
396 if (m_MixedTwoByteLeadingBytes[pString[i]])
397 i++;
398 }
399 return count;
400 }
401 case MixedFourBytes: {
402 size_t count = 0;
403 size_t offset = 0;
404 while (offset < pString.GetLength()) {
405 GetNextChar(pString, &offset);
406 count++;
407 }
408 return count;
409 }
410 }
411 NOTREACHED_NORETURN();
412 }
413
AppendChar(ByteString * str,uint32_t charcode) const414 void CPDF_CMap::AppendChar(ByteString* str, uint32_t charcode) const {
415 switch (m_CodingScheme) {
416 case OneByte:
417 *str += static_cast<char>(charcode);
418 return;
419 case TwoBytes:
420 *str += static_cast<char>(charcode / 256);
421 *str += static_cast<char>(charcode % 256);
422 return;
423 case MixedTwoBytes:
424 if (charcode < 0x100 && !m_MixedTwoByteLeadingBytes[charcode]) {
425 *str += static_cast<char>(charcode);
426 return;
427 }
428 *str += static_cast<char>(charcode >> 8);
429 *str += static_cast<char>(charcode);
430 return;
431 case MixedFourBytes:
432 if (charcode < 0x100) {
433 int iSize = static_cast<int>(
434 GetFourByteCharSizeImpl(charcode, m_MixedFourByteLeadingRanges));
435 int pad = iSize != 0 ? iSize - 1 : 0;
436 for (int i = 0; i < pad; ++i) {
437 *str += static_cast<char>(0);
438 }
439 *str += static_cast<char>(charcode);
440 return;
441 }
442 if (charcode < 0x10000) {
443 *str += static_cast<char>(charcode >> 8);
444 *str += static_cast<char>(charcode);
445 return;
446 }
447 if (charcode < 0x1000000) {
448 *str += static_cast<char>(charcode >> 16);
449 *str += static_cast<char>(charcode >> 8);
450 *str += static_cast<char>(charcode);
451 return;
452 }
453 *str += static_cast<char>(charcode >> 24);
454 *str += static_cast<char>(charcode >> 16);
455 *str += static_cast<char>(charcode >> 8);
456 *str += static_cast<char>(charcode);
457 return;
458 }
459 NOTREACHED_NORETURN();
460 }
461
SetAdditionalMappings(std::vector<CIDRange> mappings)462 void CPDF_CMap::SetAdditionalMappings(std::vector<CIDRange> mappings) {
463 DCHECK(m_AdditionalCharcodeToCIDMappings.empty());
464 if (m_CodingScheme != MixedFourBytes || mappings.empty())
465 return;
466
467 std::sort(
468 mappings.begin(), mappings.end(),
469 [](const CPDF_CMap::CIDRange& arg1, const CPDF_CMap::CIDRange& arg2) {
470 return arg1.m_EndCode < arg2.m_EndCode;
471 });
472 m_AdditionalCharcodeToCIDMappings = std::move(mappings);
473 }
474
SetMixedFourByteLeadingRanges(std::vector<CodeRange> ranges)475 void CPDF_CMap::SetMixedFourByteLeadingRanges(std::vector<CodeRange> ranges) {
476 m_MixedFourByteLeadingRanges = std::move(ranges);
477 }
478
SetDirectCharcodeToCIDTableRange(uint32_t start_code,uint32_t end_code,uint16_t start_cid)479 void CPDF_CMap::SetDirectCharcodeToCIDTableRange(uint32_t start_code,
480 uint32_t end_code,
481 uint16_t start_cid) {
482 pdfium::span<uint16_t> span = m_DirectCharcodeToCIDTable.span();
483 for (uint32_t code = start_code; code <= end_code; ++code) {
484 span[code] = static_cast<uint16_t>(start_cid + code - start_code);
485 }
486 }
487