1 // Copyright 2017 The PDFium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7 #include "core/fpdfapi/font/cpdf_cmap.h"
8
9 #include <utility>
10 #include <vector>
11
12 #include "core/fpdfapi/cmaps/fpdf_cmaps.h"
13 #include "core/fpdfapi/font/cpdf_cmapparser.h"
14 #include "core/fpdfapi/font/cpdf_fontglobals.h"
15 #include "core/fpdfapi/parser/cpdf_simple_parser.h"
16 #include "third_party/base/check.h"
17
18 namespace {
19
20 struct ByteRange {
21 uint8_t m_First;
22 uint8_t m_Last; // Inclusive.
23 };
24
25 struct PredefinedCMap {
26 const char* m_pName; // Raw, POD struct.
27 CIDSet m_Charset;
28 CIDCoding m_Coding;
29 CPDF_CMap::CodingScheme m_CodingScheme;
30 uint8_t m_LeadingSegCount;
31 ByteRange m_LeadingSegs[2];
32 };
33
34 constexpr PredefinedCMap kPredefinedCMaps[] = {
35 {"GB-EUC",
36 CIDSET_GB1,
37 CIDCoding::kGB,
38 CPDF_CMap::MixedTwoBytes,
39 1,
40 {{0xa1, 0xfe}}},
41 {"GBpc-EUC",
42 CIDSET_GB1,
43 CIDCoding::kGB,
44 CPDF_CMap::MixedTwoBytes,
45 1,
46 {{0xa1, 0xfc}}},
47 {"GBK-EUC",
48 CIDSET_GB1,
49 CIDCoding::kGB,
50 CPDF_CMap::MixedTwoBytes,
51 1,
52 {{0x81, 0xfe}}},
53 {"GBKp-EUC",
54 CIDSET_GB1,
55 CIDCoding::kGB,
56 CPDF_CMap::MixedTwoBytes,
57 1,
58 {{0x81, 0xfe}}},
59 {"GBK2K-EUC",
60 CIDSET_GB1,
61 CIDCoding::kGB,
62 CPDF_CMap::MixedTwoBytes,
63 1,
64 {{0x81, 0xfe}}},
65 {"GBK2K",
66 CIDSET_GB1,
67 CIDCoding::kGB,
68 CPDF_CMap::MixedTwoBytes,
69 1,
70 {{0x81, 0xfe}}},
71 {"UniGB-UCS2", CIDSET_GB1, CIDCoding::kUCS2, CPDF_CMap::TwoBytes, 0, {}},
72 {"UniGB-UTF16", CIDSET_GB1, CIDCoding::kUTF16, CPDF_CMap::TwoBytes, 0, {}},
73 {"B5pc",
74 CIDSET_CNS1,
75 CIDCoding::kBIG5,
76 CPDF_CMap::MixedTwoBytes,
77 1,
78 {{0xa1, 0xfc}}},
79 {"HKscs-B5",
80 CIDSET_CNS1,
81 CIDCoding::kBIG5,
82 CPDF_CMap::MixedTwoBytes,
83 1,
84 {{0x88, 0xfe}}},
85 {"ETen-B5",
86 CIDSET_CNS1,
87 CIDCoding::kBIG5,
88 CPDF_CMap::MixedTwoBytes,
89 1,
90 {{0xa1, 0xfe}}},
91 {"ETenms-B5",
92 CIDSET_CNS1,
93 CIDCoding::kBIG5,
94 CPDF_CMap::MixedTwoBytes,
95 1,
96 {{0xa1, 0xfe}}},
97 {"UniCNS-UCS2", CIDSET_CNS1, CIDCoding::kUCS2, CPDF_CMap::TwoBytes, 0, {}},
98 {"UniCNS-UTF16",
99 CIDSET_CNS1,
100 CIDCoding::kUTF16,
101 CPDF_CMap::TwoBytes,
102 0,
103 {}},
104 {"83pv-RKSJ",
105 CIDSET_JAPAN1,
106 CIDCoding::kJIS,
107 CPDF_CMap::MixedTwoBytes,
108 2,
109 {{0x81, 0x9f}, {0xe0, 0xfc}}},
110 {"90ms-RKSJ",
111 CIDSET_JAPAN1,
112 CIDCoding::kJIS,
113 CPDF_CMap::MixedTwoBytes,
114 2,
115 {{0x81, 0x9f}, {0xe0, 0xfc}}},
116 {"90msp-RKSJ",
117 CIDSET_JAPAN1,
118 CIDCoding::kJIS,
119 CPDF_CMap::MixedTwoBytes,
120 2,
121 {{0x81, 0x9f}, {0xe0, 0xfc}}},
122 {"90pv-RKSJ",
123 CIDSET_JAPAN1,
124 CIDCoding::kJIS,
125 CPDF_CMap::MixedTwoBytes,
126 2,
127 {{0x81, 0x9f}, {0xe0, 0xfc}}},
128 {"Add-RKSJ",
129 CIDSET_JAPAN1,
130 CIDCoding::kJIS,
131 CPDF_CMap::MixedTwoBytes,
132 2,
133 {{0x81, 0x9f}, {0xe0, 0xfc}}},
134 {"EUC",
135 CIDSET_JAPAN1,
136 CIDCoding::kJIS,
137 CPDF_CMap::MixedTwoBytes,
138 2,
139 {{0x8e, 0x8e}, {0xa1, 0xfe}}},
140 {"H",
141 CIDSET_JAPAN1,
142 CIDCoding::kJIS,
143 CPDF_CMap::TwoBytes,
144 1,
145 {{0x21, 0x7e}}},
146 {"V",
147 CIDSET_JAPAN1,
148 CIDCoding::kJIS,
149 CPDF_CMap::TwoBytes,
150 1,
151 {{0x21, 0x7e}}},
152 {"Ext-RKSJ",
153 CIDSET_JAPAN1,
154 CIDCoding::kJIS,
155 CPDF_CMap::MixedTwoBytes,
156 2,
157 {{0x81, 0x9f}, {0xe0, 0xfc}}},
158 {"UniJIS-UCS2",
159 CIDSET_JAPAN1,
160 CIDCoding::kUCS2,
161 CPDF_CMap::TwoBytes,
162 0,
163 {}},
164 {"UniJIS-UCS2-HW",
165 CIDSET_JAPAN1,
166 CIDCoding::kUCS2,
167 CPDF_CMap::TwoBytes,
168 0,
169 {}},
170 {"UniJIS-UTF16",
171 CIDSET_JAPAN1,
172 CIDCoding::kUTF16,
173 CPDF_CMap::TwoBytes,
174 0,
175 {}},
176 {"KSC-EUC",
177 CIDSET_KOREA1,
178 CIDCoding::kKOREA,
179 CPDF_CMap::MixedTwoBytes,
180 1,
181 {{0xa1, 0xfe}}},
182 {"KSCms-UHC",
183 CIDSET_KOREA1,
184 CIDCoding::kKOREA,
185 CPDF_CMap::MixedTwoBytes,
186 1,
187 {{0x81, 0xfe}}},
188 {"KSCms-UHC-HW",
189 CIDSET_KOREA1,
190 CIDCoding::kKOREA,
191 CPDF_CMap::MixedTwoBytes,
192 1,
193 {{0x81, 0xfe}}},
194 {"KSCpc-EUC",
195 CIDSET_KOREA1,
196 CIDCoding::kKOREA,
197 CPDF_CMap::MixedTwoBytes,
198 1,
199 {{0xa1, 0xfd}}},
200 {"UniKS-UCS2", CIDSET_KOREA1, CIDCoding::kUCS2, CPDF_CMap::TwoBytes, 0, {}},
201 {"UniKS-UTF16",
202 CIDSET_KOREA1,
203 CIDCoding::kUTF16,
204 CPDF_CMap::TwoBytes,
205 0,
206 {}},
207 };
208
GetPredefinedCMap(ByteStringView cmapid)209 const PredefinedCMap* GetPredefinedCMap(ByteStringView cmapid) {
210 if (cmapid.GetLength() > 2)
211 cmapid = cmapid.First(cmapid.GetLength() - 2);
212 for (const auto& map : kPredefinedCMaps) {
213 if (cmapid == map.m_pName)
214 return ↦
215 }
216 return nullptr;
217 }
218
LoadLeadingSegments(const PredefinedCMap & map)219 std::vector<bool> LoadLeadingSegments(const PredefinedCMap& map) {
220 std::vector<bool> segments(256);
221 for (uint32_t i = 0; i < map.m_LeadingSegCount; ++i) {
222 const ByteRange& seg = map.m_LeadingSegs[i];
223 for (int b = seg.m_First; b <= seg.m_Last; ++b)
224 segments[b] = true;
225 }
226 return segments;
227 }
228
CheckFourByteCodeRange(uint8_t * codes,size_t size,const std::vector<CPDF_CMap::CodeRange> & ranges)229 int CheckFourByteCodeRange(uint8_t* codes,
230 size_t size,
231 const std::vector<CPDF_CMap::CodeRange>& ranges) {
232 for (size_t i = ranges.size(); i > 0; i--) {
233 size_t seg = i - 1;
234 if (ranges[seg].m_CharSize < size)
235 continue;
236 size_t iChar = 0;
237 while (iChar < size) {
238 if (codes[iChar] < ranges[seg].m_Lower[iChar] ||
239 codes[iChar] > ranges[seg].m_Upper[iChar]) {
240 break;
241 }
242 ++iChar;
243 }
244 if (iChar == ranges[seg].m_CharSize)
245 return 2;
246 if (iChar)
247 return (size == ranges[seg].m_CharSize) ? 2 : 1;
248 }
249 return 0;
250 }
251
GetFourByteCharSizeImpl(uint32_t charcode,const std::vector<CPDF_CMap::CodeRange> & ranges)252 size_t GetFourByteCharSizeImpl(
253 uint32_t charcode,
254 const std::vector<CPDF_CMap::CodeRange>& ranges) {
255 if (ranges.empty())
256 return 1;
257
258 uint8_t codes[4];
259 codes[0] = codes[1] = 0x00;
260 codes[2] = static_cast<uint8_t>(charcode >> 8 & 0xFF);
261 codes[3] = static_cast<uint8_t>(charcode);
262 for (size_t offset = 0; offset < 4; offset++) {
263 size_t size = 4 - offset;
264 for (size_t j = 0; j < ranges.size(); j++) {
265 size_t iSeg = (ranges.size() - 1) - j;
266 if (ranges[iSeg].m_CharSize < size)
267 continue;
268 size_t iChar = 0;
269 while (iChar < size) {
270 if (codes[offset + iChar] < ranges[iSeg].m_Lower[iChar] ||
271 codes[offset + iChar] > ranges[iSeg].m_Upper[iChar]) {
272 break;
273 }
274 ++iChar;
275 }
276 if (iChar == ranges[iSeg].m_CharSize)
277 return size;
278 }
279 }
280 return 1;
281 }
282
FindEmbeddedCMap(pdfium::span<const fxcmap::CMap> pCMaps,ByteStringView bsName)283 const fxcmap::CMap* FindEmbeddedCMap(pdfium::span<const fxcmap::CMap> pCMaps,
284 ByteStringView bsName) {
285 for (size_t i = 0; i < pCMaps.size(); i++) {
286 if (bsName == pCMaps[i].m_Name)
287 return &pCMaps[i];
288 }
289 return nullptr;
290 }
291
292 } // namespace
293
CPDF_CMap(ByteStringView bsPredefinedName)294 CPDF_CMap::CPDF_CMap(ByteStringView bsPredefinedName)
295 : m_bVertical(bsPredefinedName.Back() == 'V') {
296 if (bsPredefinedName == "Identity-H" || bsPredefinedName == "Identity-V") {
297 m_Coding = CIDCoding::kCID;
298 m_bLoaded = true;
299 return;
300 }
301
302 const PredefinedCMap* map = GetPredefinedCMap(bsPredefinedName);
303 if (!map)
304 return;
305
306 m_Charset = map->m_Charset;
307 m_Coding = map->m_Coding;
308 m_CodingScheme = map->m_CodingScheme;
309 if (m_CodingScheme == MixedTwoBytes)
310 m_MixedTwoByteLeadingBytes = LoadLeadingSegments(*map);
311 m_pEmbedMap = FindEmbeddedCMap(
312 CPDF_FontGlobals::GetInstance()->GetEmbeddedCharset(m_Charset),
313 bsPredefinedName);
314 if (!m_pEmbedMap)
315 return;
316
317 m_bLoaded = true;
318 }
319
CPDF_CMap(pdfium::span<const uint8_t> spEmbeddedData)320 CPDF_CMap::CPDF_CMap(pdfium::span<const uint8_t> spEmbeddedData)
321 : m_DirectCharcodeToCIDTable(kDirectMapTableSize) {
322 CPDF_CMapParser parser(this);
323 CPDF_SimpleParser syntax(spEmbeddedData);
324 while (true) {
325 ByteStringView word = syntax.GetWord();
326 if (word.IsEmpty())
327 break;
328
329 parser.ParseWord(word);
330 }
331 }
332
333 CPDF_CMap::~CPDF_CMap() = default;
334
CIDFromCharCode(uint32_t charcode) const335 uint16_t CPDF_CMap::CIDFromCharCode(uint32_t charcode) const {
336 if (m_Coding == CIDCoding::kCID)
337 return static_cast<uint16_t>(charcode);
338
339 if (m_pEmbedMap)
340 return fxcmap::CIDFromCharCode(m_pEmbedMap, charcode);
341
342 if (m_DirectCharcodeToCIDTable.empty())
343 return static_cast<uint16_t>(charcode);
344
345 auto table_span = m_DirectCharcodeToCIDTable.span();
346 if (charcode < table_span.size())
347 return table_span[charcode];
348
349 auto it = std::lower_bound(m_AdditionalCharcodeToCIDMappings.begin(),
350 m_AdditionalCharcodeToCIDMappings.end(), charcode,
351 [](const CPDF_CMap::CIDRange& arg, uint32_t val) {
352 return arg.m_EndCode < val;
353 });
354 if (it == m_AdditionalCharcodeToCIDMappings.end() ||
355 it->m_StartCode > charcode) {
356 return 0;
357 }
358 return it->m_StartCID + charcode - it->m_StartCode;
359 }
360
GetNextChar(ByteStringView pString,size_t * pOffset) const361 uint32_t CPDF_CMap::GetNextChar(ByteStringView pString, size_t* pOffset) const {
362 size_t& offset = *pOffset;
363 auto pBytes = pString.raw_span();
364 switch (m_CodingScheme) {
365 case OneByte: {
366 return offset < pBytes.size() ? pBytes[offset++] : 0;
367 }
368 case TwoBytes: {
369 uint8_t byte1 = offset < pBytes.size() ? pBytes[offset++] : 0;
370 uint8_t byte2 = offset < pBytes.size() ? pBytes[offset++] : 0;
371 return 256 * byte1 + byte2;
372 }
373 case MixedTwoBytes: {
374 uint8_t byte1 = offset < pBytes.size() ? pBytes[offset++] : 0;
375 if (!m_MixedTwoByteLeadingBytes[byte1])
376 return byte1;
377 uint8_t byte2 = offset < pBytes.size() ? pBytes[offset++] : 0;
378 return 256 * byte1 + byte2;
379 }
380 case MixedFourBytes: {
381 uint8_t codes[4];
382 int char_size = 1;
383 codes[0] = offset < pBytes.size() ? pBytes[offset++] : 0;
384 while (true) {
385 int ret = CheckFourByteCodeRange(codes, char_size,
386 m_MixedFourByteLeadingRanges);
387 if (ret == 0)
388 return 0;
389 if (ret == 2) {
390 uint32_t charcode = 0;
391 for (int i = 0; i < char_size; i++)
392 charcode = (charcode << 8) + codes[i];
393 return charcode;
394 }
395 if (char_size == 4 || offset == pBytes.size())
396 return 0;
397 codes[char_size++] = pBytes[offset++];
398 }
399 }
400 }
401 return 0;
402 }
403
GetCharSize(uint32_t charcode) const404 int CPDF_CMap::GetCharSize(uint32_t charcode) const {
405 switch (m_CodingScheme) {
406 case OneByte:
407 return 1;
408 case TwoBytes:
409 return 2;
410 case MixedTwoBytes:
411 if (charcode < 0x100)
412 return 1;
413 return 2;
414 case MixedFourBytes:
415 if (charcode < 0x100)
416 return 1;
417 if (charcode < 0x10000)
418 return 2;
419 if (charcode < 0x1000000)
420 return 3;
421 return 4;
422 }
423 return 1;
424 }
425
CountChar(ByteStringView pString) const426 size_t CPDF_CMap::CountChar(ByteStringView pString) const {
427 switch (m_CodingScheme) {
428 case OneByte:
429 return pString.GetLength();
430 case TwoBytes:
431 return (pString.GetLength() + 1) / 2;
432 case MixedTwoBytes: {
433 size_t count = 0;
434 for (size_t i = 0; i < pString.GetLength(); i++) {
435 count++;
436 if (m_MixedTwoByteLeadingBytes[pString[i]])
437 i++;
438 }
439 return count;
440 }
441 case MixedFourBytes: {
442 size_t count = 0;
443 size_t offset = 0;
444 while (offset < pString.GetLength()) {
445 GetNextChar(pString, &offset);
446 count++;
447 }
448 return count;
449 }
450 }
451 return pString.GetLength();
452 }
453
AppendChar(char * str,uint32_t charcode) const454 int CPDF_CMap::AppendChar(char* str, uint32_t charcode) const {
455 switch (m_CodingScheme) {
456 case OneByte:
457 str[0] = static_cast<char>(charcode);
458 return 1;
459 case TwoBytes:
460 str[0] = static_cast<char>(charcode / 256);
461 str[1] = static_cast<char>(charcode % 256);
462 return 2;
463 case MixedTwoBytes:
464 if (charcode < 0x100 && !m_MixedTwoByteLeadingBytes[charcode]) {
465 str[0] = static_cast<char>(charcode);
466 return 1;
467 }
468 str[0] = static_cast<char>(charcode >> 8);
469 str[1] = static_cast<char>(charcode);
470 return 2;
471 case MixedFourBytes:
472 if (charcode < 0x100) {
473 int iSize = static_cast<int>(
474 GetFourByteCharSizeImpl(charcode, m_MixedFourByteLeadingRanges));
475 if (iSize == 0)
476 iSize = 1;
477 str[iSize - 1] = static_cast<char>(charcode);
478 if (iSize > 1)
479 memset(str, 0, iSize - 1);
480 return iSize;
481 }
482 if (charcode < 0x10000) {
483 str[0] = static_cast<char>(charcode >> 8);
484 str[1] = static_cast<char>(charcode);
485 return 2;
486 }
487 if (charcode < 0x1000000) {
488 str[0] = static_cast<char>(charcode >> 16);
489 str[1] = static_cast<char>(charcode >> 8);
490 str[2] = static_cast<char>(charcode);
491 return 3;
492 }
493 str[0] = static_cast<char>(charcode >> 24);
494 str[1] = static_cast<char>(charcode >> 16);
495 str[2] = static_cast<char>(charcode >> 8);
496 str[3] = static_cast<char>(charcode);
497 return 4;
498 }
499 return 0;
500 }
501
SetAdditionalMappings(std::vector<CIDRange> mappings)502 void CPDF_CMap::SetAdditionalMappings(std::vector<CIDRange> mappings) {
503 DCHECK(m_AdditionalCharcodeToCIDMappings.empty());
504 if (m_CodingScheme != MixedFourBytes || mappings.empty())
505 return;
506
507 std::sort(
508 mappings.begin(), mappings.end(),
509 [](const CPDF_CMap::CIDRange& arg1, const CPDF_CMap::CIDRange& arg2) {
510 return arg1.m_EndCode < arg2.m_EndCode;
511 });
512 m_AdditionalCharcodeToCIDMappings = std::move(mappings);
513 }
514
SetMixedFourByteLeadingRanges(std::vector<CodeRange> ranges)515 void CPDF_CMap::SetMixedFourByteLeadingRanges(std::vector<CodeRange> ranges) {
516 m_MixedFourByteLeadingRanges = std::move(ranges);
517 }
518