1 // Copyright 2017 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7 #include "core/fpdfapi/font/cpdf_cmap.h"
8
9 #include <memory>
10 #include <utility>
11 #include <vector>
12
13 #include "core/fpdfapi/cmaps/fpdf_cmaps.h"
14 #include "core/fpdfapi/font/cpdf_cmapparser.h"
15 #include "core/fpdfapi/font/cpdf_fontglobals.h"
16 #include "core/fpdfapi/parser/cpdf_simple_parser.h"
17
18 namespace {
19
20 struct ByteRange {
21 uint8_t m_First;
22 uint8_t m_Last; // Inclusive.
23 };
24
25 struct PredefinedCMap {
26 const char* m_pName; // Raw, POD struct.
27 CIDSet m_Charset;
28 CIDCoding m_Coding;
29 CPDF_CMap::CodingScheme m_CodingScheme;
30 uint8_t m_LeadingSegCount;
31 ByteRange m_LeadingSegs[2];
32 };
33
34 constexpr PredefinedCMap kPredefinedCMaps[] = {
35 {"GB-EUC",
36 CIDSET_GB1,
37 CIDCODING_GB,
38 CPDF_CMap::MixedTwoBytes,
39 1,
40 {{0xa1, 0xfe}}},
41 {"GBpc-EUC",
42 CIDSET_GB1,
43 CIDCODING_GB,
44 CPDF_CMap::MixedTwoBytes,
45 1,
46 {{0xa1, 0xfc}}},
47 {"GBK-EUC",
48 CIDSET_GB1,
49 CIDCODING_GB,
50 CPDF_CMap::MixedTwoBytes,
51 1,
52 {{0x81, 0xfe}}},
53 {"GBKp-EUC",
54 CIDSET_GB1,
55 CIDCODING_GB,
56 CPDF_CMap::MixedTwoBytes,
57 1,
58 {{0x81, 0xfe}}},
59 {"GBK2K-EUC",
60 CIDSET_GB1,
61 CIDCODING_GB,
62 CPDF_CMap::MixedTwoBytes,
63 1,
64 {{0x81, 0xfe}}},
65 {"GBK2K",
66 CIDSET_GB1,
67 CIDCODING_GB,
68 CPDF_CMap::MixedTwoBytes,
69 1,
70 {{0x81, 0xfe}}},
71 {"UniGB-UCS2", CIDSET_GB1, CIDCODING_UCS2, CPDF_CMap::TwoBytes, 0, {}},
72 {"UniGB-UTF16", CIDSET_GB1, CIDCODING_UTF16, CPDF_CMap::TwoBytes, 0, {}},
73 {"B5pc",
74 CIDSET_CNS1,
75 CIDCODING_BIG5,
76 CPDF_CMap::MixedTwoBytes,
77 1,
78 {{0xa1, 0xfc}}},
79 {"HKscs-B5",
80 CIDSET_CNS1,
81 CIDCODING_BIG5,
82 CPDF_CMap::MixedTwoBytes,
83 1,
84 {{0x88, 0xfe}}},
85 {"ETen-B5",
86 CIDSET_CNS1,
87 CIDCODING_BIG5,
88 CPDF_CMap::MixedTwoBytes,
89 1,
90 {{0xa1, 0xfe}}},
91 {"ETenms-B5",
92 CIDSET_CNS1,
93 CIDCODING_BIG5,
94 CPDF_CMap::MixedTwoBytes,
95 1,
96 {{0xa1, 0xfe}}},
97 {"UniCNS-UCS2", CIDSET_CNS1, CIDCODING_UCS2, CPDF_CMap::TwoBytes, 0, {}},
98 {"UniCNS-UTF16", CIDSET_CNS1, CIDCODING_UTF16, CPDF_CMap::TwoBytes, 0, {}},
99 {"83pv-RKSJ",
100 CIDSET_JAPAN1,
101 CIDCODING_JIS,
102 CPDF_CMap::MixedTwoBytes,
103 2,
104 {{0x81, 0x9f}, {0xe0, 0xfc}}},
105 {"90ms-RKSJ",
106 CIDSET_JAPAN1,
107 CIDCODING_JIS,
108 CPDF_CMap::MixedTwoBytes,
109 2,
110 {{0x81, 0x9f}, {0xe0, 0xfc}}},
111 {"90msp-RKSJ",
112 CIDSET_JAPAN1,
113 CIDCODING_JIS,
114 CPDF_CMap::MixedTwoBytes,
115 2,
116 {{0x81, 0x9f}, {0xe0, 0xfc}}},
117 {"90pv-RKSJ",
118 CIDSET_JAPAN1,
119 CIDCODING_JIS,
120 CPDF_CMap::MixedTwoBytes,
121 2,
122 {{0x81, 0x9f}, {0xe0, 0xfc}}},
123 {"Add-RKSJ",
124 CIDSET_JAPAN1,
125 CIDCODING_JIS,
126 CPDF_CMap::MixedTwoBytes,
127 2,
128 {{0x81, 0x9f}, {0xe0, 0xfc}}},
129 {"EUC",
130 CIDSET_JAPAN1,
131 CIDCODING_JIS,
132 CPDF_CMap::MixedTwoBytes,
133 2,
134 {{0x8e, 0x8e}, {0xa1, 0xfe}}},
135 {"H", CIDSET_JAPAN1, CIDCODING_JIS, CPDF_CMap::TwoBytes, 1, {{0x21, 0x7e}}},
136 {"V", CIDSET_JAPAN1, CIDCODING_JIS, CPDF_CMap::TwoBytes, 1, {{0x21, 0x7e}}},
137 {"Ext-RKSJ",
138 CIDSET_JAPAN1,
139 CIDCODING_JIS,
140 CPDF_CMap::MixedTwoBytes,
141 2,
142 {{0x81, 0x9f}, {0xe0, 0xfc}}},
143 {"UniJIS-UCS2", CIDSET_JAPAN1, CIDCODING_UCS2, CPDF_CMap::TwoBytes, 0, {}},
144 {"UniJIS-UCS2-HW",
145 CIDSET_JAPAN1,
146 CIDCODING_UCS2,
147 CPDF_CMap::TwoBytes,
148 0,
149 {}},
150 {"UniJIS-UTF16",
151 CIDSET_JAPAN1,
152 CIDCODING_UTF16,
153 CPDF_CMap::TwoBytes,
154 0,
155 {}},
156 {"KSC-EUC",
157 CIDSET_KOREA1,
158 CIDCODING_KOREA,
159 CPDF_CMap::MixedTwoBytes,
160 1,
161 {{0xa1, 0xfe}}},
162 {"KSCms-UHC",
163 CIDSET_KOREA1,
164 CIDCODING_KOREA,
165 CPDF_CMap::MixedTwoBytes,
166 1,
167 {{0x81, 0xfe}}},
168 {"KSCms-UHC-HW",
169 CIDSET_KOREA1,
170 CIDCODING_KOREA,
171 CPDF_CMap::MixedTwoBytes,
172 1,
173 {{0x81, 0xfe}}},
174 {"KSCpc-EUC",
175 CIDSET_KOREA1,
176 CIDCODING_KOREA,
177 CPDF_CMap::MixedTwoBytes,
178 1,
179 {{0xa1, 0xfd}}},
180 {"UniKS-UCS2", CIDSET_KOREA1, CIDCODING_UCS2, CPDF_CMap::TwoBytes, 0, {}},
181 {"UniKS-UTF16", CIDSET_KOREA1, CIDCODING_UTF16, CPDF_CMap::TwoBytes, 0, {}},
182 };
183
GetPredefinedCMap(ByteStringView cmapid)184 const PredefinedCMap* GetPredefinedCMap(ByteStringView cmapid) {
185 if (cmapid.GetLength() > 2)
186 cmapid = cmapid.First(cmapid.GetLength() - 2);
187 for (const auto& map : kPredefinedCMaps) {
188 if (cmapid == map.m_pName)
189 return ↦
190 }
191 return nullptr;
192 }
193
LoadLeadingSegments(const PredefinedCMap & map)194 std::vector<bool> LoadLeadingSegments(const PredefinedCMap& map) {
195 std::vector<bool> segments(256);
196 for (uint32_t i = 0; i < map.m_LeadingSegCount; ++i) {
197 const ByteRange& seg = map.m_LeadingSegs[i];
198 for (int b = seg.m_First; b <= seg.m_Last; ++b)
199 segments[b] = true;
200 }
201 return segments;
202 }
203
CheckFourByteCodeRange(uint8_t * codes,size_t size,const std::vector<CPDF_CMap::CodeRange> & ranges)204 int CheckFourByteCodeRange(uint8_t* codes,
205 size_t size,
206 const std::vector<CPDF_CMap::CodeRange>& ranges) {
207 for (size_t i = ranges.size(); i > 0; i--) {
208 size_t seg = i - 1;
209 if (ranges[seg].m_CharSize < size)
210 continue;
211 size_t iChar = 0;
212 while (iChar < size) {
213 if (codes[iChar] < ranges[seg].m_Lower[iChar] ||
214 codes[iChar] > ranges[seg].m_Upper[iChar]) {
215 break;
216 }
217 ++iChar;
218 }
219 if (iChar == ranges[seg].m_CharSize)
220 return 2;
221 if (iChar)
222 return (size == ranges[seg].m_CharSize) ? 2 : 1;
223 }
224 return 0;
225 }
226
GetFourByteCharSizeImpl(uint32_t charcode,const std::vector<CPDF_CMap::CodeRange> & ranges)227 size_t GetFourByteCharSizeImpl(
228 uint32_t charcode,
229 const std::vector<CPDF_CMap::CodeRange>& ranges) {
230 if (ranges.empty())
231 return 1;
232
233 uint8_t codes[4];
234 codes[0] = codes[1] = 0x00;
235 codes[2] = static_cast<uint8_t>(charcode >> 8 & 0xFF);
236 codes[3] = static_cast<uint8_t>(charcode);
237 for (size_t offset = 0; offset < 4; offset++) {
238 size_t size = 4 - offset;
239 for (size_t j = 0; j < ranges.size(); j++) {
240 size_t iSeg = (ranges.size() - 1) - j;
241 if (ranges[iSeg].m_CharSize < size)
242 continue;
243 size_t iChar = 0;
244 while (iChar < size) {
245 if (codes[offset + iChar] < ranges[iSeg].m_Lower[iChar] ||
246 codes[offset + iChar] > ranges[iSeg].m_Upper[iChar]) {
247 break;
248 }
249 ++iChar;
250 }
251 if (iChar == ranges[iSeg].m_CharSize)
252 return size;
253 }
254 }
255 return 1;
256 }
257
258 } // namespace
259
CPDF_CMap(ByteStringView bsPredefinedName)260 CPDF_CMap::CPDF_CMap(ByteStringView bsPredefinedName)
261 : m_bVertical(bsPredefinedName.Back() == 'V') {
262 if (bsPredefinedName == "Identity-H" || bsPredefinedName == "Identity-V") {
263 m_Coding = CIDCODING_CID;
264 m_bLoaded = true;
265 return;
266 }
267
268 const PredefinedCMap* map = GetPredefinedCMap(bsPredefinedName);
269 if (!map)
270 return;
271
272 m_Charset = map->m_Charset;
273 m_Coding = map->m_Coding;
274 m_CodingScheme = map->m_CodingScheme;
275 if (m_CodingScheme == MixedTwoBytes)
276 m_MixedTwoByteLeadingBytes = LoadLeadingSegments(*map);
277 m_pEmbedMap = FindEmbeddedCMap(
278 CPDF_FontGlobals::GetInstance()->GetEmbeddedCharset(m_Charset),
279 bsPredefinedName);
280 if (!m_pEmbedMap)
281 return;
282
283 m_bLoaded = true;
284 }
285
CPDF_CMap(pdfium::span<const uint8_t> spEmbeddedData)286 CPDF_CMap::CPDF_CMap(pdfium::span<const uint8_t> spEmbeddedData)
287 : m_DirectCharcodeToCIDTable(65536) {
288 CPDF_CMapParser parser(this);
289 CPDF_SimpleParser syntax(spEmbeddedData);
290 while (1) {
291 ByteStringView word = syntax.GetWord();
292 if (word.IsEmpty())
293 break;
294
295 parser.ParseWord(word);
296 }
297 }
298
299 CPDF_CMap::~CPDF_CMap() = default;
300
CIDFromCharCode(uint32_t charcode) const301 uint16_t CPDF_CMap::CIDFromCharCode(uint32_t charcode) const {
302 if (m_Coding == CIDCODING_CID)
303 return static_cast<uint16_t>(charcode);
304
305 if (m_pEmbedMap)
306 return ::CIDFromCharCode(m_pEmbedMap.Get(), charcode);
307
308 if (m_DirectCharcodeToCIDTable.empty())
309 return static_cast<uint16_t>(charcode);
310
311 if (charcode < 0x10000)
312 return m_DirectCharcodeToCIDTable[charcode];
313
314 auto it = std::lower_bound(m_AdditionalCharcodeToCIDMappings.begin(),
315 m_AdditionalCharcodeToCIDMappings.end(), charcode,
316 [](const CPDF_CMap::CIDRange& arg, uint32_t val) {
317 return arg.m_EndCode < val;
318 });
319 if (it == m_AdditionalCharcodeToCIDMappings.end() ||
320 it->m_StartCode > charcode) {
321 return 0;
322 }
323 return it->m_StartCID + charcode - it->m_StartCode;
324 }
325
GetNextChar(ByteStringView pString,size_t * pOffset) const326 uint32_t CPDF_CMap::GetNextChar(ByteStringView pString, size_t* pOffset) const {
327 size_t& offset = *pOffset;
328 auto pBytes = pString.raw_span();
329 switch (m_CodingScheme) {
330 case OneByte: {
331 return offset < pBytes.size() ? pBytes[offset++] : 0;
332 }
333 case TwoBytes: {
334 uint8_t byte1 = offset < pBytes.size() ? pBytes[offset++] : 0;
335 uint8_t byte2 = offset < pBytes.size() ? pBytes[offset++] : 0;
336 return 256 * byte1 + byte2;
337 }
338 case MixedTwoBytes: {
339 uint8_t byte1 = offset < pBytes.size() ? pBytes[offset++] : 0;
340 if (!m_MixedTwoByteLeadingBytes[byte1])
341 return byte1;
342 uint8_t byte2 = offset < pBytes.size() ? pBytes[offset++] : 0;
343 return 256 * byte1 + byte2;
344 }
345 case MixedFourBytes: {
346 uint8_t codes[4];
347 int char_size = 1;
348 codes[0] = offset < pBytes.size() ? pBytes[offset++] : 0;
349 while (1) {
350 int ret = CheckFourByteCodeRange(codes, char_size,
351 m_MixedFourByteLeadingRanges);
352 if (ret == 0)
353 return 0;
354 if (ret == 2) {
355 uint32_t charcode = 0;
356 for (int i = 0; i < char_size; i++)
357 charcode = (charcode << 8) + codes[i];
358 return charcode;
359 }
360 if (char_size == 4 || offset == pBytes.size())
361 return 0;
362 codes[char_size++] = pBytes[offset++];
363 }
364 break;
365 }
366 }
367 return 0;
368 }
369
GetCharSize(uint32_t charcode) const370 int CPDF_CMap::GetCharSize(uint32_t charcode) const {
371 switch (m_CodingScheme) {
372 case OneByte:
373 return 1;
374 case TwoBytes:
375 return 2;
376 case MixedTwoBytes:
377 if (charcode < 0x100)
378 return 1;
379 return 2;
380 case MixedFourBytes:
381 if (charcode < 0x100)
382 return 1;
383 if (charcode < 0x10000)
384 return 2;
385 if (charcode < 0x1000000)
386 return 3;
387 return 4;
388 }
389 return 1;
390 }
391
CountChar(ByteStringView pString) const392 size_t CPDF_CMap::CountChar(ByteStringView pString) const {
393 switch (m_CodingScheme) {
394 case OneByte:
395 return pString.GetLength();
396 case TwoBytes:
397 return (pString.GetLength() + 1) / 2;
398 case MixedTwoBytes: {
399 size_t count = 0;
400 for (size_t i = 0; i < pString.GetLength(); i++) {
401 count++;
402 if (m_MixedTwoByteLeadingBytes[pString[i]])
403 i++;
404 }
405 return count;
406 }
407 case MixedFourBytes: {
408 size_t count = 0;
409 size_t offset = 0;
410 while (offset < pString.GetLength()) {
411 GetNextChar(pString, &offset);
412 count++;
413 }
414 return count;
415 }
416 }
417 return pString.GetLength();
418 }
419
AppendChar(char * str,uint32_t charcode) const420 int CPDF_CMap::AppendChar(char* str, uint32_t charcode) const {
421 switch (m_CodingScheme) {
422 case OneByte:
423 str[0] = static_cast<char>(charcode);
424 return 1;
425 case TwoBytes:
426 str[0] = static_cast<char>(charcode / 256);
427 str[1] = static_cast<char>(charcode % 256);
428 return 2;
429 case MixedTwoBytes:
430 if (charcode < 0x100 && !m_MixedTwoByteLeadingBytes[charcode]) {
431 str[0] = static_cast<char>(charcode);
432 return 1;
433 }
434 str[0] = static_cast<char>(charcode >> 8);
435 str[1] = static_cast<char>(charcode);
436 return 2;
437 case MixedFourBytes:
438 if (charcode < 0x100) {
439 int iSize = static_cast<int>(
440 GetFourByteCharSizeImpl(charcode, m_MixedFourByteLeadingRanges));
441 if (iSize == 0)
442 iSize = 1;
443 str[iSize - 1] = static_cast<char>(charcode);
444 if (iSize > 1)
445 memset(str, 0, iSize - 1);
446 return iSize;
447 }
448 if (charcode < 0x10000) {
449 str[0] = static_cast<char>(charcode >> 8);
450 str[1] = static_cast<char>(charcode);
451 return 2;
452 }
453 if (charcode < 0x1000000) {
454 str[0] = static_cast<char>(charcode >> 16);
455 str[1] = static_cast<char>(charcode >> 8);
456 str[2] = static_cast<char>(charcode);
457 return 3;
458 }
459 str[0] = static_cast<char>(charcode >> 24);
460 str[1] = static_cast<char>(charcode >> 16);
461 str[2] = static_cast<char>(charcode >> 8);
462 str[3] = static_cast<char>(charcode);
463 return 4;
464 }
465 return 0;
466 }
467
SetAdditionalMappings(std::vector<CIDRange> mappings)468 void CPDF_CMap::SetAdditionalMappings(std::vector<CIDRange> mappings) {
469 ASSERT(m_AdditionalCharcodeToCIDMappings.empty());
470 if (m_CodingScheme != MixedFourBytes || mappings.empty())
471 return;
472
473 std::sort(
474 mappings.begin(), mappings.end(),
475 [](const CPDF_CMap::CIDRange& arg1, const CPDF_CMap::CIDRange& arg2) {
476 return arg1.m_EndCode < arg2.m_EndCode;
477 });
478 m_AdditionalCharcodeToCIDMappings = std::move(mappings);
479 }
480
SetMixedFourByteLeadingRanges(std::vector<CodeRange> ranges)481 void CPDF_CMap::SetMixedFourByteLeadingRanges(std::vector<CodeRange> ranges) {
482 m_MixedFourByteLeadingRanges = std::move(ranges);
483 }
484