1 // Copyright 2017 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7 #include "core/fpdfapi/font/cpdf_cmap.h"
8
9 #include <memory>
10 #include <utility>
11 #include <vector>
12
13 #include "core/fpdfapi/cmaps/cmap_int.h"
14 #include "core/fpdfapi/font/cpdf_cmapmanager.h"
15 #include "core/fpdfapi/font/cpdf_cmapparser.h"
16 #include "core/fpdfapi/parser/cpdf_simple_parser.h"
17
18 namespace {
19
20 struct ByteRange {
21 uint8_t m_First;
22 uint8_t m_Last; // Inclusive.
23 };
24
25 struct PredefinedCMap {
26 const char* m_pName;
27 CIDSet m_Charset;
28 CIDCoding m_Coding;
29 CPDF_CMap::CodingScheme m_CodingScheme;
30 uint8_t m_LeadingSegCount;
31 ByteRange m_LeadingSegs[2];
32 };
33
34 const PredefinedCMap g_PredefinedCMaps[] = {
35 {"GB-EUC",
36 CIDSET_GB1,
37 CIDCODING_GB,
38 CPDF_CMap::MixedTwoBytes,
39 1,
40 {{0xa1, 0xfe}}},
41 {"GBpc-EUC",
42 CIDSET_GB1,
43 CIDCODING_GB,
44 CPDF_CMap::MixedTwoBytes,
45 1,
46 {{0xa1, 0xfc}}},
47 {"GBK-EUC",
48 CIDSET_GB1,
49 CIDCODING_GB,
50 CPDF_CMap::MixedTwoBytes,
51 1,
52 {{0x81, 0xfe}}},
53 {"GBKp-EUC",
54 CIDSET_GB1,
55 CIDCODING_GB,
56 CPDF_CMap::MixedTwoBytes,
57 1,
58 {{0x81, 0xfe}}},
59 {"GBK2K-EUC",
60 CIDSET_GB1,
61 CIDCODING_GB,
62 CPDF_CMap::MixedTwoBytes,
63 1,
64 {{0x81, 0xfe}}},
65 {"GBK2K",
66 CIDSET_GB1,
67 CIDCODING_GB,
68 CPDF_CMap::MixedTwoBytes,
69 1,
70 {{0x81, 0xfe}}},
71 {"UniGB-UCS2", CIDSET_GB1, CIDCODING_UCS2, CPDF_CMap::TwoBytes, 0, {}},
72 {"UniGB-UTF16", CIDSET_GB1, CIDCODING_UTF16, CPDF_CMap::TwoBytes, 0, {}},
73 {"B5pc",
74 CIDSET_CNS1,
75 CIDCODING_BIG5,
76 CPDF_CMap::MixedTwoBytes,
77 1,
78 {{0xa1, 0xfc}}},
79 {"HKscs-B5",
80 CIDSET_CNS1,
81 CIDCODING_BIG5,
82 CPDF_CMap::MixedTwoBytes,
83 1,
84 {{0x88, 0xfe}}},
85 {"ETen-B5",
86 CIDSET_CNS1,
87 CIDCODING_BIG5,
88 CPDF_CMap::MixedTwoBytes,
89 1,
90 {{0xa1, 0xfe}}},
91 {"ETenms-B5",
92 CIDSET_CNS1,
93 CIDCODING_BIG5,
94 CPDF_CMap::MixedTwoBytes,
95 1,
96 {{0xa1, 0xfe}}},
97 {"UniCNS-UCS2", CIDSET_CNS1, CIDCODING_UCS2, CPDF_CMap::TwoBytes, 0, {}},
98 {"UniCNS-UTF16", CIDSET_CNS1, CIDCODING_UTF16, CPDF_CMap::TwoBytes, 0, {}},
99 {"83pv-RKSJ",
100 CIDSET_JAPAN1,
101 CIDCODING_JIS,
102 CPDF_CMap::MixedTwoBytes,
103 2,
104 {{0x81, 0x9f}, {0xe0, 0xfc}}},
105 {"90ms-RKSJ",
106 CIDSET_JAPAN1,
107 CIDCODING_JIS,
108 CPDF_CMap::MixedTwoBytes,
109 2,
110 {{0x81, 0x9f}, {0xe0, 0xfc}}},
111 {"90msp-RKSJ",
112 CIDSET_JAPAN1,
113 CIDCODING_JIS,
114 CPDF_CMap::MixedTwoBytes,
115 2,
116 {{0x81, 0x9f}, {0xe0, 0xfc}}},
117 {"90pv-RKSJ",
118 CIDSET_JAPAN1,
119 CIDCODING_JIS,
120 CPDF_CMap::MixedTwoBytes,
121 2,
122 {{0x81, 0x9f}, {0xe0, 0xfc}}},
123 {"Add-RKSJ",
124 CIDSET_JAPAN1,
125 CIDCODING_JIS,
126 CPDF_CMap::MixedTwoBytes,
127 2,
128 {{0x81, 0x9f}, {0xe0, 0xfc}}},
129 {"EUC",
130 CIDSET_JAPAN1,
131 CIDCODING_JIS,
132 CPDF_CMap::MixedTwoBytes,
133 2,
134 {{0x8e, 0x8e}, {0xa1, 0xfe}}},
135 {"H", CIDSET_JAPAN1, CIDCODING_JIS, CPDF_CMap::TwoBytes, 1, {{0x21, 0x7e}}},
136 {"V", CIDSET_JAPAN1, CIDCODING_JIS, CPDF_CMap::TwoBytes, 1, {{0x21, 0x7e}}},
137 {"Ext-RKSJ",
138 CIDSET_JAPAN1,
139 CIDCODING_JIS,
140 CPDF_CMap::MixedTwoBytes,
141 2,
142 {{0x81, 0x9f}, {0xe0, 0xfc}}},
143 {"UniJIS-UCS2", CIDSET_JAPAN1, CIDCODING_UCS2, CPDF_CMap::TwoBytes, 0, {}},
144 {"UniJIS-UCS2-HW",
145 CIDSET_JAPAN1,
146 CIDCODING_UCS2,
147 CPDF_CMap::TwoBytes,
148 0,
149 {}},
150 {"UniJIS-UTF16",
151 CIDSET_JAPAN1,
152 CIDCODING_UTF16,
153 CPDF_CMap::TwoBytes,
154 0,
155 {}},
156 {"KSC-EUC",
157 CIDSET_KOREA1,
158 CIDCODING_KOREA,
159 CPDF_CMap::MixedTwoBytes,
160 1,
161 {{0xa1, 0xfe}}},
162 {"KSCms-UHC",
163 CIDSET_KOREA1,
164 CIDCODING_KOREA,
165 CPDF_CMap::MixedTwoBytes,
166 1,
167 {{0x81, 0xfe}}},
168 {"KSCms-UHC-HW",
169 CIDSET_KOREA1,
170 CIDCODING_KOREA,
171 CPDF_CMap::MixedTwoBytes,
172 1,
173 {{0x81, 0xfe}}},
174 {"KSCpc-EUC",
175 CIDSET_KOREA1,
176 CIDCODING_KOREA,
177 CPDF_CMap::MixedTwoBytes,
178 1,
179 {{0xa1, 0xfd}}},
180 {"UniKS-UCS2", CIDSET_KOREA1, CIDCODING_UCS2, CPDF_CMap::TwoBytes, 0, {}},
181 {"UniKS-UTF16", CIDSET_KOREA1, CIDCODING_UTF16, CPDF_CMap::TwoBytes, 0, {}},
182 };
183
CheckFourByteCodeRange(uint8_t * codes,size_t size,const std::vector<CPDF_CMap::CodeRange> & ranges)184 int CheckFourByteCodeRange(uint8_t* codes,
185 size_t size,
186 const std::vector<CPDF_CMap::CodeRange>& ranges) {
187 for (size_t i = ranges.size(); i > 0; i--) {
188 size_t seg = i - 1;
189 if (ranges[seg].m_CharSize < size)
190 continue;
191 size_t iChar = 0;
192 while (iChar < size) {
193 if (codes[iChar] < ranges[seg].m_Lower[iChar] ||
194 codes[iChar] > ranges[seg].m_Upper[iChar]) {
195 break;
196 }
197 ++iChar;
198 }
199 if (iChar == ranges[seg].m_CharSize)
200 return 2;
201 if (iChar)
202 return (size == ranges[seg].m_CharSize) ? 2 : 1;
203 }
204 return 0;
205 }
206
GetFourByteCharSizeImpl(uint32_t charcode,const std::vector<CPDF_CMap::CodeRange> & ranges)207 size_t GetFourByteCharSizeImpl(
208 uint32_t charcode,
209 const std::vector<CPDF_CMap::CodeRange>& ranges) {
210 if (ranges.empty())
211 return 1;
212
213 uint8_t codes[4];
214 codes[0] = codes[1] = 0x00;
215 codes[2] = static_cast<uint8_t>(charcode >> 8 & 0xFF);
216 codes[3] = static_cast<uint8_t>(charcode);
217 for (size_t offset = 0; offset < 4; offset++) {
218 size_t size = 4 - offset;
219 for (size_t j = 0; j < ranges.size(); j++) {
220 size_t iSeg = (ranges.size() - 1) - j;
221 if (ranges[iSeg].m_CharSize < size)
222 continue;
223 size_t iChar = 0;
224 while (iChar < size) {
225 if (codes[offset + iChar] < ranges[iSeg].m_Lower[iChar] ||
226 codes[offset + iChar] > ranges[iSeg].m_Upper[iChar]) {
227 break;
228 }
229 ++iChar;
230 }
231 if (iChar == ranges[iSeg].m_CharSize)
232 return size;
233 }
234 }
235 return 1;
236 }
237
238 } // namespace
239
CPDF_CMap()240 CPDF_CMap::CPDF_CMap()
241 : m_bLoaded(false),
242 m_bVertical(false),
243 m_Charset(CIDSET_UNKNOWN),
244 m_CodingScheme(TwoBytes),
245 m_Coding(CIDCODING_UNKNOWN),
246 m_pEmbedMap(nullptr) {}
247
~CPDF_CMap()248 CPDF_CMap::~CPDF_CMap() {}
249
LoadPredefined(CPDF_CMapManager * pMgr,const ByteString & bsName,bool bPromptCJK)250 void CPDF_CMap::LoadPredefined(CPDF_CMapManager* pMgr,
251 const ByteString& bsName,
252 bool bPromptCJK) {
253 m_PredefinedCMap = bsName;
254 if (m_PredefinedCMap == "Identity-H" || m_PredefinedCMap == "Identity-V") {
255 m_Coding = CIDCODING_CID;
256 m_bVertical = bsName.Last() == 'V';
257 m_bLoaded = true;
258 return;
259 }
260 ByteString cmapid = m_PredefinedCMap;
261 m_bVertical = cmapid.Last() == 'V';
262 if (cmapid.GetLength() > 2) {
263 cmapid = cmapid.Left(cmapid.GetLength() - 2);
264 }
265 const PredefinedCMap* map = nullptr;
266 for (size_t i = 0; i < FX_ArraySize(g_PredefinedCMaps); ++i) {
267 if (cmapid == ByteStringView(g_PredefinedCMaps[i].m_pName)) {
268 map = &g_PredefinedCMaps[i];
269 break;
270 }
271 }
272 if (!map)
273 return;
274
275 m_Charset = map->m_Charset;
276 m_Coding = map->m_Coding;
277 m_CodingScheme = map->m_CodingScheme;
278 if (m_CodingScheme == MixedTwoBytes) {
279 m_MixedTwoByteLeadingBytes = std::vector<bool>(256);
280 for (uint32_t i = 0; i < map->m_LeadingSegCount; ++i) {
281 const ByteRange& seg = map->m_LeadingSegs[i];
282 for (int b = seg.m_First; b <= seg.m_Last; ++b)
283 m_MixedTwoByteLeadingBytes[b] = true;
284 }
285 }
286 m_pEmbedMap = FPDFAPI_FindEmbeddedCMap(bsName, m_Charset, m_Coding);
287 if (!m_pEmbedMap)
288 return;
289
290 m_bLoaded = true;
291 }
292
LoadEmbedded(const uint8_t * pData,uint32_t size)293 void CPDF_CMap::LoadEmbedded(const uint8_t* pData, uint32_t size) {
294 m_DirectCharcodeToCIDTable = std::vector<uint16_t>(65536);
295 CPDF_CMapParser parser(this);
296 CPDF_SimpleParser syntax(pData, size);
297 while (1) {
298 ByteStringView word = syntax.GetWord();
299 if (word.IsEmpty()) {
300 break;
301 }
302 parser.ParseWord(word);
303 }
304 if (m_CodingScheme == MixedFourBytes && parser.HasAdditionalMappings()) {
305 m_AdditionalCharcodeToCIDMappings = parser.TakeAdditionalMappings();
306 std::sort(
307 m_AdditionalCharcodeToCIDMappings.begin(),
308 m_AdditionalCharcodeToCIDMappings.end(),
309 [](const CPDF_CMap::CIDRange& arg1, const CPDF_CMap::CIDRange& arg2) {
310 return arg1.m_EndCode < arg2.m_EndCode;
311 });
312 }
313 }
314
CIDFromCharCode(uint32_t charcode) const315 uint16_t CPDF_CMap::CIDFromCharCode(uint32_t charcode) const {
316 if (m_Coding == CIDCODING_CID)
317 return static_cast<uint16_t>(charcode);
318
319 if (m_pEmbedMap)
320 return FPDFAPI_CIDFromCharCode(m_pEmbedMap, charcode);
321
322 if (m_DirectCharcodeToCIDTable.empty())
323 return static_cast<uint16_t>(charcode);
324
325 if (charcode < 0x10000)
326 return m_DirectCharcodeToCIDTable[charcode];
327
328 auto it = std::lower_bound(m_AdditionalCharcodeToCIDMappings.begin(),
329 m_AdditionalCharcodeToCIDMappings.end(), charcode,
330 [](const CPDF_CMap::CIDRange& arg, uint32_t val) {
331 return arg.m_EndCode < val;
332 });
333 if (it == m_AdditionalCharcodeToCIDMappings.end() ||
334 it->m_StartCode > charcode) {
335 return 0;
336 }
337 return it->m_StartCID + charcode - it->m_StartCode;
338 }
339
GetNextChar(const char * pString,int nStrLen,int & offset) const340 uint32_t CPDF_CMap::GetNextChar(const char* pString,
341 int nStrLen,
342 int& offset) const {
343 auto* pBytes = reinterpret_cast<const uint8_t*>(pString);
344 switch (m_CodingScheme) {
345 case OneByte: {
346 return pBytes[offset++];
347 }
348 case TwoBytes: {
349 uint8_t byte1 = pBytes[offset++];
350 return 256 * byte1 + pBytes[offset++];
351 }
352 case MixedTwoBytes: {
353 uint8_t byte1 = pBytes[offset++];
354 if (!m_MixedTwoByteLeadingBytes[byte1])
355 return byte1;
356 return 256 * byte1 + pBytes[offset++];
357 }
358 case MixedFourBytes: {
359 uint8_t codes[4];
360 int char_size = 1;
361 codes[0] = pBytes[offset++];
362 while (1) {
363 int ret = CheckFourByteCodeRange(codes, char_size,
364 m_MixedFourByteLeadingRanges);
365 if (ret == 0)
366 return 0;
367 if (ret == 2) {
368 uint32_t charcode = 0;
369 for (int i = 0; i < char_size; i++)
370 charcode = (charcode << 8) + codes[i];
371 return charcode;
372 }
373 if (char_size == 4 || offset == nStrLen)
374 return 0;
375 codes[char_size++] = pBytes[offset++];
376 }
377 break;
378 }
379 }
380 return 0;
381 }
382
GetCharSize(uint32_t charcode) const383 int CPDF_CMap::GetCharSize(uint32_t charcode) const {
384 switch (m_CodingScheme) {
385 case OneByte:
386 return 1;
387 case TwoBytes:
388 return 2;
389 case MixedTwoBytes:
390 if (charcode < 0x100)
391 return 1;
392 return 2;
393 case MixedFourBytes:
394 if (charcode < 0x100)
395 return 1;
396 if (charcode < 0x10000)
397 return 2;
398 if (charcode < 0x1000000)
399 return 3;
400 return 4;
401 }
402 return 1;
403 }
404
CountChar(const char * pString,int size) const405 int CPDF_CMap::CountChar(const char* pString, int size) const {
406 switch (m_CodingScheme) {
407 case OneByte:
408 return size;
409 case TwoBytes:
410 return (size + 1) / 2;
411 case MixedTwoBytes: {
412 int count = 0;
413 for (int i = 0; i < size; i++) {
414 count++;
415 if (m_MixedTwoByteLeadingBytes[reinterpret_cast<const uint8_t*>(
416 pString)[i]]) {
417 i++;
418 }
419 }
420 return count;
421 }
422 case MixedFourBytes: {
423 int count = 0, offset = 0;
424 while (offset < size) {
425 GetNextChar(pString, size, offset);
426 count++;
427 }
428 return count;
429 }
430 }
431 return size;
432 }
433
AppendChar(char * str,uint32_t charcode) const434 int CPDF_CMap::AppendChar(char* str, uint32_t charcode) const {
435 switch (m_CodingScheme) {
436 case OneByte:
437 str[0] = static_cast<char>(charcode);
438 return 1;
439 case TwoBytes:
440 str[0] = static_cast<char>(charcode / 256);
441 str[1] = static_cast<char>(charcode % 256);
442 return 2;
443 case MixedTwoBytes:
444 if (charcode < 0x100 && !m_MixedTwoByteLeadingBytes[charcode]) {
445 str[0] = static_cast<char>(charcode);
446 return 1;
447 }
448 str[0] = static_cast<char>(charcode >> 8);
449 str[1] = static_cast<char>(charcode);
450 return 2;
451 case MixedFourBytes:
452 if (charcode < 0x100) {
453 int iSize = static_cast<int>(
454 GetFourByteCharSizeImpl(charcode, m_MixedFourByteLeadingRanges));
455 if (iSize == 0)
456 iSize = 1;
457 str[iSize - 1] = static_cast<char>(charcode);
458 if (iSize > 1)
459 memset(str, 0, iSize - 1);
460 return iSize;
461 }
462 if (charcode < 0x10000) {
463 str[0] = static_cast<char>(charcode >> 8);
464 str[1] = static_cast<char>(charcode);
465 return 2;
466 }
467 if (charcode < 0x1000000) {
468 str[0] = static_cast<char>(charcode >> 16);
469 str[1] = static_cast<char>(charcode >> 8);
470 str[2] = static_cast<char>(charcode);
471 return 3;
472 }
473 str[0] = static_cast<char>(charcode >> 24);
474 str[1] = static_cast<char>(charcode >> 16);
475 str[2] = static_cast<char>(charcode >> 8);
476 str[3] = static_cast<char>(charcode);
477 return 4;
478 }
479 return 0;
480 }
481