• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2017 The PDFium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #include "core/fpdfapi/font/cpdf_cmap.h"
8 
9 #include <utility>
10 #include <vector>
11 
12 #include "core/fpdfapi/cmaps/fpdf_cmaps.h"
13 #include "core/fpdfapi/font/cpdf_cmapparser.h"
14 #include "core/fpdfapi/font/cpdf_fontglobals.h"
15 #include "core/fpdfapi/parser/cpdf_simple_parser.h"
16 #include "third_party/base/check.h"
17 
18 namespace {
19 
20 struct ByteRange {
21   uint8_t m_First;
22   uint8_t m_Last;  // Inclusive.
23 };
24 
25 struct PredefinedCMap {
26   const char* m_pName;  // Raw, POD struct.
27   CIDSet m_Charset;
28   CIDCoding m_Coding;
29   CPDF_CMap::CodingScheme m_CodingScheme;
30   uint8_t m_LeadingSegCount;
31   ByteRange m_LeadingSegs[2];
32 };
33 
34 constexpr PredefinedCMap kPredefinedCMaps[] = {
35     {"GB-EUC",
36      CIDSET_GB1,
37      CIDCoding::kGB,
38      CPDF_CMap::MixedTwoBytes,
39      1,
40      {{0xa1, 0xfe}}},
41     {"GBpc-EUC",
42      CIDSET_GB1,
43      CIDCoding::kGB,
44      CPDF_CMap::MixedTwoBytes,
45      1,
46      {{0xa1, 0xfc}}},
47     {"GBK-EUC",
48      CIDSET_GB1,
49      CIDCoding::kGB,
50      CPDF_CMap::MixedTwoBytes,
51      1,
52      {{0x81, 0xfe}}},
53     {"GBKp-EUC",
54      CIDSET_GB1,
55      CIDCoding::kGB,
56      CPDF_CMap::MixedTwoBytes,
57      1,
58      {{0x81, 0xfe}}},
59     {"GBK2K-EUC",
60      CIDSET_GB1,
61      CIDCoding::kGB,
62      CPDF_CMap::MixedTwoBytes,
63      1,
64      {{0x81, 0xfe}}},
65     {"GBK2K",
66      CIDSET_GB1,
67      CIDCoding::kGB,
68      CPDF_CMap::MixedTwoBytes,
69      1,
70      {{0x81, 0xfe}}},
71     {"UniGB-UCS2", CIDSET_GB1, CIDCoding::kUCS2, CPDF_CMap::TwoBytes, 0, {}},
72     {"UniGB-UTF16", CIDSET_GB1, CIDCoding::kUTF16, CPDF_CMap::TwoBytes, 0, {}},
73     {"B5pc",
74      CIDSET_CNS1,
75      CIDCoding::kBIG5,
76      CPDF_CMap::MixedTwoBytes,
77      1,
78      {{0xa1, 0xfc}}},
79     {"HKscs-B5",
80      CIDSET_CNS1,
81      CIDCoding::kBIG5,
82      CPDF_CMap::MixedTwoBytes,
83      1,
84      {{0x88, 0xfe}}},
85     {"ETen-B5",
86      CIDSET_CNS1,
87      CIDCoding::kBIG5,
88      CPDF_CMap::MixedTwoBytes,
89      1,
90      {{0xa1, 0xfe}}},
91     {"ETenms-B5",
92      CIDSET_CNS1,
93      CIDCoding::kBIG5,
94      CPDF_CMap::MixedTwoBytes,
95      1,
96      {{0xa1, 0xfe}}},
97     {"UniCNS-UCS2", CIDSET_CNS1, CIDCoding::kUCS2, CPDF_CMap::TwoBytes, 0, {}},
98     {"UniCNS-UTF16",
99      CIDSET_CNS1,
100      CIDCoding::kUTF16,
101      CPDF_CMap::TwoBytes,
102      0,
103      {}},
104     {"83pv-RKSJ",
105      CIDSET_JAPAN1,
106      CIDCoding::kJIS,
107      CPDF_CMap::MixedTwoBytes,
108      2,
109      {{0x81, 0x9f}, {0xe0, 0xfc}}},
110     {"90ms-RKSJ",
111      CIDSET_JAPAN1,
112      CIDCoding::kJIS,
113      CPDF_CMap::MixedTwoBytes,
114      2,
115      {{0x81, 0x9f}, {0xe0, 0xfc}}},
116     {"90msp-RKSJ",
117      CIDSET_JAPAN1,
118      CIDCoding::kJIS,
119      CPDF_CMap::MixedTwoBytes,
120      2,
121      {{0x81, 0x9f}, {0xe0, 0xfc}}},
122     {"90pv-RKSJ",
123      CIDSET_JAPAN1,
124      CIDCoding::kJIS,
125      CPDF_CMap::MixedTwoBytes,
126      2,
127      {{0x81, 0x9f}, {0xe0, 0xfc}}},
128     {"Add-RKSJ",
129      CIDSET_JAPAN1,
130      CIDCoding::kJIS,
131      CPDF_CMap::MixedTwoBytes,
132      2,
133      {{0x81, 0x9f}, {0xe0, 0xfc}}},
134     {"EUC",
135      CIDSET_JAPAN1,
136      CIDCoding::kJIS,
137      CPDF_CMap::MixedTwoBytes,
138      2,
139      {{0x8e, 0x8e}, {0xa1, 0xfe}}},
140     {"H",
141      CIDSET_JAPAN1,
142      CIDCoding::kJIS,
143      CPDF_CMap::TwoBytes,
144      1,
145      {{0x21, 0x7e}}},
146     {"V",
147      CIDSET_JAPAN1,
148      CIDCoding::kJIS,
149      CPDF_CMap::TwoBytes,
150      1,
151      {{0x21, 0x7e}}},
152     {"Ext-RKSJ",
153      CIDSET_JAPAN1,
154      CIDCoding::kJIS,
155      CPDF_CMap::MixedTwoBytes,
156      2,
157      {{0x81, 0x9f}, {0xe0, 0xfc}}},
158     {"UniJIS-UCS2",
159      CIDSET_JAPAN1,
160      CIDCoding::kUCS2,
161      CPDF_CMap::TwoBytes,
162      0,
163      {}},
164     {"UniJIS-UCS2-HW",
165      CIDSET_JAPAN1,
166      CIDCoding::kUCS2,
167      CPDF_CMap::TwoBytes,
168      0,
169      {}},
170     {"UniJIS-UTF16",
171      CIDSET_JAPAN1,
172      CIDCoding::kUTF16,
173      CPDF_CMap::TwoBytes,
174      0,
175      {}},
176     {"KSC-EUC",
177      CIDSET_KOREA1,
178      CIDCoding::kKOREA,
179      CPDF_CMap::MixedTwoBytes,
180      1,
181      {{0xa1, 0xfe}}},
182     {"KSCms-UHC",
183      CIDSET_KOREA1,
184      CIDCoding::kKOREA,
185      CPDF_CMap::MixedTwoBytes,
186      1,
187      {{0x81, 0xfe}}},
188     {"KSCms-UHC-HW",
189      CIDSET_KOREA1,
190      CIDCoding::kKOREA,
191      CPDF_CMap::MixedTwoBytes,
192      1,
193      {{0x81, 0xfe}}},
194     {"KSCpc-EUC",
195      CIDSET_KOREA1,
196      CIDCoding::kKOREA,
197      CPDF_CMap::MixedTwoBytes,
198      1,
199      {{0xa1, 0xfd}}},
200     {"UniKS-UCS2", CIDSET_KOREA1, CIDCoding::kUCS2, CPDF_CMap::TwoBytes, 0, {}},
201     {"UniKS-UTF16",
202      CIDSET_KOREA1,
203      CIDCoding::kUTF16,
204      CPDF_CMap::TwoBytes,
205      0,
206      {}},
207 };
208 
GetPredefinedCMap(ByteStringView cmapid)209 const PredefinedCMap* GetPredefinedCMap(ByteStringView cmapid) {
210   if (cmapid.GetLength() > 2)
211     cmapid = cmapid.First(cmapid.GetLength() - 2);
212   for (const auto& map : kPredefinedCMaps) {
213     if (cmapid == map.m_pName)
214       return &map;
215   }
216   return nullptr;
217 }
218 
LoadLeadingSegments(const PredefinedCMap & map)219 std::vector<bool> LoadLeadingSegments(const PredefinedCMap& map) {
220   std::vector<bool> segments(256);
221   for (uint32_t i = 0; i < map.m_LeadingSegCount; ++i) {
222     const ByteRange& seg = map.m_LeadingSegs[i];
223     for (int b = seg.m_First; b <= seg.m_Last; ++b)
224       segments[b] = true;
225   }
226   return segments;
227 }
228 
CheckFourByteCodeRange(uint8_t * codes,size_t size,const std::vector<CPDF_CMap::CodeRange> & ranges)229 int CheckFourByteCodeRange(uint8_t* codes,
230                            size_t size,
231                            const std::vector<CPDF_CMap::CodeRange>& ranges) {
232   for (size_t i = ranges.size(); i > 0; i--) {
233     size_t seg = i - 1;
234     if (ranges[seg].m_CharSize < size)
235       continue;
236     size_t iChar = 0;
237     while (iChar < size) {
238       if (codes[iChar] < ranges[seg].m_Lower[iChar] ||
239           codes[iChar] > ranges[seg].m_Upper[iChar]) {
240         break;
241       }
242       ++iChar;
243     }
244     if (iChar == ranges[seg].m_CharSize)
245       return 2;
246     if (iChar)
247       return (size == ranges[seg].m_CharSize) ? 2 : 1;
248   }
249   return 0;
250 }
251 
GetFourByteCharSizeImpl(uint32_t charcode,const std::vector<CPDF_CMap::CodeRange> & ranges)252 size_t GetFourByteCharSizeImpl(
253     uint32_t charcode,
254     const std::vector<CPDF_CMap::CodeRange>& ranges) {
255   if (ranges.empty())
256     return 1;
257 
258   uint8_t codes[4];
259   codes[0] = codes[1] = 0x00;
260   codes[2] = static_cast<uint8_t>(charcode >> 8 & 0xFF);
261   codes[3] = static_cast<uint8_t>(charcode);
262   for (size_t offset = 0; offset < 4; offset++) {
263     size_t size = 4 - offset;
264     for (size_t j = 0; j < ranges.size(); j++) {
265       size_t iSeg = (ranges.size() - 1) - j;
266       if (ranges[iSeg].m_CharSize < size)
267         continue;
268       size_t iChar = 0;
269       while (iChar < size) {
270         if (codes[offset + iChar] < ranges[iSeg].m_Lower[iChar] ||
271             codes[offset + iChar] > ranges[iSeg].m_Upper[iChar]) {
272           break;
273         }
274         ++iChar;
275       }
276       if (iChar == ranges[iSeg].m_CharSize)
277         return size;
278     }
279   }
280   return 1;
281 }
282 
FindEmbeddedCMap(pdfium::span<const fxcmap::CMap> pCMaps,ByteStringView bsName)283 const fxcmap::CMap* FindEmbeddedCMap(pdfium::span<const fxcmap::CMap> pCMaps,
284                                      ByteStringView bsName) {
285   for (size_t i = 0; i < pCMaps.size(); i++) {
286     if (bsName == pCMaps[i].m_Name)
287       return &pCMaps[i];
288   }
289   return nullptr;
290 }
291 
292 }  // namespace
293 
CPDF_CMap(ByteStringView bsPredefinedName)294 CPDF_CMap::CPDF_CMap(ByteStringView bsPredefinedName)
295     : m_bVertical(bsPredefinedName.Back() == 'V') {
296   if (bsPredefinedName == "Identity-H" || bsPredefinedName == "Identity-V") {
297     m_Coding = CIDCoding::kCID;
298     m_bLoaded = true;
299     return;
300   }
301 
302   const PredefinedCMap* map = GetPredefinedCMap(bsPredefinedName);
303   if (!map)
304     return;
305 
306   m_Charset = map->m_Charset;
307   m_Coding = map->m_Coding;
308   m_CodingScheme = map->m_CodingScheme;
309   if (m_CodingScheme == MixedTwoBytes)
310     m_MixedTwoByteLeadingBytes = LoadLeadingSegments(*map);
311   m_pEmbedMap = FindEmbeddedCMap(
312       CPDF_FontGlobals::GetInstance()->GetEmbeddedCharset(m_Charset),
313       bsPredefinedName);
314   if (!m_pEmbedMap)
315     return;
316 
317   m_bLoaded = true;
318 }
319 
CPDF_CMap(pdfium::span<const uint8_t> spEmbeddedData)320 CPDF_CMap::CPDF_CMap(pdfium::span<const uint8_t> spEmbeddedData)
321     : m_DirectCharcodeToCIDTable(kDirectMapTableSize) {
322   CPDF_CMapParser parser(this);
323   CPDF_SimpleParser syntax(spEmbeddedData);
324   while (true) {
325     ByteStringView word = syntax.GetWord();
326     if (word.IsEmpty())
327       break;
328 
329     parser.ParseWord(word);
330   }
331 }
332 
333 CPDF_CMap::~CPDF_CMap() = default;
334 
CIDFromCharCode(uint32_t charcode) const335 uint16_t CPDF_CMap::CIDFromCharCode(uint32_t charcode) const {
336   if (m_Coding == CIDCoding::kCID)
337     return static_cast<uint16_t>(charcode);
338 
339   if (m_pEmbedMap)
340     return fxcmap::CIDFromCharCode(m_pEmbedMap, charcode);
341 
342   if (m_DirectCharcodeToCIDTable.empty())
343     return static_cast<uint16_t>(charcode);
344 
345   auto table_span = m_DirectCharcodeToCIDTable.span();
346   if (charcode < table_span.size())
347     return table_span[charcode];
348 
349   auto it = std::lower_bound(m_AdditionalCharcodeToCIDMappings.begin(),
350                              m_AdditionalCharcodeToCIDMappings.end(), charcode,
351                              [](const CPDF_CMap::CIDRange& arg, uint32_t val) {
352                                return arg.m_EndCode < val;
353                              });
354   if (it == m_AdditionalCharcodeToCIDMappings.end() ||
355       it->m_StartCode > charcode) {
356     return 0;
357   }
358   return it->m_StartCID + charcode - it->m_StartCode;
359 }
360 
GetNextChar(ByteStringView pString,size_t * pOffset) const361 uint32_t CPDF_CMap::GetNextChar(ByteStringView pString, size_t* pOffset) const {
362   size_t& offset = *pOffset;
363   auto pBytes = pString.raw_span();
364   switch (m_CodingScheme) {
365     case OneByte: {
366       return offset < pBytes.size() ? pBytes[offset++] : 0;
367     }
368     case TwoBytes: {
369       uint8_t byte1 = offset < pBytes.size() ? pBytes[offset++] : 0;
370       uint8_t byte2 = offset < pBytes.size() ? pBytes[offset++] : 0;
371       return 256 * byte1 + byte2;
372     }
373     case MixedTwoBytes: {
374       uint8_t byte1 = offset < pBytes.size() ? pBytes[offset++] : 0;
375       if (!m_MixedTwoByteLeadingBytes[byte1])
376         return byte1;
377       uint8_t byte2 = offset < pBytes.size() ? pBytes[offset++] : 0;
378       return 256 * byte1 + byte2;
379     }
380     case MixedFourBytes: {
381       uint8_t codes[4];
382       int char_size = 1;
383       codes[0] = offset < pBytes.size() ? pBytes[offset++] : 0;
384       while (true) {
385         int ret = CheckFourByteCodeRange(codes, char_size,
386                                          m_MixedFourByteLeadingRanges);
387         if (ret == 0)
388           return 0;
389         if (ret == 2) {
390           uint32_t charcode = 0;
391           for (int i = 0; i < char_size; i++)
392             charcode = (charcode << 8) + codes[i];
393           return charcode;
394         }
395         if (char_size == 4 || offset == pBytes.size())
396           return 0;
397         codes[char_size++] = pBytes[offset++];
398       }
399     }
400   }
401   return 0;
402 }
403 
GetCharSize(uint32_t charcode) const404 int CPDF_CMap::GetCharSize(uint32_t charcode) const {
405   switch (m_CodingScheme) {
406     case OneByte:
407       return 1;
408     case TwoBytes:
409       return 2;
410     case MixedTwoBytes:
411       if (charcode < 0x100)
412         return 1;
413       return 2;
414     case MixedFourBytes:
415       if (charcode < 0x100)
416         return 1;
417       if (charcode < 0x10000)
418         return 2;
419       if (charcode < 0x1000000)
420         return 3;
421       return 4;
422   }
423   return 1;
424 }
425 
CountChar(ByteStringView pString) const426 size_t CPDF_CMap::CountChar(ByteStringView pString) const {
427   switch (m_CodingScheme) {
428     case OneByte:
429       return pString.GetLength();
430     case TwoBytes:
431       return (pString.GetLength() + 1) / 2;
432     case MixedTwoBytes: {
433       size_t count = 0;
434       for (size_t i = 0; i < pString.GetLength(); i++) {
435         count++;
436         if (m_MixedTwoByteLeadingBytes[pString[i]])
437           i++;
438       }
439       return count;
440     }
441     case MixedFourBytes: {
442       size_t count = 0;
443       size_t offset = 0;
444       while (offset < pString.GetLength()) {
445         GetNextChar(pString, &offset);
446         count++;
447       }
448       return count;
449     }
450   }
451   return pString.GetLength();
452 }
453 
AppendChar(char * str,uint32_t charcode) const454 int CPDF_CMap::AppendChar(char* str, uint32_t charcode) const {
455   switch (m_CodingScheme) {
456     case OneByte:
457       str[0] = static_cast<char>(charcode);
458       return 1;
459     case TwoBytes:
460       str[0] = static_cast<char>(charcode / 256);
461       str[1] = static_cast<char>(charcode % 256);
462       return 2;
463     case MixedTwoBytes:
464       if (charcode < 0x100 && !m_MixedTwoByteLeadingBytes[charcode]) {
465         str[0] = static_cast<char>(charcode);
466         return 1;
467       }
468       str[0] = static_cast<char>(charcode >> 8);
469       str[1] = static_cast<char>(charcode);
470       return 2;
471     case MixedFourBytes:
472       if (charcode < 0x100) {
473         int iSize = static_cast<int>(
474             GetFourByteCharSizeImpl(charcode, m_MixedFourByteLeadingRanges));
475         if (iSize == 0)
476           iSize = 1;
477         str[iSize - 1] = static_cast<char>(charcode);
478         if (iSize > 1)
479           memset(str, 0, iSize - 1);
480         return iSize;
481       }
482       if (charcode < 0x10000) {
483         str[0] = static_cast<char>(charcode >> 8);
484         str[1] = static_cast<char>(charcode);
485         return 2;
486       }
487       if (charcode < 0x1000000) {
488         str[0] = static_cast<char>(charcode >> 16);
489         str[1] = static_cast<char>(charcode >> 8);
490         str[2] = static_cast<char>(charcode);
491         return 3;
492       }
493       str[0] = static_cast<char>(charcode >> 24);
494       str[1] = static_cast<char>(charcode >> 16);
495       str[2] = static_cast<char>(charcode >> 8);
496       str[3] = static_cast<char>(charcode);
497       return 4;
498   }
499   return 0;
500 }
501 
SetAdditionalMappings(std::vector<CIDRange> mappings)502 void CPDF_CMap::SetAdditionalMappings(std::vector<CIDRange> mappings) {
503   DCHECK(m_AdditionalCharcodeToCIDMappings.empty());
504   if (m_CodingScheme != MixedFourBytes || mappings.empty())
505     return;
506 
507   std::sort(
508       mappings.begin(), mappings.end(),
509       [](const CPDF_CMap::CIDRange& arg1, const CPDF_CMap::CIDRange& arg2) {
510         return arg1.m_EndCode < arg2.m_EndCode;
511       });
512   m_AdditionalCharcodeToCIDMappings = std::move(mappings);
513 }
514 
SetMixedFourByteLeadingRanges(std::vector<CodeRange> ranges)515 void CPDF_CMap::SetMixedFourByteLeadingRanges(std::vector<CodeRange> ranges) {
516   m_MixedFourByteLeadingRanges = std::move(ranges);
517 }
518