• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2017 The PDFium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #include "core/fpdfapi/font/cpdf_cmap.h"
8 
9 #include <array>
10 #include <utility>
11 #include <vector>
12 
13 #include "core/fpdfapi/cmaps/fpdf_cmaps.h"
14 #include "core/fpdfapi/font/cpdf_cmapparser.h"
15 #include "core/fpdfapi/font/cpdf_fontglobals.h"
16 #include "core/fpdfapi/parser/cpdf_simple_parser.h"
17 #include "core/fxcrt/check.h"
18 #include "core/fxcrt/fx_memcpy_wrappers.h"
19 #include "core/fxcrt/notreached.h"
20 
21 namespace {
22 
23 struct ByteRange {
24   uint8_t m_First;
25   uint8_t m_Last;  // Inclusive.
26 };
27 
28 struct PredefinedCMap {
29   const char* m_pName;  // Raw, POD struct.
30   CIDSet m_Charset;
31   CIDCoding m_Coding;
32   CPDF_CMap::CodingScheme m_CodingScheme;
33   ByteRange m_LeadingSegs[2];
34 };
35 
36 constexpr PredefinedCMap kPredefinedCMaps[] = {
37     {"GB-EUC",
38      CIDSET_GB1,
39      CIDCoding::kGB,
40      CPDF_CMap::MixedTwoBytes,
41      {{0xa1, 0xfe}}},
42     {"GBpc-EUC",
43      CIDSET_GB1,
44      CIDCoding::kGB,
45      CPDF_CMap::MixedTwoBytes,
46      {{0xa1, 0xfc}}},
47     {"GBK-EUC",
48      CIDSET_GB1,
49      CIDCoding::kGB,
50      CPDF_CMap::MixedTwoBytes,
51      {{0x81, 0xfe}}},
52     {"GBKp-EUC",
53      CIDSET_GB1,
54      CIDCoding::kGB,
55      CPDF_CMap::MixedTwoBytes,
56      {{0x81, 0xfe}}},
57     {"GBK2K-EUC",
58      CIDSET_GB1,
59      CIDCoding::kGB,
60      CPDF_CMap::MixedTwoBytes,
61      {{0x81, 0xfe}}},
62     {"GBK2K",
63      CIDSET_GB1,
64      CIDCoding::kGB,
65      CPDF_CMap::MixedTwoBytes,
66      {{0x81, 0xfe}}},
67     {"UniGB-UCS2", CIDSET_GB1, CIDCoding::kUCS2, CPDF_CMap::TwoBytes, {}},
68     {"UniGB-UTF16", CIDSET_GB1, CIDCoding::kUTF16, CPDF_CMap::TwoBytes, {}},
69     {"B5pc",
70      CIDSET_CNS1,
71      CIDCoding::kBIG5,
72      CPDF_CMap::MixedTwoBytes,
73      {{0xa1, 0xfc}}},
74     {"HKscs-B5",
75      CIDSET_CNS1,
76      CIDCoding::kBIG5,
77      CPDF_CMap::MixedTwoBytes,
78      {{0x88, 0xfe}}},
79     {"ETen-B5",
80      CIDSET_CNS1,
81      CIDCoding::kBIG5,
82      CPDF_CMap::MixedTwoBytes,
83      {{0xa1, 0xfe}}},
84     {"ETenms-B5",
85      CIDSET_CNS1,
86      CIDCoding::kBIG5,
87      CPDF_CMap::MixedTwoBytes,
88      {{0xa1, 0xfe}}},
89     {"UniCNS-UCS2", CIDSET_CNS1, CIDCoding::kUCS2, CPDF_CMap::TwoBytes, {}},
90     {"UniCNS-UTF16", CIDSET_CNS1, CIDCoding::kUTF16, CPDF_CMap::TwoBytes, {}},
91     {"83pv-RKSJ",
92      CIDSET_JAPAN1,
93      CIDCoding::kJIS,
94      CPDF_CMap::MixedTwoBytes,
95      {{0x81, 0x9f}, {0xe0, 0xfc}}},
96     {"90ms-RKSJ",
97      CIDSET_JAPAN1,
98      CIDCoding::kJIS,
99      CPDF_CMap::MixedTwoBytes,
100      {{0x81, 0x9f}, {0xe0, 0xfc}}},
101     {"90msp-RKSJ",
102      CIDSET_JAPAN1,
103      CIDCoding::kJIS,
104      CPDF_CMap::MixedTwoBytes,
105      {{0x81, 0x9f}, {0xe0, 0xfc}}},
106     {"90pv-RKSJ",
107      CIDSET_JAPAN1,
108      CIDCoding::kJIS,
109      CPDF_CMap::MixedTwoBytes,
110      {{0x81, 0x9f}, {0xe0, 0xfc}}},
111     {"Add-RKSJ",
112      CIDSET_JAPAN1,
113      CIDCoding::kJIS,
114      CPDF_CMap::MixedTwoBytes,
115      {{0x81, 0x9f}, {0xe0, 0xfc}}},
116     {"EUC",
117      CIDSET_JAPAN1,
118      CIDCoding::kJIS,
119      CPDF_CMap::MixedTwoBytes,
120      {{0x8e, 0x8e}, {0xa1, 0xfe}}},
121     {"H", CIDSET_JAPAN1, CIDCoding::kJIS, CPDF_CMap::TwoBytes, {{0x21, 0x7e}}},
122     {"V", CIDSET_JAPAN1, CIDCoding::kJIS, CPDF_CMap::TwoBytes, {{0x21, 0x7e}}},
123     {"Ext-RKSJ",
124      CIDSET_JAPAN1,
125      CIDCoding::kJIS,
126      CPDF_CMap::MixedTwoBytes,
127      {{0x81, 0x9f}, {0xe0, 0xfc}}},
128     {"UniJIS-UCS2", CIDSET_JAPAN1, CIDCoding::kUCS2, CPDF_CMap::TwoBytes, {}},
129     {"UniJIS-UCS2-HW",
130      CIDSET_JAPAN1,
131      CIDCoding::kUCS2,
132      CPDF_CMap::TwoBytes,
133      {}},
134     {"UniJIS-UTF16", CIDSET_JAPAN1, CIDCoding::kUTF16, CPDF_CMap::TwoBytes, {}},
135     {"KSC-EUC",
136      CIDSET_KOREA1,
137      CIDCoding::kKOREA,
138      CPDF_CMap::MixedTwoBytes,
139      {{0xa1, 0xfe}}},
140     {"KSCms-UHC",
141      CIDSET_KOREA1,
142      CIDCoding::kKOREA,
143      CPDF_CMap::MixedTwoBytes,
144      {{0x81, 0xfe}}},
145     {"KSCms-UHC-HW",
146      CIDSET_KOREA1,
147      CIDCoding::kKOREA,
148      CPDF_CMap::MixedTwoBytes,
149      {{0x81, 0xfe}}},
150     {"KSCpc-EUC",
151      CIDSET_KOREA1,
152      CIDCoding::kKOREA,
153      CPDF_CMap::MixedTwoBytes,
154      {{0xa1, 0xfd}}},
155     {"UniKS-UCS2", CIDSET_KOREA1, CIDCoding::kUCS2, CPDF_CMap::TwoBytes, {}},
156     {"UniKS-UTF16", CIDSET_KOREA1, CIDCoding::kUTF16, CPDF_CMap::TwoBytes, {}},
157 };
158 
GetPredefinedCMap(ByteStringView cmapid)159 const PredefinedCMap* GetPredefinedCMap(ByteStringView cmapid) {
160   if (cmapid.GetLength() > 2)
161     cmapid = cmapid.First(cmapid.GetLength() - 2);
162   for (const auto& map : kPredefinedCMaps) {
163     if (cmapid == map.m_pName)
164       return &map;
165   }
166   return nullptr;
167 }
168 
LoadLeadingSegments(const PredefinedCMap & map)169 std::vector<bool> LoadLeadingSegments(const PredefinedCMap& map) {
170   std::vector<bool> segments(256);
171   const auto seg_span = pdfium::make_span(map.m_LeadingSegs);
172   for (const ByteRange& seg : seg_span) {
173     if (seg.m_First == 0 && seg.m_Last == 0) {
174       break;
175     }
176     for (int b = seg.m_First; b <= seg.m_Last; ++b) {
177       segments[b] = true;
178     }
179   }
180   return segments;
181 }
182 
CheckFourByteCodeRange(pdfium::span<uint8_t> codes,pdfium::span<const CPDF_CMap::CodeRange> ranges)183 int CheckFourByteCodeRange(pdfium::span<uint8_t> codes,
184                            pdfium::span<const CPDF_CMap::CodeRange> ranges) {
185   for (size_t i = ranges.size(); i > 0; i--) {
186     const auto& range = ranges[i - 1];
187     if (range.m_CharSize < codes.size()) {
188       continue;
189     }
190     size_t iChar = 0;
191     while (iChar < codes.size()) {
192       if (codes[iChar] < range.m_Lower[iChar] ||
193           codes[iChar] > range.m_Upper[iChar]) {
194         break;
195       }
196       ++iChar;
197     }
198     if (iChar == range.m_CharSize) {
199       return 2;
200     }
201     if (iChar) {
202       return (codes.size() == range.m_CharSize) ? 2 : 1;
203     }
204   }
205   return 0;
206 }
207 
GetFourByteCharSizeImpl(uint32_t charcode,pdfium::span<const CPDF_CMap::CodeRange> ranges)208 size_t GetFourByteCharSizeImpl(
209     uint32_t charcode,
210     pdfium::span<const CPDF_CMap::CodeRange> ranges) {
211   if (ranges.empty())
212     return 1;
213 
214   std::array<uint8_t, 4> codes = {{
215       0x00,
216       0x00,
217       static_cast<uint8_t>(charcode >> 8 & 0xFF),
218       static_cast<uint8_t>(charcode),
219   }};
220   for (size_t offset = 0; offset < 4; offset++) {
221     size_t size = 4 - offset;
222     for (size_t j = 0; j < ranges.size(); j++) {
223       size_t iSeg = (ranges.size() - 1) - j;
224       if (ranges[iSeg].m_CharSize < size)
225         continue;
226       size_t iChar = 0;
227       while (iChar < size) {
228         if (codes[offset + iChar] < ranges[iSeg].m_Lower[iChar] ||
229             codes[offset + iChar] > ranges[iSeg].m_Upper[iChar]) {
230           break;
231         }
232         ++iChar;
233       }
234       if (iChar == ranges[iSeg].m_CharSize)
235         return size;
236     }
237   }
238   return 1;
239 }
240 
FindEmbeddedCMap(pdfium::span<const fxcmap::CMap> pCMaps,ByteStringView bsName)241 const fxcmap::CMap* FindEmbeddedCMap(pdfium::span<const fxcmap::CMap> pCMaps,
242                                      ByteStringView bsName) {
243   for (size_t i = 0; i < pCMaps.size(); i++) {
244     if (bsName == pCMaps[i].m_Name)
245       return &pCMaps[i];
246   }
247   return nullptr;
248 }
249 
250 }  // namespace
251 
CPDF_CMap(ByteStringView bsPredefinedName)252 CPDF_CMap::CPDF_CMap(ByteStringView bsPredefinedName)
253     : m_bVertical(bsPredefinedName.Back() == 'V') {
254   if (bsPredefinedName == "Identity-H" || bsPredefinedName == "Identity-V") {
255     m_Coding = CIDCoding::kCID;
256     m_bLoaded = true;
257     return;
258   }
259 
260   const PredefinedCMap* map = GetPredefinedCMap(bsPredefinedName);
261   if (!map)
262     return;
263 
264   m_Charset = map->m_Charset;
265   m_Coding = map->m_Coding;
266   m_CodingScheme = map->m_CodingScheme;
267   if (m_CodingScheme == MixedTwoBytes)
268     m_MixedTwoByteLeadingBytes = LoadLeadingSegments(*map);
269   m_pEmbedMap = FindEmbeddedCMap(
270       CPDF_FontGlobals::GetInstance()->GetEmbeddedCharset(m_Charset),
271       bsPredefinedName);
272   if (!m_pEmbedMap)
273     return;
274 
275   m_bLoaded = true;
276 }
277 
CPDF_CMap(pdfium::span<const uint8_t> spEmbeddedData)278 CPDF_CMap::CPDF_CMap(pdfium::span<const uint8_t> spEmbeddedData)
279     : m_DirectCharcodeToCIDTable(
280           FixedSizeDataVector<uint16_t>::Zeroed(kDirectMapTableSize)) {
281   CPDF_CMapParser parser(this);
282   CPDF_SimpleParser syntax(spEmbeddedData);
283   while (true) {
284     ByteStringView word = syntax.GetWord();
285     if (word.IsEmpty()) {
286       break;
287     }
288     parser.ParseWord(word);
289   }
290 }
291 
292 CPDF_CMap::~CPDF_CMap() = default;
293 
CIDFromCharCode(uint32_t charcode) const294 uint16_t CPDF_CMap::CIDFromCharCode(uint32_t charcode) const {
295   if (m_Coding == CIDCoding::kCID)
296     return static_cast<uint16_t>(charcode);
297 
298   if (m_pEmbedMap)
299     return fxcmap::CIDFromCharCode(m_pEmbedMap, charcode);
300 
301   if (m_DirectCharcodeToCIDTable.empty())
302     return static_cast<uint16_t>(charcode);
303 
304   auto table_span = m_DirectCharcodeToCIDTable.span();
305   if (charcode < table_span.size())
306     return table_span[charcode];
307 
308   auto it = std::lower_bound(m_AdditionalCharcodeToCIDMappings.begin(),
309                              m_AdditionalCharcodeToCIDMappings.end(), charcode,
310                              [](const CPDF_CMap::CIDRange& arg, uint32_t val) {
311                                return arg.m_EndCode < val;
312                              });
313   if (it == m_AdditionalCharcodeToCIDMappings.end() ||
314       it->m_StartCode > charcode) {
315     return 0;
316   }
317   return it->m_StartCID + charcode - it->m_StartCode;
318 }
319 
GetNextChar(ByteStringView pString,size_t * pOffset) const320 uint32_t CPDF_CMap::GetNextChar(ByteStringView pString, size_t* pOffset) const {
321   size_t& offset = *pOffset;
322   auto pBytes = pString.unsigned_span();
323   switch (m_CodingScheme) {
324     case OneByte: {
325       return offset < pBytes.size() ? pBytes[offset++] : 0;
326     }
327     case TwoBytes: {
328       uint8_t byte1 = offset < pBytes.size() ? pBytes[offset++] : 0;
329       uint8_t byte2 = offset < pBytes.size() ? pBytes[offset++] : 0;
330       return 256 * byte1 + byte2;
331     }
332     case MixedTwoBytes: {
333       uint8_t byte1 = offset < pBytes.size() ? pBytes[offset++] : 0;
334       if (!m_MixedTwoByteLeadingBytes[byte1])
335         return byte1;
336       uint8_t byte2 = offset < pBytes.size() ? pBytes[offset++] : 0;
337       return 256 * byte1 + byte2;
338     }
339     case MixedFourBytes: {
340       std::array<uint8_t, 4> codes;
341       int char_size = 1;
342       codes[0] = offset < pBytes.size() ? pBytes[offset++] : 0;
343       while (true) {
344         int ret =
345             CheckFourByteCodeRange(pdfium::make_span(codes).first(char_size),
346                                    m_MixedFourByteLeadingRanges);
347         if (ret == 0)
348           return 0;
349         if (ret == 2) {
350           uint32_t charcode = 0;
351           for (int i = 0; i < char_size; i++)
352             charcode = (charcode << 8) + codes[i];
353           return charcode;
354         }
355         if (char_size == 4 || offset == pBytes.size())
356           return 0;
357         codes[char_size++] = pBytes[offset++];
358       }
359     }
360   }
361   NOTREACHED_NORETURN();
362 }
363 
GetCharSize(uint32_t charcode) const364 int CPDF_CMap::GetCharSize(uint32_t charcode) const {
365   switch (m_CodingScheme) {
366     case OneByte:
367       return 1;
368     case TwoBytes:
369       return 2;
370     case MixedTwoBytes:
371       if (charcode < 0x100)
372         return 1;
373       return 2;
374     case MixedFourBytes:
375       if (charcode < 0x100)
376         return 1;
377       if (charcode < 0x10000)
378         return 2;
379       if (charcode < 0x1000000)
380         return 3;
381       return 4;
382   }
383   NOTREACHED_NORETURN();
384 }
385 
CountChar(ByteStringView pString) const386 size_t CPDF_CMap::CountChar(ByteStringView pString) const {
387   switch (m_CodingScheme) {
388     case OneByte:
389       return pString.GetLength();
390     case TwoBytes:
391       return (pString.GetLength() + 1) / 2;
392     case MixedTwoBytes: {
393       size_t count = 0;
394       for (size_t i = 0; i < pString.GetLength(); i++) {
395         count++;
396         if (m_MixedTwoByteLeadingBytes[pString[i]])
397           i++;
398       }
399       return count;
400     }
401     case MixedFourBytes: {
402       size_t count = 0;
403       size_t offset = 0;
404       while (offset < pString.GetLength()) {
405         GetNextChar(pString, &offset);
406         count++;
407       }
408       return count;
409     }
410   }
411   NOTREACHED_NORETURN();
412 }
413 
AppendChar(ByteString * str,uint32_t charcode) const414 void CPDF_CMap::AppendChar(ByteString* str, uint32_t charcode) const {
415   switch (m_CodingScheme) {
416     case OneByte:
417       *str += static_cast<char>(charcode);
418       return;
419     case TwoBytes:
420       *str += static_cast<char>(charcode / 256);
421       *str += static_cast<char>(charcode % 256);
422       return;
423     case MixedTwoBytes:
424       if (charcode < 0x100 && !m_MixedTwoByteLeadingBytes[charcode]) {
425         *str += static_cast<char>(charcode);
426         return;
427       }
428       *str += static_cast<char>(charcode >> 8);
429       *str += static_cast<char>(charcode);
430       return;
431     case MixedFourBytes:
432       if (charcode < 0x100) {
433         int iSize = static_cast<int>(
434             GetFourByteCharSizeImpl(charcode, m_MixedFourByteLeadingRanges));
435         int pad = iSize != 0 ? iSize - 1 : 0;
436         for (int i = 0; i < pad; ++i) {
437           *str += static_cast<char>(0);
438         }
439         *str += static_cast<char>(charcode);
440         return;
441       }
442       if (charcode < 0x10000) {
443         *str += static_cast<char>(charcode >> 8);
444         *str += static_cast<char>(charcode);
445         return;
446       }
447       if (charcode < 0x1000000) {
448         *str += static_cast<char>(charcode >> 16);
449         *str += static_cast<char>(charcode >> 8);
450         *str += static_cast<char>(charcode);
451         return;
452       }
453       *str += static_cast<char>(charcode >> 24);
454       *str += static_cast<char>(charcode >> 16);
455       *str += static_cast<char>(charcode >> 8);
456       *str += static_cast<char>(charcode);
457       return;
458   }
459   NOTREACHED_NORETURN();
460 }
461 
SetAdditionalMappings(std::vector<CIDRange> mappings)462 void CPDF_CMap::SetAdditionalMappings(std::vector<CIDRange> mappings) {
463   DCHECK(m_AdditionalCharcodeToCIDMappings.empty());
464   if (m_CodingScheme != MixedFourBytes || mappings.empty())
465     return;
466 
467   std::sort(
468       mappings.begin(), mappings.end(),
469       [](const CPDF_CMap::CIDRange& arg1, const CPDF_CMap::CIDRange& arg2) {
470         return arg1.m_EndCode < arg2.m_EndCode;
471       });
472   m_AdditionalCharcodeToCIDMappings = std::move(mappings);
473 }
474 
SetMixedFourByteLeadingRanges(std::vector<CodeRange> ranges)475 void CPDF_CMap::SetMixedFourByteLeadingRanges(std::vector<CodeRange> ranges) {
476   m_MixedFourByteLeadingRanges = std::move(ranges);
477 }
478 
SetDirectCharcodeToCIDTableRange(uint32_t start_code,uint32_t end_code,uint16_t start_cid)479 void CPDF_CMap::SetDirectCharcodeToCIDTableRange(uint32_t start_code,
480                                                  uint32_t end_code,
481                                                  uint16_t start_cid) {
482   pdfium::span<uint16_t> span = m_DirectCharcodeToCIDTable.span();
483   for (uint32_t code = start_code; code <= end_code; ++code) {
484     span[code] = static_cast<uint16_t>(start_cid + code - start_code);
485   }
486 }
487