• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2017 The PDFium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #include "core/fpdfapi/font/cpdf_tounicodemap.h"
8 
9 #include <set>
10 #include <utility>
11 
12 #include "core/fpdfapi/font/cpdf_cid2unicodemap.h"
13 #include "core/fpdfapi/font/cpdf_fontglobals.h"
14 #include "core/fpdfapi/parser/cpdf_simple_parser.h"
15 #include "core/fpdfapi/parser/cpdf_stream.h"
16 #include "core/fxcrt/fx_extension.h"
17 #include "core/fxcrt/fx_safe_types.h"
18 #include "third_party/base/containers/contains.h"
19 #include "third_party/base/numerics/safe_conversions.h"
20 
21 namespace {
22 
StringDataAdd(WideString str)23 WideString StringDataAdd(WideString str) {
24   WideString ret;
25   wchar_t value = 1;
26   for (size_t i = str.GetLength(); i > 0; --i) {
27     wchar_t ch = str[i - 1] + value;
28     if (ch < str[i - 1]) {
29       ret.InsertAtFront(0);
30     } else {
31       ret.InsertAtFront(ch);
32       value = 0;
33     }
34   }
35   if (value)
36     ret.InsertAtFront(value);
37   return ret;
38 }
39 
40 }  // namespace
41 
CPDF_ToUnicodeMap(RetainPtr<const CPDF_Stream> pStream)42 CPDF_ToUnicodeMap::CPDF_ToUnicodeMap(RetainPtr<const CPDF_Stream> pStream) {
43   Load(std::move(pStream));
44 }
45 
46 CPDF_ToUnicodeMap::~CPDF_ToUnicodeMap() = default;
47 
Lookup(uint32_t charcode) const48 WideString CPDF_ToUnicodeMap::Lookup(uint32_t charcode) const {
49   auto it = m_Multimap.find(charcode);
50   if (it == m_Multimap.end()) {
51     if (!m_pBaseMap)
52       return WideString();
53     return WideString(
54         m_pBaseMap->UnicodeFromCID(static_cast<uint16_t>(charcode)));
55   }
56 
57   uint32_t value = *it->second.begin();
58   wchar_t unicode = static_cast<wchar_t>(value & 0xffff);
59   if (unicode != 0xffff)
60     return WideString(unicode);
61 
62   size_t index = value >> 16;
63   return index < m_MultiCharVec.size() ? m_MultiCharVec[index] : WideString();
64 }
65 
ReverseLookup(wchar_t unicode) const66 uint32_t CPDF_ToUnicodeMap::ReverseLookup(wchar_t unicode) const {
67   for (const auto& pair : m_Multimap) {
68     if (pdfium::Contains(pair.second, static_cast<uint32_t>(unicode)))
69       return pair.first;
70   }
71   return 0;
72 }
73 
GetUnicodeCountByCharcodeForTesting(uint32_t charcode) const74 size_t CPDF_ToUnicodeMap::GetUnicodeCountByCharcodeForTesting(
75     uint32_t charcode) const {
76   auto it = m_Multimap.find(charcode);
77   return it != m_Multimap.end() ? it->second.size() : 0u;
78 }
79 
80 // static
StringToCode(ByteStringView str)81 absl::optional<uint32_t> CPDF_ToUnicodeMap::StringToCode(ByteStringView str) {
82   size_t len = str.GetLength();
83   if (len <= 2 || str[0] != '<' || str[len - 1] != '>')
84     return absl::nullopt;
85 
86   FX_SAFE_UINT32 code = 0;
87   for (char c : str.Substr(1, len - 2)) {
88     if (!FXSYS_IsHexDigit(c))
89       return absl::nullopt;
90 
91     code = code * 16 + FXSYS_HexCharToInt(c);
92     if (!code.IsValid())
93       return absl::nullopt;
94   }
95   return absl::optional<uint32_t>(code.ValueOrDie());
96 }
97 
98 // static
StringToWideString(ByteStringView str)99 WideString CPDF_ToUnicodeMap::StringToWideString(ByteStringView str) {
100   size_t len = str.GetLength();
101   if (len <= 2 || str[0] != '<' || str[len - 1] != '>')
102     return WideString();
103 
104   WideString result;
105   int byte_pos = 0;
106   wchar_t ch = 0;
107   for (char c : str.Substr(1, len - 2)) {
108     if (!FXSYS_IsHexDigit(c))
109       break;
110 
111     ch = ch * 16 + FXSYS_HexCharToInt(c);
112     byte_pos++;
113     if (byte_pos == 4) {
114       result += ch;
115       byte_pos = 0;
116       ch = 0;
117     }
118   }
119   return result;
120 }
121 
Load(RetainPtr<const CPDF_Stream> pStream)122 void CPDF_ToUnicodeMap::Load(RetainPtr<const CPDF_Stream> pStream) {
123   CIDSet cid_set = CIDSET_UNKNOWN;
124   auto pAcc = pdfium::MakeRetain<CPDF_StreamAcc>(std::move(pStream));
125   pAcc->LoadAllDataFiltered();
126   CPDF_SimpleParser parser(pAcc->GetSpan());
127   while (true) {
128     ByteStringView word = parser.GetWord();
129     if (word.IsEmpty())
130       break;
131 
132     if (word == "beginbfchar")
133       HandleBeginBFChar(&parser);
134     else if (word == "beginbfrange")
135       HandleBeginBFRange(&parser);
136     else if (word == "/Adobe-Korea1-UCS2")
137       cid_set = CIDSET_KOREA1;
138     else if (word == "/Adobe-Japan1-UCS2")
139       cid_set = CIDSET_JAPAN1;
140     else if (word == "/Adobe-CNS1-UCS2")
141       cid_set = CIDSET_CNS1;
142     else if (word == "/Adobe-GB1-UCS2")
143       cid_set = CIDSET_GB1;
144   }
145   if (cid_set != CIDSET_UNKNOWN) {
146     m_pBaseMap = CPDF_FontGlobals::GetInstance()->GetCID2UnicodeMap(cid_set);
147   }
148 }
149 
HandleBeginBFChar(CPDF_SimpleParser * pParser)150 void CPDF_ToUnicodeMap::HandleBeginBFChar(CPDF_SimpleParser* pParser) {
151   while (true) {
152     ByteStringView word = pParser->GetWord();
153     if (word.IsEmpty() || word == "endbfchar")
154       return;
155 
156     absl::optional<uint32_t> code = StringToCode(word);
157     if (!code.has_value())
158       return;
159 
160     SetCode(code.value(), StringToWideString(pParser->GetWord()));
161   }
162 }
163 
HandleBeginBFRange(CPDF_SimpleParser * pParser)164 void CPDF_ToUnicodeMap::HandleBeginBFRange(CPDF_SimpleParser* pParser) {
165   while (true) {
166     ByteStringView lowcode_str = pParser->GetWord();
167     if (lowcode_str.IsEmpty() || lowcode_str == "endbfrange")
168       return;
169 
170     absl::optional<uint32_t> lowcode_opt = StringToCode(lowcode_str);
171     if (!lowcode_opt.has_value())
172       return;
173 
174     ByteStringView highcode_str = pParser->GetWord();
175     absl::optional<uint32_t> highcode_opt = StringToCode(highcode_str);
176     if (!highcode_opt.has_value())
177       return;
178 
179     uint32_t lowcode = lowcode_opt.value();
180     uint32_t highcode = (lowcode & 0xffffff00) | (highcode_opt.value() & 0xff);
181 
182     ByteStringView start = pParser->GetWord();
183     if (start == "[") {
184       for (FX_SAFE_UINT32 code = lowcode;
185            code.IsValid() && code.ValueOrDie() <= highcode; code++) {
186         SetCode(code.ValueOrDie(), StringToWideString(pParser->GetWord()));
187       }
188       pParser->GetWord();
189       continue;
190     }
191 
192     WideString destcode = StringToWideString(start);
193     if (destcode.GetLength() == 1) {
194       absl::optional<uint32_t> value_or_error = StringToCode(start);
195       if (!value_or_error.has_value())
196         return;
197 
198       uint32_t value = value_or_error.value();
199       for (FX_SAFE_UINT32 code = lowcode;
200            code.IsValid() && code.ValueOrDie() <= highcode; code++) {
201         InsertIntoMultimap(code.ValueOrDie(), value++);
202       }
203     } else {
204       for (FX_SAFE_UINT32 code = lowcode;
205            code.IsValid() && code.ValueOrDie() <= highcode; code++) {
206         uint32_t code_value = code.ValueOrDie();
207         WideString retcode =
208             code_value == lowcode ? destcode : StringDataAdd(destcode);
209         InsertIntoMultimap(code_value, GetMultiCharIndexIndicator());
210         m_MultiCharVec.push_back(retcode);
211         destcode = std::move(retcode);
212       }
213     }
214   }
215 }
216 
GetMultiCharIndexIndicator() const217 uint32_t CPDF_ToUnicodeMap::GetMultiCharIndexIndicator() const {
218   FX_SAFE_UINT32 uni = m_MultiCharVec.size();
219   uni = uni * 0x10000 + 0xffff;
220   return uni.ValueOrDefault(0);
221 }
222 
SetCode(uint32_t srccode,WideString destcode)223 void CPDF_ToUnicodeMap::SetCode(uint32_t srccode, WideString destcode) {
224   size_t len = destcode.GetLength();
225   if (len == 0)
226     return;
227 
228   if (len == 1) {
229     InsertIntoMultimap(srccode, destcode[0]);
230   } else {
231     InsertIntoMultimap(srccode, GetMultiCharIndexIndicator());
232     m_MultiCharVec.push_back(destcode);
233   }
234 }
235 
InsertIntoMultimap(uint32_t code,uint32_t destcode)236 void CPDF_ToUnicodeMap::InsertIntoMultimap(uint32_t code, uint32_t destcode) {
237   auto it = m_Multimap.find(code);
238   if (it == m_Multimap.end()) {
239     m_Multimap.emplace(code, std::set<uint32_t>{destcode});
240     return;
241   }
242 
243   it->second.emplace(destcode);
244 }
245