1 // Copyright 2017 The PDFium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7 #include "core/fpdfapi/font/cpdf_tounicodemap.h"
8
9 #include <set>
10 #include <utility>
11
12 #include "core/fpdfapi/font/cpdf_cid2unicodemap.h"
13 #include "core/fpdfapi/font/cpdf_fontglobals.h"
14 #include "core/fpdfapi/parser/cpdf_simple_parser.h"
15 #include "core/fpdfapi/parser/cpdf_stream.h"
16 #include "core/fxcrt/fx_extension.h"
17 #include "core/fxcrt/fx_safe_types.h"
18 #include "third_party/base/containers/contains.h"
19 #include "third_party/base/numerics/safe_conversions.h"
20
21 namespace {
22
StringDataAdd(WideString str)23 WideString StringDataAdd(WideString str) {
24 WideString ret;
25 wchar_t value = 1;
26 for (size_t i = str.GetLength(); i > 0; --i) {
27 wchar_t ch = str[i - 1] + value;
28 if (ch < str[i - 1]) {
29 ret.InsertAtFront(0);
30 } else {
31 ret.InsertAtFront(ch);
32 value = 0;
33 }
34 }
35 if (value)
36 ret.InsertAtFront(value);
37 return ret;
38 }
39
40 } // namespace
41
CPDF_ToUnicodeMap(RetainPtr<const CPDF_Stream> pStream)42 CPDF_ToUnicodeMap::CPDF_ToUnicodeMap(RetainPtr<const CPDF_Stream> pStream) {
43 Load(std::move(pStream));
44 }
45
46 CPDF_ToUnicodeMap::~CPDF_ToUnicodeMap() = default;
47
Lookup(uint32_t charcode) const48 WideString CPDF_ToUnicodeMap::Lookup(uint32_t charcode) const {
49 auto it = m_Multimap.find(charcode);
50 if (it == m_Multimap.end()) {
51 if (!m_pBaseMap)
52 return WideString();
53 return WideString(
54 m_pBaseMap->UnicodeFromCID(static_cast<uint16_t>(charcode)));
55 }
56
57 uint32_t value = *it->second.begin();
58 wchar_t unicode = static_cast<wchar_t>(value & 0xffff);
59 if (unicode != 0xffff)
60 return WideString(unicode);
61
62 size_t index = value >> 16;
63 return index < m_MultiCharVec.size() ? m_MultiCharVec[index] : WideString();
64 }
65
ReverseLookup(wchar_t unicode) const66 uint32_t CPDF_ToUnicodeMap::ReverseLookup(wchar_t unicode) const {
67 for (const auto& pair : m_Multimap) {
68 if (pdfium::Contains(pair.second, static_cast<uint32_t>(unicode)))
69 return pair.first;
70 }
71 return 0;
72 }
73
GetUnicodeCountByCharcodeForTesting(uint32_t charcode) const74 size_t CPDF_ToUnicodeMap::GetUnicodeCountByCharcodeForTesting(
75 uint32_t charcode) const {
76 auto it = m_Multimap.find(charcode);
77 return it != m_Multimap.end() ? it->second.size() : 0u;
78 }
79
80 // static
StringToCode(ByteStringView str)81 absl::optional<uint32_t> CPDF_ToUnicodeMap::StringToCode(ByteStringView str) {
82 size_t len = str.GetLength();
83 if (len <= 2 || str[0] != '<' || str[len - 1] != '>')
84 return absl::nullopt;
85
86 FX_SAFE_UINT32 code = 0;
87 for (char c : str.Substr(1, len - 2)) {
88 if (!FXSYS_IsHexDigit(c))
89 return absl::nullopt;
90
91 code = code * 16 + FXSYS_HexCharToInt(c);
92 if (!code.IsValid())
93 return absl::nullopt;
94 }
95 return absl::optional<uint32_t>(code.ValueOrDie());
96 }
97
98 // static
StringToWideString(ByteStringView str)99 WideString CPDF_ToUnicodeMap::StringToWideString(ByteStringView str) {
100 size_t len = str.GetLength();
101 if (len <= 2 || str[0] != '<' || str[len - 1] != '>')
102 return WideString();
103
104 WideString result;
105 int byte_pos = 0;
106 wchar_t ch = 0;
107 for (char c : str.Substr(1, len - 2)) {
108 if (!FXSYS_IsHexDigit(c))
109 break;
110
111 ch = ch * 16 + FXSYS_HexCharToInt(c);
112 byte_pos++;
113 if (byte_pos == 4) {
114 result += ch;
115 byte_pos = 0;
116 ch = 0;
117 }
118 }
119 return result;
120 }
121
Load(RetainPtr<const CPDF_Stream> pStream)122 void CPDF_ToUnicodeMap::Load(RetainPtr<const CPDF_Stream> pStream) {
123 CIDSet cid_set = CIDSET_UNKNOWN;
124 auto pAcc = pdfium::MakeRetain<CPDF_StreamAcc>(std::move(pStream));
125 pAcc->LoadAllDataFiltered();
126 CPDF_SimpleParser parser(pAcc->GetSpan());
127 while (true) {
128 ByteStringView word = parser.GetWord();
129 if (word.IsEmpty())
130 break;
131
132 if (word == "beginbfchar")
133 HandleBeginBFChar(&parser);
134 else if (word == "beginbfrange")
135 HandleBeginBFRange(&parser);
136 else if (word == "/Adobe-Korea1-UCS2")
137 cid_set = CIDSET_KOREA1;
138 else if (word == "/Adobe-Japan1-UCS2")
139 cid_set = CIDSET_JAPAN1;
140 else if (word == "/Adobe-CNS1-UCS2")
141 cid_set = CIDSET_CNS1;
142 else if (word == "/Adobe-GB1-UCS2")
143 cid_set = CIDSET_GB1;
144 }
145 if (cid_set != CIDSET_UNKNOWN) {
146 m_pBaseMap = CPDF_FontGlobals::GetInstance()->GetCID2UnicodeMap(cid_set);
147 }
148 }
149
HandleBeginBFChar(CPDF_SimpleParser * pParser)150 void CPDF_ToUnicodeMap::HandleBeginBFChar(CPDF_SimpleParser* pParser) {
151 while (true) {
152 ByteStringView word = pParser->GetWord();
153 if (word.IsEmpty() || word == "endbfchar")
154 return;
155
156 absl::optional<uint32_t> code = StringToCode(word);
157 if (!code.has_value())
158 return;
159
160 SetCode(code.value(), StringToWideString(pParser->GetWord()));
161 }
162 }
163
HandleBeginBFRange(CPDF_SimpleParser * pParser)164 void CPDF_ToUnicodeMap::HandleBeginBFRange(CPDF_SimpleParser* pParser) {
165 while (true) {
166 ByteStringView lowcode_str = pParser->GetWord();
167 if (lowcode_str.IsEmpty() || lowcode_str == "endbfrange")
168 return;
169
170 absl::optional<uint32_t> lowcode_opt = StringToCode(lowcode_str);
171 if (!lowcode_opt.has_value())
172 return;
173
174 ByteStringView highcode_str = pParser->GetWord();
175 absl::optional<uint32_t> highcode_opt = StringToCode(highcode_str);
176 if (!highcode_opt.has_value())
177 return;
178
179 uint32_t lowcode = lowcode_opt.value();
180 uint32_t highcode = (lowcode & 0xffffff00) | (highcode_opt.value() & 0xff);
181
182 ByteStringView start = pParser->GetWord();
183 if (start == "[") {
184 for (FX_SAFE_UINT32 code = lowcode;
185 code.IsValid() && code.ValueOrDie() <= highcode; code++) {
186 SetCode(code.ValueOrDie(), StringToWideString(pParser->GetWord()));
187 }
188 pParser->GetWord();
189 continue;
190 }
191
192 WideString destcode = StringToWideString(start);
193 if (destcode.GetLength() == 1) {
194 absl::optional<uint32_t> value_or_error = StringToCode(start);
195 if (!value_or_error.has_value())
196 return;
197
198 uint32_t value = value_or_error.value();
199 for (FX_SAFE_UINT32 code = lowcode;
200 code.IsValid() && code.ValueOrDie() <= highcode; code++) {
201 InsertIntoMultimap(code.ValueOrDie(), value++);
202 }
203 } else {
204 for (FX_SAFE_UINT32 code = lowcode;
205 code.IsValid() && code.ValueOrDie() <= highcode; code++) {
206 uint32_t code_value = code.ValueOrDie();
207 WideString retcode =
208 code_value == lowcode ? destcode : StringDataAdd(destcode);
209 InsertIntoMultimap(code_value, GetMultiCharIndexIndicator());
210 m_MultiCharVec.push_back(retcode);
211 destcode = std::move(retcode);
212 }
213 }
214 }
215 }
216
GetMultiCharIndexIndicator() const217 uint32_t CPDF_ToUnicodeMap::GetMultiCharIndexIndicator() const {
218 FX_SAFE_UINT32 uni = m_MultiCharVec.size();
219 uni = uni * 0x10000 + 0xffff;
220 return uni.ValueOrDefault(0);
221 }
222
SetCode(uint32_t srccode,WideString destcode)223 void CPDF_ToUnicodeMap::SetCode(uint32_t srccode, WideString destcode) {
224 size_t len = destcode.GetLength();
225 if (len == 0)
226 return;
227
228 if (len == 1) {
229 InsertIntoMultimap(srccode, destcode[0]);
230 } else {
231 InsertIntoMultimap(srccode, GetMultiCharIndexIndicator());
232 m_MultiCharVec.push_back(destcode);
233 }
234 }
235
InsertIntoMultimap(uint32_t code,uint32_t destcode)236 void CPDF_ToUnicodeMap::InsertIntoMultimap(uint32_t code, uint32_t destcode) {
237 auto it = m_Multimap.find(code);
238 if (it == m_Multimap.end()) {
239 m_Multimap.emplace(code, std::set<uint32_t>{destcode});
240 return;
241 }
242
243 it->second.emplace(destcode);
244 }
245