• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2017 The PDFium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #include "core/fpdfapi/font/cpdf_tounicodemap.h"
8 
9 #include <limits>
10 #include <set>
11 #include <utility>
12 
13 #include "core/fpdfapi/font/cpdf_cid2unicodemap.h"
14 #include "core/fpdfapi/font/cpdf_fontglobals.h"
15 #include "core/fpdfapi/parser/cpdf_simple_parser.h"
16 #include "core/fpdfapi/parser/cpdf_stream.h"
17 #include "core/fpdfapi/parser/fpdf_parser_utility.h"
18 #include "core/fxcrt/containers/contains.h"
19 #include "core/fxcrt/fx_extension.h"
20 #include "core/fxcrt/fx_safe_types.h"
21 
22 namespace {
23 
StringDataAdd(WideString str)24 WideString StringDataAdd(WideString str) {
25   WideString ret;
26   wchar_t value = 1;
27   for (size_t i = str.GetLength(); i > 0; --i) {
28     wchar_t ch = str[i - 1] + value;
29     if (ch < str[i - 1]) {
30       ret.InsertAtFront(0);
31     } else {
32       ret.InsertAtFront(ch);
33       value = 0;
34     }
35   }
36   if (value)
37     ret.InsertAtFront(value);
38   return ret;
39 }
40 
41 }  // namespace
42 
CPDF_ToUnicodeMap(RetainPtr<const CPDF_Stream> pStream)43 CPDF_ToUnicodeMap::CPDF_ToUnicodeMap(RetainPtr<const CPDF_Stream> pStream) {
44   Load(std::move(pStream));
45 }
46 
47 CPDF_ToUnicodeMap::~CPDF_ToUnicodeMap() = default;
48 
Lookup(uint32_t charcode) const49 WideString CPDF_ToUnicodeMap::Lookup(uint32_t charcode) const {
50   auto it = m_Multimap.find(charcode);
51   if (it == m_Multimap.end()) {
52     if (!m_pBaseMap)
53       return WideString();
54     return WideString(
55         m_pBaseMap->UnicodeFromCID(static_cast<uint16_t>(charcode)));
56   }
57 
58   uint32_t value = *it->second.begin();
59   wchar_t unicode = static_cast<wchar_t>(value & 0xffff);
60   if (unicode != 0xffff)
61     return WideString(unicode);
62 
63   size_t index = value >> 16;
64   return index < m_MultiCharVec.size() ? m_MultiCharVec[index] : WideString();
65 }
66 
ReverseLookup(wchar_t unicode) const67 uint32_t CPDF_ToUnicodeMap::ReverseLookup(wchar_t unicode) const {
68   for (const auto& pair : m_Multimap) {
69     if (pdfium::Contains(pair.second, static_cast<uint32_t>(unicode)))
70       return pair.first;
71   }
72   return 0;
73 }
74 
GetUnicodeCountByCharcodeForTesting(uint32_t charcode) const75 size_t CPDF_ToUnicodeMap::GetUnicodeCountByCharcodeForTesting(
76     uint32_t charcode) const {
77   auto it = m_Multimap.find(charcode);
78   return it != m_Multimap.end() ? it->second.size() : 0u;
79 }
80 
81 // static
StringToCode(ByteStringView input)82 std::optional<uint32_t> CPDF_ToUnicodeMap::StringToCode(ByteStringView input) {
83   // Ignore whitespaces within `input`. See https://crbug.com/pdfium/2065.
84   std::set<char> seen_whitespace_chars;
85   for (char c : input) {
86     if (PDFCharIsWhitespace(c)) {
87       seen_whitespace_chars.insert(c);
88     }
89   }
90   ByteString str_without_whitespace_chars;  // Must outlive `str`.
91   ByteStringView str;
92   if (seen_whitespace_chars.empty()) {
93     str = input;
94   } else {
95     str_without_whitespace_chars.Reserve(input.GetLength());
96     for (char c : input) {
97       if (!pdfium::Contains(seen_whitespace_chars, c)) {
98         str_without_whitespace_chars += c;
99       }
100     }
101     str = str_without_whitespace_chars.AsStringView();
102   }
103 
104   size_t len = str.GetLength();
105   if (len <= 2 || str[0] != '<' || str[len - 1] != '>')
106     return std::nullopt;
107 
108   FX_SAFE_UINT32 code = 0;
109   for (char c : str.Substr(1, len - 2)) {
110     if (!FXSYS_IsHexDigit(c))
111       return std::nullopt;
112 
113     code = code * 16 + FXSYS_HexCharToInt(c);
114     if (!code.IsValid())
115       return std::nullopt;
116   }
117   return std::optional<uint32_t>(code.ValueOrDie());
118 }
119 
120 // static
StringToWideString(ByteStringView str)121 WideString CPDF_ToUnicodeMap::StringToWideString(ByteStringView str) {
122   size_t len = str.GetLength();
123   if (len <= 2 || str[0] != '<' || str[len - 1] != '>')
124     return WideString();
125 
126   WideString result;
127   int byte_pos = 0;
128   wchar_t ch = 0;
129   for (char c : str.Substr(1, len - 2)) {
130     if (!FXSYS_IsHexDigit(c))
131       break;
132 
133     ch = ch * 16 + FXSYS_HexCharToInt(c);
134     byte_pos++;
135     if (byte_pos == 4) {
136       result += ch;
137       byte_pos = 0;
138       ch = 0;
139     }
140   }
141   return result;
142 }
143 
Load(RetainPtr<const CPDF_Stream> pStream)144 void CPDF_ToUnicodeMap::Load(RetainPtr<const CPDF_Stream> pStream) {
145   CIDSet cid_set = CIDSET_UNKNOWN;
146   auto pAcc = pdfium::MakeRetain<CPDF_StreamAcc>(std::move(pStream));
147   pAcc->LoadAllDataFiltered();
148   CPDF_SimpleParser parser(pAcc->GetSpan());
149   while (true) {
150     ByteStringView word = parser.GetWord();
151     if (word.IsEmpty())
152       break;
153 
154     if (word == "beginbfchar")
155       HandleBeginBFChar(&parser);
156     else if (word == "beginbfrange")
157       HandleBeginBFRange(&parser);
158     else if (word == "/Adobe-Korea1-UCS2")
159       cid_set = CIDSET_KOREA1;
160     else if (word == "/Adobe-Japan1-UCS2")
161       cid_set = CIDSET_JAPAN1;
162     else if (word == "/Adobe-CNS1-UCS2")
163       cid_set = CIDSET_CNS1;
164     else if (word == "/Adobe-GB1-UCS2")
165       cid_set = CIDSET_GB1;
166   }
167   if (cid_set != CIDSET_UNKNOWN) {
168     m_pBaseMap = CPDF_FontGlobals::GetInstance()->GetCID2UnicodeMap(cid_set);
169   }
170 }
171 
HandleBeginBFChar(CPDF_SimpleParser * pParser)172 void CPDF_ToUnicodeMap::HandleBeginBFChar(CPDF_SimpleParser* pParser) {
173   while (true) {
174     ByteStringView word = pParser->GetWord();
175     if (word.IsEmpty() || word == "endbfchar")
176       return;
177 
178     std::optional<uint32_t> code = StringToCode(word);
179     if (!code.has_value())
180       return;
181 
182     SetCode(code.value(), StringToWideString(pParser->GetWord()));
183   }
184 }
185 
HandleBeginBFRange(CPDF_SimpleParser * pParser)186 void CPDF_ToUnicodeMap::HandleBeginBFRange(CPDF_SimpleParser* pParser) {
187   while (true) {
188     ByteStringView lowcode_str = pParser->GetWord();
189     if (lowcode_str.IsEmpty() || lowcode_str == "endbfrange")
190       return;
191 
192     std::optional<uint32_t> lowcode_opt = StringToCode(lowcode_str);
193     if (!lowcode_opt.has_value())
194       return;
195 
196     ByteStringView highcode_str = pParser->GetWord();
197     std::optional<uint32_t> highcode_opt = StringToCode(highcode_str);
198     if (!highcode_opt.has_value())
199       return;
200 
201     uint32_t lowcode = lowcode_opt.value();
202     uint32_t highcode = (lowcode & 0xffffff00) | (highcode_opt.value() & 0xff);
203 
204     ByteStringView start = pParser->GetWord();
205     if (start == "[") {
206       for (uint32_t code = lowcode; code <= highcode; ++code) {
207         SetCode(code, StringToWideString(pParser->GetWord()));
208         if (code == std::numeric_limits<uint32_t>::max()) {
209           break;
210         }
211       }
212       pParser->GetWord();
213       continue;
214     }
215 
216     WideString destcode = StringToWideString(start);
217     if (destcode.GetLength() == 1) {
218       std::optional<uint32_t> value_or_error = StringToCode(start);
219       if (!value_or_error.has_value())
220         return;
221 
222       uint32_t value = value_or_error.value();
223       for (uint32_t code = lowcode; code <= highcode; ++code) {
224         InsertIntoMultimap(code, value++);
225         if (code == std::numeric_limits<uint32_t>::max()) {
226           break;
227         }
228       }
229     } else {
230       for (uint32_t code = lowcode; code <= highcode; ++code) {
231         WideString retcode =
232             code == lowcode ? destcode : StringDataAdd(destcode);
233         InsertIntoMultimap(code, GetMultiCharIndexIndicator());
234         m_MultiCharVec.push_back(retcode);
235         destcode = std::move(retcode);
236         if (code == std::numeric_limits<uint32_t>::max()) {
237           break;
238         }
239       }
240     }
241   }
242 }
243 
GetMultiCharIndexIndicator() const244 uint32_t CPDF_ToUnicodeMap::GetMultiCharIndexIndicator() const {
245   FX_SAFE_UINT32 uni = m_MultiCharVec.size();
246   uni = uni * 0x10000 + 0xffff;
247   return uni.ValueOrDefault(0);
248 }
249 
SetCode(uint32_t srccode,WideString destcode)250 void CPDF_ToUnicodeMap::SetCode(uint32_t srccode, WideString destcode) {
251   size_t len = destcode.GetLength();
252   if (len == 0)
253     return;
254 
255   if (len == 1) {
256     InsertIntoMultimap(srccode, destcode[0]);
257   } else {
258     InsertIntoMultimap(srccode, GetMultiCharIndexIndicator());
259     m_MultiCharVec.push_back(destcode);
260   }
261 }
262 
InsertIntoMultimap(uint32_t code,uint32_t destcode)263 void CPDF_ToUnicodeMap::InsertIntoMultimap(uint32_t code, uint32_t destcode) {
264   auto it = m_Multimap.find(code);
265   if (it == m_Multimap.end()) {
266     m_Multimap.emplace(code, std::set<uint32_t>{destcode});
267     return;
268   }
269 
270   it->second.emplace(destcode);
271 }
272