1 // Copyright 2017 PDFium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com 6 7 #include "core/fpdfapi/font/cpdf_tounicodemap.h" 8 9 #include <utility> 10 11 #include "core/fpdfapi/font/cpdf_cid2unicodemap.h" 12 #include "core/fpdfapi/font/cpdf_fontglobals.h" 13 #include "core/fpdfapi/parser/cpdf_simple_parser.h" 14 #include "core/fpdfapi/parser/cpdf_stream.h" 15 #include "core/fxcrt/fx_extension.h" 16 #include "core/fxcrt/fx_safe_types.h" 17 #include "third_party/base/numerics/safe_conversions.h" 18 19 namespace { 20 StringDataAdd(WideString str)21WideString StringDataAdd(WideString str) { 22 WideString ret; 23 wchar_t value = 1; 24 for (size_t i = str.GetLength(); i > 0; --i) { 25 wchar_t ch = str[i - 1] + value; 26 if (ch < str[i - 1]) { 27 ret.InsertAtFront(0); 28 } else { 29 ret.InsertAtFront(ch); 30 value = 0; 31 } 32 } 33 if (value) 34 ret.InsertAtFront(value); 35 return ret; 36 } 37 38 } // namespace 39 CPDF_ToUnicodeMap(const CPDF_Stream * pStream)40CPDF_ToUnicodeMap::CPDF_ToUnicodeMap(const CPDF_Stream* pStream) { 41 Load(pStream); 42 } 43 44 CPDF_ToUnicodeMap::~CPDF_ToUnicodeMap() = default; 45 Lookup(uint32_t charcode) const46WideString CPDF_ToUnicodeMap::Lookup(uint32_t charcode) const { 47 auto it = m_Map.find(charcode); 48 if (it == m_Map.end()) { 49 if (!m_pBaseMap) 50 return WideString(); 51 return m_pBaseMap->UnicodeFromCID(static_cast<uint16_t>(charcode)); 52 } 53 54 uint32_t value = it->second; 55 wchar_t unicode = static_cast<wchar_t>(value & 0xffff); 56 if (unicode != 0xffff) 57 return unicode; 58 59 WideStringView buf = m_MultiCharBuf.AsStringView(); 60 size_t index = value >> 16; 61 if (!buf.IsValidIndex(index)) 62 return WideString(); 63 return WideString(buf.Substr(index + 1, buf[index])); 64 } 65 ReverseLookup(wchar_t unicode) const66uint32_t CPDF_ToUnicodeMap::ReverseLookup(wchar_t unicode) const { 67 for (const auto& pair : m_Map) { 68 if (pair.second == static_cast<uint32_t>(unicode)) 69 return pair.first; 70 } 71 return 0; 72 } 73 74 // static StringToCode(ByteStringView str)75pdfium::Optional<uint32_t> CPDF_ToUnicodeMap::StringToCode(ByteStringView str) { 76 size_t len = str.GetLength(); 77 if (len <= 2 || str[0] != '<' || str[len - 1] != '>') 78 return pdfium::nullopt; 79 80 FX_SAFE_UINT32 code = 0; 81 for (char c : str.Substr(1, len - 2)) { 82 if (!FXSYS_IsHexDigit(c)) 83 return pdfium::nullopt; 84 85 code = code * 16 + FXSYS_HexCharToInt(c); 86 if (!code.IsValid()) 87 return pdfium::nullopt; 88 } 89 return pdfium::Optional<uint32_t>(code.ValueOrDie()); 90 } 91 92 // static StringToWideString(ByteStringView str)93WideString CPDF_ToUnicodeMap::StringToWideString(ByteStringView str) { 94 size_t len = str.GetLength(); 95 if (len <= 2 || str[0] != '<' || str[len - 1] != '>') 96 return WideString(); 97 98 WideString result; 99 int byte_pos = 0; 100 wchar_t ch = 0; 101 for (char c : str.Substr(1, len - 2)) { 102 if (!FXSYS_IsHexDigit(c)) 103 break; 104 105 ch = ch * 16 + FXSYS_HexCharToInt(c); 106 byte_pos++; 107 if (byte_pos == 4) { 108 result += ch; 109 byte_pos = 0; 110 ch = 0; 111 } 112 } 113 return result; 114 } 115 Load(const CPDF_Stream * pStream)116void CPDF_ToUnicodeMap::Load(const CPDF_Stream* pStream) { 117 CIDSet cid_set = CIDSET_UNKNOWN; 118 auto pAcc = pdfium::MakeRetain<CPDF_StreamAcc>(pStream); 119 pAcc->LoadAllDataFiltered(); 120 CPDF_SimpleParser parser(pAcc->GetSpan()); 121 while (1) { 122 ByteStringView word = parser.GetWord(); 123 if (word.IsEmpty()) 124 break; 125 126 if (word == "beginbfchar") 127 HandleBeginBFChar(&parser); 128 else if (word == "beginbfrange") 129 HandleBeginBFRange(&parser); 130 else if (word == "/Adobe-Korea1-UCS2") 131 cid_set = CIDSET_KOREA1; 132 else if (word == "/Adobe-Japan1-UCS2") 133 cid_set = CIDSET_JAPAN1; 134 else if (word == "/Adobe-CNS1-UCS2") 135 cid_set = CIDSET_CNS1; 136 else if (word == "/Adobe-GB1-UCS2") 137 cid_set = CIDSET_GB1; 138 } 139 if (cid_set) { 140 auto* manager = CPDF_FontGlobals::GetInstance()->GetCMapManager(); 141 m_pBaseMap = manager->GetCID2UnicodeMap(cid_set); 142 } 143 } 144 HandleBeginBFChar(CPDF_SimpleParser * pParser)145void CPDF_ToUnicodeMap::HandleBeginBFChar(CPDF_SimpleParser* pParser) { 146 while (1) { 147 ByteStringView word = pParser->GetWord(); 148 if (word.IsEmpty() || word == "endbfchar") 149 return; 150 151 pdfium::Optional<uint32_t> code = StringToCode(word); 152 if (!code.has_value()) 153 return; 154 155 SetCode(code.value(), StringToWideString(pParser->GetWord())); 156 } 157 } 158 HandleBeginBFRange(CPDF_SimpleParser * pParser)159void CPDF_ToUnicodeMap::HandleBeginBFRange(CPDF_SimpleParser* pParser) { 160 while (1) { 161 ByteStringView lowcode_str = pParser->GetWord(); 162 if (lowcode_str.IsEmpty() || lowcode_str == "endbfrange") 163 return; 164 165 pdfium::Optional<uint32_t> lowcode_opt = StringToCode(lowcode_str); 166 if (!lowcode_opt.has_value()) 167 return; 168 169 ByteStringView highcode_str = pParser->GetWord(); 170 pdfium::Optional<uint32_t> highcode_opt = StringToCode(highcode_str); 171 if (!highcode_opt.has_value()) 172 return; 173 174 uint32_t lowcode = lowcode_opt.value(); 175 uint32_t highcode = (lowcode & 0xffffff00) | (highcode_opt.value() & 0xff); 176 177 ByteStringView start = pParser->GetWord(); 178 if (start == "[") { 179 for (uint32_t code = lowcode; code <= highcode; code++) 180 SetCode(code, StringToWideString(pParser->GetWord())); 181 pParser->GetWord(); 182 continue; 183 } 184 185 WideString destcode = StringToWideString(start); 186 if (destcode.GetLength() == 1) { 187 pdfium::Optional<uint32_t> value_or_error = StringToCode(start); 188 if (!value_or_error.has_value()) 189 return; 190 191 uint32_t value = value_or_error.value(); 192 for (uint32_t code = lowcode; code <= highcode; code++) 193 m_Map[code] = value++; 194 } else { 195 for (uint32_t code = lowcode; code <= highcode; code++) { 196 WideString retcode = 197 code == lowcode ? destcode : StringDataAdd(destcode); 198 m_Map[code] = GetUnicode(); 199 m_MultiCharBuf.AppendChar(retcode.GetLength()); 200 m_MultiCharBuf << retcode; 201 destcode = std::move(retcode); 202 } 203 } 204 } 205 } 206 GetUnicode() const207uint32_t CPDF_ToUnicodeMap::GetUnicode() const { 208 FX_SAFE_UINT32 uni = m_MultiCharBuf.GetLength(); 209 uni = uni * 0x10000 + 0xffff; 210 return uni.ValueOrDefault(0); 211 } 212 SetCode(uint32_t srccode,WideString destcode)213void CPDF_ToUnicodeMap::SetCode(uint32_t srccode, WideString destcode) { 214 size_t len = destcode.GetLength(); 215 if (len == 0) 216 return; 217 218 if (len == 1) { 219 m_Map[srccode] = destcode[0]; 220 } else { 221 m_Map[srccode] = GetUnicode(); 222 m_MultiCharBuf.AppendChar(len); 223 m_MultiCharBuf << destcode; 224 } 225 } 226