1 // Copyright 2017 PDFium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com 6 7 #include "core/fpdfapi/font/cpdf_tounicodemap.h" 8 9 #include "core/fpdfapi/cpdf_modulemgr.h" 10 #include "core/fpdfapi/font/cpdf_cid2unicodemap.h" 11 #include "core/fpdfapi/page/cpdf_pagemodule.h" 12 #include "core/fpdfapi/parser/cpdf_simple_parser.h" 13 #include "core/fxcrt/fx_extension.h" 14 #include "core/fxcrt/fx_safe_types.h" 15 #include "third_party/base/numerics/safe_conversions.h" 16 Lookup(uint32_t charcode) const17WideString CPDF_ToUnicodeMap::Lookup(uint32_t charcode) const { 18 auto it = m_Map.find(charcode); 19 if (it != m_Map.end()) { 20 uint32_t value = it->second; 21 wchar_t unicode = (wchar_t)(value & 0xffff); 22 if (unicode != 0xffff) { 23 return unicode; 24 } 25 const wchar_t* buf = m_MultiCharBuf.GetBuffer(); 26 uint32_t buf_len = m_MultiCharBuf.GetLength(); 27 if (!buf || buf_len == 0) { 28 return WideString(); 29 } 30 uint32_t index = value >> 16; 31 if (index >= buf_len) { 32 return WideString(); 33 } 34 uint32_t len = buf[index]; 35 if (index + len < index || index + len >= buf_len) { 36 return WideString(); 37 } 38 return WideString(buf + index + 1, len); 39 } 40 if (m_pBaseMap) { 41 return m_pBaseMap->UnicodeFromCID((uint16_t)charcode); 42 } 43 return WideString(); 44 } 45 ReverseLookup(wchar_t unicode) const46uint32_t CPDF_ToUnicodeMap::ReverseLookup(wchar_t unicode) const { 47 for (const auto& pair : m_Map) { 48 if (pair.second == static_cast<uint32_t>(unicode)) 49 return pair.first; 50 } 51 return 0; 52 } 53 54 // Static. StringToCode(const ByteStringView & str)55uint32_t CPDF_ToUnicodeMap::StringToCode(const ByteStringView& str) { 56 int len = str.GetLength(); 57 if (len == 0) 58 return 0; 59 60 uint32_t result = 0; 61 if (str[0] == '<') { 62 for (int i = 1; i < len && std::isxdigit(str[i]); ++i) 63 result = result * 16 + FXSYS_HexCharToInt(str.CharAt(i)); 64 return result; 65 } 66 67 for (int i = 0; i < len && std::isdigit(str[i]); ++i) 68 result = result * 10 + FXSYS_DecimalCharToInt(str.CharAt(i)); 69 70 return result; 71 } 72 StringDataAdd(WideString str)73static WideString StringDataAdd(WideString str) { 74 WideString ret; 75 int len = str.GetLength(); 76 wchar_t value = 1; 77 for (int i = len - 1; i >= 0; --i) { 78 wchar_t ch = str[i] + value; 79 if (ch < str[i]) { 80 ret.InsertAtFront(0); 81 } else { 82 ret.InsertAtFront(ch); 83 value = 0; 84 } 85 } 86 if (value) 87 ret.InsertAtFront(value); 88 return ret; 89 } 90 91 // Static. StringToWideString(const ByteStringView & str)92WideString CPDF_ToUnicodeMap::StringToWideString(const ByteStringView& str) { 93 int len = str.GetLength(); 94 if (len == 0) 95 return WideString(); 96 97 WideString result; 98 if (str[0] == '<') { 99 int byte_pos = 0; 100 wchar_t ch = 0; 101 for (int i = 1; i < len && std::isxdigit(str[i]); ++i) { 102 ch = ch * 16 + FXSYS_HexCharToInt(str[i]); 103 byte_pos++; 104 if (byte_pos == 4) { 105 result += ch; 106 byte_pos = 0; 107 ch = 0; 108 } 109 } 110 return result; 111 } 112 return result; 113 } 114 CPDF_ToUnicodeMap()115CPDF_ToUnicodeMap::CPDF_ToUnicodeMap() : m_pBaseMap(nullptr) {} 116 ~CPDF_ToUnicodeMap()117CPDF_ToUnicodeMap::~CPDF_ToUnicodeMap() {} 118 GetUnicode()119uint32_t CPDF_ToUnicodeMap::GetUnicode() { 120 FX_SAFE_UINT32 uni = m_MultiCharBuf.GetLength(); 121 uni = uni * 0x10000 + 0xffff; 122 return uni.ValueOrDefault(0); 123 } 124 Load(CPDF_Stream * pStream)125void CPDF_ToUnicodeMap::Load(CPDF_Stream* pStream) { 126 CIDSet cid_set = CIDSET_UNKNOWN; 127 auto pAcc = pdfium::MakeRetain<CPDF_StreamAcc>(pStream); 128 pAcc->LoadAllDataFiltered(); 129 CPDF_SimpleParser parser(pAcc->GetData(), pAcc->GetSize()); 130 while (1) { 131 ByteStringView word = parser.GetWord(); 132 if (word.IsEmpty()) { 133 break; 134 } 135 if (word == "beginbfchar") { 136 while (1) { 137 word = parser.GetWord(); 138 if (word.IsEmpty() || word == "endbfchar") { 139 break; 140 } 141 uint32_t srccode = StringToCode(word); 142 word = parser.GetWord(); 143 WideString destcode = StringToWideString(word); 144 int len = destcode.GetLength(); 145 if (len == 0) { 146 continue; 147 } 148 if (len == 1) { 149 m_Map[srccode] = destcode[0]; 150 } else { 151 m_Map[srccode] = GetUnicode(); 152 m_MultiCharBuf.AppendChar(destcode.GetLength()); 153 m_MultiCharBuf << destcode; 154 } 155 } 156 } else if (word == "beginbfrange") { 157 while (1) { 158 ByteString low, high; 159 low = parser.GetWord(); 160 if (low.IsEmpty() || low == "endbfrange") { 161 break; 162 } 163 high = parser.GetWord(); 164 uint32_t lowcode = StringToCode(low.AsStringView()); 165 uint32_t highcode = 166 (lowcode & 0xffffff00) | (StringToCode(high.AsStringView()) & 0xff); 167 if (highcode == (uint32_t)-1) { 168 break; 169 } 170 ByteString start(parser.GetWord()); 171 if (start == "[") { 172 for (uint32_t code = lowcode; code <= highcode; code++) { 173 ByteString dest(parser.GetWord()); 174 WideString destcode = StringToWideString(dest.AsStringView()); 175 int len = destcode.GetLength(); 176 if (len == 0) { 177 continue; 178 } 179 if (len == 1) { 180 m_Map[code] = destcode[0]; 181 } else { 182 m_Map[code] = GetUnicode(); 183 m_MultiCharBuf.AppendChar(destcode.GetLength()); 184 m_MultiCharBuf << destcode; 185 } 186 } 187 parser.GetWord(); 188 } else { 189 WideString destcode = StringToWideString(start.AsStringView()); 190 int len = destcode.GetLength(); 191 uint32_t value = 0; 192 if (len == 1) { 193 value = StringToCode(start.AsStringView()); 194 for (uint32_t code = lowcode; code <= highcode; code++) { 195 m_Map[code] = value++; 196 } 197 } else { 198 for (uint32_t code = lowcode; code <= highcode; code++) { 199 WideString retcode; 200 if (code == lowcode) { 201 retcode = destcode; 202 } else { 203 retcode = StringDataAdd(destcode); 204 } 205 m_Map[code] = GetUnicode(); 206 m_MultiCharBuf.AppendChar(retcode.GetLength()); 207 m_MultiCharBuf << retcode; 208 destcode = retcode; 209 } 210 } 211 } 212 } 213 } else if (word == "/Adobe-Korea1-UCS2") { 214 cid_set = CIDSET_KOREA1; 215 } else if (word == "/Adobe-Japan1-UCS2") { 216 cid_set = CIDSET_JAPAN1; 217 } else if (word == "/Adobe-CNS1-UCS2") { 218 cid_set = CIDSET_CNS1; 219 } else if (word == "/Adobe-GB1-UCS2") { 220 cid_set = CIDSET_GB1; 221 } 222 } 223 if (cid_set) { 224 m_pBaseMap = CPDF_ModuleMgr::Get() 225 ->GetPageModule() 226 ->GetFontGlobals() 227 ->GetCMapManager() 228 ->GetCID2UnicodeMap(cid_set, false); 229 } else { 230 m_pBaseMap = nullptr; 231 } 232 } 233