1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "chrome/tools/convert_dict/aff_reader.h"
6
7 #include <algorithm>
8
9 #include "base/files/file_util.h"
10 #include "base/i18n/icu_string_conversions.h"
11 #include "base/strings/string_split.h"
12 #include "base/strings/stringprintf.h"
13 #include "base/strings/utf_string_conversions.h"
14 #include "chrome/tools/convert_dict/hunspell_reader.h"
15
16 namespace convert_dict {
17
18 namespace {
19
20 // Returns true if the given line begins with the given case-sensitive
21 // NULL-terminated ASCII string.
StringBeginsWith(const std::string & str,const char * with)22 bool StringBeginsWith(const std::string& str, const char* with) {
23 size_t cur = 0;
24 while (cur < str.size() && with[cur] != 0) {
25 if (str[cur] != with[cur])
26 return false;
27 cur++;
28 }
29 return with[cur] == 0;
30 }
31
32 // Collapses runs of spaces to only one space.
CollapseDuplicateSpaces(std::string * str)33 void CollapseDuplicateSpaces(std::string* str) {
34 int prev_space = false;
35 for (size_t i = 0; i < str->length(); i++) {
36 if ((*str)[i] == ' ') {
37 if (prev_space) {
38 str->erase(str->begin() + i);
39 i--;
40 }
41 prev_space = true;
42 } else {
43 prev_space = false;
44 }
45 }
46 }
47
48 // Print an error message and terminate execution
Panic(const char * fmt,...)49 void Panic(const char* fmt, ...) {
50 va_list ap;
51 printf("ERROR: ");
52 va_start(ap, fmt);
53 vprintf(fmt, ap);
54 va_end(ap);
55 printf("\n");
56 exit(1);
57 }
58
59 } // namespace
60
AffReader(const base::FilePath & path)61 AffReader::AffReader(const base::FilePath& path)
62 : has_indexed_affixes_(false) {
63 file_ = base::OpenFile(path, "r");
64
65 // Default to Latin1 in case the file doesn't specify it.
66 encoding_ = "ISO8859-1";
67 }
68
~AffReader()69 AffReader::~AffReader() {
70 if (file_)
71 base::CloseFile(file_);
72 }
73
Read()74 bool AffReader::Read() {
75 if (!file_)
76 return false;
77
78 // TODO(brettw) handle byte order mark.
79
80 bool got_command = false;
81 bool got_first_af = false;
82 bool got_first_rep = false;
83
84 has_indexed_affixes_ = false;
85
86 while (!feof(file_)) {
87 std::string line = ReadLine(file_);
88
89 // Save comment lines before any commands.
90 if (!got_command && !line.empty() && line[0] == '#') {
91 intro_comment_.append(line);
92 intro_comment_.push_back('\n');
93 continue;
94 }
95
96 StripComment(&line);
97 if (line.empty())
98 continue;
99 got_command = true;
100
101 if (StringBeginsWith(line, "SET ")) {
102 // Character set encoding.
103 encoding_ = line.substr(4);
104 TrimLine(&encoding_);
105 } else if (StringBeginsWith(line, "AF ")) {
106 // Affix. The first one is the number of ones following which we don't
107 // bother with.
108 has_indexed_affixes_ = true;
109 if (got_first_af) {
110 std::string group(line.substr(3));
111 AddAffixGroup(&group);
112 } else {
113 got_first_af = true;
114 }
115 } else if (StringBeginsWith(line, "SFX ") ||
116 StringBeginsWith(line, "PFX ")) {
117 AddAffix(&line);
118 } else if (StringBeginsWith(line, "REP ")) {
119 // The first rep line is the number of ones following which we don't
120 // bother with.
121 if (got_first_rep) {
122 std::string replacement(line.substr(4));
123 AddReplacement(&replacement);
124 } else {
125 got_first_rep = true;
126 }
127 } else if (StringBeginsWith(line, "TRY ") ||
128 StringBeginsWith(line, "MAP ")) {
129 HandleEncodedCommand(line);
130 } else if (StringBeginsWith(line, "IGNORE ")) {
131 Panic("We don't support the IGNORE command yet. This would change how "
132 "we would insert things in our lookup table.");
133 } else if (StringBeginsWith(line, "COMPLEXPREFIXES ")) {
134 Panic("We don't support the COMPLEXPREFIXES command yet. This would "
135 "mean we have to insert words backwards as well (I think)");
136 } else {
137 // All other commands get stored in the other commands list.
138 HandleRawCommand(line);
139 }
140 }
141
142 return true;
143 }
144
EncodingToUTF8(const std::string & encoded,std::string * utf8) const145 bool AffReader::EncodingToUTF8(const std::string& encoded,
146 std::string* utf8) const {
147 std::wstring wide_word;
148 if (!base::CodepageToWide(encoded, encoding(),
149 base::OnStringConversionError::FAIL, &wide_word))
150 return false;
151 *utf8 = base::WideToUTF8(wide_word);
152 return true;
153 }
154
GetAFIndexForAFString(const std::string & af_string)155 int AffReader::GetAFIndexForAFString(const std::string& af_string) {
156 std::map<std::string, int>::iterator found = affix_groups_.find(af_string);
157 if (found != affix_groups_.end())
158 return found->second;
159 std::string my_string(af_string);
160 return AddAffixGroup(&my_string);
161 }
162
163 // We convert the data from our map to an indexed list, and also prefix each
164 // line with "AF" for the parser to read later.
GetAffixGroups() const165 std::vector<std::string> AffReader::GetAffixGroups() const {
166 int max_id = 0;
167 for (std::map<std::string, int>::const_iterator i = affix_groups_.begin();
168 i != affix_groups_.end(); ++i) {
169 if (i->second > max_id)
170 max_id = i->second;
171 }
172
173 std::vector<std::string> ret;
174
175 ret.resize(max_id);
176 for (std::map<std::string, int>::const_iterator i = affix_groups_.begin();
177 i != affix_groups_.end(); ++i) {
178 // Convert the indices into 1-based.
179 ret[i->second - 1] = std::string("AF ") + i->first;
180 }
181
182 return ret;
183 }
184
AddAffixGroup(std::string * rule)185 int AffReader::AddAffixGroup(std::string* rule) {
186 TrimLine(rule);
187
188 // We use the 1-based index of the rule. This matches the way Hunspell
189 // refers to the numbers.
190 int affix_id = static_cast<int>(affix_groups_.size()) + 1;
191 affix_groups_.insert(std::make_pair(*rule, affix_id));
192 return affix_id;
193 }
194
AddAffix(std::string * rule)195 void AffReader::AddAffix(std::string* rule) {
196 TrimLine(rule);
197 CollapseDuplicateSpaces(rule);
198
199 // These lines have two forms:
200 // AFX D Y 4 <- First line, lists how many affixes for "D" there are.
201 // AFX D 0 d e <- Following lines.
202 // We want to ensure the two last groups on the last line are encoded in
203 // UTF-8, and we want to make sure that the affix identifier "D" is *not*
204 // encoded, since that's basically an 8-bit identifier.
205
206 // Count to the third space. Everything after that will be re-encoded. This
207 // will re-encode the number on the first line, but that will be a NOP. If
208 // there are not that many groups, we won't reencode it, but pass it through.
209 int found_spaces = 0;
210 std::string token;
211 for (size_t i = 0; i < rule->length(); i++) {
212 if ((*rule)[i] == ' ') {
213 found_spaces++;
214 if (found_spaces == 3) {
215 size_t part_start = i;
216 std::string part;
217 if (token[0] != 'Y' && token[0] != 'N') {
218 // This token represents a stripping prefix or suffix, which is
219 // either a length or a string to be replaced.
220 // We also reencode them to UTF-8.
221 part_start = i - token.length();
222 }
223 part = rule->substr(part_start); // From here to end.
224
225 if (part.find('-') != std::string::npos) {
226 // This rule has a morph rule used by old Hungarian dictionaries.
227 // When a line has a morph rule, its format becomes as listed below.
228 // AFX D 0 d e - M
229 // To make hunspell work more happily, replace this morph rule with
230 // a compound flag as listed below.
231 // AFX D 0 d/M e
232 std::vector<std::string> tokens;
233 base::SplitString(part, ' ', &tokens);
234 if (tokens.size() >= 5) {
235 part = base::StringPrintf("%s %s/%s %s",
236 tokens[0].c_str(),
237 tokens[1].c_str(),
238 tokens[4].c_str(),
239 tokens[2].c_str());
240 }
241 }
242
243 size_t slash_index = part.find('/');
244 if (slash_index != std::string::npos && !has_indexed_affixes()) {
245 // This can also have a rule string associated with it following a
246 // slash. For example:
247 // PFX P 0 foo/Y .
248 // The "Y" is a flag. For example, the aff file might have a line:
249 // COMPOUNDFLAG Y
250 // so that means that this prefix would be a compound one.
251 //
252 // It expects these rules to use the same alias rules as the .dic
253 // file. We've forced it to use aliases, which is a numerical index
254 // instead of these character flags, and this needs to be consistent.
255
256 std::string before_flags = part.substr(0, slash_index + 1);
257
258 // After the slash are both the flags, then whitespace, then the part
259 // that tells us what to strip.
260 std::vector<std::string> after_slash;
261 base::SplitString(part.substr(slash_index + 1), ' ', &after_slash);
262 if (after_slash.size() == 0) {
263 Panic("Found 0 terms after slash in affix rule '%s', "
264 "but need at least 2.",
265 part.c_str());
266 }
267 if (after_slash.size() == 1) {
268 printf("WARNING: Found 1 term after slash in affix rule '%s', "
269 "but expected at least 2. Adding '.'.\n",
270 part.c_str());
271 after_slash.push_back(".");
272 }
273 // Note that we may get a third term here which is the morphological
274 // description of this rule. This happens in the tests only, so we can
275 // just ignore it.
276
277 part = base::StringPrintf("%s%d %s",
278 before_flags.c_str(),
279 GetAFIndexForAFString(after_slash[0]),
280 after_slash[1].c_str());
281 }
282
283 // Reencode from here
284 std::string reencoded;
285 if (!EncodingToUTF8(part, &reencoded))
286 Panic("Cannot encode affix rule part '%s' to utf8.", part.c_str());
287
288 *rule = rule->substr(0, part_start) + reencoded;
289 break;
290 }
291 token.clear();
292 } else {
293 token.push_back((*rule)[i]);
294 }
295 }
296
297 affix_rules_.push_back(*rule);
298 }
299
AddReplacement(std::string * rule)300 void AffReader::AddReplacement(std::string* rule) {
301 TrimLine(rule);
302 CollapseDuplicateSpaces(rule);
303
304 std::string utf8rule;
305 if (!EncodingToUTF8(*rule, &utf8rule))
306 Panic("Cannot encode replacement rule '%s' to utf8.", rule->c_str());
307
308 // The first space separates key and value.
309 size_t space_index = utf8rule.find(' ');
310 if (space_index == std::string::npos)
311 Panic("Did not find a space in '%s'.", utf8rule.c_str());
312
313 std::vector<std::string> split;
314 split.push_back(utf8rule.substr(0, space_index));
315 split.push_back(utf8rule.substr(space_index + 1));
316
317 // Underscores are used to represent spaces in most aff files
318 // (since the line is parsed on spaces).
319 std::replace(split[0].begin(), split[0].end(), '_', ' ');
320 std::replace(split[1].begin(), split[1].end(), '_', ' ');
321
322 replacements_.push_back(std::make_pair(split[0], split[1]));
323 }
324
HandleRawCommand(const std::string & line)325 void AffReader::HandleRawCommand(const std::string& line) {
326 other_commands_.push_back(line);
327 }
328
HandleEncodedCommand(const std::string & line)329 void AffReader::HandleEncodedCommand(const std::string& line) {
330 std::string utf8;
331 if (!EncodingToUTF8(line, &utf8))
332 Panic("Cannot encode command '%s' to utf8.", line.c_str());
333 other_commands_.push_back(utf8);
334 }
335
336 } // namespace convert_dict
337