• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Defines an iterator class that enumerates words supported by our spellchecker
6 // from multi-language text. This class is used for filtering out characters
7 // not supported by our spellchecker.
8 
9 #ifndef CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_
10 #define CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_
11 
12 #include <string>
13 
14 #include "base/basictypes.h"
15 #include "base/strings/string16.h"
16 #include "third_party/icu/source/common/unicode/ubrk.h"
17 #include "third_party/icu/source/common/unicode/uscript.h"
18 
19 // A class which encapsulates language-specific operations used by
20 // SpellcheckWordIterator. When we set the spellchecker language, this class
21 // creates rule sets that filter out the characters not supported by the
22 // spellchecker. (Please read the comment in the SpellcheckWordIterator class
23 // about how to use this class.)
24 class SpellcheckCharAttribute {
25  public:
26   SpellcheckCharAttribute();
27   ~SpellcheckCharAttribute();
28 
29   // Sets the language of the spellchecker. When this function is called with an
30   // ISO language code, this function creates the custom rule-sets used by
31   // the ICU break iterator so it can extract only words used by the language.
32   // GetRuleSet() returns the rule-sets created in this function.
33   void SetDefaultLanguage(const std::string& language);
34 
35   // Returns a custom rule-set string used by the ICU break iterator. This class
36   // has two rule-sets, one splits a contraction and the other does not, so we
37   // can split a concaticated word (e.g. "seven-year-old") into words (e.g.
38   // "seven", "year", and "old") and check their spellings. The result stirng is
39   // encoded in UTF-16 since ICU needs UTF-16 strings.
40   base::string16 GetRuleSet(bool allow_contraction) const;
41 
42   // Outputs a character only if it is a word character. (Please read the
43   // comments in CreateRuleSets() why we need this function.)
44   bool OutputChar(UChar c, base::string16* output) const;
45 
46  private:
47   // Creates the rule-sets that return words possibly used by the given
48   // language. Unfortunately, these rule-sets are not perfect and have some
49   // false-positives. For example, they return combined accent marks even though
50   // we need English words only. We call OutputCharacter() to filter out such
51   // false-positive characters.
52   void CreateRuleSets(const std::string& language);
53 
54   // Outputs a character only if it is one used by the given language. These
55   // functions are called from OutputChar().
56   bool OutputArabic(UChar c, base::string16* output) const;
57   bool OutputHangul(UChar c, base::string16* output) const;
58   bool OutputHebrew(UChar c, base::string16* output) const;
59   bool OutputDefault(UChar c, base::string16* output) const;
60 
61   // The custom rule-set strings used by ICU break iterator. Since it is not so
62   // easy to create custom rule-sets from an ISO language code, this class
63   // saves these rule-set strings created when we set the language.
64   base::string16 ruleset_allow_contraction_;
65   base::string16 ruleset_disallow_contraction_;
66 
67   // The script code used by this language.
68   UScriptCode script_code_;
69 
70   DISALLOW_COPY_AND_ASSIGN(SpellcheckCharAttribute);
71 };
72 
73 // A class which extracts words that can be checked for spelling from a
74 // multi-language string. The ICU word-break iterator does not discard some
75 // punctuation characters attached to a word. For example, when we set a word
76 // "_hello_" to a word-break iterator, it just returns "_hello_". Neither does
77 // it discard characters not used by the language. For example, it returns
78 // Russian words even though we need English words only. To extract only the
79 // words that our spellchecker can check their spellings, this class uses custom
80 // rule-sets created by the SpellcheckCharAttribute class. Also, this class
81 // normalizes extracted words so our spellchecker can check the spellings of
82 // words that include ligatures, combined characters, full-width characters,
83 // etc. This class uses UTF-16 strings as its input and output strings since
84 // UTF-16 is the native encoding of ICU and avoid unnecessary conversions
85 // when changing the encoding of this string for our spellchecker. (Chrome can
86 // use two or more spellcheckers and we cannot assume their encodings.)
87 // The following snippet is an example that extracts words with this class.
88 //
89 //   // Creates the language-specific attributes for US English.
90 //   SpellcheckCharAttribute attribute;
91 //   attribute.SetDefaultLanguage("en-US");
92 //
93 //   // Set up a SpellcheckWordIterator object which extracts English words,
94 //   // and retrieve them.
95 //   SpellcheckWordIterator iterator;
96 //   base::string16 text(UTF8ToUTF16("this is a test."));
97 //   iterator.Initialize(&attribute, true);
98 //   iterator.SetText(text.c_str(), text_.length());
99 //
100 //   base::string16 word;
101 //   int offset;
102 //   int length;
103 //   while (iterator.GetNextWord(&word, &offset, &length)) {
104 //     ...
105 //   }
106 //
107 class SpellcheckWordIterator {
108  public:
109   SpellcheckWordIterator();
110   ~SpellcheckWordIterator();
111 
112   // Initializes a word-iterator object with the language-specific attribute. If
113   // we need to split contractions and concatenated words, call this function
114   // with its 'allow_contraction' parameter false. (This function uses lots of
115   // temporal memory to compile a custom word-break rule into an automaton.)
116   bool Initialize(const SpellcheckCharAttribute* attribute,
117                   bool allow_contraction);
118 
119   // Returns whether this word iterator is initialized.
120   bool IsInitialized() const;
121 
122   // Set text to be iterated. (This text does not have to be NULL-terminated.)
123   // This function also resets internal state so we can reuse this iterator
124   // without calling Initialize().
125   bool SetText(const char16* text, size_t length);
126 
127   // Retrieves a word (or a contraction), stores its copy to 'word_string', and
128   // stores the position and the length for input word to 'word_start'. Since
129   // this function normalizes the output word, the length of 'word_string' may
130   // be different from the 'word_length'. Therefore, when we call functions that
131   // changes the input text, such as string16::replace(), we need to use
132   // 'word_start' and 'word_length' as listed in the following snippet.
133   //
134   //   while(iterator.GetNextWord(&word, &offset, &length))
135   //     text.replace(offset, length, word);
136   //
137   bool GetNextWord(base::string16* word_string,
138                    int* word_start,
139                    int* word_length);
140 
141   // Releases all the resources attached to this object.
142   void Reset();
143 
144  private:
145   // Normalizes a non-terminated string returned from an ICU word-break
146   // iterator. A word returned from an ICU break iterator may include characters
147   // not supported by our spellchecker, e.g. ligatures, combining/ characters,
148   // full-width letters, etc. This function replaces such characters with
149   // alternative characters supported by our spellchecker. This function also
150   // calls SpellcheckWordIterator::OutputChar() to filter out false-positive
151   // characters.
152   bool Normalize(int input_start,
153                  int input_length,
154                  base::string16* output_string) const;
155 
156   // The pointer to the input string from which we are extracting words.
157   const char16* text_;
158 
159   // The length of the original string.
160   int length_;
161 
162   // The current position in the original string.
163   int position_;
164 
165   // The language-specific attributes used for filtering out non-word
166   // characters.
167   const SpellcheckCharAttribute* attribute_;
168 
169   // The ICU break iterator.
170   UBreakIterator* iterator_;
171 
172   DISALLOW_COPY_AND_ASSIGN(SpellcheckWordIterator);
173 };
174 
175 #endif  // CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_
176 
177