1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // Defines an iterator class that enumerates words supported by our spellchecker 6 // from multi-language text. This class is used for filtering out characters 7 // not supported by our spellchecker. 8 9 #ifndef CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_ 10 #define CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_ 11 12 #include <string> 13 14 #include "base/basictypes.h" 15 #include "base/memory/scoped_ptr.h" 16 #include "base/strings/string16.h" 17 #include "third_party/icu/source/common/unicode/uscript.h" 18 19 namespace base { 20 namespace i18n { 21 class BreakIterator; 22 } // namespace i18n 23 } // namespace base 24 25 // A class which encapsulates language-specific operations used by 26 // SpellcheckWordIterator. When we set the spellchecker language, this class 27 // creates rule sets that filter out the characters not supported by the 28 // spellchecker. (Please read the comment in the SpellcheckWordIterator class 29 // about how to use this class.) 30 class SpellcheckCharAttribute { 31 public: 32 SpellcheckCharAttribute(); 33 ~SpellcheckCharAttribute(); 34 35 // Sets the language of the spellchecker. When this function is called with an 36 // ISO language code, this function creates the custom rule-sets used by 37 // the ICU break iterator so it can extract only words used by the language. 38 // GetRuleSet() returns the rule-sets created in this function. 39 void SetDefaultLanguage(const std::string& language); 40 41 // Returns a custom rule-set string used by the ICU break iterator. This class 42 // has two rule-sets, one splits a contraction and the other does not, so we 43 // can split a concaticated word (e.g. "seven-year-old") into words (e.g. 44 // "seven", "year", and "old") and check their spellings. The result stirng is 45 // encoded in UTF-16 since ICU needs UTF-16 strings. 46 base::string16 GetRuleSet(bool allow_contraction) const; 47 48 // Outputs a character only if it is a word character. (Please read the 49 // comments in CreateRuleSets() why we need this function.) 50 bool OutputChar(UChar c, base::string16* output) const; 51 52 private: 53 // Creates the rule-sets that return words possibly used by the given 54 // language. Unfortunately, these rule-sets are not perfect and have some 55 // false-positives. For example, they return combined accent marks even though 56 // we need English words only. We call OutputCharacter() to filter out such 57 // false-positive characters. 58 void CreateRuleSets(const std::string& language); 59 60 // Outputs a character only if it is one used by the given language. These 61 // functions are called from OutputChar(). 62 bool OutputArabic(UChar c, base::string16* output) const; 63 bool OutputHangul(UChar c, base::string16* output) const; 64 bool OutputHebrew(UChar c, base::string16* output) const; 65 bool OutputDefault(UChar c, base::string16* output) const; 66 67 // The custom rule-set strings used by ICU break iterator. Since it is not so 68 // easy to create custom rule-sets from an ISO language code, this class 69 // saves these rule-set strings created when we set the language. 70 base::string16 ruleset_allow_contraction_; 71 base::string16 ruleset_disallow_contraction_; 72 73 // The script code used by this language. 74 UScriptCode script_code_; 75 76 DISALLOW_COPY_AND_ASSIGN(SpellcheckCharAttribute); 77 }; 78 79 // A class which extracts words that can be checked for spelling from a 80 // multi-language string. The ICU word-break iterator does not discard some 81 // punctuation characters attached to a word. For example, when we set a word 82 // "_hello_" to a word-break iterator, it just returns "_hello_". Neither does 83 // it discard characters not used by the language. For example, it returns 84 // Russian words even though we need English words only. To extract only the 85 // words that our spellchecker can check their spellings, this class uses custom 86 // rule-sets created by the SpellcheckCharAttribute class. Also, this class 87 // normalizes extracted words so our spellchecker can check the spellings of 88 // words that include ligatures, combined characters, full-width characters, 89 // etc. This class uses UTF-16 strings as its input and output strings since 90 // UTF-16 is the native encoding of ICU and avoid unnecessary conversions 91 // when changing the encoding of this string for our spellchecker. (Chrome can 92 // use two or more spellcheckers and we cannot assume their encodings.) 93 // The following snippet is an example that extracts words with this class. 94 // 95 // // Creates the language-specific attributes for US English. 96 // SpellcheckCharAttribute attribute; 97 // attribute.SetDefaultLanguage("en-US"); 98 // 99 // // Set up a SpellcheckWordIterator object which extracts English words, 100 // // and retrieve them. 101 // SpellcheckWordIterator iterator; 102 // base::string16 text(base::UTF8ToUTF16("this is a test.")); 103 // iterator.Initialize(&attribute, true); 104 // iterator.SetText(text.c_str(), text_.length()); 105 // 106 // base::string16 word; 107 // int offset; 108 // int length; 109 // while (iterator.GetNextWord(&word, &offset, &length)) { 110 // ... 111 // } 112 // 113 class SpellcheckWordIterator { 114 public: 115 SpellcheckWordIterator(); 116 ~SpellcheckWordIterator(); 117 118 // Initializes a word-iterator object with the language-specific attribute. If 119 // we need to split contractions and concatenated words, call this function 120 // with its 'allow_contraction' parameter false. (This function uses lots of 121 // temporal memory to compile a custom word-break rule into an automaton.) 122 bool Initialize(const SpellcheckCharAttribute* attribute, 123 bool allow_contraction); 124 125 // Returns whether this word iterator is initialized. 126 bool IsInitialized() const; 127 128 // Set text to be iterated. (This text does not have to be NULL-terminated.) 129 // This function also resets internal state so we can reuse this iterator 130 // without calling Initialize(). 131 bool SetText(const base::char16* text, size_t length); 132 133 // Retrieves a word (or a contraction), stores its copy to 'word_string', and 134 // stores the position and the length for input word to 'word_start'. Since 135 // this function normalizes the output word, the length of 'word_string' may 136 // be different from the 'word_length'. Therefore, when we call functions that 137 // changes the input text, such as string16::replace(), we need to use 138 // 'word_start' and 'word_length' as listed in the following snippet. 139 // 140 // while(iterator.GetNextWord(&word, &offset, &length)) 141 // text.replace(offset, length, word); 142 // 143 bool GetNextWord(base::string16* word_string, 144 int* word_start, 145 int* word_length); 146 147 // Releases all the resources attached to this object. 148 void Reset(); 149 150 private: 151 // Normalizes a non-terminated string returned from an ICU word-break 152 // iterator. A word returned from an ICU break iterator may include characters 153 // not supported by our spellchecker, e.g. ligatures, combining/ characters, 154 // full-width letters, etc. This function replaces such characters with 155 // alternative characters supported by our spellchecker. This function also 156 // calls SpellcheckWordIterator::OutputChar() to filter out false-positive 157 // characters. 158 bool Normalize(int input_start, 159 int input_length, 160 base::string16* output_string) const; 161 162 // The pointer to the input string from which we are extracting words. 163 const base::char16* text_; 164 165 // The language-specific attributes used for filtering out non-word 166 // characters. 167 const SpellcheckCharAttribute* attribute_; 168 169 // The break iterator. 170 scoped_ptr<base::i18n::BreakIterator> iterator_; 171 172 DISALLOW_COPY_AND_ASSIGN(SpellcheckWordIterator); 173 }; 174 175 #endif // CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_ 176 177