diff --git a/source/data/brkitr/brklocal.mk b/source/data/brkitr/brklocal.mk index 91754f1..ccac4d1 100644 --- a/source/data/brkitr/brklocal.mk +++ b/source/data/brkitr/brklocal.mk @@ -34,15 +34,15 @@ BRK_RES_ALIAS_SOURCE = $(BRK_RES_SYNTHETIC_ALIAS) # List of compact trie dictionary files (ctd). -BRK_CTD_SOURCE = thaidict.txt cjdict.txt +BRK_CTD_SOURCE = thaidict.txt # List of break iterator files (brk). -# Chrome change: remove word_ja.txt and line_he.txt -BRK_SOURCE = sent_el.txt word_POSIX.txt line_fi.txt char.txt word.txt line.txt sent.txt title.txt char_th.txt +# Chrome change: remove line_he.txt +BRK_SOURCE = sent_el.txt word_POSIX.txt line_fi.txt word_ja.txt char.txt word.txt line.txt sent.txt title.txt char_th.txt # Ordinary resources -# Chrome change: remove ja.txt and he.txt +# Chrome change: remove he.txt BRK_RES_SOURCE = el.txt en.txt en_US.txt en_US_POSIX.txt\ - fi.txt th.txt + fi.txt ja.txt th.txt diff --git a/source/data/brkitr/root.txt b/source/data/brkitr/root.txt index fb83ac3..5d839bd 100644 --- a/source/data/brkitr/root.txt +++ b/source/data/brkitr/root.txt @@ -17,8 +17,5 @@ root{ } dictionaries{ Thai:process(dependency){"thaidict.ctd"} - Hani:process(dependency){"cjdict.ctd"} - Hira:process(dependency){"cjdict.ctd"} - Kata:process(dependency){"cjdict.ctd"} } } diff --git a/source/data/brkitr/word.txt b/source/data/brkitr/word.txt index 0b49377..a0e1ceb 100644 --- a/source/data/brkitr/word.txt +++ b/source/data/brkitr/word.txt @@ -60,11 +60,10 @@ $Control = [\p{Grapheme_Cluster_Break = Control}]; $HangulSyllable = [\uac00-\ud7a3]; $ComplexContext = [:LineBreak = Complex_Context:]; $KanaKanji = [$Han $Hiragana $Katakana]; -$dictionaryCJK = [$KanaKanji $HangulSyllable]; -$dictionary = [$ComplexContext $dictionaryCJK]; +$dictionary = [:LineBreak = Complex_Context:]; -# leave CJK scripts out of ALetterPlus -$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]]; +$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]]; # Note: default ALetter does not + # include the dictionary characters. # @@ -99,8 +98,7 @@ $CR $LF; # begins with a group of Format chars, or with a "word" consisting of a single # char that is not in any of the listed word break categories followed by # format char(s). - # format char(s), or is not a CJK dictionary character. -[^$CR $LF $Newline $dictionaryCJK]? ($Extend | $Format)+; +[^$CR $LF $Newline]? ($Extend | $Format)+; $NumericEx {100}; $ALetterEx {200}; @@ -155,9 +153,6 @@ $ExtendNumLetEx $ALetterEx {200}; # (13b) $ExtendNumLetEx $NumericEx {100}; # (13b) $ExtendNumLetEx $KatakanaEx {400}; # (13b) -# special handling for CJK characters: chain for later dictionary segmentation -$HangulSyllable $HangulSyllable {200}; -$KanaKanji $KanaKanji {400}; #different rule status if both kanji and kana found ## ------------------------------------------------- @@ -179,7 +174,7 @@ $BackHebrewLetEx = ($Format | $Extend)* $HebrewLet; $LF $CR; # rule 4 -($Format | $Extend)* [^$CR $LF $Newline $dictionaryCJK]?; +($Format | $Extend)* [^$CR $LF $Newline]?; # rule 5 @@ -217,10 +212,6 @@ $BackKatakanaEx $BackKatakanaEx; $BackExtendNumLetEx ($BackALetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx); ($BackALetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx; -# special handling for CJK characters: chain for later dictionary segmentation -$HangulSyllable $HangulSyllable; -$KanaKanji $KanaKanji; #different rule status if both kanji and kana found - ## ------------------------------------------------- !!safe_reverse;