diff --git a/source/data/brkitr/rules/word.txt b/source/data/brkitr/rules/word.txt index e9420c8c..b4603823 100644 --- a/source/data/brkitr/rules/word.txt +++ b/source/data/brkitr/rules/word.txt @@ -38,12 +38,34 @@ $Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; $Format = [\p{Word_Break = Format}]; $Katakana = [\p{Word_Break = Katakana}]; $Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; + +# Exclude '@' (commercial at, \u0040) from ALetter to maintain breaking at '@'. +# ICU 49d192fefe09, in ICU 72, stopped breaking at '@' in order to not break up +# e-mail addresses (https://unicode-org.atlassian.net/browse/CLDR-15767). In +# light of the Chromium-specific change below that breaks on full-stop (period, +# dot, \u002e below in MidNumLet), e-mail addresses will be broken in any case. +# Thus, although the upstream intent was to not break "user.name@example.com" at +# all, it actually would break down into {"user", ".", "name@example", ".", +# "com"}, which is undesirable. See https://crbug.com/1410331. Maintain the +# previous Chromium behavior of breaking at both '@' and '.'. +# +# TODO: Determine whether it's feasible to drop the Chromium-specific behaviors +# (and thus this patch) for '.' and now '@'. $ALetter = [\p{Word_Break = ALetter}]; + $Single_Quote = [\p{Word_Break = Single_Quote}]; $Double_Quote = [\p{Word_Break = Double_Quote}]; -$MidNumLet = [\p{Word_Break = MidNumLet}]; + +# Remove two full stop characters from $MidNumLet and add them to $MidNum +# to break a hostname into its components at the cost of breaking +# 'e.g.' and 'i.e.' as well. +# $MidNumLet is used in rules 6/7 (rules of our interest) and rules 11/12. +# Because it's OR'd with $MidNum in rules 11/12, rules 11/12 are not affected +# while rules 6/7 are reverted to the old behavior we want. +$MidNumLet = [[\p{Word_Break = MidNumLet}] - [\u002E \uFF0E]]; $MidLetter = [\p{Word_Break = MidLetter} - [\: \uFE55 \uFF1A]]; -$MidNum = [\p{Word_Break = MidNum}]; +$MidNum = [\p{Word_Break = MidNum}[\u002E \uFF0E]]; + $Numeric = [\p{Word_Break = Numeric}]; $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; $WSegSpace = [\p{Word_Break = WSegSpace}]; diff --git a/source/data/brkitr/rules/word_POSIX.txt b/source/data/brkitr/rules/word_POSIX.txt index 3cd0556e..8e63ee4c 100644 --- a/source/data/brkitr/rules/word_POSIX.txt +++ b/source/data/brkitr/rules/word_POSIX.txt @@ -38,12 +38,29 @@ $Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; $Format = [\p{Word_Break = Format}]; $Katakana = [\p{Word_Break = Katakana}]; $Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; + +# Exclude '@' (commercial at, \u0040) from ALetter to maintain breaking at '@'. +# ICU 49d192fefe09, in ICU 72, stopped breaking at '@' in order to not break up +# e-mail addresses (https://unicode-org.atlassian.net/browse/CLDR-15767). In +# light of the Chromium-specific change below that breaks on full-stop (period, +# dot, \u002e below in MidNumLet), e-mail addresses will be broken in any case. +# Thus, although the upstream intent was to not break "user.name@example.com" at +# all, it actually would break down into {"user", ".", "name@example", ".", +# "com"}, which is undesirable. See https://crbug.com/1410331. Maintain the +# previous Chromium behavior of breaking at both '@' and '.'. +# +# TODO: Determine whether it's feasible to drop the Chromium-specific behaviors +# (and thus this patch) for '.' and now '@'. $ALetter = [\p{Word_Break = ALetter}]; + $Single_Quote = [\p{Word_Break = Single_Quote}]; $Double_Quote = [\p{Word_Break = Double_Quote}]; -$MidNumLet = [\p{Word_Break = MidNumLet} - [.]]; +# Remove full-width full stop (\uff0e) from $MidNumLet and add it to $MidNum, in +# addition to the ordinary full stop (dot, period, '.', \u002e). +$MidNumLet = [[\p{Word_Break = MidNumLet}] - [\u002E \uFF0E]]; $MidLetter = [\p{Word_Break = MidLetter} - [\: \uFE55 \uFF1A]]; -$MidNum = [\p{Word_Break = MidNum} [.]]; +$MidNum = [\p{Word_Break = MidNum}[\u002E \uFF0E]]; + $Numeric = [\p{Word_Break = Numeric}]; $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; $WSegSpace = [\p{Word_Break = WSegSpace}]; diff --git a/source/data/brkitr/rules/word_fi_sv.txt b/source/data/brkitr/rules/word_fi_sv.txt index daf5b355..ca2decfc 100644 --- a/source/data/brkitr/rules/word_fi_sv.txt +++ b/source/data/brkitr/rules/word_fi_sv.txt @@ -38,12 +38,33 @@ $Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; $Format = [\p{Word_Break = Format}]; $Katakana = [\p{Word_Break = Katakana}]; $Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; + +# Exclude '@' (commercial at, \u0040) from ALetter to maintain breaking at '@'. +# ICU 49d192fefe09, in ICU 72, stopped breaking at '@' in order to not break up +# e-mail addresses (https://unicode-org.atlassian.net/browse/CLDR-15767). In +# light of the Chromium-specific change below that breaks on full-stop (period, +# dot, \u002e below in MidNumLet), e-mail addresses will be broken in any case. +# Thus, although the upstream intent was to not break "user.name@example.com" at +# all, it actually would break down into {"user", ".", "name@example", ".", +# "com"}, which is undesirable. See https://crbug.com/1410331. Maintain the +# previous Chromium behavior of breaking at both '@' and '.'. +# +# TODO: Determine whether it's feasible to drop the Chromium-specific behaviors +# (and thus this patch) for '.' and now '@'. $ALetter = [\p{Word_Break = ALetter}]; + $Single_Quote = [\p{Word_Break = Single_Quote}]; $Double_Quote = [\p{Word_Break = Double_Quote}]; -$MidNumLet = [\p{Word_Break = MidNumLet}]; +# Remove two full stop characters from $MidNumLet and add them to $MidNum +# to break a hostname into its components at the cost of breaking +# 'e.g.' and 'i.e.' as well. +# $MidNumLet is used in rules 6/7 (rules of our interest) and rules 11/12. +# Because it's OR'd with $MidNum in rules 11/12, rules 11/12 are not affected +# while rules 6/7 are reverted to the old behavior we want. +$MidNumLet = [[\p{Word_Break = MidNumLet}] - [\u002E \uFF0E]]; $MidLetter = [\p{Word_Break = MidLetter}]; -$MidNum = [\p{Word_Break = MidNum}]; +$MidNum = [\p{Word_Break = MidNum}[\u002E \uFF0E]]; + $Numeric = [\p{Word_Break = Numeric}]; $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; $WSegSpace = [\p{Word_Break = WSegSpace}];