1# © 2016 and later: Unicode, Inc. and others. 2# License & terms of use: http://www.unicode.org/copyright.html 3# Generated using tools/cldr/cldr-to-icu/build-icu-data.xml 4# 5# File: Latn_Kana.txt 6# Generated from CLDR 7# 8 9# note: a global filter is more efficient, but MUST include all source chars 10#:: [\u0000-\u007E 、。 \u3099-゜ ァ-ー 。-゚ [:Latin:][:Katakana:] [:nonspacing mark:]] ; 11# MINIMAL FILTER GENERATED FOR: Latin-Katakana 12### WARNING -- must add width filter, both here and below!!! ### 13:: [[ᄀ-ᄒᄚᄡ\u1160-ᅵᆪᆬ-ᆭᆰ-ᆵ←-↓│■○\u3000-。「-」\u3099-\u309Aァ-ロワヲ-ヴヷヺ-ー!-~¢-₩][',.A-Za-z~À-ÖØ-öø-ďĒ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǭǰǴ-ǵǸ-țȞ-ȟȦ-ȳ\u0304Ӣ-ӣӮ-ӯḀ-ẙẠ-ỹᾱᾹῑῙῡῩK-Å]] ; 14:: [:Latin:] fullwidth-halfwidth (); 15:: NFD (NFC); 16:: Lower (); # whenever transliterating from cased to uncased script, include this 17# :: NFD () ; # this would catch the odd cases where a lowercase is not in NFD, but none are important for Japanese 18# Uses modified Hepburn. Small changes to make unambiguous. 19# | Kunrei-shiki: Hepburn/MHepburn 20# | ------------------------------ 21# | si: shi 22# | si ~ya: sha 23# | si ~yu: shu 24# | si ~yo: sho 25# | zi: ji 26# | zi ~ya: ja 27# | zi ~yu: ju 28# | zi ~yo: jo 29# | ti: chi 30# | ti ~ya: cha 31# | ti ~yu: chu 32# | ti ~yu: cho 33# | tu: tsu 34# | di: ji/dji 35# | du: zu/dzu 36# | hu: fu 37# | For foreign words: 38# | ----------------- 39# | se ~i si 40# | si ~e she 41# | 42# | ze ~i zi 43# | zi ~e je 44# | 45# | te ~i ti 46# | ti ~e che 47# | te ~u tu 48# | 49# | de ~i di 50# | de ~u du 51# | de ~i di 52# | 53# | he ~u: hu 54# | hu ~a fa 55# | hu ~i fi 56# | hu ~e he 57# | hu ~o ho 58# Most small forms are generated, but if necessary 59# explicit small forms are given with ~a, ~ya, etc. 60#------------------------------------------------------ 61# Variables 62$vowel = [aeiou] ; 63$consonant = [bcdfghjklmnpqrstvwxyz] ; 64$macron = \u0304 ; 65# Variables used for doubled-consonants with tsu 66$kana = [ぁ-ゔ] ; 67$voice = [\u3099゛]; 68$semivoice = [\u309A゜]; 69$k_start = [カキクケコかきくけこ] ; 70$s_start = [サシスセソさしすせそ] ; 71$j_start = [シし] $voice ; 72$t_start = [タチツテトたちつてと] ; 73$n_start = [ナニヌネノンなにぬねの] ; 74$h_start = [ハヒヘホはひへほ] ; 75$f_start = [フふ] ; 76$m_start = [マミムメモまみむめも] ; 77$y_start = [ヤユヨやゆよ] ; 78$r_start = [ラリルレロらりるれろ] ; 79$w_start = [ワヰヱヲわゐゑを] ; 80$v_start = [ワヰヱヲ]\u3099 ; 81$voweled_basekana = [ァ-オカキクケコサシスセソタチッツテトナ-ノハヒフヘホマ-ヲヵヶ] ; 82# if ン is followed by $n_quoter, then it needs an 83# apostrophe after its romaji form to disambiguate it. 84# e.g., ン ア ! = ナ, so represent as "n'a", not "na". 85$n_quoter = [ア イ ウ エ オ ナ ニ ヌ ネ ノ ヤ ユ ヨ ン] ; 86$small_y = [ャィュェョ] ; 87$iteration = ゝ ; 88#------------------------------------------------------ 89# katakana rules 90# Punctuation 91'.' ↔ 。; 92',' ↔ 、; 93# ' ' } [a-z] → ; # delete spaces before latin 94# ' ' ← [^' '゠-ヿ] {} ['゠-ヿ] ; #insert spaces before hiragana 95# Iteration Mark 96# Copy previous letter § marks 97# TODO 98# | $1 $1 ← ($kana [[:M:]$voice$semivoice]?) $iteration 99# Specials for katakana -- not shared with hiragana 100va ↔ ワ\u3099 ; 101vi ↔ ヰ\u3099 ; 102ve ↔ ヱ\u3099 ; 103vo ↔ ヲ\u3099 ; 104'~ka' ↔ ヵ ; 105'~ke' ↔ ヶ ; 106# ~~~ begin shared rules ~~~ 107#special 108ya ← '~'ャ; 109yi ← '~'ィ ; 110yu ← '~'ュ; 111ye ← '~'ェ; 112yo ← '~'ョ; 113#normal 114a ↔ ア ; 115b | '~' ← ヒ \u3099} $small_y ; 116by } $vowel → ヒ\u3099 | '~y' ; 117ba ↔ ハ\u3099 ; 118bi ↔ ヒ\u3099 ; 119bu ↔ フ\u3099 ; 120be ↔ ヘ\u3099 ; 121bo ↔ ホ\u3099 ; 122c } i → | s ; 123c } e → | s ; 124da ↔ タ\u3099 ; 125di ↔ テ\u3099ィ ; 126du ↔ テ\u3099ゥ ; 127de ↔ テ\u3099 ; 128do ↔ ト\u3099 ; 129dzu ↔ ツ\u3099 ; 130dja ← チ\u3099ャ ; 131dji'~i' ← チ\u3099ィ ; # liu 132dju ← チ\u3099ュ ; 133dje ← チ\u3099ェ ; 134djo ← チ\u3099ョ ; 135dji ↔ チ\u3099 ; 136dj } $vowel → チ\u3099 | '~y' ; 137# TODO: QUESTION: use ĵĴżŻ instead of dj, dz 138cha ← チャ ; 139chi'~i' ← チィ ; # liu 140chu ← チュ ; 141che ← チェ ; 142cho ← チョ ; 143chi ↔ チ ; 144ch } $vowel → チ | '~y' ; 145e ↔ エ ; 146g | '~' ← キ\u3099} $small_y ; 147gy } $vowel → キ\u3099 | '~y' ; 148ga ↔ カ\u3099 ; 149gi ↔ キ\u3099 ; 150gu ↔ ク\u3099 ; 151ge ↔ ケ\u3099 ; 152go ↔ コ\u3099 ; 153i ↔ イ ; 154# j } $vowel → シ\u3099 | '~y' ; 155ja ↔ シ\u3099ャ ; 156ji'~i' ← シ\u3099ィ ; # liu 157ju ↔ シ\u3099ュ ; 158je ↔ シ\u3099ェ ; 159jo ↔ シ\u3099ョ ; 160ji ↔ シ\u3099 ; 161k | '~' ← キ} $small_y ; 162ky } $vowel → キ | '~y' ; 163ka ↔ カ ; 164ki ↔ キ ; 165ku ↔ ク ; 166ke ↔ ケ ; 167ko ↔ コ ; 168m | '~' ← ミ} $small_y ; 169my } $vowel → ミ | '~y' ; 170ma ↔ マ ; 171mi ↔ ミ ; 172mu ↔ ム ; 173me ↔ メ ; 174mo ↔ モ ; 175m } [pbfv] → ン ; 176n | '~' ← ニ } $small_y ; 177ny } $vowel → ニ | '~y' ; 178na ↔ ナ ; 179ni ↔ ニ ; 180nu ↔ ヌ ; 181ne ↔ ネ ; 182no ↔ ノ ; 183o ↔ オ ; 184p | '~' ← ヒ\u309A } $small_y ; 185py } $vowel → ヒ\u309A | '~y' ; 186pa ↔ ハ\u309A ; 187pi ↔ ヒ\u309A ; 188pu ↔ フ\u309A ; 189pe ↔ ヘ\u309A ; 190po ↔ ホ\u309A ; 191h | '~' ← ヒ } $small_y ; 192hy } $vowel → ヒ | '~y' ; 193ha ↔ ハ ; 194hi ↔ ヒ ; 195hu ↔ ヘゥ ; 196he ↔ ヘ ; 197ho ↔ ホ ; 198# f | '~' ← フ } $small_y ; 199# f } $vowel → フ | '~' ; 200fa ↔ ファ ; 201fi ↔ フィ ; 202fe ↔ フェ ; 203fo ↔ フォ ; 204fu ↔ フ ; 205r | '~' ← リ } $small_y ; 206ry } $vowel → リ | '~y' ; 207ra ↔ ラ ; 208ri ↔ リ ; 209ru ↔ ル ; 210re ↔ レ ; 211ro ↔ ロ ; 212za ↔ サ\u3099 ; 213zi ↔ セ\u3099ィ ; 214zu ↔ ス\u3099 ; 215ze ↔ セ\u3099 ; 216zo ↔ ソ\u3099 ; 217sa ↔ サ ; 218si ↔ セィ ; 219su ↔ ス ; 220se ↔ セ ; 221so ↔ ソ ; 222sha ← シャ ; 223shi'~i' ← シィ ; # liu 224shu ← シュ ; 225she ← シェ ; 226sho ← ショ ; 227shi ↔ シ ; 228sh } $vowel → シ | '~y' ; 229ta ↔ タ ; 230ti ↔ ティ ; 231tu ↔ テゥ ; 232te ↔ テ ; 233to ↔ ト ; 234tsu ↔ ツ ; 235# v } $vowel → ウ\u3099 | '~' ; 236#'v~a' ← ウ\u3099ァ ; # liu 237#'v~i' ← ウ\u3099ィ ; # liu 238#'v~e' ← ウ\u3099ェ ; # liu 239#'v~o' ← ウ\u3099ォ ; # liu 240vu ↔ ウ\u3099 ; 241u ↔ ウ ; 242# w } $vowel → ウ | '~' ; 243wa ↔ ワ ; 244wi ↔ ヰ ; 245wu → ウ ; 246we ↔ ヱ ; 247wo ↔ ヲ ; 248ya ↔ ヤ ; 249yi → イ ; 250yu ↔ ユ ; 251ye → エ ; 252yo ↔ ヨ ; 253# double consonants 254#specials 255s } sh → ッ ; 256t } ch → ッ ; 257#voiced 258j } j ↔ ッ } $j_start ; 259b } b ↔ ッ } [$h_start$f_start] $voice; 260d } d ↔ ッ } $t_start $voice; 261g } g ↔ ッ } $k_start $voice; 262p } p ↔ ッ } [$h_start$f_start] $semivoice; 263# v } v ↔ ッ } [ワヰウヱヲう] $voice ; 264z } z ↔ ッ } $s_start $voice; 265v } v ↔ ッ } $v_start; 266# normal 267k } k ↔ ッ } $k_start ; 268m } m ↔ ッ } $m_start ; 269n } n ↔ ッ } $n_start ; 270h } h ↔ ッ } $h_start ; 271f } f ↔ ッ } $f_start ; 272r } r ↔ ッ } $r_start ; 273t } t ↔ ッ } $t_start ; 274s } s ↔ ッ } $s_start ; 275w } w ↔ ッ } $w_start; 276y } y ↔ ッ } $y_start; 277# completeness 278x } x → ッ ; 279c } k → ッ ; 280c } c → ッ ; 281c } q → ッ ; 282l } l → ッ ; 283q } q → ッ ; 284# y } y → ッ ; 285# w } w → ッ ; 286# prolonged vowel mark. this indicates a doubling of 287# the preceding vowel sound 288#a ← a { ー ; # liu 289#e ← e { ー ; # liu 290#i ← i { ー ; # liu 291#o ← o { ー ; # liu 292#u ← u { ー ; # liu 293$macron ↔ ー ; 294# small forms 295'~a' ↔ ァ ; 296'~i' ↔ ィ ; 297'~u' ↔ ゥ ; 298'~e' ↔ ェ ; 299'~o' ↔ ォ ; 300'~tsu' ↔ ッ ; 301'~wa' ↔ ヮ ; 302'~ya' ↔ ャ ; 303'~yi' → ィ ; 304'~yu' ↔ ュ ; 305'~ye' → ェ ; 306'~yo' ↔ ョ ; 307# iteration marks 308# TODO: make more accurate 309j $1 ← sh (y* $vowel) {ヽ$voice ; 310dj $1 ← ch (y* $vowel) {ヽ$voice ; 311dz $1 ← ts (y* $vowel) {ヽ$voice ; 312g $1 ← k (y* $vowel) {ヽ$voice ; 313z $1 ← s (y* $vowel) {ヽ$voice ; 314d $1 ← t (y* $vowel) {ヽ$voice ; 315h $1 ← b (y* $vowel) {ヽ$voice ; 316v $1 ← w (y* $vowel) {ヽ$voice ; 317sh $1 ← sh (y* $vowel) {ヽ$voice ; 318j $1 ← j (y* $vowel) {ヽ$voice ; 319ch $1 ← ch (y* $vowel) {ヽ$voice ; 320dj $1 ← dj(y* $vowel) {ヽ$voice ; 321ts $1 ← ts (y* $vowel) {ヽ$voice ; 322dz $1 ← dz (y* $vowel) {ヽ$voice ; 323$1 ← ($consonant y* $vowel) {ヽ$voice? ; 324$1 ← (.) {ヽ $voice? ; # otherwise repeat last character 325← ヽ $voice? ; # delete if no characters found 326# h- rule: lengthens vowel if not followed by a vowel. 327# At the point this is applied, latin [cons]?vowel sequences 328# have been converted to katakana in NFD form. 329$voweled_basekana [\u3099 \u309A]? { h → ー ; 330# one-way latin- → kana rules. these do not occur in 331# well-formed romaji representing actual japanese text. 332# their purpose is to make all romaji map to kana of 333# some sort. 334# the following are not really necessary, but produce 335# slightly more natural results. 336cy → セィ ; 337dy → テ\u3099ィ ; 338hy → ヒ ; 339sy → セィ ; 340ty → ティ ; 341zy → セ\u3099ィ ; 342h → ヘ ; 343# isolated consonants listed here so as not to mask 344# longer rules above. 345ch → チ; 346sh → シ ; 347dz → ツ\u3099 ; 348dj → チ\u3099; 349b → フ\u3099 ; 350d → テ\u3099 ; 351g → ク\u3099 ; 352k → ク ; 353m → ム ; 354n'' ← ン } $n_quoter ; 355n ↔ ン ; 356p → フ\u309A ; 357r → ル ; 358s → ス ; 359t → テ ; 360y → イ ; 361z → ス\u3099 ; 362v → ウ\u3099 ; 363f → フ; 364j → シ\u3099; 365w → ウ; 366ß → | ss ; 367æ → | e ; 368ð → | d ; 369ø → | u ; 370þ → | th ; 371# simple substitutions using backup 372c → | k ; 373l → | r ; 374q → | k ; 375x → | ks ; 376# ~~~ END shared rules ~~~ 377#------------------------------------------------------ 378# Final cleanup 379'~' → ; # delete stray tildes between letters 380[:Katakana:] { '' } [:Latin:] → ; # delete stray quotes between letters 381# [ʾ[:Nonspacing Mark:]-[\u3099-゜]] → ; # delete any non-spacing marks that we didn't use 382:: NFC (NFD) ; 383:: ([[:Katakana:][\u309B\u309C\u30A0\u30FC\uFF70\uFF9E\uFF9F]] halfwidth-fullwidth); 384# note: a global filter is more efficient, but MUST include all source chars!! 385#:: ([\u0000-\u007E 、。 \u3099-゜ ァ-ー 。-゚ [:Latin:][:Katakana:] [:nonspacing mark:]]); 386# MINIMAL FILTER GENERATED FOR: Latin-Katakana BACKWARD 387:: ( [[\ -~¢-£¥-¦¬\u0304₩。-하-ᅦᅧ-ᅬᅭ-ᅲᅳ-ᅵ│-○][~、-。がぎぐげござじずぜぞだぢづでどば-ぱび-ぴぶ-ぷべ-ぺぼ-ぽゔ\u3099-゛ゞァ-ヺー-ヾ][\u309B\u309C\u30A0\u30FC\uFF70\uFF9E\uFF9F]] ) ; 388# eof 389 390