• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# © 2016 and later: Unicode, Inc. and others.
2# License & terms of use: http://www.unicode.org/copyright.html
3# Generated using tools/cldr/cldr-to-icu/build-icu-data.xml
4#
5# File: Latn_Kana.txt
6# Generated from CLDR
7#
8
9# note: a global filter is more efficient, but MUST include all source chars
10#:: [\u0000-\u007E 、。 \u3099-゜ ァ-ー 。-゚ [:Latin:][:Katakana:] [:nonspacing mark:]] ;
11# MINIMAL FILTER GENERATED FOR: Latin-Katakana
12### WARNING -- must add width filter, both here and below!!! ###
13:: [[ᄀ-ᄒᄚᄡ\u1160-ᅵᆪᆬ-ᆭᆰ-ᆵ←-↓│■○\u3000-。「-」\u3099-\u309Aァ-ロワヲ-ヴヷヺ-ー!-~¢-₩][',.A-Za-z~À-ÖØ-öø-ďĒ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǭǰǴ-ǵǸ-țȞ-ȟȦ-ȳ\u0304Ӣ-ӣӮ-ӯḀ-ẙẠ-ỹᾱᾹῑῙῡῩK-Å]] ;
14:: [:Latin:] fullwidth-halfwidth ();
15:: NFD (NFC);
16:: Lower ();    # whenever transliterating from cased to uncased script, include this
17# :: NFD () ;   # this would catch the odd cases where a lowercase is not in NFD, but none are important for Japanese
18# Uses modified Hepburn. Small changes to make  unambiguous.
19# | Kunrei-shiki: Hepburn/MHepburn
20# | ------------------------------
21# | si: shi
22# | si ~ya: sha
23# | si ~yu: shu
24# | si ~yo: sho
25# | zi: ji
26# | zi ~ya: ja
27# | zi ~yu: ju
28# | zi ~yo: jo
29# | ti: chi
30# | ti ~ya: cha
31# | ti ~yu: chu
32# | ti ~yu: cho
33# | tu: tsu
34# | di: ji/dji
35# | du: zu/dzu
36# | hu: fu
37# | For foreign words:
38# | -----------------
39# | se ~i si
40# | si ~e she
41# |
42# | ze ~i zi
43# | zi ~e je
44# |
45# | te ~i ti
46# | ti ~e che
47# | te ~u tu
48# |
49# | de ~i di
50# | de ~u du
51# | de ~i di
52# |
53# | he ~u: hu
54# | hu ~a fa
55# | hu ~i fi
56# | hu ~e he
57# | hu ~o ho
58# Most small forms are generated, but if necessary
59# explicit small forms are given with ~a, ~ya, etc.
60#------------------------------------------------------
61# Variables
62$vowel = [aeiou] ;
63$consonant = [bcdfghjklmnpqrstvwxyz] ;
64$macron = \u0304 ;
65# Variables used for doubled-consonants with tsu
66$kana = [ぁ-ゔ] ;
67$voice = [\u3099゛];
68$semivoice = [\u309A゜];
69$k_start = [カキクケコかきくけこ] ;
70$s_start = [サシスセソさしすせそ] ;
71$j_start = [シし] $voice ;
72$t_start = [タチツテトたちつてと] ;
73$n_start = [ナニヌネノンなにぬねの] ;
74$h_start = [ハヒヘホはひへほ] ;
75$f_start = [フふ] ;
76$m_start = [マミムメモまみむめも] ;
77$y_start = [ヤユヨやゆよ] ;
78$r_start = [ラリルレロらりるれろ] ;
79$w_start = [ワヰヱヲわゐゑを] ;
80$v_start = [ワヰヱヲ]\u3099 ;
81$voweled_basekana = [ァ-オカキクケコサシスセソタチッツテトナ-ノハヒフヘホマ-ヲヵヶ] ;
82# if ン is followed by $n_quoter, then it needs an
83# apostrophe after its romaji form to disambiguate it.
84# e.g., ン ア ! =  ナ, so represent as "n'a", not "na".
85$n_quoter  =  [ア イ ウ エ オ ナ ニ ヌ ネ ノ ヤ ユ ヨ ン] ;
86$small_y = [ャィュェョ] ;
87$iteration = ゝ ;
88#------------------------------------------------------
89# katakana rules
90# Punctuation
91'.' ↔ 。;
92',' ↔ 、;
93# ' ' } [a-z] → ; # delete spaces before latin
94# ' ' ← [^' '゠-ヿ] {} ['゠-ヿ] ; #insert spaces before hiragana
95# Iteration Mark
96# Copy previous letter § marks
97# TODO
98# | $1 $1 ← ($kana [[:M:]$voice$semivoice]?) $iteration
99# Specials for katakana -- not shared with hiragana
100va ↔ ワ\u3099 ;
101vi ↔ ヰ\u3099 ;
102ve ↔ ヱ\u3099 ;
103vo ↔ ヲ\u3099 ;
104'~ka' ↔ ヵ ;
105'~ke' ↔ ヶ ;
106# ~~~ begin shared rules ~~~
107#special
108ya ← '~'ャ;
109yi ← '~'ィ ;
110yu ← '~'ュ;
111ye ← '~'ェ;
112yo ← '~'ョ;
113#normal
114a ↔ ア ;
115b | '~' ← ヒ \u3099} $small_y ;
116by } $vowel → ヒ\u3099 | '~y' ;
117ba ↔ ハ\u3099 ;
118bi ↔ ヒ\u3099 ;
119bu ↔ フ\u3099 ;
120be ↔ ヘ\u3099 ;
121bo ↔ ホ\u3099 ;
122c } i → | s ;
123c } e → | s ;
124da ↔ タ\u3099 ;
125di ↔ テ\u3099ィ ;
126du ↔ テ\u3099ゥ ;
127de ↔ テ\u3099 ;
128do ↔ ト\u3099 ;
129dzu ↔ ツ\u3099 ;
130dja ← チ\u3099ャ ;
131dji'~i' ← チ\u3099ィ ; # liu
132dju ← チ\u3099ュ ;
133dje ← チ\u3099ェ ;
134djo ← チ\u3099ョ ;
135dji ↔ チ\u3099 ;
136dj  } $vowel → チ\u3099 | '~y' ;
137# TODO: QUESTION: use ĵĴżŻ instead of dj, dz
138cha ← チャ ;
139chi'~i' ← チィ ; # liu
140chu ← チュ ;
141che ← チェ ;
142cho ← チョ ;
143chi ↔ チ ;
144ch } $vowel → チ | '~y' ;
145e ↔ エ ;
146g | '~' ← キ\u3099} $small_y ;
147gy  } $vowel → キ\u3099 | '~y' ;
148ga ↔ カ\u3099 ;
149gi ↔ キ\u3099 ;
150gu ↔ ク\u3099 ;
151ge ↔ ケ\u3099 ;
152go ↔ コ\u3099 ;
153i ↔ イ ;
154# j  } $vowel → シ\u3099 | '~y' ;
155ja ↔ シ\u3099ャ ;
156ji'~i' ← シ\u3099ィ ; # liu
157ju ↔ シ\u3099ュ ;
158je ↔ シ\u3099ェ ;
159jo ↔ シ\u3099ョ ;
160ji ↔ シ\u3099 ;
161k | '~' ← キ} $small_y ;
162ky  } $vowel → キ | '~y' ;
163ka ↔ カ ;
164ki ↔ キ ;
165ku ↔ ク ;
166ke ↔ ケ ;
167ko ↔ コ ;
168m | '~' ← ミ} $small_y ;
169my  } $vowel → ミ | '~y' ;
170ma ↔ マ ;
171mi ↔ ミ ;
172mu ↔ ム ;
173me ↔ メ ;
174mo ↔ モ ;
175m } [pbfv] → ン ;
176n | '~' ← ニ } $small_y ;
177ny  } $vowel → ニ | '~y' ;
178na ↔ ナ ;
179ni ↔ ニ ;
180nu ↔ ヌ ;
181ne ↔ ネ ;
182no ↔ ノ ;
183o ↔ オ ;
184p | '~' ← ヒ\u309A } $small_y ;
185py  } $vowel → ヒ\u309A | '~y' ;
186pa ↔ ハ\u309A ;
187pi ↔ ヒ\u309A ;
188pu ↔ フ\u309A ;
189pe ↔ ヘ\u309A ;
190po ↔ ホ\u309A ;
191h | '~' ← ヒ } $small_y ;
192hy  } $vowel → ヒ | '~y' ;
193ha ↔ ハ ;
194hi ↔ ヒ ;
195hu ↔ ヘゥ ;
196he ↔ ヘ ;
197ho ↔ ホ ;
198# f | '~' ← フ } $small_y ;
199# f } $vowel → フ | '~' ;
200fa ↔ ファ ;
201fi ↔ フィ ;
202fe ↔ フェ ;
203fo ↔ フォ ;
204fu ↔ フ ;
205r | '~' ← リ } $small_y ;
206ry  } $vowel → リ | '~y' ;
207ra ↔ ラ ;
208ri ↔ リ ;
209ru ↔ ル ;
210re ↔ レ ;
211ro ↔ ロ ;
212za ↔ サ\u3099 ;
213zi ↔ セ\u3099ィ ;
214zu ↔ ス\u3099 ;
215ze ↔ セ\u3099 ;
216zo ↔ ソ\u3099 ;
217sa ↔ サ ;
218si ↔ セィ ;
219su ↔ ス ;
220se ↔ セ ;
221so ↔ ソ ;
222sha ← シャ ;
223shi'~i' ← シィ ; # liu
224shu ← シュ ;
225she ← シェ ;
226sho ← ショ ;
227shi ↔ シ ;
228sh } $vowel → シ | '~y' ;
229ta ↔ タ ;
230ti ↔ ティ ;
231tu ↔ テゥ ;
232te ↔ テ ;
233to ↔ ト ;
234tsu ↔ ツ ;
235# v  } $vowel → ウ\u3099 | '~' ;
236#'v~a' ← ウ\u3099ァ ; # liu
237#'v~i' ← ウ\u3099ィ ; # liu
238#'v~e' ← ウ\u3099ェ ; # liu
239#'v~o' ← ウ\u3099ォ ; # liu
240vu ↔ ウ\u3099 ;
241u ↔ ウ ;
242# w  } $vowel → ウ | '~' ;
243wa ↔ ワ ;
244wi ↔ ヰ ;
245wu → ウ ;
246we ↔ ヱ ;
247wo ↔ ヲ ;
248ya ↔ ヤ ;
249yi → イ ;
250yu ↔ ユ ;
251ye → エ ;
252yo ↔ ヨ ;
253# double consonants
254#specials
255s } sh → ッ ;
256t } ch → ッ ;
257#voiced
258j } j ↔ ッ } $j_start ;
259b } b ↔ ッ } [$h_start$f_start] $voice;
260d } d ↔ ッ } $t_start $voice;
261g } g ↔ ッ } $k_start $voice;
262p } p ↔ ッ } [$h_start$f_start] $semivoice;
263# v } v ↔ ッ } [ワヰウヱヲう]  $voice ;
264z } z ↔ ッ } $s_start $voice;
265v } v ↔ ッ } $v_start;
266# normal
267k } k ↔ ッ } $k_start ;
268m } m ↔ ッ } $m_start ;
269n } n ↔ ッ } $n_start ;
270h } h ↔ ッ } $h_start ;
271f } f ↔ ッ } $f_start ;
272r } r ↔ ッ } $r_start ;
273t } t ↔ ッ } $t_start ;
274s } s ↔ ッ } $s_start ;
275w } w  ↔ ッ } $w_start;
276y } y ↔ ッ } $y_start;
277# completeness
278x } x → ッ ;
279c } k → ッ ;
280c } c → ッ ;
281c } q → ッ ;
282l } l → ッ ;
283q } q → ッ ;
284# y } y → ッ ;
285# w } w → ッ ;
286# prolonged vowel mark. this indicates a doubling of
287# the preceding vowel sound
288#a ← a { ー ; # liu
289#e ← e { ー ; # liu
290#i ← i { ー ; # liu
291#o ← o { ー ; # liu
292#u ← u { ー ; # liu
293$macron ↔ ー ;
294# small forms
295'~a' ↔ ァ ;
296'~i' ↔ ィ ;
297'~u' ↔ ゥ ;
298'~e' ↔ ェ ;
299'~o' ↔ ォ ;
300'~tsu' ↔ ッ ;
301'~wa' ↔ ヮ ;
302'~ya' ↔ ャ ;
303'~yi' → ィ ;
304'~yu' ↔ ュ ;
305'~ye' → ェ ;
306'~yo' ↔ ョ ;
307# iteration marks
308# TODO: make more accurate
309j $1 ← sh (y* $vowel) {ヽ$voice ;
310dj $1 ← ch (y* $vowel) {ヽ$voice ;
311dz $1 ← ts (y* $vowel) {ヽ$voice ;
312g $1 ← k (y* $vowel) {ヽ$voice ;
313z $1 ← s (y* $vowel) {ヽ$voice ;
314d $1 ← t (y* $vowel) {ヽ$voice ;
315h $1 ← b (y* $vowel) {ヽ$voice ;
316v $1 ← w (y* $vowel) {ヽ$voice ;
317sh $1 ← sh (y* $vowel) {ヽ$voice ;
318j $1 ← j (y* $vowel) {ヽ$voice ;
319ch $1 ← ch (y* $vowel) {ヽ$voice ;
320dj $1 ← dj(y* $vowel) {ヽ$voice ;
321ts $1 ← ts (y* $vowel) {ヽ$voice ;
322dz $1 ← dz (y* $vowel) {ヽ$voice ;
323$1 ← ($consonant y* $vowel) {ヽ$voice? ;
324$1 ← (.) {ヽ $voice? ; # otherwise repeat last character
325← ヽ $voice? ; # delete if no characters found
326# h- rule: lengthens vowel if not followed by a vowel.
327# At the point this is applied, latin [cons]?vowel sequences
328# have been converted to katakana in NFD form.
329$voweled_basekana [\u3099 \u309A]? { h → ー ;
330# one-way latin- → kana rules. these do not occur in
331# well-formed romaji representing actual japanese text.
332# their purpose is to make all romaji map to kana of
333# some sort.
334# the following are not really necessary, but produce
335# slightly more natural results.
336cy → セィ ;
337dy → テ\u3099ィ ;
338hy → ヒ ;
339sy → セィ ;
340ty → ティ ;
341zy → セ\u3099ィ ;
342h → ヘ ;
343# isolated consonants listed here so as not to mask
344# longer rules above.
345ch → チ;
346sh → シ ;
347dz → ツ\u3099 ;
348dj → チ\u3099;
349b → フ\u3099 ;
350d → テ\u3099 ;
351g → ク\u3099 ;
352k → ク ;
353m → ム ;
354n'' ← ン } $n_quoter ;
355n ↔ ン ;
356p → フ\u309A ;
357r → ル ;
358s → ス ;
359t → テ ;
360y → イ ;
361z → ス\u3099 ;
362v → ウ\u3099 ;
363f → フ;
364j  → シ\u3099;
365w → ウ;
366ß → | ss ;
367æ → | e ;
368ð → | d ;
369ø → | u ;
370þ → | th ;
371# simple substitutions using backup
372c → | k ;
373l → | r ;
374q → | k ;
375x → | ks ;
376# ~~~ END shared rules ~~~
377#------------------------------------------------------
378# Final cleanup
379'~' → ; # delete stray tildes between letters
380[:Katakana:] { '' } [:Latin:] → ; # delete stray quotes between letters
381# [ʾ[:Nonspacing Mark:]-[\u3099-゜]] → ; # delete any non-spacing marks that we didn't use
382:: NFC (NFD) ;
383:: ([[:Katakana:][\u309B\u309C\u30A0\u30FC\uFF70\uFF9E\uFF9F]] halfwidth-fullwidth);
384# note: a global filter is more efficient, but MUST include all source chars!!
385#:: ([\u0000-\u007E 、。 \u3099-゜ ァ-ー 。-゚ [:Latin:][:Katakana:] [:nonspacing mark:]]);
386# MINIMAL FILTER GENERATED FOR: Latin-Katakana BACKWARD
387:: ( [[\ -~¢-£¥-¦¬\u0304₩。-하-ᅦᅧ-ᅬᅭ-ᅲᅳ-ᅵ│-○][~、-。がぎぐげござじずぜぞだぢづでどば-ぱび-ぴぶ-ぷべ-ぺぼ-ぽゔ\u3099-゛ゞァ-ヺー-ヾ][\u309B\u309C\u30A0\u30FC\uFF70\uFF9E\uFF9F]] ) ;
388# eof
389
390