• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1<?xml version="1.0" encoding="UTF-8" ?>
2<!DOCTYPE supplementalData SYSTEM "../../common/dtd/ldmlSupplemental.dtd">
3<!--
4Copyright © 1991-2018 Unicode, Inc.
5CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)
6For terms of use, see http://www.unicode.org/copyright.html
7-->
8<supplementalData>
9	<version number="$Revision: 14381 $"/>
10	<transforms>
11		<transform source="Zawgyi" target="my" direction="forward" alias="my-t-my-s0-zawgyi">
12			<tRule><![CDATA[
13# This transform converts Zawgyi "encoded" Burmese into proper
14# unicode. Zawgyi is a popular encoding scheme in Myanmar. It uses
15# the Myanmar unicode range but assigns different characters or
16# glyphs to some codepoints. In addition to the character mapping,
17# there is reordering of codepoints needed to match the expected
18# unicode order. This reordering is context-based.
19#
20# This transform is done in two main stages:
21# (1) Map all Zawgyi codepoints to their Unicode counterpart.
22# (2) Perform reordering.
23
24# Modern Burmese digits & Unicode code points.
25$nondigits = [^\u1040-\u1049];
26$consonant = [\u1000-\u1021];
27$vowelsign = [\u102B-\u1030\u1032];  # Unicode vowel signs except E (1031)
28$vowelsAndConsonants = [\u1000-\u102a];
29
30$umedial = [\u103B-\u103E];    # Medial codepoints in Unicode
31$vowelmedial = [\u102B-\u1030\u1032\1u36\u1037\u103A-\u103F];  # Union of vowel signs and medials
32$ukinzi = \u1004\u103A\u1039;  # Codepoints representing kinzi in Unicode
33
34# Zawgyi medial ra has multiple representations
35$zmedialra = [\u103B\u107E-\u1084];
36
37$wspace = [\u0020\u00a0\u1680\u2000-\u200d\u2060\u202f\u205f\u3000\ufeff];
38
39
40####
41#### STAGE 1: CODEPOINT MAPPING FROM ZAWGYI TO UNICODE
42####
43
44# Kinzi (predefined ligatures)
45# Move base character to the right
46($consonant) \u103A \u1064 → $ukinzi $1 \u103B;
47($consonant) \u1064 → $ukinzi $1;
48\u1064 → $ukinzi;
49
50# Special cases moving base character to right before vowel signs
51($consonant) \u108B → $ukinzi $1 \u102D;
52($consonant) \u108C → $ukinzi $1 \u102E;
53($consonant) \u108D → $ukinzi $1 \u1036;
54
55# Special cases moving Kinzi block to left
56($consonant) \u103A \u1033 \u108B → $ukinzi $1 \u103B \u102D \u102F;
57($consonant) \u103A \u108b → $ukinzi $1 \u103B \u102D ;
58($consonant) \u103A \u108C → $ukinzi $1 \u103B \u102E ;
59($consonant) \u103A \u108D → $ukinzi $1 \u103B \u1036 ;
60($consonant) \u103A \u108e → $1 \u103B \u102D \u1036 ;
61
62\u108B → $ukinzi \u102D ;
63\u108C → $ukinzi \u102E ;
64\u108D → $ukinzi \u1036 ;
65
66# Consonants (only the ones that have to change)
67\u106A → \u1009 ;  # NYA
68\u106B → \u100A ;
69\u108F → \u1014 ;
70\u1090 → \u101B ;
71\u1086 → \u103F ;
72
73# yapin
74[\u103A|\u107d] → \u103B ;
75
76# yayit
77($zmedialra)+ → \u103C ;
78
79# wasway
80\u103C* \u108A → \u103D \u103E;  # To avoid duplicate medials
81\u103C → \u103D ;
82
83# hatoh
84[\u103D|\u1087] → \u103E ;
85\u1088 → \u103E \u102F ;
86\u1089 → \u103E \u1030 ;
87
88# Vowels
89\u1033 → \u102F ;
90\u1034 → \u1030 ;
91
92# asat
93\u1039 → \u103A ;
94
95# lower dot
96[\u1094\u1095] → \u1037 ;
97
98# Special cases for 1025 vs 1009;
99\u1025 \u1039 → \u1009 \u103a;
100\u1025 \u1061 → \u1009 \u1039 \u1001;
101\u1025 \u1062 → \u1009 \u1039 \u1002;
102\u1025 \u1065 → \u1009 \u1039 \u1005;
103\u1025 \u1068 → \u1009 \u1039 \u1007;
104\u1025 \u1076 → \u1009 \u1039 \u1013;
105\u1025 \u1078 → \u1009 \u1039 \u1015;
106\u1025 \u107A → \u1009 \u1039 \u1017;
107\u1025 \u1079 → \u1009 \u1039 \u1016;
108
109# Stacked Consonants
110\u105A → \u102B \u103A ;
111\u1060 → \u1039 \u1000 ;
112\u1061 → \u1039 \u1001 ;
113\u1062 → \u1039 \u1002 ;
114\u1063 → \u1039 \u1003 ;
115\u1065 → \u1039 \u1005 ;
116[\u1066\u1067] → \u1039 \u1006 ;
117\u1068 → \u1039 \u1007 ;
118\u1069 → \u1039 \u1008 ;
119\u106C → \u1039 \u100B ;
120\u106D → \u1039 \u100C ;
121\u1070 → \u1039 \u100F ;
122[\u1071\u1072] → \u1039 \u1010 ;
123\u1096 → \u1039 \u1010 \u103D;
124[\u1073\u1074] → \u1039 \u1011 ;
125\u1075 → \u1039 \u1012 ;
126\u1076 → \u1039 \u1013 ;
127\u1077 → \u1039 \u1014 ;
128\u1078 → \u1039 \u1015 ;
129\u1079 → \u1039 \u1016 ;
130\u107A → \u1039 \u1017 ;
131[\u107B\u1093] → \u1039 \u1018 ;
132\u107C → \u1039 \u1019 ;
133\u1085 → \u1039 \u101C ;
134\u108E → \u102D \u1036 ;
135
136# Pre-defined ligatures
137\u106E → \u100D\u1039\u100D ;
138\u106F → \u100D\u1039\u100E ;
139\u1091 → \u100F\u1039\u100D ;
140\u1092 → \u100B\u1039\u100C ;
141\u1097 → \u100B\u1039\u100B ;
142\u104E → \u104E\u1004\u103A\u1038 ;
143
144
145####
146#### STAGE 1.01: Digits 0 and 4 used instead of letters
147# Case of MYANMAR digit being used instead of a letter
148# Lone digit zero and four at start
149::Null;
150^ \u1040 ($nondigits) → \u101D $1;
151^ \u1044 ($nondigits) → | \u104E $1 ;
152
153# Lone digit zero or four at end
154($nondigits) \u1040 $ → $1 \u101D;
155($nondigits) \u1044 $ → $1 \u104e;
156
157# Evowel and dependent vowel signs before 0 or 4 only
158#   -> convert to the consonant.
159([\u102b-\u103f]) \u1040 ($nondigits) → $1 \u101d $2;
160([\u102b-\u103f]) \u1044 ($nondigits) → $1 \u104E $2;
161
162
163####
164#### STAGE 1.1: Strip spaces immediately before combining characters.
165####   Move e-vowel after consonants and medials
166####   Now every codepoint is Unicode. This starts conversion
167####   from semi-visual order to logical order.
168####
169::Null;
170
171# Don't remove spaces before E vowel or medial Ra at this stage
172($wspace) \u1037 > \u1037 $1;
173($wspace+) ([\u102b-\u1030\u1032-\u103b\u103d\u103e]) → $2;
174
175# Remove a duplicate early
176\u1037+ → \u1037;
177
178# Move e-vowel after medials and consonants.
179\u1031+ $ukinzi ($consonant) > $ukinzi $1 \u1031;
180\u1031+ \u1037+ ($consonant) > $1 \u1031 \u1037 ;
181\u1031+ \u103c ($consonant) > $1 \u103c \u1031;
182
183# Move medials other than 103c before the 1031. Leave 103c for
184# the next consonant.
185\u1031+ ($consonant) ([\u103b\u103d\u103e]+) > $1 $2 \u1031;
186\u1031+ ($vowelsAndConsonants) > $1 \u1031;
187
188
189####
190#### STAGE 2: POST REORDERING RULES FOR UNICODE RENDERING
191####
192::Null;
193
194\u103b \u103a > \u103a \u103b;
195
196# Simpler replacements for Zawgyi 1025
197\u1025 \u102E → \u1026;
198
199# Asat and dot below reordering, to Unicode NFC.
200\u103A\u1037 → \u1037\u103A;
201
202# Reorder some vowel signs
203\u1036 ($umedial*) ($vowelsign+) → $1 $2 \u1036 ;
204([\u102B\u102C\u102F\u1030]) ([\u102D\u102E\u1032]) → $2 $1;
205
206# Move ra medial which precedes consonant, but not other medials.
207\u103C ($consonant) → $1 \u103C;
208
209
210####
211#### Stage 3
212#### Move \u1036, and \u103C after consonants.
213::Null;
214
215($umedial) \u1039 ($consonant) > \u1039 $2 $1;
216
217\u103C \u103A \u1039 ($consonant) → \u103A \u1039 $1 \u103C;
218
219\u1036 ($umedial+) → $1 \u1036;
220
221
222####
223#### Stage 4
224#### Reordering medials, dot below, contractions, E sign, and asat.
225::Null;
226
227# Reorder the medials
228([\u103C\u103D\u103E]+) \u103B → \u103B $1;
229([\u103D\u103E]+) \u103C → \u103C $1;
230\u103E\u103D → \u103D\u103E ;
231
232# Contractions with vowel signs
233([\u1031]+) ($vowelsign*) \u1039 ($consonant) → \u1039 $3 $1 $2;
234($vowelsign+) \u1039 ($consonant) → \u1039 $2 $1;
235
236# Move vowel sign E \u1031 after medials, but not across consonants
237($umedial*) ([\u1031]+) ($umedial*) → $1 $3 $2;
238
239# Reorder dot below after medials and vowel diacritics
240\u1037 ([\u102D-\u1030\u1032\u1036\u103b-\u103e]+) → $1 \u1037;
241
242# Move vowel signs after medials
243($vowelsign+) ($umedial+) → $2 $1;
244
245# Reorder modifiers and asat
246($consonant) ([\u102B-\u1032\u1036\u103B-\u103E]) \u103A ($consonant) → $1 \u103A $2 $3;
247
248
249####
250#### Stage 5.  More reorderings
251#### Vowel signs after medials, sort medials,
252####
253::Null;
254
255# Replace CA + YA with JHA after moving other things beyond the medials.
256\u1005 \u103b → \u1008;
257
258# More moving vowel signs after medials
259([\u102b-\u1032]) ($umedial) → $2 $1;
260
261# Sort the medials
262([\u103C\u103D\u103E]) \u103B → \u103B $1;
263([\u103D\u103E]) \u103C → \u103C $1;
264\u103E\u103D → \u103D\u103E ;
265
266# Move visarga after other signs
267\u1038 ($vowelmedial) → $1 \u1038;
268
269# Reorder
270\u1036 \u102f → \u102f \u1036;
271
272
273###
274### Stage 6
275### Finish conflicting and extra diacritics. Remove some white space
276###
277::Null;
278
279# Fix duplicate combiners
280\u102D \u102D+ → \u102D;
281\u102E \u102E+ → \u102E;
282\u102F \u102F+ → \u102F;
283\u1030 \u1030+ → \u1030;
284\u1032 \u1032+ → \u1032;
285\u1036 \u1036+ → \u1036;
286\u1037 \u1037+ → \u1037;
287\u1039 \u1039+ → \u1039;
288\u103a \u103a+ → \u103a;
289\u103b \u103b+ → \u103b;
290\u103c \u103c+ → \u103c;
291\u103d \u103d+ → \u103d;
292\u103e \u103e+ → \u103e; # http://unicode.org/cldr/trac/ticket/10386
293
294# Fix overlapping signs
295\u102F [\u1030\u103a] → \u102F;
296\u102D \u102E → \u102E;
297
298# Remove space directly before diacritics.
299($wspace)+ ([\u102b-\u1032\u1036-\u103e]) → $2;
300
301# Remove ZWSP at start and end
302^ \u200b+ → ;
303\u200b+ $ → ;
304
305# Fix multiple spaces around ZWSP to single ZWSP.
306$wspace* \u200b $wspace* → \u200b;
307			]]></tRule>
308		</transform>
309	</transforms>
310</supplementalData>
311