1# © 2016 and later: Unicode, Inc. and others. 2# License & terms of use: http://www.unicode.org/copyright.html#License 3# 4# File: Zawgyi_my.txt 5# Generated from CLDR 6# 7 8# This transform converts Zawgyi "encoded" Burmese into proper 9# unicode. Zawgyi is a popular encoding scheme in Myanmar. It uses 10# the Myanmar unicode range but assigns different characters or 11# glyphs to some codepoints. In addition to the character mapping, 12# there is reordering of codepoints needed to match the expected 13# unicode order. This reordering is context-based. 14# 15# This transform is done in two main stages: 16# (1) Map all Zawgyi codepoints to their Unicode counterpart. 17# (2) Perform reordering. 18# Modern Burmese digits & Unicode code points. 19$nondigits = [^\u1040-\u1049]; 20$consonant = [\u1000-\u1021]; 21$vowelsign = [\u102B-\u1030\u1032]; # Unicode vowel signs except E (1031) 22$vowelsAndConsonants = [\u1000-\u102a]; 23$umedial = [\u103B-\u103E]; # Medial codepoints in Unicode 24$vowelmedial = [\u102B-\u1030\u1032\1u36\u1037\u103A-\u103F]; # Union of vowel signs and medials 25$ukinzi = \u1004\u103A\u1039; # Codepoints representing kinzi in Unicode 26# Zawgyi medial ra has multiple representations 27$zmedialra = [\u103B\u107E-\u1084]; 28$wspace = [\u0020\u00a0\u1680\u2000-\u200d\u2060\u202f\u205f\u3000\ufeff]; 29#### 30#### STAGE 1: CODEPOINT MAPPING FROM ZAWGYI TO UNICODE 31#### 32# Kinzi (predefined ligatures) 33# Move base character to the right 34($consonant) \u103A \u1064 → $ukinzi $1 \u103B; 35($consonant) \u1064 → $ukinzi $1; 36\u1064 → $ukinzi; 37# Special cases moving base character to right before vowel signs 38($consonant) \u108B → $ukinzi $1 \u102D; 39($consonant) \u108C → $ukinzi $1 \u102E; 40($consonant) \u108D → $ukinzi $1 \u1036; 41# Special cases moving Kinzi block to left 42($consonant) \u103A \u1033 \u108B → $ukinzi $1 \u103B \u102D \u102F; 43($consonant) \u103A \u108b → $ukinzi $1 \u103B \u102D ; 44($consonant) \u103A \u108C → $ukinzi $1 \u103B \u102E ; 45($consonant) \u103A \u108D → $ukinzi $1 \u103B \u1036 ; 46($consonant) \u103A \u108e → $1 \u103B \u102D \u1036 ; 47\u108B → $ukinzi \u102D ; 48\u108C → $ukinzi \u102E ; 49\u108D → $ukinzi \u1036 ; 50# Consonants (only the ones that have to change) 51\u106A → \u1009 ; # NYA 52\u106B → \u100A ; 53\u108F → \u1014 ; 54\u1090 → \u101B ; 55\u1086 → \u103F ; 56# yapin 57[\u103A|\u107d] → \u103B ; 58# yayit 59($zmedialra)+ → \u103C ; 60# wasway 61\u103C* \u108A → \u103D \u103E; # To avoid duplicate medials 62\u103C → \u103D ; 63# hatoh 64[\u103D|\u1087] → \u103E ; 65\u1088 → \u103E \u102F ; 66\u1089 → \u103E \u1030 ; 67# Vowels 68\u1033 → \u102F ; 69\u1034 → \u1030 ; 70# asat 71\u1039 → \u103A ; 72# lower dot 73[\u1094\u1095] → \u1037 ; 74# Special cases for 1025 vs 1009; 75\u1025 \u1039 → \u1009 \u103a; 76\u1025 \u1061 → \u1009 \u1039 \u1001; 77\u1025 \u1062 → \u1009 \u1039 \u1002; 78\u1025 \u1065 → \u1009 \u1039 \u1005; 79\u1025 \u1068 → \u1009 \u1039 \u1007; 80\u1025 \u1076 → \u1009 \u1039 \u1013; 81\u1025 \u1078 → \u1009 \u1039 \u1015; 82\u1025 \u107A → \u1009 \u1039 \u1017; 83\u1025 \u1079 → \u1009 \u1039 \u1016; 84# Stacked Consonants 85\u105A → \u102B \u103A ; 86\u1060 → \u1039 \u1000 ; 87\u1061 → \u1039 \u1001 ; 88\u1062 → \u1039 \u1002 ; 89\u1063 → \u1039 \u1003 ; 90\u1065 → \u1039 \u1005 ; 91[\u1066\u1067] → \u1039 \u1006 ; 92\u1068 → \u1039 \u1007 ; 93\u1069 → \u1039 \u1008 ; 94\u106C → \u1039 \u100B ; 95\u106D → \u1039 \u100C ; 96\u1070 → \u1039 \u100F ; 97[\u1071\u1072] → \u1039 \u1010 ; 98\u1096 → \u1039 \u1010 \u103D; 99[\u1073\u1074] → \u1039 \u1011 ; 100\u1075 → \u1039 \u1012 ; 101\u1076 → \u1039 \u1013 ; 102\u1077 → \u1039 \u1014 ; 103\u1078 → \u1039 \u1015 ; 104\u1079 → \u1039 \u1016 ; 105\u107A → \u1039 \u1017 ; 106[\u107B\u1093] → \u1039 \u1018 ; 107\u107C → \u1039 \u1019 ; 108\u1085 → \u1039 \u101C ; 109\u108E → \u102D \u1036 ; 110# Pre-defined ligatures 111\u106E → \u100D\u1039\u100D ; 112\u106F → \u100D\u1039\u100E ; 113\u1091 → \u100F\u1039\u100D ; 114\u1092 → \u100B\u1039\u100C ; 115\u1097 → \u100B\u1039\u100B ; 116\u104E → \u104E\u1004\u103A\u1038 ; 117#### 118#### STAGE 1.01: Digits 0 and 4 used instead of letters 119# Case of MYANMAR digit being used instead of a letter 120# Lone digit zero and four at start 121::Null; 122^ \u1040 ($nondigits) → \u101D $1; 123^ \u1044 ($nondigits) → | \u104E $1 ; 124# Lone digit zero or four at end 125($nondigits) \u1040 $ → $1 \u101D; 126($nondigits) \u1044 $ → $1 \u104e; 127# Evowel and dependent vowel signs before 0 or 4 only 128# -> convert to the consonant. 129([\u102b-\u103f]) \u1040 ($nondigits) → $1 \u101d $2; 130([\u102b-\u103f]) \u1044 ($nondigits) → $1 \u104E $2; 131#### 132#### STAGE 1.1: Strip spaces immediately before combining characters. 133#### Move e-vowel after consonants and medials 134#### Now every codepoint is Unicode. This starts conversion 135#### from semi-visual order to logical order. 136#### 137::Null; 138# Don't remove spaces before E vowel or medial Ra at this stage 139($wspace) \u1037 > \u1037 $1; 140($wspace+) ([\u102b-\u1030\u1032-\u103b\u103d\u103e]) → $2; 141# Remove a duplicate early 142\u1037+ → \u1037; 143# Move e-vowel after medials and consonants. 144\u1031+ $ukinzi ($consonant) > $ukinzi $1 \u1031; 145\u1031+ \u1037+ ($consonant) > $1 \u1031 \u1037 ; 146\u1031+ \u103c ($consonant) > $1 \u103c \u1031; 147# Move medials other than 103c before the 1031. Leave 103c for 148# the next consonant. 149\u1031+ ($consonant) ([\u103b\u103d\u103e]+) > $1 $2 \u1031; 150\u1031+ ($vowelsAndConsonants) > $1 \u1031; 151#### 152#### STAGE 2: POST REORDERING RULES FOR UNICODE RENDERING 153#### 154::Null; 155\u103b \u103a > \u103a \u103b; 156# Simpler replacements for Zawgyi 1025 157\u1025 \u102E → \u1026; 158# Asat and dot below reordering, to Unicode NFC. 159\u103A\u1037 → \u1037\u103A; 160# Reorder some vowel signs 161\u1036 ($umedial*) ($vowelsign+) → $1 $2 \u1036 ; 162([\u102B\u102C\u102F\u1030]) ([\u102D\u102E\u1032]) → $2 $1; 163# Move ra medial which precedes consonant, but not other medials. 164\u103C ($consonant) → $1 \u103C; 165#### 166#### Stage 3 167#### Move \u1036, and \u103C after consonants. 168::Null; 169($umedial) \u1039 ($consonant) > \u1039 $2 $1; 170\u103C \u103A \u1039 ($consonant) → \u103A \u1039 $1 \u103C; 171\u1036 ($umedial+) → $1 \u1036; 172#### 173#### Stage 4 174#### Reordering medials, dot below, contractions, E sign, and asat. 175::Null; 176# Reorder the medials 177([\u103C\u103D\u103E]+) \u103B → \u103B $1; 178([\u103D\u103E]+) \u103C → \u103C $1; 179\u103E\u103D → \u103D\u103E ; 180# Contractions with vowel signs 181([\u1031]+) ($vowelsign*) \u1039 ($consonant) → \u1039 $3 $1 $2; 182($vowelsign+) \u1039 ($consonant) → \u1039 $2 $1; 183# Move vowel sign E \u1031 after medials, but not across consonants 184($umedial*) ([\u1031]+) ($umedial*) → $1 $3 $2; 185# Reorder dot below after medials and vowel diacritics 186\u1037 ([\u102D-\u1030\u1032\u1036\u103b-\u103e]+) → $1 \u1037; 187# Move vowel signs after medials 188($vowelsign+) ($umedial+) → $2 $1; 189# Reorder modifiers and asat 190($consonant) ([\u102B-\u1032\u1036\u103B-\u103E]) \u103A ($consonant) → $1 \u103A $2 $3; 191#### 192#### Stage 5. More reorderings 193#### Vowel signs after medials, sort medials, 194#### 195::Null; 196# Replace CA + YA with JHA after moving other things beyond the medials. 197\u1005 \u103b → \u1008; 198# More moving vowel signs after medials 199([\u102b-\u1032]) ($umedial) → $2 $1; 200# Sort the medials 201([\u103C\u103D\u103E]) \u103B → \u103B $1; 202([\u103D\u103E]) \u103C → \u103C $1; 203\u103E\u103D → \u103D\u103E ; 204# Move visarga after other signs 205\u1038 ($vowelmedial) → $1 \u1038; 206# Reorder 207\u1036 \u102f → \u102f \u1036; 208### 209### Stage 6 210### Finish conflicting and extra diacritics. Remove some white space 211### 212::Null; 213# Fix duplicate combiners 214\u102D \u102D+ → \u102D; 215\u102E \u102E+ → \u102E; 216\u102F \u102F+ → \u102F; 217\u1030 \u1030+ → \u1030; 218\u1032 \u1032+ → \u1032; 219\u1036 \u1036+ → \u1036; 220\u1037 \u1037+ → \u1037; 221\u1039 \u1039+ → \u1039; 222\u103a \u103a+ → \u103a; 223\u103b \u103b+ → \u103b; 224\u103c \u103c+ → \u103c; 225\u103d \u103d+ → \u103d; 226\u103e \u103e+ → \u103e; # http://unicode.org/cldr/trac/ticket/10386 227# Fix overlapping signs 228\u102F [\u1030\u103a] → \u102F; 229\u102D \u102E → \u102E; 230# Remove space directly before diacritics. 231($wspace)+ ([\u102b-\u1032\u1036-\u103e]) → $2; 232# Remove ZWSP at start and end 233^ \u200b+ → ; 234\u200b+ $ → ; 235# Fix multiple spaces around ZWSP to single ZWSP. 236$wspace* \u200b $wspace* → \u200b; 237 238