1 /* 2 * 3 * (C) Copyright IBM Corp. 1998-2013 - All Rights Reserved 4 * 5 * Developed at DIT - Government of Bhutan 6 * 7 * Contact person: Pema Geyleg - <pema_geyleg@druknet.bt> 8 * 9 * This file is a modification of the ICU file KhmerReordering.h 10 * by Jens Herden and Javier Sola who have given all their possible rights to IBM and the Governement of Bhutan 11 * A first module for Dzongkha was developed by Karunakar under Panlocalisation funding. 12 * Assistance for this module has been received from Namgay Thinley, Christopher Fynn and Javier Sola 13 * 14 */ 15 16 #ifndef __TIBETANREORDERING_H 17 #define __TIBETANREORDERING_H 18 19 /** 20 * \file 21 * \internal 22 */ 23 24 // #include "LETypes.h" 25 // #include "OpenTypeTables.h" 26 27 U_NAMESPACE_BEGIN 28 29 class LEGlyphStorage; 30 31 // Vocabulary 32 // Base -> A consonant in its full (not subscript) form. It is the 33 // center of the syllable, it can be souranded by subjoined consonants, vowels, 34 // signs... but there is only one base in a stack, it has to be coded as 35 // the first character of the syllable.Included here are also groups of base + subjoined 36 // which are represented by one single code point in unicode (e.g. 0F43) Also other characters that might take 37 // subjoined consonants or other combining characters. 38 // Subjoined -> Subjoined consonants and groups of subjoined consonants which have a single code-point 39 // to repersent the group (even if each subjoined consonant is represented independently 40 // by anothe code-point 41 // Tsa Phru --> Tsa Phru character, Bhutanese people will always place it right after the base, but sometimes, due to 42 // "normalization" 43 // is placed after all the subjoined consonants, and it is also permitted there. 44 // A Chung Vowel lengthening mark --> . 0F71 It is placed after the base and any subjoined consonants but before any vowels 45 // Precomposed Sanskrit vowels --> The are combinations of subjoined consonants + vowels that have been assigned 46 // a given code-point (in spite of each single part of them having also a code-point 47 // They are avoided, and users are encouraged to use the combination of code-points that 48 // represents the same sound instead of using this combined characters. This is included here 49 // for compatibility with possible texts that use them (they are not in the Dzongkha keyboard). 50 // Halanta -> The Halanta or Virama character 0F84 indicates that a consonant should not use its inheernt vowel, 51 // in spite of not having other vowels present. It is usually placed immediatly after a base consonant, 52 // but in some special cases it can also be placed after a subjoined consonant, so this is also 53 // permitted in this algorithm. (Halanta is always displayed in Tibetan not used as a connecting char) 54 // 55 // Subjoined vowels -> Dependent vowels (matras) placed below the base and below all subjoined consonants. There 56 // might be as much as three subjoined vowels in a given stack (only one in general text, but up 57 // to three for abreviations, they have to be permitted). 58 // Superscript vowels -> There are three superscript vowels, and they can be repeated or combined (up to three 59 // times. They can combine with subjoined vowels, and are always coded after these. 60 // Anusvara --> Nasalisation sign. Traditioinally placed in absence of vowels, but also after vowels. In some 61 // special cases it can be placed before a vowel, so this is also permitted 62 // Candrabindu -> Forms of the Anusvara with different glyphs (and different in identity) which can be placed 63 // without vowel or after the vowel, but never before. Cannot combine with Anusvara. 64 // Stress marks -> Marks placed above or below a syllable, affecting the whole syllable. They are combining 65 // marks, so they have to be attached to a specific stack. The are using to emphasise a syllable. 66 // 67 // Digits -> Digits are not considered as non-combining characters because there are a few characters which 68 // combine with them, so they have to be considered independently. 69 // Digit combining marks -> dependent marks that combine with digits. 70 // 71 // TODO 72 // There are a number of characters in the CJK block that are used in Tibetan script, two of these are symbols 73 // are used as bases for combining glyphs, and have not been encoded in Tibetan. As these characters are outside 74 // of the tibetan block, they have not been treated in this program. 75 76 77 struct TibetanClassTable // This list must include all types of components that can be used inside a syllable 78 { 79 enum CharClassValues // order is important here! This order must be the same that is found in each horizontal 80 // line in the statetable for Tibetan (file TibetanReordering.cpp). It assigns one number 81 // to each type of character that has to be considered when analysing the order in which 82 // characters can be placed 83 { 84 CC_RESERVED = 0, //Non Combining Characters 85 CC_BASE = 1, // Base Consonants, Base Consonants with Subjoined attached in code point, Sanskrit base marks 86 CC_SUBJOINED = 2, // Subjoined Consonats, combination of more than Subjoined Consonants in the code point 87 CC_TSA_PHRU = 3, // Tsa-Phru character 0F39 88 CC_A_CHUNG = 4, // Vowel Lenthening a-chung mark 0F71 89 CC_COMP_SANSKRIT = 5, // Precomposed Sanskrit vowels including Subjoined characters and vowels 90 CC_HALANTA = 6, // Halanta Character 0F84 91 CC_BELOW_VOWEL = 7, // Subjoined vowels 92 CC_ABOVE_VOWEL = 8, // Superscript vowels 93 CC_ANUSVARA = 9, // Tibetan sign Rjes Su Nga Ro 0F7E 94 CC_CANDRABINDU = 10, // Tibetan sign Sna Ldan and Nyi Zla Naa Da 0F82, 0F83 95 CC_VISARGA = 11, // Tibetan sign Rnam Bcad (0F7F) 96 CC_ABOVE_S_MARK = 12, // Stress Marks placed above the text 97 CC_BELOW_S_MARK = 13, // Stress Marks placed below the text 98 CC_DIGIT = 14, // Dzongkha Digits 99 CC_PRE_DIGIT_MARK = 15, // Mark placed before the digit 100 CC_POST_BELOW_DIGIT_M = 16, // Mark placed below or after the digit 101 CC_COUNT = 17 // This is the number of character classes 102 }; 103 104 enum CharClassFlags 105 { 106 CF_CLASS_MASK = 0x0000FFFF, 107 108 CF_DOTTED_CIRCLE = 0x04000000, // add a dotted circle if a character with this flag is the first in a syllable 109 CF_DIGIT = 0x01000000, // flag to speed up comparaisson 110 CF_PREDIGIT = 0x02000000, // flag to detect pre-digit marks for reordering 111 112 // position flags 113 CF_POS_BEFORE = 0x00080000, 114 CF_POS_BELOW = 0x00040000, 115 CF_POS_ABOVE = 0x00020000, 116 CF_POS_AFTER = 0x00010000, 117 CF_POS_MASK = 0x000f0000 118 }; 119 120 typedef le_uint32 CharClass; 121 122 typedef le_int32 ScriptFlags; 123 124 LEUnicode firstChar; // for Tibetan this will become xOF00 125 LEUnicode lastChar; // and this x0FFF 126 const CharClass *classTable; 127 128 CharClass getCharClass(LEUnicode ch) const; 129 130 static const TibetanClassTable *getTibetanClassTable(); 131 }; 132 133 134 class TibetanReordering /* not : public UObject because all methods are static */ { 135 public: 136 static le_int32 reorder(const LEUnicode *theChars, le_int32 charCount, le_int32 scriptCode, 137 LEUnicode *outChars, LEGlyphStorage &glyphStorage); 138 139 static const FeatureMap *getFeatureMap(le_int32 &count); 140 141 private: 142 // do not instantiate 143 TibetanReordering(); 144 145 static le_int32 findSyllable(const TibetanClassTable *classTable, const LEUnicode *chars, le_int32 prev, le_int32 charCount); 146 147 }; 148 149 150 U_NAMESPACE_END 151 #endif 152