1<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> 3<html xmlns="http://www.w3.org/1999/xhtml"> 4 5<head> 6<meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> 7<meta http-equiv="Content-Language" content="en-us" /> 8<meta name="VI60_defaultClientScript" content="JavaScript" /> 9<meta name="GENERATOR" content="Microsoft FrontPage 6.0" /> 10<meta name="keywords" content="Unicode, common locale data repository" /> 11<meta name="ProgId" content="FrontPage.Editor.Document" /> 12<title>Common Locale Data Repository</title> 13<link rel="stylesheet" type="text/css" href="http://www.unicode.org/webscripts/standard_styles.css" /> 14<style type="text/css"> 15<!-- 16.major {font-size:95%; font-family: Arial, Geneva, sans-serif; color: #808080; font-weight:bold; } 17.minor {font-size:85%; font-family: Arial, Geneva, sans-serif; color: #808080; font-weight:400; } 18.table2 { margin-top: 1.5em; margin-bottom: 0.5em } 19td,th {border-color:#EEEEEE; vertical-align:top; padding:2px} 20th { background-color: #CCCCCC } 21table {border-collapse: collapse} 22caption { font-weight: bold } 23--> 24</style> 25</head> 26 27<body> 28 29<table width="100%" cellpadding="0" cellspacing="0" border="0"> 30 <tr> 31 <td colspan="2" style="padding:0; margin:0"> 32 <table width="100%" border="0" cellpadding="0" cellspacing="0"> 33 <tr> 34 <td class="icon" style="padding:2px; margin:0"><a href="http://www.unicode.org/"> 35 <img border="0" src="http://www.unicode.org/webscripts/logo60s2.gif" align="middle" alt="[Unicode]" width="34" height="33" /></a> 36 <a class="bar" href="index.html"><font size="3">Common Locale Data Repository</font></a></td> 37 <td class="bar" style="padding:2px; margin:0"> 38 <a href="http://www.unicode.org" class="bar">Home</a> | 39 <a href="http://www.unicode.org/sitemap/" class="bar">Site Map</a> | 40 <a href="http://www.unicode.org/search/" class="bar">Search</a></td> 41 </tr> 42 </table> 43 </td> 44 </tr> 45 <tr> 46 <td style="padding:2px; margin:0" colspan="2" class="gray"> </td> 47 </tr> 48 <tr> 49 <td style="padding:2px; margin:0" valign="top" width="25%" class="navCol"> 50 <table class="navColTable" border="0" width="100%" cellspacing="4" cellpadding="0"> 51 <tr> 52 <td style="padding:2px; margin:0" class="navColTitle" colspan="2">Contents</td> 53 </tr> 54 <tr> 55 <td style="padding:2px; margin:0" valign="top" class="navColCell" colspan="2"> 56 <a href="#Introduction">Introduction</a></td> 57 </tr> 58 <tr> 59 <td style="padding:2px; margin:0" valign="top" class="navColCell" colspan="2"> 60 <a href="#Variants">Variants</a></td> 61 </tr> 62 <tr> 63 <td style="padding:2px; margin:0" valign="top" class="navColCell" colspan="2"> 64 <a href="#Guidelines">Guidelines</a></td> 65 </tr> 66 <tr> 67 <td style="padding:2px; margin:0" valign="top" class="navColCell" width="1%"> 68 </td> 69 <td style="padding:2px; margin:0" valign="top" class="navColCell"> 70 <a href="#Ambiguity">Ambiguity</a></td> 71 </tr> 72 <tr> 73 <td style="padding:2px; margin:0" valign="top" class="navColCell"> 74 </td> 75 <td style="padding:2px; margin:0" valign="top" class="navColCell"> 76 <a href="#Pronunciation">Pronunciation</a></td> 77 </tr> 78 <tr> 79 <td style="padding:2px; margin:0" valign="top" class="navColCell"> 80 </td> 81 <td style="padding:2px; margin:0" valign="top" class="navColCell"> 82 <a href="#Cautions">Cautions</a></td> 83 </tr> 84 <tr> 85 <td style="padding:2px; margin:0" valign="top" class="navColCell" colspan="2"> 86 <a href="#Available_Transliterations">Available Transliterations</a></td> 87 </tr> 88 <tr> 89 <td style="padding:2px; margin:0" valign="top" class="navColCell"> 90 </td> 91 <td style="padding:2px; margin:0" valign="top" class="navColCell"> 92 <a href="#Korean">Korean</a></td> 93 </tr> 94 <tr> 95 <td style="padding:2px; margin:0" valign="top" class="navColCell"> 96 </td> 97 <td style="padding:2px; margin:0" valign="top" class="navColCell"><a href="#Japanese">Japanese</a></td> 98 </tr> 99 <tr> 100 <td style="padding:2px; margin:0" valign="top" class="navColCell"> 101 </td> 102 <td style="padding:2px; margin:0" valign="top" class="navColCell"><a href="#Greek">Greek</a></td> 103 </tr> 104 <tr> 105 <td style="padding:2px; margin:0" valign="top" class="navColCell"> 106 </td> 107 <td style="padding:2px; margin:0" valign="top" class="navColCell"><a href="#Cyrillic">Cyrillic</a></td> 108 </tr> 109 <tr> 110 <td style="padding:2px; margin:0" valign="top" class="navColCell"> 111 </td> 112 <td style="padding:2px; margin:0" valign="top" class="navColCell"><a href="#Indic">Indic</a></td> 113 </tr> 114 <tr> 115 <td style="padding:2px; margin:0" valign="top" class="navColCell"> 116 </td> 117 <td style="padding:2px; margin:0" valign="top" class="navColCell"><a href="#Others">Others</a></td> 118 </tr> 119 <tr> 120 <td style="padding:2px; margin:0" valign="top" class="navColCell" colspan="2"> 121 <a href="#Submitting_Transliterations">Submitting Transliterations</a></td> 122 </tr> 123 <tr> 124 <td style="padding:2px; margin:0" valign="top" class="navColCell" colspan="2"> 125 <a href="#More_Information">More Information</a></td> 126 </tr> 127 <tr> 128 <td style="padding:2px; margin:0" class="navColTitle" colspan="2">Unicode CLDR</td> 129 </tr> 130 <tr> 131 <td style="padding:2px; margin:0" valign="top" class="navColCell" colspan="2"> 132 <a href="index.html">CLDR Project</a></td> 133 </tr> 134 <tr> 135 <td style="padding:2px; margin:0" valign="top" class="navColCell" colspan="2"> 136 <a href="repository_access.html">CLDR Releases (Downloads)</a></td> 137 </tr> 138 <tr> 139 <td style="padding:2px; margin:0" valign="top" class="navColCell" colspan="2"> 140 <a href="survey_tool.html">CLDR Survey Tool</a></td> 141 </tr> 142 <tr> 143 <td style="padding:2px; margin:0" valign="top" class="navColCell" colspan="2"> 144 <a href="filing_bug_reports.html">CLDR Bug Reports</a></td> 145 </tr> 146 <tr> 147 <td style="padding:2px; margin:0" valign="top" class="navColCell" colspan="2"> 148 <a href="comparison_charts.html">CLDR Charts</a></td> 149 </tr> 150 <tr> 151 <td style="padding:2px; margin:0" valign="top" class="navColCell" colspan="2"> 152 <a href="process.html">CLDR Process</a></td> 153 </tr> 154 <tr> 155 <td style="padding:2px; margin:0" valign="top" class="navColCell" colspan="2"> 156 <a href="http://www.unicode.org/reports/tr35/">UTS #35: Locale Data Markup Language 157 (LDML)</a></td> 158 </tr> 159 <tr> 160 <td style="padding:2px; margin:0" class="navColTitle" colspan="2">Related Links</td> 161 </tr> 162 <tr> 163 <td style="padding:2px; margin:0" valign="top" class="navColCell" colspan="2">Join the 164 <a href="http://www.unicode.org/consortium/consort.html">Unicode Consortium</a></td> 165 </tr> 166 <tr> 167 <td style="padding:2px; margin:0" valign="top" class="navColCell" colspan="2"> 168 <a href="http://www.unicode.org/reports/">Unicode Technical Reports</a></td> 169 </tr> 170 <tr> 171 <td style="padding:2px; margin:0" valign="top" class="navColCell" colspan="2"> 172 <a href="http://www.unicode.org/faq/reports_process.html">Technical Reports Development 173 and Maintenance Process</a></td> 174 </tr> 175 <tr> 176 <td style="padding:2px; margin:0" valign="top" class="navColCell" colspan="2"> 177 <a href="http://www.unicode.org/consortium/utc.html">Unicode Technical Committee</a></td> 178 </tr> 179 <tr> 180 <td style="padding:2px; margin:0" valign="top" class="navColCell" colspan="2"> 181 <a href="http://www.unicode.org/versions/">Versions of the Unicode Standard</a></td> 182 </tr> 183 <tr> 184 <td style="padding:2px; margin:0" class="navColTitle" colspan="2">Other Publications</td> 185 </tr> 186 <tr> 187 <td style="padding:2px; margin:0" valign="top" class="navColCell" colspan="2"> 188 <a href="http://www.unicode.org/standard/standard.html">The Unicode Standard</a></td> 189 </tr> 190 <tr> 191 <td style="padding:2px; margin:0" valign="top" class="navColCell" colspan="2"> 192 <a href="http://www.unicode.org/notes/">Unicode Technical Notes</a></td> 193 </tr> 194 </table> 195 <!-- BEGIN CONTENTS --></td> 196 <td> 197 <table> 198 <tr> 199 <td class="contents" valign="top"> 200 <div class="body"> 201 <h1 align="center">Unicode Transliteration Guidelines</h1><br /> 202 <blockquote> 203 <p><i>This document describes guidelines for the creation and use of CLDR 204 transliterations. Preliminary 205 <a href="http://www.unicode.org/cldr/data/charts/transforms/index.html">charts</a> 206 are available for the available transliterations -- be sure to read the known issues 207 there. Please file any feedback 208 on this document or those charts at 209 <a href="http://www.unicode.org/cldr/bugs/locale-bugs">Locale Bugs</a>.</i></p> 210 </blockquote> 211 <h2><a name="Introduction">Introduction</a></h2> 212 <table border="1" width="33%" id="table21" cellspacing="0" cellpadding="2" style="border-collapse: collapse; float: right; margin:1em; border-color:#BB0000"> 213 <tr> 214 <td style="vertical-align: top; border-color: #EEEEEE; padding: 2px"> 215 <font size="2"><i><b>Display. </b></i>Some of the characters in this 216 document may not be visible in your browser, and with some fonts the diacritics 217 will not be correctly placed on the base letters. See 218 <a href="http://www.unicode.org/help/display_problems.html">Display Problems</a>.</font></td> 219 </tr> 220 </table> 221 <p>Transliteration is the general process of converting characters from one script 222 to another, where the result is roughly phonetic for languages in the target script. 223 For example, "Phobos" and "Deimos" are transliterations of Greek mythological "Φόβος" 224 and "Δεῖμος" into Latin letters, used to name the moons of Mars.</p> 225 <p>Transliteration is <i>not</i> translation. Rather, transliteration is the conversion 226 of letters from one script to another without translating the underlying words. 227 The following shows a sample of transliteration systems:</p> 228 <table id="table20" style="border-collapse: collapse" border="1" cellspacing="0" cellpadding="2"> 229 <caption>Sample Transliteration Systems</caption> 230 <tr> 231 <th width="25%" style="vertical-align: top">Source</th> 232 <th width="25%" style="vertical-align: top">Translation</th> 233 <th style="vertical-align: top" width="25%">Transliteration</th> 234 <th width="25%" style="vertical-align: top">System</th> 235 </tr> 236 <tr> 237 <td bgcolor="#cccccc" style="vertical-align: top" rowspan="2">Αλφαβητικός</td> 238 <td bgcolor="#cccccc" style="vertical-align: top"><i>Alphabetic</i></td> 239 <td bgcolor="#cccccc" style="vertical-align: top">Alphabētikós</td> 240 <td bgcolor="#cccccc" style="vertical-align: top">Classic</td> 241 </tr> 242 <tr> 243 <td bgcolor="#cccccc" style="vertical-align: top"> </td> 244 <td bgcolor="#cccccc" style="vertical-align: top">Alfavi̱tikós</td> 245 <td bgcolor="#cccccc" style="vertical-align: top">UNGEGN</td> 246 </tr> 247 <tr> 248 <td style="vertical-align: top" rowspan="2">しんばし</td> 249 <td style="vertical-align: top" rowspan="2"><i>new bridge<br />(district in Tokyo)</i></td> 250 <td style="vertical-align: top">shimbashi</td> 251 <td style="vertical-align: top">Hepburn</td> 252 </tr> 253 <tr> 254 <td style="vertical-align: top">sinbasi</td> 255 <td style="vertical-align: top">Kunrei</td> 256 </tr> 257 <tr> 258 <td style="vertical-align: top" rowspan="3">яйца Фаберже</td> 259 <td style="vertical-align: top" rowspan="3"><i>Fabergé eggs</i></td> 260 <td style="vertical-align: top">yaytsa Faberzhe</td> 261 <td style="vertical-align: top">BGN/PCGN</td> 262 </tr> 263 <tr> 264 <td style="vertical-align: top">jajca Faberže</td> 265 <td style="vertical-align: top">Scholarly</td> 266 </tr> 267 <tr> 268 <td style="vertical-align: top">âjca Faberže</td> 269 <td style="vertical-align: top">ISO</td> 270 </tr> 271 </table> 272 <p>While an English speaker may 273 not recognize that the Japanese word <i>kyanpasu</i> is equivalent 274 to the English word <i>campus</i>, the word <i>kyanpasu</i> is still far easier 275 to recognize and interpret than if the letters were left in the original script. 276 There are several situations where this transliteration is especially useful, 277 such as the following. See the sidebar for examples.</p> 278 <table id="table22" cellpadding="2" style="margin:1em; border-collapse: collapse" border="1" align="right"> 279 <caption><b>Sample Transliterations</b></caption> 280 <tr> 281 <th style="vertical-align: top; border-color: #EEEEEE; padding: 2px">Source</th> 282 <th style="vertical-align: top; border-color: #EEEEEE; padding: 2px">Transliteration</th> 283 </tr> 284 <tr> 285 <td style="vertical-align: top; border-color: #EEEEEE; padding: 2px">김, 286 국삼</td> 287 <td style="vertical-align: top; border-color: #EEEEEE; padding: 2px">Gim, 288 Gugsam </td> 289 </tr> 290 <tr> 291 <td style="vertical-align: top; border-color: #EEEEEE; padding: 2px">김, 292 명희</td> 293 <td style="vertical-align: top; border-color: #EEEEEE; padding: 2px">Gim, 294 Myeonghyi </td> 295 </tr> 296 <tr> 297 <td style="vertical-align: top; border-color: #EEEEEE; padding: 2px">정, 298 병호</td> 299 <td style="vertical-align: top; border-color: #EEEEEE; padding: 2px">Jeong, Byeongho 300 </td> 301 </tr> 302 <tr> 303 <td style="vertical-align: top; border-color: #EEEEEE; padding: 2px">...</td> 304 <td style="vertical-align: top; border-color: #EEEEEE; padding: 2px">...</td> 305 </tr> 306 <tr> 307 <td style="vertical-align: top; border-color: #EEEEEE; padding: 2px">たけだ, まさゆき</td> 308 <td style="vertical-align: top; border-color: #EEEEEE; padding: 2px">Takeda, Masayuki 309 </td> 310 </tr> 311 <tr> 312 <td style="vertical-align: top; border-color: #EEEEEE; padding: 2px">ますだ, よしひこ</td> 313 <td style="vertical-align: top; border-color: #EEEEEE; padding: 2px">Masuda, Yoshihiko 314 </td> 315 </tr> 316 <tr> 317 <td style="vertical-align: top; border-color: #EEEEEE; padding: 2px">やまもと, のぼる 318 </td> 319 <td style="vertical-align: top; border-color: #EEEEEE; padding: 2px">Yamamoto, Noboru 320 </td> 321 </tr> 322 <tr> 323 <td style="vertical-align: top; border-color: #EEEEEE; padding: 2px">...</td> 324 <td style="vertical-align: top; border-color: #EEEEEE; padding: 2px">...</td> 325 </tr> 326 <tr> 327 <td style="vertical-align: top; border-color: #EEEEEE; padding: 2px">Ρούτση, Άννα</td> 328 <td style="vertical-align: top; border-color: #EEEEEE; padding: 2px">Roútsē, Ánna</td> 329 </tr> 330 <tr> 331 <td style="vertical-align: top; border-color: #EEEEEE; padding: 2px">Καλούδης, Χρήστος</td> 332 <td style="vertical-align: top; border-color: #EEEEEE; padding: 2px">Kaloúdēs, Chrḗstos</td> 333 </tr> 334 <tr> 335 <td style="vertical-align: top; border-color: #EEEEEE; padding: 2px">Θεοδωράτου, Ελένη</td> 336 <td style="vertical-align: top; border-color: #EEEEEE; padding: 2px">Theodōrátou, Elénē</td> 337 </tr> 338 </table> 339 <ul> 340 <li>When a user views names that are entered in a world-wide database, it is 341 extremely helpful to view and refer to the names in the user's native script.</li> 342 <li>When the user performs searching and indexing tasks, transliteration can 343 retrieve information in a different script.</li> 344 <li>When a service engineer is sent a program dump that is filled with characters 345 from foreign scripts, it is much easier to diagnose the problem when the text 346 is transliterated and the service engineer can recognize the characters. 347 </li> 348 </ul> 349 <p>The term <i>transliteration</i> 350 is sometimes given a narrow meaning, implying that the transformation is <i>reversible</i> (sometimes called 351 <i>lossless</i>). In CLDR this is not the case; 352 the term <i>transliteration</i> 353 is interpreted broadly to mean both reversible and non-reversible transforms of 354 text. (Note that even if theoretically a <span class="nfakPe">transliteration</span> 355 system is supposed to be reversible, in source standards it is often not 356 specified in sufficient detail in the edge cases to actually be reversible.) A 357 non-reversible transliteration is often called a <i>transcription</i>, or called 358 a <i>lossy </i>or<i> ambiguous</i> transcription.</p> 359 <p>Note that reversibility is generally 360 only in one direction, so for native to Latin a transliteration may be reversible, but not the 361 contrary. For example, Hangul is reversible, in that any Hangul to Latin to 362 Hangul should provide the same Hangul as the input. Thus we have the following:</p> 363 <blockquote> 364 <p>갗 365 <font face="Times New Roman">→</font> 366 gach <font face="Times New Roman">→</font> 367 갗</p> 368 </blockquote> 369 <p>However, for completeness, many Latin 370 characters have fallbacks. This means that more than one Latin character may map to the same 371 Hangul. Thus <i>from</i> Latin we don't have reversibility, because two 372 different Latin source strings round-trip back to the same Latin string.</p> 373 <blockquote> 374 <p>gach 375 <font face="Times New Roman">→</font> 376 갗 <font face="Times New Roman">→</font> 377 gach<br>gac 378 <font face="Times New Roman">→</font> 379 갗 <font face="Times New Roman">→</font> 380 gach</p> 381 </blockquote> 382 <p>Transliteration can also be used to convert unfamiliar letters within the same 383 script, such as converting Icelandic THORN (þ) to th. These are not typically reversible.</p> 384 <blockquote> 385 <p><i>There is an online demo using released CLDR data 386 at <a href="http://demo.icu-project.org/icu-bin/translit">ICU Transform Demo</a>.</i></p> 387 </blockquote> 388 <h2><a name="Variants">Variants</a></h2> 389 <p>There are many systems for transliteration between languages: the same text can 390 be transliterated in many different ways. For example, for the Greek example above, 391 the transliteration is classical, while the <a href="http://www.eki.ee/wgrs/">UNGEGN</a> 392 alternate has different correspondences, such as φ → <i>f</i> instead of φ → <i>ph</i>.</p> 393 <p>CLDR provides for generic mappings from script to script (such as Cyrillic-Latin), 394 and also language-specific variants (Russian-French, or Serbian-German). There can 395 also be semi-generic mappings, such as Russian-Latin or Cyrillic-French. These can 396 be referred to, respectively, as script transliterations, language-specific transliterations, or 397 script-language transliterations. Transliterations from other scripts to Latin are also called 398 <i>Romanizations</i>.</p> 399 <p>Even within particular languages, there can be variant systems according to different 400 authorities, or even varying across time (if the authority for a system changes its recommendation). 401 The canonical identifier that CLDR uses for these has the form:</p> 402 <blockquote> 403 <p><i>source-target/variant</i></p> 404 </blockquote> 405 <p>The source (and target) can be a language or script, either using the English 406 name or a locale code. The variant should specify the authority for the system, and if necessary 407 for disambiguation, 408 the year. For example, the identifier for the Russian to Latin transliteration according 409 to the UNGEGN system would be:</p> 410 <ul> 411 <li>ru-und_Latn/UNGEGN, or</li> 412 <li>Russian-Latin/UNGEGN</li> 413 </ul> 414 <p>If there were multiple versions of these over time, the variant would be, say, 415 UNGEGN2006.</p> 416 <p>The assumption is that implementations will allow the use of fallbacks, if the 417 exact transliteration specified is unavailable. For example, the following would 418 be the fallback chain for the identifier Russian-English/UNGEGN. This is similar 419 to the <i>Lookup Fallback Pattern</i> used in 420 <a href="http://tools.ietf.org/html/bcp47">BCP 47 Tags for Identifying Languages</a>, 421 except that it uses a "stepladder approach" to progressively handle the fallback 422 among source, target, and variant, with priorities being the target, source, and 423 variant, in that order.</p> 424 <ul> 425 <li>Russian-English/UNGEGN</li> 426 <li>Russian-English</li> 427 <li>Cyrillic-English/UNGEGN</li> 428 <li>Cyrillic-English</li> 429 <li>Russian-Latin/UNGEGN</li> 430 <li>Russian-Latin</li> 431 <li>Cyrillic-Latin/UNGEGN</li> 432 <li>Cyrillic-Latin</li> 433 </ul> 434 <h2><a name="Guidelines">Guidelines</a></h2> 435 <p>There are a number of generally desirable guidelines for script transliterations. 436 These guidelines are rarely satisfied simultaneously, so constructing a reasonable 437 transliteration is always a process of balancing different requirements. These requirements 438 are most important for people who are building transliterations, but are also useful 439 as background information for users.</p> 440 <p>The following lists the general guidelines 441 for Unicode CLDR transliterations: </p> 442 <ul> 443 <li><i>standard:</i> follow established systems (standards, authorities, or 444 <i>de facto</i> practice) where possible, deviating sometimes where necessary for reversibility. 445 In CLDR, the systems are generally described in the comments in the XML data files found in 446 the in the 447 <a target="_blank" href="http://www.unicode.org/cldr/data/common/transforms/">transforms</a> 448 folder online. For example, the system for Arabic transliteration in CLDR are 449 found in the comments in 450 <a target="_blank" href="http://www.unicode.org/cldr/data/common/transforms/Arabic-Latin.xml">Arabic<wbr>-Latin.xml</a>; 451 there is a reference to the 452 <a target="_blank" href="http://www.eki.ee/wgrs/rom1_ar.pdf">UNGEGN Arabic Tables</a>. 453 Similarly for Hebrew, which also follows the 454 <a href="http://www.eki.ee/wgrs/rom1_he.pdf">Hebrew UNGEGN Tables</a>.</li> 455 <li><i>complete</i>: every well-formed sequence of characters in the source 456 script should transliterate to a sequence of characters from the target script, 457 and vice versa.</li> 458 <li><i>predictable</i>: the letters themselves (without any knowledge of the 459 languages written in that script) should be sufficient for the transliteration, 460 based on a relatively small number of rules. This allows the transliteration 461 to be performed mechanically. </li> 462 <li><i>pronounceable</i>: the resulting characters have reasonable 463 pronunciations in the target script. Transliteration is not as useful if the process simply 464 maps the characters without any regard to their pronunciation. Simply mapping 465 by alphabetic order ("αβγδεζηθ..." to "abcdefgh...") could yield strings that 466 might be complete and unambiguous, but the pronunciation would be completely 467 unexpected.</li> 468 <li><i>reversible</i>: it is possible to recover the text in the source script 469 from the transliteration in the target script. That is, someone that knows the transliteration 470 rules would be able to recover the precise spelling of the original source text. 471 For example, it is possible to go from <i>Elláda</i> back to the original Ελλάδα, 472 while if the transliteration were <i>Ellada</i> (with no accent), it would 473 not be possible.</li> 474 </ul> 475 <p>Some of these principles may not be achievable simultaneously; in particular, 476 adherence to a standard system <i>and</i> reversibility. Often small changes in 477 existing systems can be made to accommodate reversibility. However, where a particular 478 system specifies a fundamentally non-reversible transliterations, those transliterations 479 as represented in CLDR may not be reversible.</p> 480 <h3><a name="Ambiguity">Ambiguity</a></h3> 481 <p>In transliteration, multiple characters may produce ambiguities 482 (non-reversible mappings) unless the rules 483 are carefully designed. For example, the Greek character PSI (ψ) maps to <i>ps</i>, 484 but <i>ps</i> could also result from the sequence PI, SIGMA (πσ) since PI (π) maps 485 to p and SIGMA (σ) maps to s. </p> 486 <p>The Japanese transliteration standards provide a good mechanism for handling 487 these kinds of ambiguities. Using the Japanese transliteration standards, whenever 488 an ambiguous sequence in the target script does not result from a single letter, 489 the transform uses an apostrophe to disambiguate it. For example, it uses that procedure 490 to distinguish between <i>man'ichi</i> and <i>manichi</i>. Using this procedure, 491 the Greek character PI SIGMA (πσ) maps to <i>p's</i>. This method is recommended 492 for all script transliteration methods, although sometimes the character may vary: 493 for example, "-" is used in Korean. </p> 494 <blockquote> 495 <p><b>Note:</b> We've had a recent proposal to consistently use the hyphenation dot 496 for this code, thus we'd have πσ → p‧s.</p> 497 </blockquote> 498 <p>A second problem is that some characters in a target script are not normally 499 found outside of certain contexts. For example, the small Japanese "ya" character, 500 as in "kya" (キャ), is not normally found in isolation. To handle such characters, 501 the Unicode transliterations currently use different conventions.</p> 502 <ul> 503 <li>Tilde: "ャ" in isolation is represented as "~ya"</li> 504 <li>Diacritics: Greek "ς" in isolation is represented as s̱</li> 505 </ul> 506 <blockquote> 507 <p><b>Note:</b> The CLDR committee is considering converging on a common representation for 508 this. The advantage of a common representation is that it allows for easy filtering.</p> 509 </blockquote> 510 <p>For the default script transforms, the goal is to have unambiguous mappings, 511 with variants for any common use mappings that are ambiguous (non-reversible). In 512 some cases, however, case may not be preserved. For example, </p> 513 <table id="table16" cellspacing="1" cellpadding="2" border="1" style="border-collapse: collapse"> 514 <tr> 515 <th>Latin</th> 516 <th>Greek</th> 517 <th>Latin</th> 518 </tr> 519 <tr> 520 <td>ps PS</td> 521 <td>ψ Ψ</td> 522 <td>ps PS</td> 523 </tr> 524 <tr> 525 <td>psa Psa <b>PsA</b></td> 526 <td>ψα Ψα <b>ΨΑ</b></td> 527 <td>psa Psa <b>PSA</b></td> 528 </tr> 529 <tr> 530 <td>psA PSA <b>PSa</b></td> 531 <td>ψΑ ΨΑ <b>Ψα</b></td> 532 <td>psA PSA <b>Psa</b></td> 533 </tr> 534 </table> 535 <p>The following shows Greek text that is mapped to fully reversible Latin: </p> 536 <table id="table5" border="1"> 537 <tr> 538 <th>Greek-Latin</th> 539 <th> </th> 540 </tr> 541 <tr> 542 <td>τί φῄς; γραφὴν σέ τις, ὡς ἔοικε, γέγραπται: οὐ γὰρ ἐκεῖνό γε καταγνώσομαι, 543 ὡς σὺ ἕτερον.</td> 544 <td>tí phḗis; graphḕn sé tis, hōs éoike, gégraptai: ou gàr ekeînó ge katagnṓsomai, 545 hōs sỳ héteron.</td> 546 </tr> 547 </table> 548 <p>If the user wants a version without certain accents, then CLDR's <i>chaining 549 rules </i>can be 550 used to remove the accents. For example, the following transliterates to Latin but 551 removes the macron accents on the long vowels. </p> 552 <table id="table6" border="1"> 553 <tr> 554 <th>Greek-Latin; nfd; [\u0304] remove; nfc</th> 555 <th> </th> 556 </tr> 557 <tr> 558 <td>τί φῄς; γραφὴν σέ τις, ὡς ἔοικε, γέγραπται: οὐ γὰρ ἐκεῖνό γε καταγνώσομαι, 559 ὡς σὺ ἕτερον.</td> 560 <td>tí phéis; graphèn sé tis, hos éoike, gégraptai: ou gàr ekeînó ge katagnósomai, 561 hos sỳ héteron.</td> 562 </tr> 563 </table> 564 <p>The above chaining rules, separated by semi-colons, perform the following 565 commands in order:</p> 566 <table id="table23" border="1"> 567 <tr> 568 <th style="vertical-align: top; border-color: #EEEEEE; padding: 2px"> 569 Rule</th> 570 <th style="vertical-align: top; border-color: #EEEEEE; padding: 2px"> 571 Description</th> 572 </tr> 573 <tr> 574 <td style="vertical-align: top; border-color: #EEEEEE; padding: 2px" nowrap>Greek-Latin</td> 575 <td style="vertical-align: top; border-color: #EEEEEE; padding: 2px"> 576 transliterate Greek to Latin</td> 577 </tr> 578 <tr> 579 <td style="vertical-align: top; border-color: #EEEEEE; padding: 2px" nowrap>nfd 580 </td> 581 <td style="vertical-align: top; border-color: #EEEEEE; padding: 2px"> 582 convert to Unicode NFD format (separating accents from base characters)</td> 583 </tr> 584 <tr> 585 <td style="vertical-align: top; border-color: #EEEEEE; padding: 2px" nowrap>[\u0304] remove</td> 586 <td style="vertical-align: top; border-color: #EEEEEE; padding: 2px"> 587 remove accents, but <i>filter</i> the command to only apply to a single 588 character: <code> 589 <a target="c" href="http://unicode.org/cldr/utility/character.jsp?a=0304"> 590 U+0304</a></code> ( ̄ ) COMBINING MACRON</td> 591 </tr> 592 <tr> 593 <td style="vertical-align: top; border-color: #EEEEEE; padding: 2px" nowrap>nfc</td> 594 <td style="vertical-align: top; border-color: #EEEEEE; padding: 2px"> 595 convert to Unicode NFC format (rejoining accents to base characters)</td> 596 </tr> 597 </table> 598 <p>The following transliterates to Latin but removes <i>all</i> accents. Note 599 that the only change is to expand the filter for the <i>remove</i> command.</p> 600 <table id="table7" border="1"> 601 <tr> 602 <th>Greek-Latin; nfd; [:nonspacing marks:] remove; nfc</th> 603 <th> </th> 604 </tr> 605 <tr> 606 <td>τί φῄς; γραφὴν σέ τις, ὡς ἔοικε, γέγραπται: οὐ γὰρ ἐκεῖνό γε καταγνώσομαι, 607 ὡς σὺ ἕτερον.</td> 608 <td>ti pheis; graphen se tis, hos eoike, gegraptai: ou gar ekeino ge katagnosomai, 609 hos sy heteron.</td> 610 </tr> 611 </table> 612 <h3><a name="Pronunciation">Pronunciation</a></h3> 613 <p>Standard transliteration methods often do not follow the pronunciation rules 614 of any particular language in the target script. For example, the Japanese Hepburn 615 system uses a "j" that has the English phonetic value (as opposed to French, German, 616 or Spanish), but uses vowels that do not have the standard English sounds. A transliteration 617 method might also require some special knowledge to have the correct pronunciation. 618 For example, in the Japanese kunrei-siki system, "ti" is pronounced as English "chee".</p> 619 <p>This is similar to situations where there are different languages within the same 620 script. For example, knowing that the word <i>Gewalt</i> comes from German allows 621 a knowledgeable reader to pronounce the "w" as a "v". 622 When encountering a 623 foreign word like <i>jawa</i>, there is little assurance how it is to be 624 pronounced even when it is not a <span class="nfakPe">transliteration (it is just from /span>another Latin-script language). The <i>j</i> could be 625 pronounced (for an English speaker) as in <i>jump</i>, 626 or <i>Junker</i>, 627 or <i>jour</i>; 628 and so on. Transcriptions are only roughly phonetic, and only so when the 629 specific pronunciation rules are understood.</p> 630 <p>The pronunciation of the characters 631 in the original script may also be influenced by context, which may be 632 particularly misleading in transliteration. For, in the Bengali নিঃশব, 633 transliterated as niḥśaba, the <i>visarga</i> <i>ḥ</i> 634 is not pronounced itself (whereas elsewhere it may be) but lengthens the 635 <i>ś</i> 636 sound, and the final inherent <i>a</i> is pronounced (whereas it 637 commonly is not), and the two inherent a's are pronounced as <i>ɔ</i> 638 and <i>ô</i>, 639 respectively.</p> 640 <p>In some cases, transliteration may be heavily influenced by tradition. For example, 641 the modern Greek letter beta (β) sounds like a "v", but a 642 transliteration may use a <i>b</i> (as in <i>biology</i>). In that case, the user would need to know 643 that a "b" in the transliterated word corresponded to beta (β) and is to be pronounced 644 as a <i>v</i> in modern Greek.</p> 645 <p>Letters may also be transliterated differently according 646 to their context to make the pronunciation more predictable. For example, since 647 the Greek sequence GAMMA GAMMA (γγ) is pronounced as <i>ng</i>, the first GAMMA 648 can be transcribed as an "n" in that context. 649 Similarly, the transliteration can give other guidance to the pronunciation in the 650 source language, for example, using "n" or "m" for the same Japanese character 651 (ん) depending on context, even though there is no distinction in the source 652 script.</p> 653 <p>In general, predictability means that when transliterating Latin script to other 654 scripts using reversible transliterations, English text will not produce phonetic 655 results. This is because the pronunciation of English cannot be predicted easily 656 from the letters in a word: e.g. <i>grove</i>, <i>move</i>, and <i>love</i> all end with "ove", but are 657 pronounced very differently. </p> 658 <h3><a name="Cautions">Cautions</a></h3> 659 <p>Reversibility may require modifications of traditional transcription methods. 660 For example, there are two standard methods for transliterating Japanese katakana 661 and hiragana into Latin letters. The <i>kunrei-siki</i> method is unambiguous. The Hepburn 662 method can be more easily pronounced by foreigners but is ambiguous. In the Hepburn 663 method, both ZI (ジ) and DI (ヂ) are represented by "ji" and both ZU (ズ) and DU (ヅ) 664 are represented by "zu". A slightly amended version of Hepburn, that uses "dji" 665 for DI and "dzu" for DU, is unambiguous. </p> 666 <p>When a sequence of two letters map to one, case mappings (uppercase and lowercase) 667 must be handled carefully to ensure reversibility. For cased scripts, the two letters 668 may need to have different cases, depending on the next letter. For example, the 669 Greek letter PHI (Φ) maps to PH in Latin, but Φο maps to Pho, and not to PHo. 670 </p> 671 <p>Some scripts have characters that take on different shapes depending on their 672 context. Usually, this is done at the display level (such as with Arabic) and does 673 not require special transliteration support. However, in a few cases this is represented 674 with different character codes, such as in Greek and Hebrew. For example, a Greek 675 SIGMA is written in a final form (ς) at the end of words, and a non-final form (σ) 676 in other locations. This also requires the transform to map different characters based 677 on the context.</p> 678 <p>Another thing to look out for when 679 dealing with cased scripts is that some of the characters in the target script may 680 not be able to represent case distinctions, such as some of the IPA characters 681 in the Latin script.</p> 682 <p>It is useful for the reverse mapping to be complete so that arbitrary strings 683 in the target script can be reasonably mapped back to the source script. Complete 684 reverse mapping makes it much easier to do mechanical quality checks and so on. 685 For example, even though the letter "q" might not be necessary in a transliteration 686 of Greek, it can be mapped to a KAPPA (κ). Such reverse mappings will not, in general, 687 be unambiguous. </p> 688 <h2><a name="Available_Transliterations">Available Transliterations</a></h2> 689 <p>Currently Unicode CLDR offers Romanizations for certain scripts, plus transliterations 690 between the Indic scripts (excluding Urdu). Additional script transliterations will 691 be added in the future.</p> 692 <wbr> 693 <p>Except where otherwise noted, all of these systems are designed to be reversible. 694 For bicameral scripts (those with uppercase and lowercase), however, case may not 695 be completely preserved.</p> 696 <p>The transliterations are also designed to be complete for any sequence of the 697 Latin letters <i>a-z</i>. A fallback is used for a letter that is not covered by 698 the transliteration, and default letters may be inserted as required. For example, 699 in the Hangul transliteration, <i>rink</i> → 린크 → <i>linkeu</i>. That is, "r" is 700 mapped to the closest other letter, and a default vowel is inserted at the end (since 701 "nk" cannot end a syllable).</p> 702 <p><i>Preliminary 703 <a href="http://www.unicode.org/cldr/data/charts/transforms/index.html">charts</a> 704 are available for the available transliterations. Be sure to read the known issues 705 described there.</i></p> 706 <h3><a name="Korean">Korean</a></h3> 707 <p>There are many Romanizations of Korean. The default transliteration in Unicode 708 CLDR follows the <a href="http://www.korean.go.kr/06_new/rule/rule06.jsp">Korean 709 Ministry of Culture & Tourism Transliteration</a> regulations (see also 710 <a href="http://www.korea.net/korea/kor_loca.asp?code=A020303">English summary</a>). 711 There is an optional clause 8 variant for reversibility:</p> 712 <blockquote> 713 <p>"제 8 항 학술 연구 논문 등 특수 분야에서 한글 복원을 전제로 표기할 경우에는 한글 표기를 대상으로 적는다. 이때 글자 대응은 714 제2장을 따르되 'ㄱ, ㄷ, ㅂ, ㄹ'은 'g, d, b, l'로만 적는다. 음가 없는 'ㅇ'은 붙임표(-)로 표기하되 어두에서는 생략하는 715 것을 원칙으로 한다. 기타 분절의 필요가 있을 때에도 붙임표(-)를 쓴다."</p> 716 <p><i>translation: </i>"Clause 8: When it is required to recover the original 717 Hangul representation faithfully as in scholarly articles, ' ㄱ, ㄷ, ㅂ, ㄹ' must 718 be always romanized as 'g, d, b, l' while the mapping for the rest of the letters 719 remains the same as specified in clause 2. The placeholder 'ㅇ' at the beginning 720 of a syllable should be represented with '-', but should be omitted at the beginning 721 of a word. In addition, '-' should be used in other cases where a syllable boundary 722 needs to be explicitly marked (be disambiguated."</p> 723 </blockquote> 724 <p>There are a number of cases where this Romanization may be ambiguous, because 725 sometimes multiple Latin letters map to a single entity (jamo) in Hangul. This happens 726 with vowels and consonants, the latter being slightly more complicated because there 727 are both initial and final consonants:</p> 728 <table border="1" id="table18" style="border-collapse: collapse"> 729 <tr> 730 <th>Type</th> 731 <th>Multi-Character Consonants</th> 732 </tr> 733 <tr> 734 <td>Initial-Only</td> 735 <td>tt pp jj</td> 736 </tr> 737 <tr> 738 <td>Initial-or-Final</td> 739 <td>kk ch ss</td> 740 </tr> 741 <tr> 742 <td>Final-Only</td> 743 <td>gs nj nh lg lm lb ls lt lp lh bs ng</td> 744 </tr> 745 </table> 746 <p>CLDR uses the following rules for disambiguation of the possible boundaries 747 between letters, in order. The first rule comes from Clause 8.</p> 748 <ol> 749 <li>Don't break so as to require an implicit vowel or null consonant (if possible)</li> 750 <li>Don't break within Initial-Only or Initial-Or-Final sequences (if possible)</li> 751 <li>Favor longest match first.</li> 752 </ol> 753 <p>If there is a single consonant between vowels, then Rule #1 will group it with 754 the following vowel if there is one (this is the same as the first part of Clause 755 8). If there is a sequence of four consonants between vowels, then there is only 756 one possible break (with well-formed text). So the only ambiguities lie with two 757 or three consonants between vowels, where there are possible multi-character consonants 758 involved. Even there, in most cases the resolution is simple, because there isn't 759 a possible multi-character consonant in the case of two, or two possible multi-character 760 consonants in the case of 3. For example, in the following cases, the left side 761 is unambiguous:</p> 762 <blockquote> 763 <p>angda = ang-da → 앙다<br />apda = ap-da → 앞다</p> 764 </blockquote> 765 <p>There are a relatively small number of possible ambiguities, listed below using 766 "a" as a sample vowel.</p> 767 <table border="1" id="table17" style="border-collapse: collapse" cellspacing="0" cellpadding="2"> 768 <tr> 769 <th align="left">No. of<br />Cons. </th> 770 <th align="left">Latin</th> 771 <th align="left">CLDR<br />Disambiguation</th> 772 <th align="left">Hangul</th> 773 <th colspan="2" align="left">Comments</th> 774 </tr> 775 <tr> 776 <td rowspan="18">2</td> 777 <td><code>atta</code></td> 778 <td><code>= a-tta</code></td> 779 <td>아따</td> 780 <td colspan="2" rowspan="3">Rule 1, then 2</td> 781 </tr> 782 <tr> 783 <td><code>appa</code></td> 784 <td><code>= a-ppa</code></td> 785 <td>아빠</td> 786 </tr> 787 <tr> 788 <td><code>ajja</code></td> 789 <td><code>= a-jja</code></td> 790 <td>아짜</td> 791 </tr> 792 <tr> 793 <td><code>akka</code></td> 794 <td><code>= a-kka</code></td> 795 <td>아까</td> 796 <td colspan="2" rowspan="3">Rule 1, then 2</td> 797 </tr> 798 <tr> 799 <td><code>assa</code></td> 800 <td><code>= a-ssa</code></td> 801 <td>아싸</td> 802 </tr> 803 <tr> 804 <td><code>acha</code></td> 805 <td><code>= a-cha</code></td> 806 <td>아차</td> 807 </tr> 808 <tr> 809 <td><code>agsa </code></td> 810 <td><code>= ag-sa</code></td> 811 <td>악사</td> 812 <td colspan="2" rowspan="12">Rule 1</td> 813 </tr> 814 <tr> 815 <td><code>anja </code></td> 816 <td><code>= an-ja</code></td> 817 <td>안자</td> 818 </tr> 819 <tr> 820 <td><code>anha </code></td> 821 <td><code>= an-ha</code></td> 822 <td>안하</td> 823 </tr> 824 <tr> 825 <td><code>alga </code></td> 826 <td><code>= al-ga</code></td> 827 <td>알가</td> 828 </tr> 829 <tr> 830 <td><code>alma </code></td> 831 <td><code>= al-ma</code></td> 832 <td>알마</td> 833 </tr> 834 <tr> 835 <td><code>alba </code></td> 836 <td><code>= al-ba</code></td> 837 <td>알바</td> 838 </tr> 839 <tr> 840 <td><code>alsa </code></td> 841 <td><code>= al-sa</code></td> 842 <td>알사</td> 843 </tr> 844 <tr> 845 <td><code>alta </code></td> 846 <td><code>= al-ta</code></td> 847 <td>알타</td> 848 </tr> 849 <tr> 850 <td><code>alpa </code></td> 851 <td><code>= al-pa</code></td> 852 <td>알파</td> 853 </tr> 854 <tr> 855 <td><code>alha </code></td> 856 <td><code>= al-ha</code></td> 857 <td>알하</td> 858 </tr> 859 <tr> 860 <td><code>absa </code></td> 861 <td><code>= ab-sa</code></td> 862 <td>압사</td> 863 </tr> 864 <tr> 865 <td><code>anga </code></td> 866 <td><code>= an-ga</code></td> 867 <td>안가</td> 868 </tr> 869 <tr> 870 <td rowspan="9">3</td> 871 <td><code>agssa </code></td> 872 <td><code>= ag-ssa</code></td> 873 <td>악싸</td> 874 <td colspan="2" rowspan="4">Rule 1, then 2</td> 875 </tr> 876 <tr> 877 <td><code>anjja </code></td> 878 <td><code>= an-jja</code></td> 879 <td>안짜</td> 880 </tr> 881 <tr> 882 <td><code>alssa </code></td> 883 <td><code>= al-ssa</code></td> 884 <td>알싸</td> 885 </tr> 886 <tr> 887 <td><code>abssa </code></td> 888 <td><code>= ab-ssa</code></td> 889 <td>압싸</td> 890 </tr> 891 <tr> 892 <td><code>akkka </code></td> 893 <td><code>= akk-ka</code></td> 894 <td>앆카</td> 895 <td colspan="2" rowspan="2">Rule 1, then 2, then 3</td> 896 </tr> 897 <tr> 898 <td><code>asssa </code></td> 899 <td><code>= ass-sa</code></td> 900 <td>았사</td> 901 </tr> 902 <tr> 903 <td colspan="3"><i>Known bugs*</i></td> 904 <td colspan="2"><i>Should be Rule 1, then 2</i></td> 905 </tr> 906 <tr> 907 <td style="vertical-align: top; border-color: #EEEEEE; padding: 2px"> 908 <code>altta </code></td> 909 <td style="vertical-align: top; border-color: #EEEEEE; padding: 2px"> 910 <code>= alt-ta</code></td> 911 <td style="vertical-align: top; border-color: #EEEEEE; padding: 2px">앑타</td> 912 <td><code>= al-tta</code></td> 913 <td>알따</td> 914 </tr> 915 <tr> 916 <td><code>alppa </code></td> 917 <td><code>= alp-pa</code></td> 918 <td>앒파</td> 919 <td><code>= al-ppa</code></td> 920 <td>알빠</td> 921 </tr> 922 </table> 923 <p><font size="2">* There is one other known bug in CLDR 1.5.1, where "ch" 924 transliterates incorrectly in the degenerate case<font face="Arial"> — </font>when not followed by a vowel.</font></p> 925 <p>For vowel sequences, the situation is simpler. Only Rule #3 applies, so aeo = 926 ae-o → 애오.</p> 927 <h3><a name="Japanese">Japanese</a></h3> 928 <p>The default transliteration for Japanese uses the a slight variant of the Hepburn 929 system. With Hepburn system, both ZI (ジ) and DI (ヂ) are represented by "ji" and 930 both ZU (ズ) and DU (ヅ) are represented by "zu". This is amended slightly for reversibility 931 by using "dji" for DI and "dzu" for DU.</p> 932 <h3><a name="Greek">Greek</a></h3> 933 <p>The default transliteration uses a standard transcription for Greek which is 934 aimed at preserving etymology. The ISO 843 variant includes following differences: 935 </p> 936 <table id="table10" border="1"> 937 <tr> 938 <th>Greek</th> 939 <th>Default</th> 940 <th>ISO 843</th> 941 </tr> 942 <tr> 943 <td>β</td> 944 <td>b</td> 945 <td>v</td> 946 </tr> 947 <tr> 948 <td>γ*</td> 949 <td>n</td> 950 <td>g</td> 951 </tr> 952 <tr> 953 <td>η</td> 954 <td>ē</td> 955 <td>ī</td> 956 </tr> 957 <tr> 958 <td>̔</td> 959 <td>h</td> 960 <td>(omitted)</td> 961 </tr> 962 <tr> 963 <td>̀</td> 964 <td>̀</td> 965 <td>(omitted)</td> 966 </tr> 967 <tr> 968 <td>~</td> 969 <td>~</td> 970 <td>(omitted)</td> 971 </tr> 972 </table> 973 <p>* before γ, κ, ξ, χ </p> 974 <h3><a name="Cyrillic">Cyrillic</a></h3> 975 <p>Cyrillic generally follows ISO 9 for the base Cyrillic set. There are tentative 976 plans to add extended Cyrillic characters in the future, plus variants for GOST 977 and other national standards.</p> 978 <h3><a name="Indic">Indic</a></h3> 979 <p>Transliteration of Indic scripts follows the ISO 15919<i> 980 <strong style="font-weight: 400">Transliteration of Devanagari and related Indic 981 scripts into Latin characters</strong></i>. Internally, all Indic scripts are transliterated 982 by converting first to an internal form, called Inter-Indic, then from Inter-Indic 983 to the target script. Inter-Indic thus provides a pivot between the different 984 scripts, and contains a superset of correspondences for all of them.</p> 985 <p>ISO 15919 differs from ISCII 91 in application of diacritics 986 for certain characters. These differences are shown in the following example (illustrated 987 with Devanagari, although the same principles apply to the other Indic scripts): 988 </p> 989 <table id="table11" border="1"> 990 <tr> 991 <th>Devanagari</th> 992 <th>ISCII 91</th> 993 <th>ISO 15919</th> 994 </tr> 995 <tr> 996 <td>ऋ</td> 997 <td>ṛ</td> 998 <td>r̥</td> 999 </tr> 1000 <tr> 1001 <td>ऌ</td> 1002 <td>ḻ</td> 1003 <td>l̥</td> 1004 </tr> 1005 <tr> 1006 <td>ॠ</td> 1007 <td>ṝ</td> 1008 <td>r̥̄</td> 1009 </tr> 1010 <tr> 1011 <td>ॡ</td> 1012 <td>ḻ̄</td> 1013 <td>l̥̄</td> 1014 </tr> 1015 <tr> 1016 <td>ढ़</td> 1017 <td>d̂ha</td> 1018 <td>ṛha</td> 1019 </tr> 1020 <tr> 1021 <td>ड़</td> 1022 <td>d̂a</td> 1023 <td>ṛa</td> 1024 </tr> 1025 </table> 1026 <p>Transliteration rules from Indic to Latin are reversible with the exception of 1027 the ZWJ and ZWNJ used to request explicit rendering effects. For example:</p> 1028 <table id="table13" border="1"> 1029 <tr> 1030 <th>Devanagari</th> 1031 <th>Romanization</th> 1032 <th>Note</th> 1033 </tr> 1034 <tr> 1035 <td>क्ष</td> 1036 <td>kṣa</td> 1037 <td>normal</td> 1038 </tr> 1039 <tr> 1040 <td>क्ष</td> 1041 <td>kṣa</td> 1042 <td>explicit halant requested</td> 1043 </tr> 1044 <tr> 1045 <td>क्ष</td> 1046 <td>kṣa</td> 1047 <td>half-consonant requested</td> 1048 </tr> 1049 </table> 1050 <p>Transliteration between Indic scripts are roundtrip where there are corresponding 1051 letters. Otherwise, there may be fallbacks.</p> 1052 <p>There are two particular instances where transliterations may produce unexpected 1053 results: (1) where the final vowel is suppressed in speech, and (2) with the transliteration 1054 of 'c'. </p> 1055 <p>For example:</p> 1056 <table id="table14" border="1"> 1057 <tr> 1058 <th>Devanagari</th> 1059 <th style="vertical-align: top">Romanization</th> 1060 <th>Notes</th> 1061 </tr> 1062 <tr> 1063 <td>सेन्गुप्त</td> 1064 <td style="vertical-align: top">Sēngupta</td> 1065 <td style="vertical-align: top"> </td> 1066 </tr> 1067 <tr> 1068 <td>सेनगुप्त</td> 1069 <td style="vertical-align: top">Sēnagupta</td> 1070 <td style="vertical-align: top">The final 'a' is not pronounced</td> 1071 </tr> 1072 <tr> 1073 <td style="vertical-align: top">मोनिक</td> 1074 <td style="vertical-align: top">Monika</td> 1075 <td style="vertical-align: top"> </td> 1076 </tr> 1077 <tr> 1078 <td>मोनिच</td> 1079 <td style="vertical-align: top">Monica</td> 1080 <td style="vertical-align: top">The 'c' is pronounced "ch"</td> 1081 </tr> 1082 </table> 1083 <h3><a name="Others">Others</a></h3> 1084 <p>Unicode CLDR provides other transliterations based on the 1085 <a href="http://geonames.usgs.gov/">U.S. Board on Geographic Names</a> (BGN) transliterations. 1086 These are currently unidirectional <font face="Arial">—</font> to Latin only. The goal is to make them bidirectional 1087 in future versions of CLDR.</p> 1088 <p>Other transliterations are generally based on the 1089 <a href="http://www.eki.ee/wgrs/">UNGEGN: Working Group on Romanization Systems</a> 1090 transliterations. These systems are in 1091 wider actual implementation than most ISO standardized transliterations, and are 1092 published freely available on the web (<a target="_blank" href="http://www.eki.ee/wgrs/">http://www.eki.ee/wgrs/</a>) 1093 and thus easily accessible to all. 1094 The UNGEGN also has good documentation. For example, the 1095 <a href="http://www.eki.ee/wgrs/rom1_ar.pdf">UNGEGN Arabic Tables</a> 1096 not only presents the UN system, but compares it with the BGN/PCGN 1956 system, 1097 the I.G.N. System 1973, ISO 233:1984, the royal Jordanian Geographic Centre 1098 System, and the Survey of Egypt System.</p> 1099 <h2><a name="Submitting_Transliterations">Submitting Transliterations</a></h2> 1100 <p>If you are interested in providing transliterations for one or 1101 more scripts, file an initial bug report at <i> 1102 <a href="http://www.unicode.org/cldr/bugs/locale-bugs">Locale Bugs</a></i>. The initial bug should contain the scripts and or languages 1103 involved, and the system being followed (with a link to a full description of 1104 the proposed transliteration system), and a brief example. The proposed data can 1105 also be in that bug, or be added in a Reply to that bug.</p> 1106 <p>You can also file a bug in <i> 1107 <a href="http://www.unicode.org/cldr/bugs/locale-bugs">Locale Bugs</a></i> if 1108 you find a problem in an existing transliteration.</p> 1109 <p>For submission to CLDR, the data 1110 needs to supplied in the correct XML format, and should follow an accepted 1111 standard. It is best if the results are 1112 tested using the <i><a href="http://demo.icu-project.org/icu-bin/translit">ICU 1113 Transform Demo</a></i> first, since if the data doesn't validate it would 1114 not be accepted into CLDR. As mentioned above, even if a transliteration is only used 1115 in certain countries or contexts CLDR can provide for them with different 1116 variant tags.</p> 1117 <p>The format for rules is specified in 1118 <a target="_blank" href="http://www.unicode.org/reports/tr35/#Transform_Rules">Transform_Rules</a>. 1119 The XML format is just a series of rules and comments. For comparison, you can see what is 1120 currently in CLDR in the 1121 <a target="_blank" href="http://www.unicode.org/cldr/data/common/transforms/">transforms</a> 1122 folder online. For example, see 1123 <a target="_blank" href="http://www.unicode.org/cldr/data/common/transforms/Hebrew-Latin.xml">Hebrew-Latin.xml</a>.</p> 1124 <h2><a name="More_Information">More Information</a></h2> 1125 <p>For more information, see:</p> 1126 <ul> 1127 <li>BGN: <a href="http://geonames.usgs.gov/">U.S. Board on Geographic Names</a></li> 1128 <li>UNGEGN: <a href="http://www.eki.ee/wgrs/">UNITED NATIONS GROUP OF 1129 EXPERTS ON GEOGRAPHICAL NAMES: Working Group on Romanization 1130 Systems</a></li> 1131 <li><a href="http://transliteration.eki.ee/">Transliteration of Non-Roman Alphabets 1132 and Scripts (Søren Binks)</a> </li> 1133 <li><a href="http://www.archivists.org/catalog/stds99/chapter8.html">Standards 1134 for Archival Description: Romanization</a> </li> 1135 <li><a href="http://transliteration.eki.ee/pdf/Hindi-Marathi-Nepali.pdf">ISO-15915 1136 (Hindi)</a> </li> 1137 <li><a href="http://transliteration.eki.ee/pdf/Gujarati.pdf">ISO-15915 (Gujarati)</a> 1138 </li> 1139 <li><a href="http://transliteration.eki.ee/pdf/Kannada.pdf">ISO-15915 (Kannada)</a> 1140 </li> 1141 <li><a href="http://www.cdacindia.com/html/gist/down/iscii_d.asp">ISCII-91</a> 1142 </li> 1143 <li><a href="http://www.unicode.org/reports/tr35/">UTS #35: Locale Data Markup 1144 Language (LDML)</a></li> 1145 </ul></div></td> 1146 </tr> 1147 </table> 1148 <hr width="50%" /> 1149 <div align="center"> 1150 <center> 1151 <table cellspacing="0" cellpadding="0" border="0"> 1152 <tr> 1153 <td><a href="http://www.unicode.org/copyright.html"> 1154 <img src="http://www.unicode.org/img/hb_notice.gif" border="0" alt="Access to Copyright and terms of use" width="216" height="50" /></a></td> 1155 </tr> 1156 </table> 1157 <script language="Javascript" type="text/javascript" src="http://www.unicode.org/webscripts/lastModified.js"> 1158 1159 1160 1161 1162 </script> 1163 </center></div> 1164 </td> 1165 </tr> 1166</table> 1167 1168</body> 1169 1170</html>