data/mappings/convrtrs.txt

# ******************************************************************************
# *
# *   Copyright (C) 1995-2014, International Business Machines
# *   Corporation and others.  All Rights Reserved.
# *
# ******************************************************************************

# If this converter alias table looks very confusing, a much easier to
# understand view can be found at this demo:
# http://demo.icu-project.org/icu-bin/convexp

# IMPORTANT NOTE
#
# This file is not read directly by ICU. If you change it, you need to
# run gencnval, and eventually run pkgdata to update the representation that
# ICU uses for aliases. The gencnval tool will normally compile this file into
# cnvalias.icu. The gencnval -v verbose option will help you when you edit
# this file.

# Please be friendly to the rest of us that edit this table by
# keeping this table free of tabs.

# This is an alias file used by the character set converter.
# A lot of converter information can be found in unicode/ucnv.h, but here
# is more information about this file.
#
# If you are adding a new converter to this list and want to include it in the
# icu data library, please be sure to add an entry to the appropriate ucm*.mk file
# (see ucmfiles.mk for more information).
#
# Here is the file format using BNF-like syntax:
#
# converterTable ::= tags { converterLine* }
# converterLine ::= converterName [ tags ] { taggedAlias* }'\n'
# taggedAlias ::= alias [ tags ]
# tags ::= '{' { tag+ } '}'
# tag ::= standard['*']
# converterName ::= [0-9a-zA-Z:_'-']+
# alias ::= converterName
#
# Except for the converter name, aliases are case insensitive.
# Names are separated by whitespace.
# Line continuation and comment sytax are similar to the GNU make syntax.
# Any lines beginning with whitespace (e.g. U+0020 SPACE or U+0009 HORIZONTAL
# TABULATION) are presumed to be a continuation of the previous line.
# The # symbol starts a comment and the comment continues till the end of
# the line.
#
# The converter
#
# All names can be tagged by including a space-separated list of tags in
# curly braces, as in ISO_8859-1:1987{IANA*} iso-8859-1 { MIME* } or
# some-charset{MIME* IANA*}. The order of tags does not matter, and
# whitespace is allowed between the tagged name and the tags list.
#
# The tags can be used to get standard names using ucnv_getStandardName().
#
# The complete list of recognized tags used in this file is defined in
# the affinity list near the beginning of the file.
#
# The * after the standard tag denotes that the previous alias is the
# preferred (default) charset name for that standard. There can only
# be one of these default charset names per converter.


# The world is getting more complicated...
# Supporting XML parsers, HTML, MIME, and similar applications
# that mark encodings with a charset name can be difficult.
# Many of these applications and operating systems will update
# their codepages over time.

# It means that a new codepage, one that differs from an
# old one by changing a code point, e.g., to the Euro sign,
# must not get an old alias, because it would mean that
# old files with this alias would be interpreted differently.

# If an codepage gets updated by assigning characters to previously
# unassigned code points, then a new name is not necessary.
# Also, some codepages map unassigned codepage byte values
# to the same numbers in Unicode for roundtripping. It may be
# industry practice to keep the encoding name in such a case, too
# (example: Windows codepages).

# The aliases listed in the list of character sets
# that is maintained by the IANA (http://www.iana.org/) must
# not be changed to mean encodings different from what this
# list shows. Currently, the IANA list is at
# http://www.iana.org/assignments/character-sets
# It should also be mentioned that the exact mapping table used for each
# IANA names usually isn't specified. This means that some other applications
# and operating systems are left to interpret the exact mappings for the
# underspecified aliases. For instance, Shift-JIS on a Solaris platform
# may be different from Shift-JIS on a Windows platform. This is why
# some of the aliases can be tagged to differentiate different mapping
# tables with the same alias. If an alias is given to more than one converter,
# it is considered to be an ambiguous alias, and the affinity list will
# choose the converter to use when a standard isn't specified with the alias.

# Name matching is case-insensitive. Also, dashes '-', underscores '_'
# and spaces ' ' are ignored in names (thus cs-iso_latin-1, csisolatin1
# and "cs iso latin 1" are the same).
# However, the names in the left column are directly file names
# or names of algorithmic converters, and their case must not
# be changed - or else code and/or file names must also be changed.
# For example, the converter ibm-921 is expected to be the file ibm-921.cnv.


# The immediately following list is the affinity list of supported standard tags.
# When multiple converters have the same alias under different standards,
# the standard nearest to the top of this list with that alias will
# be the first converter that will be opened. The ordering of the aliases
# after this affinity list does not affect the preferred alias, but it may
# affect the order of the returned list of aliases for a given converter.
#
# The general ordering is from specific and frequently used to more general
# or rarely used at the bottom.
{
    UTR22           # Name format specified by http://www.unicode.org/unicode/reports/tr22/
    HTML            # WHATWG's encoding spec; https://encoding.spec.whatwg.org
    IANA            # Source: http://www.iana.org/assignments/character-sets
    MIME            # Source: http://www.iana.org/assignments/character-sets
    }

UTF-8 { MIME* HTML* }
    unicode-1-1-utf-8
    utf8

utf-16be { MIME* HTML* }

utf-16le { MIME* HTML* }
    utf-16

ibm866-html
    IBM866 { MIME* HTML* }
    866
    cp866
    csibm866

iso-8859-2-html
    ISO-8859-2 { MIME* HTML* }
    csisolatin2
    iso-ir-101
    iso8859-2
    iso88592
    iso_8859-2
    iso_8859-2:1987
    l2
    latin2

iso-8859-3-html
    ISO-8859-3 { MIME* HTML* }
    csisolatin3
    iso-ir-109
    iso8859-3
    iso88593
    iso_8859-3
    iso_8859-3:1988
    l3
    latin3

iso-8859-4-html
    ISO-8859-4 { MIME* HTML* }
    csisolatin4
    iso-ir-110
    iso8859-4
    iso88594
    iso_8859-4
    iso_8859-4:1988
    l4
    latin4

iso-8859-5-html
    ISO-8859-5 { MIME* HTML* }
    csisolatincyrillic
    cyrillic
    iso-ir-144
    iso8859-5
    iso88595
    iso_8859-5
    iso_8859-5:1988

iso-8859-6-html
    ISO-8859-6 { MIME* HTML* }
    arabic
    asmo-708
    csiso88596e
    csiso88596i
    csisolatinarabic
    ecma-114
    iso-8859-6-e
    iso-8859-6-i
    iso-ir-127
    iso8859-6
    iso88596
    iso_8859-6
    iso_8859-6:1987

iso-8859-7-html
    ISO-8859-7 { MIME* HTML* }
    csisolatingreek
    ecma-118
    elot_928
    greek
    greek8
    iso-ir-126
    iso8859-7
    iso88597
    iso_8859-7
    iso_8859-7:1987
    sun_eu_greek

iso-8859-8-html
    ISO-8859-8 { MIME* HTML* }
    csiso88598e { MIME }
    csisolatinhebrew
    hebrew
    ISO-8859-8-E
    ISO-8859-8-I
    iso-ir-138
    iso8859-8
    iso88598
    iso_8859-8
    iso_8859-8:1988
    visual
    # adding this one leads to a failure in encoding-labels.html
#   csiso88598i


# This alias has to be dealt with by TextCodecICU unless
# multiple encodings can share a single mapping table.
#ISO-8859-8-I { MIME* HTML* }
#   csiso88598i
#   logical

iso-8859-10-html
    ISO-8859-10 { MIME* HTML* }
    csisolatin6
    iso-ir-157
    iso8859-10
    iso885910
    l6
    latin6

iso-8859-13-html
    ISO-8859-13 { MIME* HTML* }
    iso8859-13
    iso885913

iso-8859-14-html
    ISO-8859-14 { MIME* HTML* }
    iso8859-14
    iso885914

iso-8859-15-html
    ISO-8859-15 { MIME* HTML* }
    csisolatin9
    iso8859-15
    iso885915
    iso_8859-15
    l9

iso-8859-16-html
    ISO-8859-16 { MIME* HTML* }

koi8-r-html
    KOI8-R { MIME* HTML* }
    cskoi8r
    koi
    koi8
    koi8_r

koi8-u-html
    KOI8-U { MIME* HTML* }
    koi8-ru

macintosh-html
    macintosh { MIME* HTML* }
    csmacintosh
    mac
    x-mac-roman

windows-874-html
    windows-874 { MIME* HTML* }
    dos-874
    iso-8859-11
    iso8859-11
    iso885911
    tis-620

windows-1250-html
    windows-1250 { MIME* HTML* }
    cp1250
    x-cp1250

windows-1251-html
    windows-1251 { MIME* HTML* }
    cp1251
    x-cp1251

windows-1252-html
    windows-1252 { MIME* HTML* }
    ansi_x3.4-1968
    ascii
    cp1252
    cp819
    csisolatin1
    ibm819
    iso-8859-1
    iso-ir-100
    iso8859-1
    iso88591
    iso_8859-1
    iso_8859-1:1987
    l1
    latin1
    us-ascii
    x-cp1252

windows-1253-html
    windows-1253 { MIME* HTML* }
    cp1253
    x-cp1253

windows-1254-html
    windows-1254 { MIME* HTML* }
    cp1254
    csisolatin5
    iso-8859-9
    iso-ir-148
    iso8859-9
    iso88599
    iso_8859-9
    iso_8859-9:1989
    l5
    latin5
    x-cp1254

windows-1255-html
    windows-1255 { MIME* HTML* }
    cp1255
    x-cp1255

windows-1256-html
    windows-1256 { MIME* HTML* }
    cp1256
    x-cp1256

windows-1257-html
    windows-1257 { MIME* HTML* }
    cp1257
    x-cp1257

windows-1258-html
    windows-1258 { MIME* HTML* }
    cp1258
    x-cp1258

x-mac-cyrillic-html
    x-mac-cyrillic { MIME* HTML* }
    x-mac-ukrainian

# Keep GBK and GB18030 separate for now until we decide
# what to do about them: crbug.com/339862
# The encoding spec requires that decoding to Unicode should use GB18030
# while encoding from Unicode should use GBK.

windows-936-2000
                       GBK { MIME* IANA* }
                       chinese { IANA }
                       iso-ir-58 { IANA }
                       GB2312 { IANA MIME }
                       GB_2312-80 { IANA }
                       gb_2312
                       csGB2312 { IANA }
                       csiso58gb231280
                       x-gbk

# GB 18030 is partly algorithmic, using the MBCS converter
gb18030 { IANA* }      gb18030 { HTML* MIME* }

big5-html
    Big5 { MIME* HTML* }
    cn-big5
    csbig5
    x-x-big5
    Big5-HKSCS

euc-jp-html
    EUC-JP { MIME* HTML* }
    cseucpkdfmtjapanese
    x-euc-jp

ISO_2022,locale=ja,version=0
    ISO-2022-JP { MIME* HTML* }
    csiso2022jp

shift_jis-html
    Shift_JIS { MIME* HTML* }
    csshiftjis
    ms_kanji
    ms932
    shift-jis
    sjis
    windows-31j
    x-sjis

euc-kr-html
    EUC-KR { MIME* HTML* }
    cseuckr
    csksc56011987
    iso-ir-149
    korean
    ks_c_5601-1987
    ks_c_5601-1989
    ksc5601
    ksc_5601
    windows-949

# We need to keep these aliases so that documents labelled with them
# are converted to a single U+FFFD instead of being rendered as a gibberish.
ISO-2022-KR { HTML* MIME* } csISO2022KR { IANA }
ISO-2022-CN { IANA* HTML* } csISO2022CN  x-ISO-2022-CN-GB
ISO-2022-CN-EXT { IANA* HTML* }
HZ-GB-2312 { HTML* IANA* } HZ