/* * iconv library implemented with Win32 API. * * This file is placed in the public domain. * * Maintainer: Yukihiro Nakadaira * * Win32 API does not support strict encoding conversion for some * codepage. And MLang function drop or replace invalid bytes and does * not return useful error status as iconv. This implementation cannot * be used for encoding validation purpose. */ /* Primary source was http://code.google.com/p/win-iconv/source/checkout but now (Aug 2015) https://raw.githubusercontent.com/win-iconv/win-iconv/master/win_iconv.c Original R version from http://www.gtk.org/download-windows.html R changes: - add some missing encoding names, remove duplicate names. - add iconvlist() - set errno on error - XP-compatibility for WC_NO_BEST_FIT_CHARS -- use only for ASCII by default, control via R_WIN_ICONV_BEST_FIT environment variable - iconv_open option NOBESTFIT A reasonably complete list is at http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756%28v=vs.85%29.aspx */ static int R_WIN_ICONV_best_fit = 1; /* R addition */ /* for WC_NO_BEST_FIT_CHARS */ #ifndef WINVER # define WINVER 0x0500 #endif #include #include #include #include #define BUILDING_LIBICONV #include #define MB_CHAR_MAX 16 #define UNICODE_MODE_BOM_DONE 1 #define UNICODE_MODE_SWAPPED 2 #define FLAG_USE_BOM 1 #define FLAG_TRANSLIT 2 /* //TRANSLIT */ #define FLAG_IGNORE 4 /* //IGNORE */ #define FLAG_NO_BEST_FIT 8 /* //NOBESTFIT */ typedef unsigned char uchar; typedef unsigned short ushort; typedef unsigned int uint; /* typedef void* iconv_t; iconv_t iconv_open(const char *tocode, const char *fromcode); int iconv_close(iconv_t cd); size_t iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft); */ typedef struct compat_t compat_t; typedef struct csconv_t csconv_t; typedef struct rec_iconv_t rec_iconv_t; typedef iconv_t (*f_iconv_open)(const char *tocode, const char *fromcode); typedef int (*f_iconv_close)(iconv_t cd); typedef size_t (*f_iconv)(iconv_t cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft); typedef int* (*f_errno)(void); typedef int (*f_mbtowc)(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize); typedef int (*f_wctomb)(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize); typedef int (*f_mblen)(csconv_t *cv, const uchar *buf, int bufsize); typedef int (*f_flush)(csconv_t *cv, uchar *buf, int bufsize); #define COMPAT_IN 1 #define COMPAT_OUT 2 /* unicode mapping for compatibility with other conversion table. */ struct compat_t { uint in; uint out; uint flag; }; struct csconv_t { int codepage; int flags; f_mbtowc mbtowc; f_wctomb wctomb; f_mblen mblen; f_flush flush; DWORD mode; compat_t *compat; }; struct rec_iconv_t { iconv_t cd; f_iconv_close iconv_close; f_iconv iconv; f_errno _errno; csconv_t from; csconv_t to; }; static int win_iconv_open(rec_iconv_t *cd, const char *tocode, const char *fromcode); static int win_iconv_close(iconv_t cd); static size_t win_iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft); static int load_mlang(void); static int make_csconv(const char *name, csconv_t *cv); static int name_to_codepage(const char *name); static uint utf16_to_ucs4(const ushort *wbuf); static void ucs4_to_utf16(uint wc, ushort *wbuf, int *wbufsize); static int mbtowc_flags(int codepage); static int must_use_null_useddefaultchar(int codepage); static char *strrstr(const char *str, const char *token); static int seterror(int err); static int sbcs_mblen(csconv_t *cv, const uchar *buf, int bufsize); static int dbcs_mblen(csconv_t *cv, const uchar *buf, int bufsize); static int mbcs_mblen(csconv_t *cv, const uchar *buf, int bufsize); static int utf8_mblen(csconv_t *cv, const uchar *buf, int bufsize); static int eucjp_mblen(csconv_t *cv, const uchar *buf, int bufsize); static int kernel_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize); static int kernel_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize); static int mlang_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize); static int mlang_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize); static int utf16_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize); static int utf16_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize); static int utf32_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize); static int utf32_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize); static int iso2022jp_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize); static int iso2022jp_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize); static int iso2022jp_flush(csconv_t *cv, uchar *buf, int bufsize); static struct { int codepage; const char *name; } codepage_alias[] = { {65001, "CP65001"}, {65001, "UTF8"}, {65001, "UTF-8"}, {1200, "CP1200"}, {1200, "UTF16LE"}, {1200, "UTF-16LE"}, {1200, "UCS2LE"}, {1200, "UCS-2LE"}, {1201, "CP1201"}, {1201, "UTF16BE"}, {1201, "UTF-16BE"}, {1201, "UCS2BE"}, {1201, "UCS-2BE"}, {1201, "unicodeFFFE"}, {12000, "CP12000"}, {12000, "UTF32LE"}, {12000, "UTF-32LE"}, {12000, "UCS4LE"}, {12000, "UCS-4LE"}, {12001, "CP12001"}, {12001, "UTF32BE"}, {12001, "UTF-32BE"}, {12001, "UCS4BE"}, {12001, "UCS-4BE"}, #ifndef GLIB_COMPILATION /* * Default is big endian. * See rfc2781 4.3 Interpreting text labelled as UTF-16. */ {1201, "UTF16"}, {1201, "UTF-16"}, {1201, "UCS2"}, {1201, "UCS-2"}, {12001, "UTF32"}, {12001, "UTF-32"}, {12001, "UCS-4"}, {12001, "UCS4"}, #else /* Default is little endian, because the platform is */ {1200, "UTF16"}, {1200, "UTF-16"}, {1200, "UCS2"}, {1200, "UCS-2"}, {12000, "UTF32"}, {12000, "UTF-32"}, {12000, "UCS4"}, {12000, "UCS-4"}, #endif /* R additions */ {12000, "UCS-4LE"}, {12001, "UCS-4BE"}, /* copy from libiconv `iconv -l` */ /* !IsValidCodePage(367) */ {20127, "ANSI_X3.4-1968"}, {20127, "ANSI_X3.4-1986"}, {20127, "ASCII"}, {20127, "CP367"}, {20127, "IBM367"}, {20127, "ISO-IR-6"}, {20127, "ISO646-US"}, {20127, "ISO_646.IRV:1991"}, {20127, "US"}, {20127, "US-ASCII"}, {20127, "CSASCII"}, /* !IsValidCodePage(819) */ {1252, "CP819"}, {1252, "IBM819"}, {28591, "ISO-8859-1"}, {28591, "ISO-IR-100"}, {28591, "ISO8859-1"}, {28591, "ISO_8859-1"}, {28591, "ISO_8859-1:1987"}, {28591, "L1"}, {28591, "LATIN1"}, {28591, "CSISOLATIN1"}, {1250, "CP1250"}, {1250, "MS-EE"}, // {1250, "WINDOWS-1250"}, {1251, "CP1251"}, {1251, "MS-CYRL"}, // {1251, "WINDOWS-1251"}, {1252, "CP1252"}, {1252, "MS-ANSI"}, // {1252, "WINDOWS-1252"}, {1253, "CP1253"}, {1253, "MS-GREEK"}, // {1253, "WINDOWS-1253"}, {1254, "CP1254"}, {1254, "MS-TURK"}, // {1254, "WINDOWS-1254"}, {1255, "CP1255"}, {1255, "MS-HEBR"}, // {1255, "WINDOWS-1255"}, {1256, "CP1256"}, {1256, "MS-ARAB"}, // {1256, "WINDOWS-1256"}, {1257, "CP1257"}, {1257, "WINBALTRIM"}, // {1257, "WINDOWS-1257"}, {1258, "CP1258"}, // {1258, "WINDOWS-1258"}, {850, "850"}, {850, "CP850"}, {850, "IBM850"}, {850, "CSPC850MULTILINGUAL"}, /* !IsValidCodePage(862) */ {862, "862"}, {862, "CP862"}, {862, "IBM862"}, {862, "CSPC862LATINHEBREW"}, {866, "866"}, {866, "CP866"}, {866, "IBM866"}, {866, "CSIBM866"}, /* !IsValidCodePage(154) */ {154, "CP154"}, {154, "CYRILLIC-ASIAN"}, {154, "PT154"}, {154, "PTCP154"}, {154, "CSPTCP154"}, /* !IsValidCodePage(1133) */ {1133, "CP1133"}, {1133, "IBM-CP1133"}, {874, "CP874"}, // {874, "WINDOWS-874"}, /* !IsValidCodePage(51932) */ {51932, "CP51932"}, {51932, "MS51932"}, {51932, "WINDOWS-51932"}, // {51932, "EUC-JP"}, /* duplicate */ {932, "CP932"}, {932, "MS932"}, {932, "SHIFFT_JIS"}, {932, "SHIFFT_JIS-MS"}, {932, "SJIS"}, {932, "SJIS-MS"}, {932, "SJIS-OPEN"}, {932, "SJIS-WIN"}, {932, "WINDOWS-31J"}, {932, "WINDOWS-932"}, {932, "CSWINDOWS31J"}, {50221, "CP50221"}, {50221, "ISO-2022-JP"}, {50221, "ISO-2022-JP-MS"}, {50221, "ISO2022-JP"}, {50221, "ISO2022-JP-MS"}, {50221, "MS50221"}, {50221, "WINDOWS-50221"}, {936, "CP936"}, {936, "GBK"}, {936, "MS936"}, {936, "WINDOWS-936"}, {950, "CP950"}, {950, "BIG5"}, {950, "BIG5HKSCS"}, {950, "BIG5-HKSCS"}, /* R additions */ {950, "BIG-5"}, {950, "BIG-FIVE"}, {949, "CP949"}, {949, "UHC"}, {949, "EUC-KR"}, {1361, "CP1361"}, {1361, "JOHAB"}, {437, "437"}, {437, "CP437"}, {437, "IBM437"}, {437, "CSPC8CODEPAGE437"}, {737, "CP737"}, {775, "CP775"}, {775, "IBM775"}, {775, "CSPC775BALTIC"}, {852, "852"}, {852, "CP852"}, {852, "IBM852"}, {852, "CSPCP852"}, /* !IsValidCodePage(853) */ {853, "CP853"}, {855, "855"}, {855, "CP855"}, {855, "IBM855"}, {855, "CSIBM855"}, {857, "857"}, {857, "CP857"}, {857, "IBM857"}, {857, "CSIBM857"}, /* !IsValidCodePage(858) */ {858, "CP858"}, {860, "860"}, {860, "CP860"}, {860, "IBM860"}, {860, "CSIBM860"}, {861, "861"}, {861, "CP-IS"}, {861, "CP861"}, {861, "IBM861"}, {861, "CSIBM861"}, {863, "863"}, {863, "CP863"}, {863, "IBM863"}, {863, "CSIBM863"}, {864, "CP864"}, {864, "IBM864"}, {864, "CSIBM864"}, {865, "865"}, {865, "CP865"}, {865, "IBM865"}, {865, "CSIBM865"}, {869, "869"}, {869, "CP-GR"}, {869, "CP869"}, {869, "IBM869"}, {869, "CSIBM869"}, /* !IsValidCodePage(1152) */ {1125, "CP1125"}, /* * Code Page Identifiers * http://msdn2.microsoft.com/en-us/library/ms776446.aspx */ {37, "IBM037"}, /* IBM EBCDIC US-Canada */ {437, "IBM437"}, /* OEM United States */ {500, "IBM500"}, /* IBM EBCDIC International */ {708, "ASMO-708"}, /* Arabic (ASMO 708) */ /* 709 Arabic (ASMO-449+, BCON V4) */ /* 710 Arabic - Transparent Arabic */ {720, "DOS-720"}, /* Arabic (Transparent ASMO); Arabic (DOS) */ {737, "ibm737"}, /* OEM Greek (formerly 437G); Greek (DOS) */ {775, "ibm775"}, /* OEM Baltic; Baltic (DOS) */ {850, "ibm850"}, /* OEM Multilingual Latin 1; Western European (DOS) */ {852, "ibm852"}, /* OEM Latin 2; Central European (DOS) */ {855, "IBM855"}, /* OEM Cyrillic (primarily Russian) */ {857, "ibm857"}, /* OEM Turkish; Turkish (DOS) */ {858, "IBM00858"}, /* OEM Multilingual Latin 1 + Euro symbol */ {860, "IBM860"}, /* OEM Portuguese; Portuguese (DOS) */ {861, "ibm861"}, /* OEM Icelandic; Icelandic (DOS) */ {862, "DOS-862"}, /* OEM Hebrew; Hebrew (DOS) */ {863, "IBM863"}, /* OEM French Canadian; French Canadian (DOS) */ {864, "IBM864"}, /* OEM Arabic; Arabic (864) */ {865, "IBM865"}, /* OEM Nordic; Nordic (DOS) */ {866, "cp866"}, /* OEM Russian; Cyrillic (DOS) */ {869, "ibm869"}, /* OEM Modern Greek; Greek, Modern (DOS) */ {870, "IBM870"}, /* IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2 */ {874, "windows-874"}, /* ANSI/OEM Thai (same as 28605, ISO 8859-15); Thai (Windows) */ {875, "cp875"}, /* IBM EBCDIC Greek Modern */ {932, "shift_jis"}, /* ANSI/OEM Japanese; Japanese (Shift-JIS) */ {932, "shift-jis"}, /* alternative name for it */ {936, "gb2312"}, /* ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312) */ {949, "ks_c_5601-1987"}, /* ANSI/OEM Korean (Unified Hangul Code) */ {950, "big5"}, /* ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5) */ {950, "big5hkscs"}, /* ANSI/OEM Traditional Chinese (Hong Kong SAR); Chinese Traditional (Big5-HKSCS) */ {950, "big5-hkscs"}, /* alternative name for it */ {1026, "IBM1026"}, /* IBM EBCDIC Turkish (Latin 5) */ {1047, "IBM01047"}, /* IBM EBCDIC Latin 1/Open System */ {1140, "IBM01140"}, /* IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro) */ {1141, "IBM01141"}, /* IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro) */ {1142, "IBM01142"}, /* IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro) */ {1143, "IBM01143"}, /* IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro) */ {1144, "IBM01144"}, /* IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro) */ {1145, "IBM01145"}, /* IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro) */ {1146, "IBM01146"}, /* IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro) */ {1147, "IBM01147"}, /* IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro) */ {1148, "IBM01148"}, /* IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro) */ {1149, "IBM01149"}, /* IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro) */ {1250, "windows-1250"}, /* ANSI Central European; Central European (Windows) */ {1251, "windows-1251"}, /* ANSI Cyrillic; Cyrillic (Windows) */ {1252, "windows-1252"}, /* ANSI Latin 1; Western European (Windows) */ {1253, "windows-1253"}, /* ANSI Greek; Greek (Windows) */ {1254, "windows-1254"}, /* ANSI Turkish; Turkish (Windows) */ {1255, "windows-1255"}, /* ANSI Hebrew; Hebrew (Windows) */ {1256, "windows-1256"}, /* ANSI Arabic; Arabic (Windows) */ {1257, "windows-1257"}, /* ANSI Baltic; Baltic (Windows) */ {1258, "windows-1258"}, /* ANSI/OEM Vietnamese; Vietnamese (Windows) */ {1361, "Johab"}, /* Korean (Johab) */ {10000, "macintosh"}, /* MAC Roman; Western European (Mac) */ /* R additions to match GNU libiconv and glibc */ {10000, "mac"}, {10000, "macroman"}, {10001, "x-mac-japanese"}, /* Japanese (Mac) */ {10002, "x-mac-chinesetrad"}, /* MAC Traditional Chinese (Big5); Chinese Traditional (Mac) */ {10003, "x-mac-korean"}, /* Korean (Mac) */ {10004, "x-mac-arabic"}, /* Arabic (Mac) */ {10004, "macarabic"}, /* Arabic (Mac), no longer in the master */ {10005, "x-mac-hebrew"}, /* Hebrew (Mac) */ {10005, "machebrew"}, {10006, "x-mac-greek"}, /* Greek (Mac) */ {10006, "macgreek"}, {10007, "x-mac-cyrillic"}, /* Cyrillic (Mac) */ {10007, "maccyrillic"}, {10008, "x-mac-chinesesimp"}, /* MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac) */ {10010, "x-mac-romanian"}, /* Romanian (Mac) */ {10010, "macromania"}, {10017, "x-mac-ukrainian"}, /* Ukrainian (Mac) */ {10017, "macukraine"}, {10017, "macukrainian"}, {10021, "x-mac-thai"}, /* Thai (Mac) */ {10021, "macthai"}, {10029, "x-mac-ce"}, /* MAC Latin 2; Central European (Mac) */ {10029, "maccentraleurope"}, {10029, "mac-centraleurope"}, {10079, "x-mac-icelandic"}, /* Icelandic (Mac) */ {10079, "maciceland"}, {10079, "mac-is"}, {10079, "macis"}, {10081, "x-mac-turkish"}, /* Turkish (Mac) */ {10081, "macturkish"}, {10082, "x-mac-croatian"}, /* Croatian (Mac) */ {10082, "maccroatian"}, {20000, "x-Chinese_CNS"}, /* CNS Taiwan; Chinese Traditional (CNS) */ {20001, "x-cp20001"}, /* TCA Taiwan */ {20002, "x_Chinese-Eten"}, /* Eten Taiwan; Chinese Traditional (Eten) */ {20003, "x-cp20003"}, /* IBM5550 Taiwan */ {20004, "x-cp20004"}, /* TeleText Taiwan */ {20005, "x-cp20005"}, /* Wang Taiwan */ {20105, "x-IA5"}, /* IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5) */ {20106, "x-IA5-German"}, /* IA5 German (7-bit) */ {20107, "x-IA5-Swedish"}, /* IA5 Swedish (7-bit) */ {20108, "x-IA5-Norwegian"}, /* IA5 Norwegian (7-bit) */ // {20127, "us-ascii"}, /* US-ASCII (7-bit), duplicate */ {20261, "x-cp20261"}, /* T.61 */ {20269, "x-cp20269"}, /* ISO 6937 Non-Spacing Accent */ {20273, "IBM273"}, /* IBM EBCDIC Germany */ {20277, "IBM277"}, /* IBM EBCDIC Denmark-Norway */ {20278, "IBM278"}, /* IBM EBCDIC Finland-Sweden */ {20280, "IBM280"}, /* IBM EBCDIC Italy */ {20284, "IBM284"}, /* IBM EBCDIC Latin America-Spain */ {20285, "IBM285"}, /* IBM EBCDIC United Kingdom */ {20290, "IBM290"}, /* IBM EBCDIC Japanese Katakana Extended */ {20297, "IBM297"}, /* IBM EBCDIC France */ {20420, "IBM420"}, /* IBM EBCDIC Arabic */ {20423, "IBM423"}, /* IBM EBCDIC Greek */ {20424, "IBM424"}, /* IBM EBCDIC Hebrew */ {20833, "x-EBCDIC-KoreanExtended"}, /* IBM EBCDIC Korean Extended */ {20838, "IBM-Thai"}, /* IBM EBCDIC Thai */ {20866, "koi8-r"}, /* Russian (KOI8-R); Cyrillic (KOI8-R) */ {20871, "IBM871"}, /* IBM EBCDIC Icelandic */ {20880, "IBM880"}, /* IBM EBCDIC Cyrillic Russian */ {20905, "IBM905"}, /* IBM EBCDIC Turkish */ {20924, "IBM00924"}, /* IBM EBCDIC Latin 1/Open System (1047 + Euro symbol) */ //{20932, "EUC-JP"}, /* Japanese (JIS 0208-1990 and 0121-1990) */ {20936, "x-cp20936"}, /* Simplified Chinese (GB2312); Chinese Simplified (GB2312-80) */ {20949, "x-cp20949"}, /* Korean Wansung */ {21025, "cp1025"}, /* IBM EBCDIC Cyrillic Serbian-Bulgarian */ /* 21027 (deprecated) */ {21866, "koi8-u"}, /* Ukrainian (KOI8-U); Cyrillic (KOI8-U) */ //{28591, "iso-8859-1"}, /* ISO 8859-1 Latin 1; Western European (ISO) */ {28591, "iso8859-1"}, {28591, "iso_8859-1"}, {28591, "iso_8859_1"}, {28592, "iso-8859-2"}, /* ISO 8859-2 Central European; Central European (ISO) */ {28592, "iso8859-2"}, {28592, "iso_8859-2"}, {28592, "iso_8859_2"}, /* latin2 etc are R additions */ {28592, "latin2"}, {28593, "iso-8859-3"}, /* ISO 8859-3 Latin 3 */ {28593, "iso8859-3"}, /* ISO 8859-3 Latin 3 */ {28593, "iso_8859-3"}, {28593, "iso_8859_3"}, {28593, "latin3"}, {28594, "iso-8859-4"}, /* ISO 8859-4 Baltic */ {28594, "iso8859-4"}, /* ISO 8859-4 Baltic */ {28594, "iso_8859-4"}, {28594, "iso_8859_4"}, {28594, "latin4"}, {28595, "iso-8859-5"}, /* ISO 8859-5 Cyrillic */ {28595, "iso8859-5"}, /* ISO 8859-5 Cyrillic */ {28595, "iso_8859-5"}, {28595, "iso_8859_5"}, {28596, "iso-8859-6"}, /* ISO 8859-6 Arabic */ {28596, "iso8859-6"}, /* ISO 8859-6 Arabic */ {28596, "iso_8859-6"}, {28596, "iso_8859_6"}, {28597, "iso-8859-7"}, /* ISO 8859-7 Greek */ {28597, "iso8859-7"}, /* ISO 8859-7 Greek */ {28597, "iso_8859-7"}, {28597, "iso_8859_7"}, {28598, "iso-8859-8"}, /* ISO 8859-8 Hebrew; Hebrew (ISO-Visual) */ {28598, "iso8859-8"}, /* ISO 8859-8 Hebrew; Hebrew (ISO-Visual) */ {28598, "iso_8859-8"}, {28598, "iso_8859_8"}, {28599, "iso-8859-9"}, /* ISO 8859-9 Turkish */ {28599, "iso8859-9"}, /* ISO 8859-9 Turkish */ {28599, "iso_8859-9"}, {28599, "iso_8859_9"}, {28599, "latin5"}, {28603, "iso-8859-13"}, /* ISO 8859-13 Estonian */ {28603, "iso8859-13"}, /* ISO 8859-13 Estonian */ {28603, "iso_8859-13"}, {28603, "iso_8859_13"}, {28603, "latin7"}, {28605, "iso-8859-15"}, /* ISO 8859-15 Latin 9 */ {28605, "iso8859-15"}, /* ISO 8859-15 Latin 9 */ {28605, "iso_8859-15"}, {28605, "iso_8859_15"}, {28605, "latin9"}, {28605, "latin-9"}, {29001, "x-Europa"}, /* Europa 3 */ {38598, "iso-8859-8-i"}, /* ISO 8859-8 Hebrew; Hebrew (ISO-Logical) */ {38598, "iso8859-8-i"}, /* ISO 8859-8 Hebrew; Hebrew (ISO-Logical) */ {38598, "iso_8859-8-i"}, {38598, "iso_8859_8-i"}, {50220, "iso-2022-jp"}, /* ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS) */ {50221, "csISO2022JP"}, /* ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana) */ {50222, "iso-2022-jp"}, /* ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI) */ {50225, "iso-2022-kr"}, /* ISO 2022 Korean */ {50225, "iso2022-kr"}, /* ISO 2022 Korean */ {50227, "x-cp50227"}, /* ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022) */ /* 50229 ISO 2022 Traditional Chinese */ /* 50930 EBCDIC Japanese (Katakana) Extended */ /* 50931 EBCDIC US-Canada and Japanese */ /* 50933 EBCDIC Korean Extended and Korean */ /* 50935 EBCDIC Simplified Chinese Extended and Simplified Chinese */ /* 50936 EBCDIC Simplified Chinese */ /* 50937 EBCDIC US-Canada and Traditional Chinese */ /* 50939 EBCDIC Japanese (Latin) Extended and Japanese */ {51932, "euc-jp"}, /* EUC Japanese */ {51936, "EUC-CN"}, /* EUC Simplified Chinese; Chinese Simplified (EUC) */ {51949, "euc-kr"}, /* EUC Korean */ // R additions {51932, "eucjp"}, {51936, "EUCCN"}, {51949, "euckr"}, /* 51950 EUC Traditional Chinese */ {52936, "hz-gb-2312"}, /* HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ) */ {54936, "GB18030"}, /* Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030) */ {57002, "x-iscii-de"}, /* ISCII Devanagari */ {57003, "x-iscii-be"}, /* ISCII Bengali */ {57004, "x-iscii-ta"}, /* ISCII Tamil */ {57005, "x-iscii-te"}, /* ISCII Telugu */ {57006, "x-iscii-as"}, /* ISCII Assamese */ {57007, "x-iscii-or"}, /* ISCII Oriya */ {57008, "x-iscii-ka"}, /* ISCII Kannada */ {57009, "x-iscii-ma"}, /* ISCII Malayalam */ {57010, "x-iscii-gu"}, /* ISCII Gujarati */ {57011, "x-iscii-pa"}, /* ISCII Punjabi */ {0, NULL} }; /* * SJIS SHIFTJIS table CP932 table * ---- --------------------------- -------------------------------- * 5C U+00A5 YEN SIGN U+005C REVERSE SOLIDUS * 7E U+203E OVERLINE U+007E TILDE * 815C U+2014 EM DASH U+2015 HORIZONTAL BAR * 815F U+005C REVERSE SOLIDUS U+FF3C FULLWIDTH REVERSE SOLIDUS * 8160 U+301C WAVE DASH U+FF5E FULLWIDTH TILDE * 8161 U+2016 DOUBLE VERTICAL LINE U+2225 PARALLEL TO * 817C U+2212 MINUS SIGN U+FF0D FULLWIDTH HYPHEN-MINUS * 8191 U+00A2 CENT SIGN U+FFE0 FULLWIDTH CENT SIGN * 8192 U+00A3 POUND SIGN U+FFE1 FULLWIDTH POUND SIGN * 81CA U+00AC NOT SIGN U+FFE2 FULLWIDTH NOT SIGN * * EUC-JP and ISO-2022-JP should be compatible with CP932. * * Kernel and MLang have different Unicode mapping table. Make sure * which API is used. */ static compat_t cp932_compat[] = { {0x00A5, 0x005C, COMPAT_OUT}, {0x203E, 0x007E, COMPAT_OUT}, {0x2014, 0x2015, COMPAT_OUT}, {0x301C, 0xFF5E, COMPAT_OUT}, {0x2016, 0x2225, COMPAT_OUT}, {0x2212, 0xFF0D, COMPAT_OUT}, {0x00A2, 0xFFE0, COMPAT_OUT}, {0x00A3, 0xFFE1, COMPAT_OUT}, {0x00AC, 0xFFE2, COMPAT_OUT}, {0, 0, 0} }; static compat_t cp20932_compat[] = { {0x00A5, 0x005C, COMPAT_OUT}, {0x203E, 0x007E, COMPAT_OUT}, {0x2014, 0x2015, COMPAT_OUT}, {0xFF5E, 0x301C, COMPAT_OUT|COMPAT_IN}, {0x2225, 0x2016, COMPAT_OUT|COMPAT_IN}, {0xFF0D, 0x2212, COMPAT_OUT|COMPAT_IN}, {0xFFE0, 0x00A2, COMPAT_OUT|COMPAT_IN}, {0xFFE1, 0x00A3, COMPAT_OUT|COMPAT_IN}, {0xFFE2, 0x00AC, COMPAT_OUT|COMPAT_IN}, {0, 0, 0} }; static compat_t *cp51932_compat = cp932_compat; /* cp20932_compat for kernel. cp932_compat for mlang. */ static compat_t *cp5022x_compat = cp932_compat; typedef HRESULT (WINAPI *CONVERTINETSTRING)( LPDWORD lpdwMode, DWORD dwSrcEncoding, DWORD dwDstEncoding, LPCSTR lpSrcStr, LPINT lpnSrcSize, LPBYTE lpDstStr, LPINT lpnDstSize ); typedef HRESULT (WINAPI *CONVERTINETMULTIBYTETOUNICODE)( LPDWORD lpdwMode, DWORD dwSrcEncoding, LPCSTR lpSrcStr, LPINT lpnMultiCharCount, LPWSTR lpDstStr, LPINT lpnWideCharCount ); typedef HRESULT (WINAPI *CONVERTINETUNICODETOMULTIBYTE)( LPDWORD lpdwMode, DWORD dwEncoding, LPCWSTR lpSrcStr, LPINT lpnWideCharCount, LPSTR lpDstStr, LPINT lpnMultiCharCount ); typedef HRESULT (WINAPI *ISCONVERTINETSTRINGAVAILABLE)( DWORD dwSrcEncoding, DWORD dwDstEncoding ); typedef HRESULT (WINAPI *LCIDTORFC1766A)( LCID Locale, LPSTR pszRfc1766, int nChar ); typedef HRESULT (WINAPI *LCIDTORFC1766W)( LCID Locale, LPWSTR pszRfc1766, int nChar ); typedef HRESULT (WINAPI *RFC1766TOLCIDA)( LCID *pLocale, LPSTR pszRfc1766 ); typedef HRESULT (WINAPI *RFC1766TOLCIDW)( LCID *pLocale, LPWSTR pszRfc1766 ); static CONVERTINETSTRING ConvertINetString; static CONVERTINETMULTIBYTETOUNICODE ConvertINetMultiByteToUnicode; static CONVERTINETUNICODETOMULTIBYTE ConvertINetUnicodeToMultiByte; static ISCONVERTINETSTRINGAVAILABLE IsConvertINetStringAvailable; static LCIDTORFC1766A LcidToRfc1766A; static RFC1766TOLCIDA Rfc1766ToLcidA; static int load_mlang(void) { HMODULE h; if (ConvertINetString != NULL) return TRUE; h = LoadLibrary(TEXT("mlang.dll")); if (!h) return FALSE; ConvertINetString = (CONVERTINETSTRING)GetProcAddress(h, TEXT("ConvertINetString")); ConvertINetMultiByteToUnicode = (CONVERTINETMULTIBYTETOUNICODE)GetProcAddress(h, TEXT("ConvertINetMultiByteToUnicode")); ConvertINetUnicodeToMultiByte = (CONVERTINETUNICODETOMULTIBYTE)GetProcAddress(h, TEXT("ConvertINetUnicodeToMultiByte")); IsConvertINetStringAvailable = (ISCONVERTINETSTRINGAVAILABLE)GetProcAddress(h, TEXT("IsConvertINetStringAvailable")); LcidToRfc1766A = (LCIDTORFC1766A)GetProcAddress(h, TEXT("LcidToRfc1766A")); Rfc1766ToLcidA = (RFC1766TOLCIDA)GetProcAddress(h, TEXT("Rfc1766ToLcidA")); return TRUE; } iconv_t iconv_open(const char *tocode, const char *fromcode) { rec_iconv_t *cd; cd = (rec_iconv_t *)calloc(1, sizeof(rec_iconv_t)); if (cd == NULL) { // Setting errno is R addition errno = ENOMEM; return (iconv_t)(-1); } /* reset the errno to prevent reporting wrong error code. * 0 for unsorted error. */ errno = 0; if (win_iconv_open(cd, tocode, fromcode)) { R_WIN_ICONV_best_fit = 1; /* legacy default */ char *valstr = getenv("R_WIN_ICONV_BEST_FIT"); if (valstr) { if (!stricmp(valstr, "true") || !stricmp(valstr, "yes") || !strcmp(valstr, "1")) R_WIN_ICONV_best_fit = 1; else if (!stricmp(valstr, "false") || !stricmp(valstr, "no") || !strcmp(valstr, "0")) R_WIN_ICONV_best_fit = 0; } return (iconv_t)cd; } free(cd); // setting errno is R addition errno = EINVAL; return (iconv_t)(-1); } int iconv_close(iconv_t _cd) { rec_iconv_t *cd = (rec_iconv_t *)_cd; int r = cd->iconv_close(cd->cd); int e = *(cd->_errno()); free(cd); errno = e; return r; } size_t iconv(iconv_t _cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft) { rec_iconv_t *cd = (rec_iconv_t *)_cd; size_t r = cd->iconv(cd->cd, inbuf, inbytesleft, outbuf, outbytesleft); errno = *(cd->_errno()); return r; } static int win_iconv_open(rec_iconv_t *cd, const char *tocode, const char *fromcode) { if (!make_csconv(fromcode, &cd->from) || !make_csconv(tocode, &cd->to)) return FALSE; cd->iconv_close = win_iconv_close; cd->iconv = win_iconv; cd->_errno = _errno; cd->cd = (iconv_t)cd; return TRUE; } static int win_iconv_close(iconv_t cd) { return 0; } static size_t win_iconv(iconv_t _cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft) { rec_iconv_t *cd = (rec_iconv_t *)_cd; ushort wbuf[MB_CHAR_MAX]; /* enough room for one character */ int insize; int outsize; int wsize; DWORD frommode; DWORD tomode; uint wc; compat_t *cp; int i; if (inbuf == NULL || *inbuf == NULL) { if (outbuf != NULL && *outbuf != NULL && cd->to.flush != NULL) { tomode = cd->to.mode; outsize = cd->to.flush(&cd->to, (uchar *)*outbuf, *outbytesleft); if (outsize == -1) { if ((cd->to.flags & FLAG_IGNORE) && errno != E2BIG) { outsize = 0; } else { cd->to.mode = tomode; return (size_t)(-1); } } *outbuf += outsize; *outbytesleft -= outsize; } cd->from.mode = 0; cd->to.mode = 0; return 0; } while (*inbytesleft != 0) { frommode = cd->from.mode; tomode = cd->to.mode; wsize = MB_CHAR_MAX; insize = cd->from.mbtowc(&cd->from, (const uchar *)*inbuf, *inbytesleft, wbuf, &wsize); if (insize == -1) { if (cd->to.flags & FLAG_IGNORE) { cd->from.mode = frommode; insize = 1; wsize = 0; } else { cd->from.mode = frommode; return (size_t)(-1); } } if (wsize == 0) { *inbuf += insize; *inbytesleft -= insize; continue; } if (cd->from.compat != NULL) { wc = utf16_to_ucs4(wbuf); cp = cd->from.compat; for (i = 0; cp[i].in != 0; ++i) { if ((cp[i].flag & COMPAT_IN) && cp[i].out == wc) { ucs4_to_utf16(cp[i].in, wbuf, &wsize); break; } } } if (cd->to.compat != NULL) { wc = utf16_to_ucs4(wbuf); cp = cd->to.compat; for (i = 0; cp[i].in != 0; ++i) { if ((cp[i].flag & COMPAT_OUT) && cp[i].in == wc) { ucs4_to_utf16(cp[i].out, wbuf, &wsize); break; } } } outsize = cd->to.wctomb(&cd->to, wbuf, wsize, (uchar *)*outbuf, *outbytesleft); if (outsize == -1) { if ((cd->to.flags & FLAG_IGNORE) && errno != E2BIG) { cd->to.mode = tomode; outsize = 0; } else { cd->from.mode = frommode; cd->to.mode = tomode; return (size_t)(-1); } } *inbuf += insize; *outbuf += outsize; *inbytesleft -= insize; *outbytesleft -= outsize; } return 0; } static int make_csconv(const char *_name, csconv_t *cv) { CPINFO cpinfo; int use_compat = TRUE; int flag = 0; char *name; char *p; // original uses local strndup here, but copies all the string. // encoding names are never very long, so could use a stack buffer name = strdup(_name); if (name == NULL) return FALSE; /* check for option "enc_name//opt1//opt2" */ while ((p = strrstr(name, "//")) != NULL) { if (_stricmp(p + 2, "nocompat") == 0) use_compat = FALSE; else if (_stricmp(p + 2, "translit") == 0) flag |= FLAG_TRANSLIT; else if (_stricmp(p + 2, "ignore") == 0) flag |= FLAG_IGNORE; else if (_stricmp(p + 2, "nobestfit") == 0) /* R addition for internal use, may be removed without notice */ flag |= FLAG_NO_BEST_FIT; *p = 0; } cv->mode = 0; cv->flags = flag; cv->mblen = NULL; cv->flush = NULL; cv->compat = NULL; cv->codepage = name_to_codepage(name); if (cv->codepage == 1200 || cv->codepage == 1201) { cv->mbtowc = utf16_mbtowc; cv->wctomb = utf16_wctomb; if (_stricmp(name, "UTF-16") == 0 || _stricmp(name, "UTF16") == 0 || _stricmp(name, "UCS-2") == 0 || _stricmp(name, "UCS2") == 0) cv->flags |= FLAG_USE_BOM; } else if (cv->codepage == 12000 || cv->codepage == 12001) { cv->mbtowc = utf32_mbtowc; cv->wctomb = utf32_wctomb; if (_stricmp(name, "UTF-32") == 0 || _stricmp(name, "UTF32") == 0 || _stricmp(name, "UCS-4") == 0 || _stricmp(name, "UCS4") == 0) cv->flags |= FLAG_USE_BOM; } else if (cv->codepage == 65001) { cv->mbtowc = kernel_mbtowc; cv->wctomb = kernel_wctomb; cv->mblen = utf8_mblen; } else if ((cv->codepage == 50220 || cv->codepage == 50221 || cv->codepage == 50222) && load_mlang()) { cv->mbtowc = iso2022jp_mbtowc; cv->wctomb = iso2022jp_wctomb; cv->flush = iso2022jp_flush; } else if (cv->codepage == 51932 && load_mlang()) { cv->mbtowc = mlang_mbtowc; cv->wctomb = mlang_wctomb; cv->mblen = eucjp_mblen; } else if (IsValidCodePage(cv->codepage) && GetCPInfo(cv->codepage, &cpinfo) != 0) { cv->mbtowc = kernel_mbtowc; cv->wctomb = kernel_wctomb; if (cpinfo.MaxCharSize == 1) cv->mblen = sbcs_mblen; else if (cpinfo.MaxCharSize == 2) cv->mblen = dbcs_mblen; else cv->mblen = mbcs_mblen; } else { /* not supported */ free(name); errno = EINVAL; return FALSE; } if (use_compat) { switch (cv->codepage) { case 932: cv->compat = cp932_compat; break; case 20932: cv->compat = cp20932_compat; break; case 51932: cv->compat = cp51932_compat; break; case 50220: case 50221: case 50222: cv->compat = cp5022x_compat; break; } } free(name); return TRUE; } static int name_to_codepage(const char *name) { int i; if (*name == '\0' || strcmp(name, "char") == 0) return GetACP(); else if (strcmp(name, "wchar_t") == 0) return 1200; else if (_strnicmp(name, "cp", 2) == 0) return atoi(name + 2); /* CP123 */ else if ('0' <= name[0] && name[0] <= '9') return atoi(name); /* 123 */ else if (_strnicmp(name, "xx", 2) == 0) return atoi(name + 2); /* XX123 for debug */ for (i = 0; codepage_alias[i].name != NULL; ++i) if (_stricmp(name, codepage_alias[i].name) == 0) return codepage_alias[i].codepage; return -1; } /* * http://www.faqs.org/rfcs/rfc2781.html */ static uint utf16_to_ucs4(const ushort *wbuf) { uint wc = wbuf[0]; if (0xD800 <= wbuf[0] && wbuf[0] <= 0xDBFF) wc = ((wbuf[0] & 0x3FF) << 10) + (wbuf[1] & 0x3FF) + 0x10000; return wc; } static void ucs4_to_utf16(uint wc, ushort *wbuf, int *wbufsize) { if (wc < 0x10000) { wbuf[0] = wc; *wbufsize = 1; } else { wc -= 0x10000; wbuf[0] = 0xD800 | ((wc >> 10) & 0x3FF); wbuf[1] = 0xDC00 | (wc & 0x3FF); *wbufsize = 2; } } /* * Check if codepage is one of those for which the dwFlags parameter * to MultiByteToWideChar() must be zero. Return zero or * MB_ERR_INVALID_CHARS. The docs in Platform SDK for for Windows * Server 2003 R2 claims that also codepage 65001 is one of these, but * that doesn't seem to be the case. The MSDN docs for MSVS2008 leave * out 65001 (UTF-8), and that indeed seems to be the case on XP, it * works fine to pass MB_ERR_INVALID_CHARS in dwFlags when converting * from UTF-8. */ static int mbtowc_flags(int codepage) { return (codepage == 50220 || codepage == 50221 || codepage == 50222 || codepage == 50225 || codepage == 50227 || codepage == 50229 || codepage == 52936 || codepage == 54936 || (codepage >= 57002 && codepage <= 57011) || codepage == 65000 || codepage == 42) ? 0 : MB_ERR_INVALID_CHARS; } /* * Check if codepage is one those for which the lpUsedDefaultChar * parameter to WideCharToMultiByte() must be NULL. The docs in * Platform SDK for Windows Server 2003 R2 claims that this is the * list below, while the MSDN docs for MSVS2008 claim that it is only * for 65000 (UTF-7) and 65001 (UTF-8). This time the earlier Platform * SDK seems to be correct, at least for XP. */ static int must_use_null_useddefaultchar(int codepage) { return (codepage == 65000 || codepage == 65001 || codepage == 50220 || codepage == 50221 || codepage == 50222 || codepage == 50225 || codepage == 50227 || codepage == 50229 || codepage == 52936 || codepage == 54936 || (codepage >= 57002 && codepage <= 57011) || codepage == 42); } static int seterror(int err) { errno = err; return -1; } static char * strrstr(const char *str, const char *token) { int len = strlen(token); const char *p = str + strlen(str); while (str <= --p) if (p[0] == token[0] && strncmp(p, token, len) == 0) return (char *)p; return NULL; } static int sbcs_mblen(csconv_t *cv, const uchar *buf, int bufsize) { return 1; } static int dbcs_mblen(csconv_t *cv, const uchar *buf, int bufsize) { int len = IsDBCSLeadByteEx(cv->codepage, buf[0]) ? 2 : 1; if (bufsize < len) return seterror(EINVAL); return len; } static int mbcs_mblen(csconv_t *cv, const uchar *buf, int bufsize) { int len = 0; if (cv->codepage == 54936) { if (buf[0] <= 0x7F) len = 1; else if (buf[0] >= 0x81 && buf[0] <= 0xFE && bufsize >= 2 && ((buf[1] >= 0x40 && buf[1] <= 0x7E) || (buf[1] >= 0x80 && buf[1] <= 0xFE))) len = 2; else if (buf[0] >= 0x81 && buf[0] <= 0xFE && bufsize >= 4 && buf[1] >= 0x30 && buf[1] <= 0x39) len = 4; else return seterror(EINVAL); return len; } else return seterror(EINVAL); } static int utf8_mblen(csconv_t *cv, const uchar *buf, int bufsize) { int len = 0; if (buf[0] < 0x80) len = 1; else if ((buf[0] & 0xE0) == 0xC0) len = 2; else if ((buf[0] & 0xF0) == 0xE0) len = 3; else if ((buf[0] & 0xF8) == 0xF0) len = 4; else if ((buf[0] & 0xFC) == 0xF8) len = 5; else if ((buf[0] & 0xFE) == 0xFC) len = 6; if (len == 0) return seterror(EILSEQ); else if (bufsize < len) return seterror(EINVAL); return len; } static int eucjp_mblen(csconv_t *cv, const uchar *buf, int bufsize) { if (buf[0] < 0x80) /* ASCII */ return 1; else if (buf[0] == 0x8E) /* JIS X 0201 */ { if (bufsize < 2) return seterror(EINVAL); else if (!(0xA1 <= buf[1] && buf[1] <= 0xDF)) return seterror(EILSEQ); return 2; } else if (buf[0] == 0x8F) /* JIS X 0212 */ { if (bufsize < 3) return seterror(EINVAL); else if (!(0xA1 <= buf[1] && buf[1] <= 0xFE) || !(0xA1 <= buf[2] && buf[2] <= 0xFE)) return seterror(EILSEQ); return 3; } else /* JIS X 0208 */ { if (bufsize < 2) return seterror(EINVAL); else if (!(0xA1 <= buf[0] && buf[0] <= 0xFE) || !(0xA1 <= buf[1] && buf[1] <= 0xFE)) return seterror(EILSEQ); return 2; } } static int kernel_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize) { int len; len = cv->mblen(cv, buf, bufsize); if (len == -1) return -1; *wbufsize = MultiByteToWideChar(cv->codepage, mbtowc_flags (cv->codepage), (const char *)buf, len, (wchar_t *)wbuf, *wbufsize); if (*wbufsize == 0) return seterror(EILSEQ); return len; } static int kernel_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize) { BOOL usedDefaultChar = 0; BOOL *p = NULL; int flags = 0; int len; if (bufsize == 0) return seterror(E2BIG); if (!must_use_null_useddefaultchar(cv->codepage)) { p = &usedDefaultChar; #ifdef WC_NO_BEST_FIT_CHARS /* http://msdn.microsoft.com/en-us/library/dd374130%28VS.85%29.aspx says this cannot be used for 65001 and 54936, but it also says 'for Vista only', and 65001 fails on XP. We definitely want this for ASCII, which is 20127. The current (05/12/2018) version of https://docs.microsoft.com/en-us/windows/desktop/api/stringapiset/nf-stringapiset-widechartomultibyte claims that WC_NO_BEST_FIT_CHARS can be used for all code pages for which the current version of must_use_null_useddefaultchar is FALSE (it is TRUE also for 65001), and that the API is supported since Windows 2000. */ if ( !(cv->flags & FLAG_TRANSLIT) && (cv->codepage == 20127 || R_WIN_ICONV_best_fit == 0 || (cv->flags & FLAG_NO_BEST_FIT)) ) flags |= WC_NO_BEST_FIT_CHARS; #endif } len = WideCharToMultiByte(cv->codepage, flags, (const wchar_t *)wbuf, wbufsize, (char *)buf, bufsize, NULL, p); if (len == 0) { if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) return seterror(E2BIG); return seterror(EILSEQ); } else if (usedDefaultChar && !(cv->flags & FLAG_TRANSLIT)) return seterror(EILSEQ); else if (cv->mblen(cv, buf, len) != len) /* validate result */ return seterror(EILSEQ); return len; } /* * It seems that the mode (cv->mode) is fixnum. * For example, when converting iso-2022-jp(cp50221) to unicode: * in ascii sequence: mode=0xC42C0000 * in jisx0208 sequence: mode=0xC42C0001 * "C42C" is same for each convert session. * It should be: ((codepage-1)<<16)|state */ static int mlang_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize) { int len; int insize; HRESULT hr; len = cv->mblen(cv, buf, bufsize); if (len == -1) return -1; insize = len; hr = ConvertINetMultiByteToUnicode(&cv->mode, cv->codepage, (const char *)buf, &insize, (wchar_t *)wbuf, wbufsize); if (hr != S_OK || insize != len) return seterror(EILSEQ); return len; } static int mlang_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize) { char tmpbuf[MB_CHAR_MAX]; /* enough room for one character */ int tmpsize = MB_CHAR_MAX; int insize = wbufsize; HRESULT hr; hr = ConvertINetUnicodeToMultiByte(&cv->mode, cv->codepage, (const wchar_t *)wbuf, &wbufsize, tmpbuf, &tmpsize); if (hr != S_OK || insize != wbufsize) return seterror(EILSEQ); else if (bufsize < tmpsize) return seterror(E2BIG); else if (cv->mblen(cv, (uchar *)tmpbuf, tmpsize) != tmpsize) return seterror(EILSEQ); memcpy(buf, tmpbuf, tmpsize); return tmpsize; } static int utf16_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize) { int codepage = cv->codepage; /* swap endian: 1200 <-> 1201 */ if (cv->mode & UNICODE_MODE_SWAPPED) codepage ^= 1; if (bufsize < 2) return seterror(EINVAL); if (codepage == 1200) /* little endian */ wbuf[0] = (buf[1] << 8) | buf[0]; else if (codepage == 1201) /* big endian */ wbuf[0] = (buf[0] << 8) | buf[1]; if ((cv->flags & FLAG_USE_BOM) && !(cv->mode & UNICODE_MODE_BOM_DONE)) { cv->mode |= UNICODE_MODE_BOM_DONE; if (wbuf[0] == 0xFFFE) { cv->mode |= UNICODE_MODE_SWAPPED; *wbufsize = 0; return 2; } else if (wbuf[0] == 0xFEFF) { *wbufsize = 0; return 2; } } if (0xDC00 <= wbuf[0] && wbuf[0] <= 0xDFFF) return seterror(EILSEQ); if (0xD800 <= wbuf[0] && wbuf[0] <= 0xDBFF) { if (bufsize < 4) return seterror(EINVAL); if (codepage == 1200) /* little endian */ wbuf[1] = (buf[3] << 8) | buf[2]; else if (codepage == 1201) /* big endian */ wbuf[1] = (buf[2] << 8) | buf[3]; if (!(0xDC00 <= wbuf[1] && wbuf[1] <= 0xDFFF)) return seterror(EILSEQ); *wbufsize = 2; return 4; } *wbufsize = 1; return 2; } static int utf16_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize) { if ((cv->flags & FLAG_USE_BOM) && !(cv->mode & UNICODE_MODE_BOM_DONE)) { int r; cv->mode |= UNICODE_MODE_BOM_DONE; if (bufsize < 2) return seterror(E2BIG); if (cv->codepage == 1200) /* little endian */ memcpy(buf, "\xFF\xFE", 2); else if (cv->codepage == 1201) /* big endian */ memcpy(buf, "\xFE\xFF", 2); r = utf16_wctomb(cv, wbuf, wbufsize, buf + 2, bufsize - 2); if (r == -1) return -1; return r + 2; } if (bufsize < 2) return seterror(E2BIG); if (cv->codepage == 1200) /* little endian */ { buf[0] = (wbuf[0] & 0x00FF); buf[1] = (wbuf[0] & 0xFF00) >> 8; } else if (cv->codepage == 1201) /* big endian */ { buf[0] = (wbuf[0] & 0xFF00) >> 8; buf[1] = (wbuf[0] & 0x00FF); } if (0xD800 <= wbuf[0] && wbuf[0] <= 0xDBFF) { if (bufsize < 4) return seterror(E2BIG); if (cv->codepage == 1200) /* little endian */ { buf[2] = (wbuf[1] & 0x00FF); buf[3] = (wbuf[1] & 0xFF00) >> 8; } else if (cv->codepage == 1201) /* big endian */ { buf[2] = (wbuf[1] & 0xFF00) >> 8; buf[3] = (wbuf[1] & 0x00FF); } return 4; } return 2; } static int utf32_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize) { int codepage = cv->codepage; uint wc = 0 /* -Wall */; /* swap endian: 12000 <-> 12001 */ if (cv->mode & UNICODE_MODE_SWAPPED) codepage ^= 1; if (bufsize < 4) return seterror(EINVAL); if (codepage == 12000) /* little endian */ wc = (buf[3] << 24) | (buf[2] << 16) | (buf[1] << 8) | buf[0]; else if (codepage == 12001) /* big endian */ wc = (buf[0] << 24) | (buf[1] << 16) | (buf[2] << 8) | buf[3]; if ((cv->flags & FLAG_USE_BOM) && !(cv->mode & UNICODE_MODE_BOM_DONE)) { cv->mode |= UNICODE_MODE_BOM_DONE; if (wc == 0xFFFE0000) { cv->mode |= UNICODE_MODE_SWAPPED; *wbufsize = 0; return 4; } else if (wc == 0x0000FEFF) { *wbufsize = 0; return 4; } } if ((0xD800 <= wc && wc <= 0xDFFF) || 0x10FFFF < wc) return seterror(EILSEQ); ucs4_to_utf16(wc, wbuf, wbufsize); return 4; } static int utf32_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize) { uint wc; if ((cv->flags & FLAG_USE_BOM) && !(cv->mode & UNICODE_MODE_BOM_DONE)) { int r; cv->mode |= UNICODE_MODE_BOM_DONE; if (bufsize < 4) return seterror(E2BIG); if (cv->codepage == 12000) /* little endian */ memcpy(buf, "\xFF\xFE\x00\x00", 4); else if (cv->codepage == 12001) /* big endian */ memcpy(buf, "\x00\x00\xFE\xFF", 4); r = utf32_wctomb(cv, wbuf, wbufsize, buf + 4, bufsize - 4); if (r == -1) return -1; return r + 4; } if (bufsize < 4) return seterror(E2BIG); wc = utf16_to_ucs4(wbuf); if (cv->codepage == 12000) /* little endian */ { buf[0] = wc & 0x000000FF; buf[1] = (wc & 0x0000FF00) >> 8; buf[2] = (wc & 0x00FF0000) >> 16; buf[3] = (wc & 0xFF000000) >> 24; } else if (cv->codepage == 12001) /* big endian */ { buf[0] = (wc & 0xFF000000) >> 24; buf[1] = (wc & 0x00FF0000) >> 16; buf[2] = (wc & 0x0000FF00) >> 8; buf[3] = wc & 0x000000FF; } return 4; } /* * 50220: ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS) * 50221: ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow * 1 byte Kana) * 50222: ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte * Kana - SO/SI) * * MultiByteToWideChar() and WideCharToMultiByte() behave differently * depending on Windows version. On XP, WideCharToMultiByte() doesn't * terminate result sequence with ascii escape. But Vista does. * Use MLang instead. */ #define ISO2022_MODE(cs, shift) (((cs) << 8) | (shift)) #define ISO2022_MODE_CS(mode) (((mode) >> 8) & 0xFF) #define ISO2022_MODE_SHIFT(mode) ((mode) & 0xFF) #define ISO2022_SI 0 #define ISO2022_SO 1 /* shift in */ static const char iso2022_SI_seq[] = "\x0F"; /* shift out */ static const char iso2022_SO_seq[] = "\x0E"; typedef struct iso2022_esc_t iso2022_esc_t; struct iso2022_esc_t { const char *esc; int esc_len; int len; int cs; }; #define ISO2022JP_CS_ASCII 0 #define ISO2022JP_CS_JISX0201_ROMAN 1 #define ISO2022JP_CS_JISX0201_KANA 2 #define ISO2022JP_CS_JISX0208_1978 3 #define ISO2022JP_CS_JISX0208_1983 4 #define ISO2022JP_CS_JISX0212 5 static iso2022_esc_t iso2022jp_esc[] = { {"\x1B\x28\x42", 3, 1, ISO2022JP_CS_ASCII}, {"\x1B\x28\x4A", 3, 1, ISO2022JP_CS_JISX0201_ROMAN}, {"\x1B\x28\x49", 3, 1, ISO2022JP_CS_JISX0201_KANA}, {"\x1B\x24\x40", 3, 2, ISO2022JP_CS_JISX0208_1983}, /* unify 1978 with 1983 */ {"\x1B\x24\x42", 3, 2, ISO2022JP_CS_JISX0208_1983}, {"\x1B\x24\x28\x44", 4, 2, ISO2022JP_CS_JISX0212}, {NULL, 0, 0, 0} }; static int iso2022jp_mbtowc(csconv_t *cv, const uchar *buf, int bufsize, ushort *wbuf, int *wbufsize) { iso2022_esc_t *iesc = iso2022jp_esc; char tmp[MB_CHAR_MAX]; int insize; HRESULT hr; DWORD dummy = 0; int len; int esc_len; int cs; int shift; int i; if (buf[0] == 0x1B) { for (i = 0; iesc[i].esc != NULL; ++i) { esc_len = iesc[i].esc_len; if (bufsize < esc_len) { if (strncmp((char *)buf, iesc[i].esc, bufsize) == 0) return seterror(EINVAL); } else { if (strncmp((char *)buf, iesc[i].esc, esc_len) == 0) { cv->mode = ISO2022_MODE(iesc[i].cs, ISO2022_SI); *wbufsize = 0; return esc_len; } } } /* not supported escape sequence */ return seterror(EILSEQ); } else if (buf[0] == iso2022_SO_seq[0]) { cv->mode = ISO2022_MODE(ISO2022_MODE_CS(cv->mode), ISO2022_SO); *wbufsize = 0; return 1; } else if (buf[0] == iso2022_SI_seq[0]) { cv->mode = ISO2022_MODE(ISO2022_MODE_CS(cv->mode), ISO2022_SI); *wbufsize = 0; return 1; } cs = ISO2022_MODE_CS(cv->mode); shift = ISO2022_MODE_SHIFT(cv->mode); /* reset the mode for informal sequence */ if (buf[0] < 0x20) { cs = ISO2022JP_CS_ASCII; shift = ISO2022_SI; } len = iesc[cs].len; if (bufsize < len) return seterror(EINVAL); for (i = 0; i < len; ++i) if (!(buf[i] < 0x80)) return seterror(EILSEQ); esc_len = iesc[cs].esc_len; memcpy(tmp, iesc[cs].esc, esc_len); if (shift == ISO2022_SO) { memcpy(tmp + esc_len, iso2022_SO_seq, 1); esc_len += 1; } memcpy(tmp + esc_len, buf, len); if ((cv->codepage == 50220 || cv->codepage == 50221 || cv->codepage == 50222) && shift == ISO2022_SO) { /* XXX: shift-out cannot be used for mbtowc (both kernel and * mlang) */ esc_len = iesc[ISO2022JP_CS_JISX0201_KANA].esc_len; memcpy(tmp, iesc[ISO2022JP_CS_JISX0201_KANA].esc, esc_len); memcpy(tmp + esc_len, buf, len); } insize = len + esc_len; hr = ConvertINetMultiByteToUnicode(&dummy, cv->codepage, (const char *)tmp, &insize, (wchar_t *)wbuf, wbufsize); if (hr != S_OK || insize != len + esc_len) return seterror(EILSEQ); /* Check for conversion error. Assuming defaultChar is 0x3F. */ /* ascii should be converted from ascii */ if (wbuf[0] == buf[0] && cv->mode != ISO2022_MODE(ISO2022JP_CS_ASCII, ISO2022_SI)) return seterror(EILSEQ); /* reset the mode for informal sequence */ if (cv->mode != ISO2022_MODE(cs, shift)) cv->mode = ISO2022_MODE(cs, shift); return len; } static int iso2022jp_wctomb(csconv_t *cv, ushort *wbuf, int wbufsize, uchar *buf, int bufsize) { iso2022_esc_t *iesc = iso2022jp_esc; char tmp[MB_CHAR_MAX]; int tmpsize = MB_CHAR_MAX; int insize = wbufsize; HRESULT hr; DWORD dummy = 0; int len; int esc_len; int cs; int shift; int i; /* * MultiByte = [escape sequence] + character + [escape sequence] * * Whether trailing escape sequence is added depends on which API is * used (kernel or MLang, and its version). */ hr = ConvertINetUnicodeToMultiByte(&dummy, cv->codepage, (const wchar_t *)wbuf, &wbufsize, tmp, &tmpsize); if (hr != S_OK || insize != wbufsize) return seterror(EILSEQ); else if (bufsize < tmpsize) return seterror(E2BIG); if (tmpsize == 1) { cs = ISO2022JP_CS_ASCII; esc_len = 0; } else { for (i = 1; iesc[i].esc != NULL; ++i) { esc_len = iesc[i].esc_len; if (strncmp(tmp, iesc[i].esc, esc_len) == 0) { cs = iesc[i].cs; break; } } if (iesc[i].esc == NULL) /* not supported escape sequence */ return seterror(EILSEQ); } shift = ISO2022_SI; if (tmp[esc_len] == iso2022_SO_seq[0]) { shift = ISO2022_SO; esc_len += 1; } len = iesc[cs].len; /* Check for converting error. Assuming defaultChar is 0x3F. */ /* ascii should be converted from ascii */ if (cs == ISO2022JP_CS_ASCII && !(wbuf[0] < 0x80)) return seterror(EILSEQ); else if (tmpsize < esc_len + len) return seterror(EILSEQ); if (cv->mode == ISO2022_MODE(cs, shift)) { /* remove escape sequence */ if (esc_len != 0) memmove(tmp, tmp + esc_len, len); esc_len = 0; } else { if (cs == ISO2022JP_CS_ASCII) { esc_len = iesc[ISO2022JP_CS_ASCII].esc_len; memmove(tmp + esc_len, tmp, len); memcpy(tmp, iesc[ISO2022JP_CS_ASCII].esc, esc_len); } if (ISO2022_MODE_SHIFT(cv->mode) == ISO2022_SO) { /* shift-in before changing to other mode */ memmove(tmp + 1, tmp, len + esc_len); memcpy(tmp, iso2022_SI_seq, 1); esc_len += 1; } } if (bufsize < len + esc_len) return seterror(E2BIG); memcpy(buf, tmp, len + esc_len); cv->mode = ISO2022_MODE(cs, shift); return len + esc_len; } static int iso2022jp_flush(csconv_t *cv, uchar *buf, int bufsize) { iso2022_esc_t *iesc = iso2022jp_esc; int esc_len; if (cv->mode != ISO2022_MODE(ISO2022JP_CS_ASCII, ISO2022_SI)) { esc_len = 0; if (ISO2022_MODE_SHIFT(cv->mode) != ISO2022_SI) esc_len += 1; if (ISO2022_MODE_CS(cv->mode) != ISO2022JP_CS_ASCII) esc_len += iesc[ISO2022JP_CS_ASCII].esc_len; if (bufsize < esc_len) return seterror(E2BIG); esc_len = 0; if (ISO2022_MODE_SHIFT(cv->mode) != ISO2022_SI) { memcpy(buf, iso2022_SI_seq, 1); esc_len += 1; } if (ISO2022_MODE_CS(cv->mode) != ISO2022JP_CS_ASCII) { memcpy(buf + esc_len, iesc[ISO2022JP_CS_ASCII].esc, iesc[ISO2022JP_CS_ASCII].esc_len); esc_len += iesc[ISO2022JP_CS_ASCII].esc_len; } return esc_len; } return 0; } // R addition void iconvlist (int (*do_one) (unsigned int namescount, const char * const * names, void* data), void* data) { size_t j = 0; for (j = 0; codepage_alias[j].codepage ; j++) if (do_one(1, &codepage_alias[j].name, data)) break; }