/* * R : A Computer Language for Statistical Data Analysis * Copyright (C) 2005-2017 The R Core Team * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, a copy is available at * https://www.R-project.org/Licenses/ */ /* The original version of this file was contributed by Ei-ji Nakama. * See also the comments in ../include/rlocale.h. * * It provides replacements for the wctype functions on * Windows (where they are not correct in e.g. Japanese) * AIX (missing) * macOS in CJK (where these just call the ctype functions) * * It also provides wc[s]width, where widths of CJK fonts are often * wrong in vendor-supplied versions and in Markus Kuhn's version * used for Windows in R 2.[12].x. */ #ifdef HAVE_CONFIG_H #include #endif #ifdef HAVE_VISIBILITY_ATTRIBUTE # define attribute_hidden __attribute__ ((visibility ("hidden"))) #else # define attribute_hidden #endif #include #include #define IN_RLOCALE_C 1 /* used in rlocale.h */ #include #include "rlocale_data.h" #include #include #include #include #include #include // This seems based on Markus Kuhn's function but with 1-based 'max' static int wcsearch(int wint, const struct interval *table, int max) { int min = 0; int mid; max--; if (wint < table[0].first || wint > table[max].last) return 0; while (max >= min) { mid = (min + max) / 2; if (wint > table[mid].last) min = mid + 1; else if (wint < table[mid].first) max = mid - 1; else return 1; } return 0; } static int wcwidthsearch(int wint, const struct interval_wcwidth *table, int max, int locale) { int min = 0; int mid; max--; /* This quickly gives one for ASCII characters since the table starts at 0xa0 */ if (wint < table[0].first || wint > table[max].last) return 1; while (max >= min) { mid = (min + max) / 2; if (wint > table[mid].last) min = mid + 1; else if (wint < table[mid].first) max = mid - 1; else{ return(table[mid].mb[locale]); } } return -1; } /* The idea here here has never been explained. See also the comments in ../include/rlocale.h. That does not explain the separate entries for Singapore (simplified) and Hong Kong/Macau (traditional) where it seems the Windows system font is not different from PRC/Taiwan respectively, nor what font was used for non-Windows, nor where the values came from. Except perhaps on macOS, the non-Windows locale names are for the default MBCS encodings (e.g. GBK, GB1312, BIG5, EUCJP, EUCKR). There are other non-UTF-8 encodings for those locales, e.g. ja_JP.SJIS, ko_KR.CP949, zh_CN.eucCN, zh_HK.Big5HKSCS. */ typedef struct { char *name; int locale; } cjk_locale_name_t; static cjk_locale_name_t cjk_locale_name[] = { // Windows locale names {"CHINESE(SINGAPORE)_SIGNAPORE", MB_zh_SG}, {"CHINESE_SIGNAPORE", MB_zh_SG}, {"CHINESE(PRC)_PEOPLE'S REPUBLIC OF CHINA", MB_zh_CN}, {"CHINESE_PEOPLE'S REPUBLIC OF CHINA", MB_zh_CN}, {"CHINESE_MACAU S.A.R.", MB_zh_HK}, {"CHINESE(PRC)_HONG KONG", MB_zh_HK}, {"CHINESE_HONG KONG S.A.R.", MB_zh_HK}, {"CHINESE(TAIWAN)_TAIWAN", MB_zh_TW}, {"CHINESE_TAIWAN", MB_zh_TW}, {"CHINESE-S", MB_zh_CN}, {"CHINESE-T", MB_zh_TW}, {"JAPANESE_JAPAN", MB_ja_JP}, {"JAPANESE", MB_ja_JP}, {"KOREAN_KOREA", MB_ko_KR}, {"KOREAN", MB_ko_KR}, // Other OSes, but only in default encodings. {"ZH_TW", MB_zh_TW}, {"ZH_CN", MB_zh_CN}, {"ZH_CN.BIG5", MB_zh_TW}, {"ZH_HK", MB_zh_HK}, {"ZH_SG", MB_zh_SG}, {"JA_JP", MB_ja_JP}, {"KO_KR", MB_ko_KR}, {"ZH", MB_zh_CN}, {"JA", MB_ja_JP}, {"KO", MB_ko_KR}, // Default, where all EA Ambiguous characters have width one. {"", MB_Default}, }; // used in character.c, ../gnuwin32/console.c , ../library/grDevices/src/devP*.c : int Ri18n_wcwidth(Rwchar_t c) { char lc_str[128]; unsigned int i, j; static char *lc_cache = ""; static int lc = 0; if (0 != strcmp(setlocale(LC_CTYPE, NULL), lc_cache)) { strncpy(lc_str, setlocale(LC_CTYPE, NULL), sizeof(lc_str)); lc_str[sizeof(lc_str) - 1] = '\0'; for (i = 0, j = (int) strlen(lc_str); i < j && i < sizeof(lc_str); i++) lc_str[i] = (char) toupper(lc_str[i]); for (i = 0; i < (sizeof(cjk_locale_name)/sizeof(cjk_locale_name_t)); i++) { if (0 == strncmp(cjk_locale_name[i].name, lc_str, strlen(cjk_locale_name[i].name))) { lc = cjk_locale_name[i].locale; break; } } } int wd = wcwidthsearch(c, table_wcwidth, (sizeof(table_wcwidth)/sizeof(struct interval_wcwidth)), lc); if (wd >= 0) return wd; // currently all are 1 or 2. int zw = wcsearch(c, zero_width, zero_width_count); return zw ? 0 : 1; // assume unknown chars are width one. } /* Used in character.c, errors.c, ../gnuwin32/console.c */ attribute_hidden int Ri18n_wcswidth (const wchar_t *s, size_t n) { int rs = 0; while ((n-- > 0) && (*s != L'\0')) { int now = Ri18n_wcwidth (*s); if (now == -1) return -1; rs += now; s++; } return rs; } /********************************************************************* * macOS's wide character type functions are based on FreeBSD * and only work correctly for Latin-1 characters. * So we replace them. May also be needed on FreeBSD. ********************************************************************/ #if defined(__APPLE__) /* allow for both PowerPC and Intel platforms */ #ifdef WORDS_BIGENDIAN static const char UNICODE[] = "UCS-4BE"; #else static const char UNICODE[] = "UCS-4LE"; #endif /* in Defn.h which is not included here */ extern const char *locale2charset(const char *); #define ISWFUNC(ISWNAME) static int Ri18n_isw ## ISWNAME (wint_t wc) \ { \ char mb_buf[MB_LEN_MAX+1]; \ size_t mb_len; \ int ucs4_buf[2]; \ size_t wc_len; \ void *cd; \ char fromcode[128]; \ char *_mb_buf; \ char *_wc_buf; \ size_t rc ; \ \ strncpy(fromcode, locale2charset(NULL), sizeof(fromcode)); \ fromcode[sizeof(fromcode) - 1] = '\0'; \ if(0 == strcmp(fromcode, "UTF-8")) \ return wcsearch(wc,table_w ## ISWNAME , table_w ## ISWNAME ## _count);\ memset(mb_buf, 0, sizeof(mb_buf)); \ memset(ucs4_buf, 0, sizeof(ucs4_buf)); \ wcrtomb( mb_buf, wc, NULL); \ if((void *)(-1) != (cd = Riconv_open(UNICODE, fromcode))) { \ wc_len = sizeof(ucs4_buf); \ _wc_buf = (char *)ucs4_buf; \ mb_len = strlen(mb_buf); \ _mb_buf = (char *)mb_buf; \ rc = Riconv(cd, (const char **)&_mb_buf, (size_t *)&mb_len, \ (char **)&_wc_buf, (size_t *)&wc_len); \ Riconv_close(cd); \ wc = ucs4_buf[0]; \ return wcsearch(wc,table_w ## ISWNAME , table_w ## ISWNAME ## _count); \ } \ return(-1); \ } #endif // __APPLE__ /********************************************************************* * iswalpha etc. does not function correctly for Windows * iswalpha etc. does not function at all in AIX. * all locale wchar_t == UNICODE ********************************************************************/ #if defined(Win32) || defined(_AIX) #define ISWFUNC(ISWNAME) static int Ri18n_isw ## ISWNAME (wint_t wc) \ { \ return wcsearch(wc,table_w ## ISWNAME , table_w ## ISWNAME ## _count); \ } #endif /********************************************************************* * iswalpha etc. do function correctly for Linux ********************************************************************/ #ifndef ISWFUNC #define ISWFUNC(ISWNAME) static int Ri18n_isw ## ISWNAME (wint_t wc) \ { \ return isw ## ISWNAME (wc); \ } /* Solaris 8 was missing iswblank. Its man page was missing iswcntrl, but the function is there. MinGW used not to have iswblank until mingw-runtime-3.11. */ #ifndef HAVE_ISWBLANK #define iswblank(wc) iswctype(wc, wctype("blank")) #endif #endif /* These are the functions which C99 and POSIX define. However, not all are used elsewhere in R, but they are used in Ri18n_iswctype. */ ISWFUNC(upper) ISWFUNC(lower) ISWFUNC(alpha) ISWFUNC(digit) ISWFUNC(xdigit) ISWFUNC(space) ISWFUNC(print) ISWFUNC(graph) ISWFUNC(blank) ISWFUNC(cntrl) ISWFUNC(punct) /* defined below in terms of digit and alpha ISWFUNC(alnum) */ wctype_t Ri18n_wctype(const char *); int Ri18n_iswctype(wint_t, wctype_t); static int Ri18n_iswalnum (wint_t wc) { return (Ri18n_iswctype(wc, Ri18n_wctype("digit")) || Ri18n_iswctype(wc, Ri18n_wctype("alpha")) ); } /* * iswctype */ typedef struct { char * name; wctype_t wctype; int(*func)(wint_t); } Ri18n_wctype_func_l ; static const Ri18n_wctype_func_l Ri18n_wctype_func[] = { {"upper", 1<<0, Ri18n_iswupper}, {"lower", 1<<1, Ri18n_iswlower}, {"alpha", 1<<2, Ri18n_iswalpha}, {"digit", 1<<3, Ri18n_iswdigit}, {"xdigit", 1<<4, Ri18n_iswxdigit}, {"space", 1<<5, Ri18n_iswspace}, {"print", 1<<6, Ri18n_iswprint}, {"graph", 1<<7, Ri18n_iswgraph}, {"blank", 1<<8, Ri18n_iswblank}, {"cntrl", 1<<9, Ri18n_iswcntrl}, {"punct", 1<<10, Ri18n_iswpunct}, {"alnum", 1<<11, Ri18n_iswalnum}, {NULL, 0, NULL} }; /* These two used (via macros) in X11 dataentry */ wctype_t Ri18n_wctype(const char *name) { int i; for (i = 0 ; Ri18n_wctype_func[i].name != NULL && 0 != strcmp(Ri18n_wctype_func[i].name, name) ; i++ ); return Ri18n_wctype_func[i].wctype; } int Ri18n_iswctype(wint_t wc, wctype_t desc) { int i; for (i = 0 ; Ri18n_wctype_func[i].wctype != 0 && Ri18n_wctype_func[i].wctype != desc ; i++ ); return (*Ri18n_wctype_func[i].func)(wc); }