From c2f1df1d4af0c7e633528cb4c8caa79ef04b0b5a Mon Sep 17 00:00:00 2001 From: Kurtis Rader Date: Thu, 10 Mar 2016 18:17:39 -0800 Subject: [PATCH] fix handling of non-ASCII chars in C locale The relevant standards allow the mbtowc/mbrtowc functions to reject non-ASCII characters (i.e., chars with the high bit set) when the locale is C or POSIX. The BSD libraries (e.g., on OS X) don't do this but the GNU libraries (e.g., on Linux) do. Like most programs we need the C/POSIX locales to allow arbitrary bytes. So explicitly check if we're in a single-byte locale (which would also include ISO-8859 variants) and simply pass-thru the chars without encoding or decoding. Fixes #2802. --- src/builtin.cpp | 47 ++++++++++++++-------------- src/common.cpp | 71 +++++++++++++++++++++++++++++------------- src/env.cpp | 7 ++--- src/fallback.cpp | 72 ++++++++++++++++++++++--------------------- src/fallback.h | 20 ++---------- src/history.cpp | 19 +++++++----- src/input_common.cpp | 11 ++++--- src/output.cpp | 58 +++++++++++++++++----------------- src/wutil.cpp | 29 +++++++---------- src/wutil.h | 6 ++-- tests/c-locale.err | 0 tests/c-locale.in | 35 +++++++++++++++++++++ tests/c-locale.out | 4 +++ tests/c-locale.status | 1 + 14 files changed, 215 insertions(+), 165 deletions(-) create mode 100644 tests/c-locale.err create mode 100644 tests/c-locale.in create mode 100644 tests/c-locale.out create mode 100644 tests/c-locale.status diff --git a/src/builtin.cpp b/src/builtin.cpp index 4597b18e8..f4de72c73 100644 --- a/src/builtin.cpp +++ b/src/builtin.cpp @@ -1907,18 +1907,18 @@ static int builtin_echo(parser_t &parser, io_streams_t &streams, wchar_t **argv) return STATUS_BUILTIN_OK; } -/** The pwd builtin. We don't respect -P to resolve symbolic links because we try to always resolve them. */ +// The pwd builtin. We don't respect -P to resolve symbolic links because we +// try to always resolve them. static int builtin_pwd(parser_t &parser, io_streams_t &streams, wchar_t **argv) { - wchar_t dir_path[4096]; - wchar_t *res = wgetcwd(dir_path, 4096); - if (res == NULL) + wcstring res = wgetcwd(); + if (res.empty()) { return STATUS_BUILTIN_ERROR; } else { - streams.out.append(dir_path); + streams.out.append(res); streams.out.push_back(L'\n'); return STATUS_BUILTIN_OK; } @@ -2699,9 +2699,8 @@ static int builtin_read(parser_t &parser, io_streams_t &streams, wchar_t **argv) while (1) { - int finished=0; - - wchar_t res=0; + int finished = 0; + wchar_t res = 0; mbstate_t state = {}; while (!finished) @@ -2713,24 +2712,26 @@ static int builtin_read(parser_t &parser, io_streams_t &streams, wchar_t **argv) break; } - size_t sz = mbrtowc(&res, &b, 1, &state); - - switch (sz) + if (MB_CUR_MAX == 1) // single-byte locale { - case (size_t)(-1): - memset(&state, '\0', sizeof(state)); - break; + res = (unsigned char)b; + finished = 1; + } + else { + size_t sz = mbrtowc(&res, &b, 1, &state); + switch (sz) + { + case (size_t)-1: + memset(&state, 0, sizeof(state)); + break; - case (size_t)(-2): - break; - case 0: - finished = 1; - break; - - default: - finished=1; - break; + case (size_t)-2: + break; + default: + finished = 1; + break; + } } } diff --git a/src/common.cpp b/src/common.cpp index 2aa76cc5c..a796baca1 100644 --- a/src/common.cpp +++ b/src/common.cpp @@ -103,8 +103,7 @@ int fgetws2(wcstring *s, FILE *f) { errno=0; - c = getwc(f); - + c = fgetwc(f); if (errno == EILSEQ || errno == EINTR) { continue; @@ -148,8 +147,19 @@ static wcstring str2wcs_internal(const char *in, const size_t in_len) wcstring result; result.reserve(in_len); - mbstate_t state = {}; size_t in_pos = 0; + + if (MB_CUR_MAX == 1) // single-byte locale, all values are legal + { + while (in_pos < in_len) + { + result.push_back((unsigned char)in[in_pos]); + in_pos++; + } + return result; + } + + mbstate_t state = {}; while (in_pos < in_len) { wchar_t wc = 0; @@ -165,12 +175,12 @@ static wcstring str2wcs_internal(const char *in, const size_t in_len) { use_encode_direct = true; } - else if (ret == (size_t)(-2)) + else if (ret == (size_t)-2) { /* Incomplete sequence */ use_encode_direct = true; } - else if (ret == (size_t)(-1)) + else if (ret == (size_t)-1) { /* Invalid data */ use_encode_direct = true; @@ -266,9 +276,7 @@ std::string wcs2string(const wcstring &input) std::string result; result.reserve(input.size()); - mbstate_t state; - memset(&state, 0, sizeof(state)); - + mbstate_t state = {}; char converted[MB_LEN_MAX + 1]; for (size_t i=0; i < input.size(); i++) @@ -276,12 +284,22 @@ std::string wcs2string(const wcstring &input) wchar_t wc = input[i]; if (wc == INTERNAL_SEPARATOR) { + // Do nothing. } - else if ((wc >= ENCODE_DIRECT_BASE) && - (wc < ENCODE_DIRECT_BASE+256)) + else if (wc >= ENCODE_DIRECT_BASE && wc < ENCODE_DIRECT_BASE + 256) { result.push_back(wc - ENCODE_DIRECT_BASE); } + else if (MB_CUR_MAX == 1) // single-byte locale (C/POSIX/ISO-8859) + { + // If `wc` contains a wide character we emit a question-mark. + if (wc & ~0xFF) + { + wc = '?'; + } + converted[0] = wc; + result.append(converted, 1); + } else { memset(converted, 0, sizeof converted); @@ -311,38 +329,47 @@ std::string wcs2string(const wcstring &input) */ static char *wcs2str_internal(const wchar_t *in, char *out) { - size_t res=0; - size_t in_pos=0; - size_t out_pos = 0; - mbstate_t state; - CHECK(in, 0); CHECK(out, 0); - memset(&state, 0, sizeof(state)); + size_t in_pos = 0; + size_t out_pos = 0; + mbstate_t state = {}; while (in[in_pos]) { if (in[in_pos] == INTERNAL_SEPARATOR) { + // Do nothing. } - else if ((in[in_pos] >= ENCODE_DIRECT_BASE) && - (in[in_pos] < ENCODE_DIRECT_BASE+256)) + else if (in[in_pos] >= ENCODE_DIRECT_BASE && + in[in_pos] < ENCODE_DIRECT_BASE + 256) { out[out_pos++] = in[in_pos]- ENCODE_DIRECT_BASE; } + else if (MB_CUR_MAX == 1) // single-byte locale (C/POSIX/ISO-8859) + { + // If `wc` contains a wide character we emit a question-mark. + if (in[in_pos] & ~0xFF) + { + out[out_pos++] = '?'; + } + else + { + out[out_pos++] = (unsigned char)in[in_pos]; + } + } else { - res = wcrtomb(&out[out_pos], in[in_pos], &state); - - if (res == (size_t)(-1)) + size_t len = wcrtomb(&out[out_pos], in[in_pos], &state); + if (len == (size_t)-1) { debug(1, L"Wide character %d has no narrow representation", in[in_pos]); memset(&state, 0, sizeof(state)); } else { - out_pos += res; + out_pos += len; } } in_pos++; diff --git a/src/env.cpp b/src/env.cpp index c24280bcc..56c9deb07 100644 --- a/src/env.cpp +++ b/src/env.cpp @@ -377,14 +377,13 @@ static void setup_path() int env_set_pwd() { - wchar_t dir_path[4096]; - wchar_t *res = wgetcwd(dir_path, 4096); - if (!res) + wcstring res = wgetcwd(); + if (res.empty()) { debug(0, _(L"Could not determine current working directory. Is your locale set correctly?")); return 0; } - env_set(L"PWD", dir_path, ENV_EXPORT | ENV_GLOBAL); + env_set(L"PWD", res.c_str(), ENV_EXPORT | ENV_GLOBAL); return 1; } diff --git a/src/fallback.cpp b/src/fallback.cpp index 7c5f84d23..ed29f5de9 100644 --- a/src/fallback.cpp +++ b/src/fallback.cpp @@ -606,7 +606,7 @@ static FILE *fw_data; static void fw_writer(wchar_t c) { - putwc(c, fw_data); + fputwc(c, fw_data); } /* @@ -648,33 +648,30 @@ int wprintf(const wchar_t *filter, ...) #endif #ifndef HAVE_FGETWC - wint_t fgetwc(FILE *stream) { - wchar_t res=0; - mbstate_t state; - memset(&state, '\0', sizeof(state)); + wchar_t res; + mbstate_t state = {}; while (1) { int b = fgetc(stream); - char bb; - - int sz; - if (b == EOF) + { return WEOF; + } - bb=b; - - sz = mbrtowc(&res, &bb, 1, &state); + if (MB_CUR_MAX == 1) // single-byte locale, all values are legal + { + return b; + } + char bb = b; + size_t sz = mbrtowc(&res, &bb, 1, &state); switch (sz) { case -1: - memset(&state, '\0', sizeof(state)); return WEOF; - case -2: break; case 0: @@ -683,35 +680,40 @@ wint_t fgetwc(FILE *stream) return res; } } - } - - -wint_t getwc(FILE *stream) -{ - return fgetwc(stream); -} - - #endif #ifndef HAVE_FPUTWC - wint_t fputwc(wchar_t wc, FILE *stream) { - int res; - char s[MB_CUR_MAX+1]; - memset(s, 0, MB_CUR_MAX+1); - wctomb(s, wc); - res = fputs(s, stream); - return res==EOF?WEOF:wc; -} + int res = 0; + mbstate_t state = {}; + char s[MB_CUR_MAX + 1] = {}; -wint_t putwc(wchar_t wc, FILE *stream) -{ - return fputwc(wc, stream); -} + if (MB_CUR_MAX == 1) // single-byte locale (C/POSIX/ISO-8859) + { + // If `wc` contains a wide character we emit a question-mark. + if (wc & ~0xFF) + { + wc = '?'; + } + s[0] = (char)wc; + res = fputs(s, stream); + } + else + { + size_t len = wcrtomb(s, wc, &state); + if (len == (size_t)-1) + { + debug(1, L"Wide character %d has no narrow representation", wc); + } + else { + res = fputs(s, stream); + } + } + return res == EOF ? WEOF : wc; +} #endif #ifndef HAVE_WCSTOK diff --git a/src/fallback.h b/src/fallback.h index e30444497..f3252c149 100644 --- a/src/fallback.h +++ b/src/fallback.h @@ -158,29 +158,13 @@ int vswprintf(wchar_t *out, size_t n, const wchar_t *filter, va_list va); #endif #ifndef HAVE_FGETWC -/** - Fallback implementation of fgetwc -*/ +// Fallback implementation of fgetwc. wint_t fgetwc(FILE *stream); - -/** - Fallback implementation of getwc -*/ -wint_t getwc(FILE *stream); - #endif #ifndef HAVE_FPUTWC - -/** - Fallback implementation of fputwc -*/ +// Fallback implementation of fputwc. wint_t fputwc(wchar_t wc, FILE *stream); -/** - Fallback implementation of putwc -*/ -wint_t putwc(wchar_t wc, FILE *stream); - #endif #ifndef HAVE_WCSTOK diff --git a/src/history.cpp b/src/history.cpp index 75e426387..a37d2b776 100644 --- a/src/history.cpp +++ b/src/history.cpp @@ -926,10 +926,9 @@ history_item_t history_t::decode_item_fish_1_x(const char *begin, size_t length) { const char *end = begin + length; - const char *pos=begin; - - bool was_backslash = 0; + const char *pos = begin; wcstring out; + bool was_backslash = false; bool first_char = true; bool timestamp_mode = false; time_t timestamp = 0; @@ -937,12 +936,18 @@ history_item_t history_t::decode_item_fish_1_x(const char *begin, size_t length) while (1) { wchar_t c; - mbstate_t state; size_t res; + mbstate_t state = {}; - memset(&state, 0, sizeof(state)); - - res = mbrtowc(&c, pos, end-pos, &state); + if (MB_CUR_MAX == 1) // single-byte locale + { + c = (unsigned char)*pos; + res = 1; + } + else + { + res = mbrtowc(&c, pos, end - pos, &state); + } if (res == (size_t)-1) { diff --git a/src/input_common.cpp b/src/input_common.cpp index ebe95fced..7c018c2d1 100644 --- a/src/input_common.cpp +++ b/src/input_common.cpp @@ -263,16 +263,17 @@ wchar_t input_common_readch(int timed) while (1) { wint_t b = readb(); - char bb; - size_t sz; + if (MB_CUR_MAX == 1) // single-byte locale, all values are legal + { + return (unsigned char)b; + } if ((b >= R_NULL) && (b < R_NULL + 1000)) return b; - bb=b; - - sz = mbrtowc(&res, &bb, 1, &state); + char bb = b; + size_t sz = mbrtowc(&res, &bb, 1, &state); switch (sz) { diff --git a/src/output.cpp b/src/output.cpp index 1707f4fb7..dbd97eb4b 100644 --- a/src/output.cpp +++ b/src/output.cpp @@ -386,32 +386,35 @@ int writeb(tputs_arg_t b) int writech(wint_t ch) { - mbstate_t state; - size_t i; char buff[MB_LEN_MAX+1]; - size_t bytes; + size_t len; - if ((ch >= ENCODE_DIRECT_BASE) && - (ch < ENCODE_DIRECT_BASE+256)) + if (ch >= ENCODE_DIRECT_BASE && ch < ENCODE_DIRECT_BASE + 256) { buff[0] = ch - ENCODE_DIRECT_BASE; - bytes=1; + len = 1; + } + else if (MB_CUR_MAX == 1) // single-byte locale (C/POSIX/ISO-8859) + { + // If `wc` contains a wide character we emit a question-mark. + if (ch & ~0xFF) + { + ch = '?'; + } + buff[0] = ch; + len = 1; } else { - memset(&state, 0, sizeof(state)); - bytes= wcrtomb(buff, ch, &state); - - switch (bytes) + mbstate_t state = {}; + len = wcrtomb(buff, ch, &state); + if (len == (size_t)-1) { - case (size_t)(-1): - { - return 1; - } + return 1; } } - for (i=0; i