Incorporate a modified UTF8 <-> wchar_t implementation from Alexey

Vatchenko (http://www.bsdua.org/libbsdua.html) in preparation for eliminating our dependency on iconv
2026-06-01 04:41:14 -03:00 · 2014-03-22 23:46:58 -07:00
parent 06eb271bda
commit aabed8279e
6 changed files with 861 additions and 27 deletions
--- a/fish_tests.cpp
+++ b/fish_tests.cpp
@@ -62,6 +62,7 @@
 #include "parse_util.h"
 #include "pager.h"
 #include "input.h"
+#include "utf8.h"

 static const char * const * s_arguments;
 static int s_test_run_count = 0;
@@ -140,17 +141,17 @@ static void err(const wchar_t *blah, ...)
    va_list va;
    va_start(va, blah);
    err_count++;
-    
+
    // show errors in red
    fputs("\x1b[31m", stdout);

    wprintf(L"Error: ");
    vwprintf(blah, va);
    va_end(va);
-    
+
    // return to normal color
    fputs("\x1b[0m", stdout);
-    
+
    wprintf(L"\n");
 }

@@ -857,6 +858,260 @@ static void test_utils()
    if (begin != a + wcslen(L"echo (echo (")) err(L"parse_util_cmdsubst_extent failed on line %ld", (long)__LINE__);
 }

+/* UTF8 tests taken from Alexey Vatchenko's utf8 library. See http://www.bsdua.org/libbsdua.html */
+
+static void test_utf82wchar(const char *src, size_t slen, const wchar_t *dst, size_t dlen,
+                            int flags, size_t res, const char *descr)
+{
+    size_t size;
+    wchar_t *mem = NULL;
+
+    /* Hack: if wchar is only UCS-2, and the UTF-8 input string contains astral characters, then tweak the expected size to 0 */
+    if (src != NULL && is_wchar_ucs2())
+    {
+        /* A UTF-8 code unit may represent an astral code point if it has 4 or more leading 1s */
+        const unsigned char astral_mask = 0xF0;
+        for (size_t i=0; i < slen; i++)
+        {
+            if ((src[i] & astral_mask) == astral_mask)
+            {
+                /* Astral char. We expect this conversion to just fail. */
+                res = 0;
+                break;
+            }
+        }
+    }
+
+    if (dst != NULL)
+    {
+        mem = (wchar_t *)malloc(dlen * sizeof(*mem));
+        if (mem == NULL)
+        {
+            err(L"u2w: %s: MALLOC FAILED\n", descr);
+            return;
+        }
+    }
+
+    do
+    {
+        size = utf8_to_wchar(src, slen, mem, dlen, flags);
+        if (res != size)
+        {
+            err(L"u2w: %s: FAILED (rv: %lu, must be %lu)", descr, size, res);
+            break;
+        }
+
+        if (mem == NULL)
+            break;		/* OK */
+
+        if (memcmp(mem, dst, size * sizeof(*mem)) != 0)
+        {
+            err(L"u2w: %s: BROKEN", descr);
+            break;
+        }
+
+    }
+    while (0);
+
+    free(mem);
+}
+
+static void test_wchar2utf8(const wchar_t *src, size_t slen, const char *dst, size_t dlen,
+                            int flags, size_t res, const char *descr)
+{
+    size_t size;
+    char *mem = NULL;
+
+    /* Hack: if wchar is simulating UCS-2, and the wchar_t input string contains astral characters, then tweak the expected size to 0 */
+    if (src != NULL && is_wchar_ucs2())
+    {
+        const uint32_t astral_mask = 0xFFFF0000U;
+        for (size_t i=0; i < slen; i++)
+        {
+            if ((src[i] & astral_mask) != 0)
+            {
+                /* astral char */
+                res = 0;
+                break;
+            }
+        }
+    }
+
+    if (dst != NULL)
+    {
+        mem = (char *)malloc(dlen);
+        if (mem == NULL)
+        {
+            err(L"w2u: %s: MALLOC FAILED", descr);
+            return;
+        }
+    }
+
+    do
+    {
+        size = wchar_to_utf8(src, slen, mem, dlen, flags);
+        if (res != size)
+        {
+            err(L"w2u: %s: FAILED (rv: %lu, must be %lu)", descr, size, res);
+            break;
+        }
+
+        if (mem == NULL)
+            break;		/* OK */
+
+        if (memcmp(mem, dst, size) != 0)
+        {
+            err(L"w2u: %s: BROKEN", descr);
+            break;
+        }
+
+    }
+    while (0);
+
+    if (mem != NULL);
+    free(mem);
+}
+
+static void test_utf8()
+{
+    wchar_t w1[] = {0x54, 0x65, 0x73, 0x74};
+    wchar_t w2[] = {0x0422, 0x0435, 0x0441, 0x0442};
+    wchar_t w3[] = {0x800, 0x1e80, 0x98c4, 0x9910, 0xff00};
+    wchar_t w4[] = {0x15555, 0xf7777, 0xa};
+    wchar_t w5[] = {0x255555, 0x1fa04ff, 0xddfd04, 0xa};
+    wchar_t w6[] = {0xf255555, 0x1dfa04ff, 0x7fddfd04, 0xa};
+    wchar_t wb[] = {-2, 0xa, 0xffffffff, 0x0441};
+    wchar_t wm[] = {0x41, 0x0441, 0x3042, 0xff67, 0x9b0d, 0x2e05da67};
+    wchar_t wb1[] = {0xa, 0x0422};
+    wchar_t wb2[] = {0xd800, 0xda00, 0x41, 0xdfff, 0xa};
+    wchar_t wbom[] = {0xfeff, 0x41, 0xa};
+    wchar_t wbom2[] = {0x41, 0xa};
+    wchar_t wbom22[] = {0xfeff, 0x41, 0xa};
+    char u1[] = {0x54, 0x65, 0x73, 0x74};
+    char u2[] = {0xd0, 0xa2, 0xd0, 0xb5, 0xd1, 0x81, 0xd1, 0x82};
+    char u3[] = {0xe0, 0xa0, 0x80, 0xe1, 0xba, 0x80, 0xe9, 0xa3, 0x84,
+                 0xe9, 0xa4, 0x90, 0xef, 0xbc, 0x80
+                };
+    char u4[] = {0xf0, 0x95, 0x95, 0x95, 0xf3, 0xb7, 0x9d, 0xb7, 0xa};
+    char u5[] = {0xf8, 0x89, 0x95, 0x95, 0x95, 0xf9, 0xbe, 0xa0, 0x93,
+                 0xbf, 0xf8, 0xb7, 0x9f, 0xb4, 0x84, 0x0a
+                };
+    char u6[] = {0xfc, 0x8f, 0x89, 0x95, 0x95, 0x95, 0xfc, 0x9d, 0xbe,
+                 0xa0, 0x93, 0xbf, 0xfd, 0xbf, 0xb7, 0x9f, 0xb4, 0x84, 0x0a
+                };
+    char ub[] = {0xa, 0xd1, 0x81};
+    char um[] = {0x41, 0xd1, 0x81, 0xe3, 0x81, 0x82, 0xef, 0xbd, 0xa7,
+                 0xe9, 0xac, 0x8d, 0xfc, 0xae, 0x81, 0x9d, 0xa9, 0xa7
+                };
+    char ub1[] = {0xa, 0xff, 0xd0, 0xa2, 0xfe, 0x8f, 0xe0, 0x80};
+    char uc080[] = {0xc0, 0x80};
+    char ub2[] = {0xed, 0xa1, 0x8c, 0xed, 0xbe, 0xb4, 0xa};
+    char ubom[] = {0x41, 0xa};
+    char ubom2[] = {0xef, 0xbb, 0xbf, 0x41, 0xa};
+
+    /*
+     * UTF-8 -> UCS-4 string.
+     */
+    test_utf82wchar(ubom2, sizeof(ubom2), wbom2,
+                    sizeof(wbom2) / sizeof(*wbom2), UTF8_SKIP_BOM,
+                    sizeof(wbom2) / sizeof(*wbom2), "skip BOM");
+    test_utf82wchar(ubom2, sizeof(ubom2), wbom22,
+                    sizeof(wbom22) / sizeof(*wbom22), 0,
+                    sizeof(wbom22) / sizeof(*wbom22), "BOM");
+    test_utf82wchar(uc080, sizeof(uc080), NULL, 0, 0, 0,
+                    "c0 80 - forbitten by rfc3629");
+    test_utf82wchar(ub2, sizeof(ub2), NULL, 0, 0, is_wchar_ucs2() ? 0 : 3,
+                    "resulted in forbitten wchars (len)");
+    test_utf82wchar(ub2, sizeof(ub2), wb2, sizeof(wb2) / sizeof(*wb2), 0, 0,
+                    "resulted in forbitten wchars");
+    test_utf82wchar(ub2, sizeof(ub2), L"\x0a", 1, UTF8_IGNORE_ERROR,
+                    1, "resulted in ignored forbitten wchars");
+    test_utf82wchar(u1, sizeof(u1), w1, sizeof(w1) / sizeof(*w1), 0,
+                    sizeof(w1) / sizeof(*w1), "1 octet chars");
+    test_utf82wchar(u2, sizeof(u2), w2, sizeof(w2) / sizeof(*w2), 0,
+                    sizeof(w2) / sizeof(*w2), "2 octets chars");
+    test_utf82wchar(u3, sizeof(u3), w3, sizeof(w3) / sizeof(*w3), 0,
+                    sizeof(w3) / sizeof(*w3), "3 octets chars");
+    test_utf82wchar(u4, sizeof(u4), w4, sizeof(w4) / sizeof(*w4), 0,
+                    sizeof(w4) / sizeof(*w4), "4 octets chars");
+    test_utf82wchar(u5, sizeof(u5), w5, sizeof(w5) / sizeof(*w5), 0,
+                    sizeof(w5) / sizeof(*w5), "5 octets chars");
+    test_utf82wchar(u6, sizeof(u6), w6, sizeof(w6) / sizeof(*w6), 0,
+                    sizeof(w6) / sizeof(*w6), "6 octets chars");
+    test_utf82wchar("\xff", 1, NULL, 0, 0, 0, "broken utf-8 0xff symbol");
+    test_utf82wchar("\xfe", 1, NULL, 0, 0, 0, "broken utf-8 0xfe symbol");
+    test_utf82wchar("\x8f", 1, NULL, 0, 0, 0,
+                    "broken utf-8, start from 10 higher bits");
+    if (! is_wchar_ucs2()) test_utf82wchar(ub1, sizeof(ub1), wb1, sizeof(wb1) / sizeof(*wb1),
+                                               UTF8_IGNORE_ERROR, sizeof(wb1) / sizeof(*wb1), "ignore bad chars");
+    test_utf82wchar(um, sizeof(um), wm, sizeof(wm) / sizeof(*wm), 0,
+                    sizeof(wm) / sizeof(*wm), "mixed languages");
+    test_utf82wchar(um, sizeof(um), wm, sizeof(wm) / sizeof(*wm) - 1, 0,
+                    0, "boundaries -1");
+    test_utf82wchar(um, sizeof(um), wm, sizeof(wm) / sizeof(*wm) + 1, 0,
+                    sizeof(wm) / sizeof(*wm), "boundaries +1");
+    test_utf82wchar(um, sizeof(um), NULL, 0, 0,
+                    sizeof(wm) / sizeof(*wm), "calculate length");
+    test_utf82wchar(ub1, sizeof(ub1), NULL, 0, 0,
+                    0, "calculate length of bad chars");
+    test_utf82wchar(ub1, sizeof(ub1), NULL, 0,
+                    UTF8_IGNORE_ERROR, sizeof(wb1) / sizeof(*wb1),
+                    "calculate length, ignore bad chars");
+    test_utf82wchar(NULL, 0, NULL, 0, 0, 0, "invalid params, all 0");
+    test_utf82wchar(u1, 0, NULL, 0, 0, 0,
+                    "invalid params, src buf not NULL");
+    test_utf82wchar(NULL, 10, NULL, 0, 0, 0,
+                    "invalid params, src length is not 0");
+    test_utf82wchar(u1, sizeof(u1), w1, 0, 0, 0,
+                    "invalid params, dst is not NULL");
+
+    /*
+     * UCS-4 -> UTF-8 string.
+     */
+    test_wchar2utf8(wbom, sizeof(wbom) / sizeof(*wbom), ubom, sizeof(ubom),
+                    UTF8_SKIP_BOM, sizeof(ubom), "BOM");
+    test_wchar2utf8(wb2, sizeof(wb2) / sizeof(*wb2), NULL, 0, 0,
+                    0, "prohibited wchars");
+    test_wchar2utf8(wb2, sizeof(wb2) / sizeof(*wb2), NULL, 0,
+                    UTF8_IGNORE_ERROR, 2, "ignore prohibited wchars");
+    test_wchar2utf8(w1, sizeof(w1) / sizeof(*w1), u1, sizeof(u1), 0,
+                    sizeof(u1), "1 octet chars");
+    test_wchar2utf8(w2, sizeof(w2) / sizeof(*w2), u2, sizeof(u2), 0,
+                    sizeof(u2), "2 octets chars");
+    test_wchar2utf8(w3, sizeof(w3) / sizeof(*w3), u3, sizeof(u3), 0,
+                    sizeof(u3), "3 octets chars");
+    test_wchar2utf8(w4, sizeof(w4) / sizeof(*w4), u4, sizeof(u4), 0,
+                    sizeof(u4), "4 octets chars");
+    test_wchar2utf8(w5, sizeof(w5) / sizeof(*w5), u5, sizeof(u5), 0,
+                    sizeof(u5), "5 octets chars");
+    test_wchar2utf8(w6, sizeof(w6) / sizeof(*w6), u6, sizeof(u6), 0,
+                    sizeof(u6), "6 octets chars");
+    test_wchar2utf8(wb, sizeof(wb) / sizeof(*wb), ub, sizeof(ub), 0,
+                    0, "bad chars");
+    test_wchar2utf8(wb, sizeof(wb) / sizeof(*wb), ub, sizeof(ub),
+                    UTF8_IGNORE_ERROR, sizeof(ub), "ignore bad chars");
+    test_wchar2utf8(wm, sizeof(wm) / sizeof(*wm), um, sizeof(um), 0,
+                    sizeof(um), "mixed languages");
+    test_wchar2utf8(wm, sizeof(wm) / sizeof(*wm), um, sizeof(um) - 1, 0,
+                    0, "boundaries -1");
+    test_wchar2utf8(wm, sizeof(wm) / sizeof(*wm), um, sizeof(um) + 1, 0,
+                    sizeof(um), "boundaries +1");
+    test_wchar2utf8(wm, sizeof(wm) / sizeof(*wm), NULL, 0, 0,
+                    sizeof(um), "calculate length");
+    test_wchar2utf8(wb, sizeof(wb) / sizeof(*wb), NULL, 0, 0,
+                    0, "calculate length of bad chars");
+    test_wchar2utf8(wb, sizeof(wb) / sizeof(*wb), NULL, 0,
+                    UTF8_IGNORE_ERROR, sizeof(ub),
+                    "calculate length, ignore bad chars");
+    test_wchar2utf8(NULL, 0, NULL, 0, 0, 0, "invalid params, all 0");
+    test_wchar2utf8(w1, 0, NULL, 0, 0, 0,
+                    "invalid params, src buf not NULL");
+    test_wchar2utf8(NULL, 10, NULL, 0, 0, 0,
+                    "invalid params, src length is not 0");
+    test_wchar2utf8(w1, sizeof(w1) / sizeof(*w1), u1, 0, 0, 0,
+                    "invalid params, dst is not NULL");
+}
+
 static void test_escape_sequences(void)
 {
    say(L"Testing escape codes");
@@ -1110,9 +1365,9 @@ static void test_path()
 static void test_pager_navigation()
 {
    say(L"Testing pager navigation");
-    
+
    /* Generate 19 strings of width 10. There's 2 spaces between completions, and our term size is 80; these can therefore fit into 6 columns (6 * 12 - 2 = 70) or 5 columns (58) but not 7 columns (7 * 12 - 2 = 82).
-    
+
       You can simulate this test by creating 19 files named "file00.txt" through "file_18.txt".
    */
    completion_list_t completions;
@@ -1120,31 +1375,31 @@ static void test_pager_navigation()
    {
        append_completion(completions, L"abcdefghij");
    }
-    
+
    pager_t pager;
    pager.set_completions(completions);
    pager.set_term_size(80, 24);
    page_rendering_t render = pager.render();
-    
+
    if (render.term_width != 80)
        err(L"Wrong term width");
    if (render.term_height != 24)
        err(L"Wrong term height");
-    
+
    size_t rows = 4, cols = 5;
-    
+
    /* We have 19 completions. We can fit into 6 columns with 4 rows or 5 columns with 4 rows; the second one is better and so is what we ought to have picked. */
    if (render.rows != rows)
        err(L"Wrong row count");
    if (render.cols != cols)
        err(L"Wrong column count");
-    
+
    /* Initially expect to have no completion index */
    if (render.selected_completion_idx != (size_t)(-1))
    {
        err(L"Wrong initial selection");
    }
-    
+
    /* Here are navigation directions and where we expect the selection to be */
    const struct
    {
@@ -1155,31 +1410,31 @@ static void test_pager_navigation()
    {
        /* Tab completion to get into the list */
        {direction_next, 0},
-        
+
        /* Westward motion in upper left wraps along the top row */
        {direction_west, 16},
        {direction_east, 1},
-        
+
        /* "Next" motion goes down the column */
        {direction_next, 2},
        {direction_next, 3},
-        
+
        {direction_west, 18},
        {direction_east, 3},
        {direction_east, 7},
        {direction_east, 11},
        {direction_east, 15},
        {direction_east, 3},
-        
+
        {direction_west, 18},
        {direction_east, 3},
-        
+
        /* Eastward motion wraps along the bottom, westward goes to the prior column */
        {direction_east, 7},
        {direction_east, 11},
        {direction_east, 15},
        {direction_east, 3},
-        
+
        /* Column memory */
        {direction_west, 18},
        {direction_south, 15},
@@ -1197,7 +1452,7 @@ static void test_pager_navigation()
            err(L"For command %lu, expected selection %lu, but found instead %lu\n", i, cmds[i].sel, render.selected_completion_idx);
        }
    }
-    
+
 }

 enum word_motion_t
@@ -1536,14 +1791,14 @@ static void test_complete(void)
    completions.clear();
    complete(L"echo (builtin scuttlebut", completions, COMPLETION_REQUEST_DEFAULT);
    do_test(completions.size() == 0);
-    
+
    /* Trailing spaces (#1261) */
    complete_add(L"foobarbaz", false, 0, NULL, 0, NO_FILES, NULL, L"qux", NULL, COMPLETE_AUTO_SPACE);
    completions.clear();
    complete(L"foobarbaz ", completions, COMPLETION_REQUEST_DEFAULT);
    do_test(completions.size() == 1);
    do_test(completions.at(0).completion == L"qux");
-    
+
    /* Don't complete variable names in single quotes (#1023) */
    completions.clear();
    complete(L"echo '$Foo", completions, COMPLETION_REQUEST_DEFAULT);
@@ -1814,14 +2069,14 @@ static void test_input()
    wcstring desired_binding = prefix_binding + L'a';
    input_mapping_add(prefix_binding.c_str(), L"up-line");
    input_mapping_add(desired_binding.c_str(), L"down-line");
-    
+
    /* Push the desired binding on the stack (backwards!) */
    size_t idx = desired_binding.size();
    while (idx--)
    {
        input_unreadch(desired_binding.at(idx));
    }
-    
+
    /* Now test */
    wint_t c = input_readch();
    if (c != R_DOWN_LINE)
@@ -2748,7 +3003,7 @@ static void test_highlighting(void)
        {L"'single_quote", highlight_spec_error},
        {NULL, -1}
    };
-    
+
    const highlight_component_t components11[] =
    {
        {L"echo", highlight_spec_command},
@@ -2761,7 +3016,7 @@ static void test_highlighting(void)
        {L"]", highlight_spec_operator},
        {NULL, -1}
    };
-    
+
    const highlight_component_t components12[] =
    {
        {L"for", highlight_spec_command},
@@ -2867,6 +3122,7 @@ int main(int argc, char **argv)
    if (should_test_function("cancellation")) test_cancellation();
    if (should_test_function("indents")) test_indents();
    if (should_test_function("utils")) test_utils();
+    if (should_test_function("utf8")) test_utf8();
    if (should_test_function("escape_sequences")) test_escape_sequences();
    if (should_test_function("lru")) test_lru();
    if (should_test_function("expand")) test_expand();
@@ -2906,7 +3162,8 @@ int main(int argc, char **argv)
    event_destroy();
    proc_destroy();

-    if(err_count != 0) {
+    if (err_count != 0)
+    {
        return(1);
    }
 }