implement string unescape

Fixes #3543
2026-06-25 02:01:15 -03:00 · 2017-06-22 20:47:54 -07:00
parent 60bca14b37
commit f3cb625802
8 changed files with 319 additions and 34 deletions
--- a/src/common.cpp
+++ b/src/common.cpp
@@ -75,6 +75,38 @@ static void debug_shared(const wchar_t msg_level, const wcstring &msg);

 bool has_working_tty_timestamps = true;

+/// Convert a character to its integer equivalent if it is a valid character for the requested base.
+/// Return the integer value if it is valid else -1.
+long convert_digit(wchar_t d, int base) {
+    long res = -1;
+    if ((d <= L'9') && (d >= L'0')) {
+        res = d - L'0';
+    } else if ((d <= L'z') && (d >= L'a')) {
+        res = d + 10 - L'a';
+    } else if ((d <= L'Z') && (d >= L'A')) {
+        res = d + 10 - L'A';
+    }
+    if (res >= base) {
+        res = -1;
+    }
+
+    return res;
+}
+
+/// Test whether the char is a valid hex digit as used by the `escape_string_*()` functions.
+static bool is_hex_digit(int c) { return strchr("0123456789ABCDEF", c) != NULL; }
+
+/// This is a specialization of `convert_digit()` that only handles base 16 and only uppercase.
+long convert_hex_digit(wchar_t d) {
+    if ((d <= L'9') && (d >= L'0')) {
+        return d - L'0';
+    } else if ((d <= L'Z') && (d >= L'A')) {
+        return 10 + d - L'A';
+    }
+
+    return -1;
+}
+
 #ifdef HAVE_BACKTRACE_SYMBOLS
 // This function produces a stack backtrace with demangled function & method names. It is based on
 // https://gist.github.com/fmela/591333 but adapted to the style of the fish project.
@@ -765,12 +797,41 @@ static void escape_string_url(const wchar_t *orig_in, wcstring &out) {
    }
 }

-static bool is_hex_digit(int c) { return strchr("0123456789abcdefABCDEF", c) != NULL; }
+/// Reverse the effects of `escape_string_url()`. By definition the string has consist of just ASCII
+/// chars.
+static bool unescape_string_url(const wchar_t *in, wcstring *out) {
+    std::string result;
+    result.reserve(out->size());
+    for (wchar_t c = *in; c; c = *++in) {
+        if (c > 0x7F) return false;  // invalid character means we can't decode the string
+        if (c == '%') {
+            int c1 = in[1];
+            if (c1 == 0) return false;  // found unexpected end of string
+            if (c1 == '%') {
+                result.push_back('%');
+                in++;
+            } else {
+                int c2 = in[2];
+                if (c2 == 0) return false;  // string ended prematurely
+                long d1 = convert_digit(c1, 16);
+                if (d1 < 0) return false;
+                long d2 = convert_digit(c2, 16);
+                if (d2 < 0) return false;
+                result.push_back(16 * d1 + d2);
+                in += 2;
+            }
+        } else {
+            result.push_back(c);
+        }
+    }
+
+    *out = str2wcstring(result);
+    return true;
+}

 /// Escape a string in a fashion suitable for using as a fish var name. Store the result in out_str.
 static void escape_string_var(const wchar_t *orig_in, wcstring &out) {
    bool prev_was_hex_encoded = false;
-    bool maybe_encode_next_char = false;
    const std::string &in = wcs2string(orig_in);
    for (auto c1 : in) {
        // This silliness is so we get the correct result whether chars are signed or unsigned.
@@ -799,6 +860,46 @@ static void escape_string_var(const wchar_t *orig_in, wcstring &out) {
    }
 }

+/// Reverse the effects of `escape_string_var()`. By definition the string has consist of just ASCII
+/// chars.
+static bool unescape_string_var(const wchar_t *in, wcstring *out) {
+    std::string result;
+    result.reserve(out->size());
+    bool prev_was_hex_encoded = false;
+    for (wchar_t c = *in; c; c = *++in) {
+        if (c > 0x7F) return false;  // invalid character means we can't decode the string
+        if (c == '_') {
+            int c1 = in[1];
+            if (c1 == 0) {
+                if (prev_was_hex_encoded) break;
+                return false;  // found unexpected escape char at end of string
+            }
+            if (c1 == '_') {
+                result.push_back('_');
+                in++;
+            } else if (is_hex_digit(c1)) {
+                int c2 = in[2];
+                if (c2 == 0) return false;  // string ended prematurely
+                long d1 = convert_hex_digit(c1);
+                if (d1 < 0) return false;
+                long d2 = convert_hex_digit(c2);
+                if (d2 < 0) return false;
+                result.push_back(16 * d1 + d2);
+                in += 2;
+                prev_was_hex_encoded = true;
+            }
+            // No "else" clause because if the first char after an underscore is not another
+            // underscore or a valid hex character then the underscore is there to improve
+            // readability after we've encoded a character not valid in a var name.
+        } else {
+            result.push_back(c);
+        }
+    }
+
+    *out = str2wcstring(result);
+    return true;
+}
+
 /// Escape a string in a fashion suitable for using in fish script. Store the result in out_str.
 static void escape_string_script(const wchar_t *orig_in, size_t in_len, wcstring &out,
                                 escape_flags_t flags) {
@@ -1390,14 +1491,44 @@ bool unescape_string_in_place(wcstring *str, unescape_flags_t escape_special) {
    return success;
 }

-bool unescape_string(const wchar_t *input, wcstring *output, unescape_flags_t escape_special) {
-    bool success = unescape_string_internal(input, wcslen(input), output, escape_special);
+bool unescape_string(const wchar_t *input, wcstring *output, unescape_flags_t escape_special,
+                     escape_string_style_t style) {
+    bool success;
+    switch (style) {
+        case STRING_STYLE_SCRIPT: {
+            success = unescape_string_internal(input, wcslen(input), output, escape_special);
+            break;
+        }
+        case STRING_STYLE_URL: {
+            success = unescape_string_url(input, output);
+            break;
+        }
+        case STRING_STYLE_VAR: {
+            success = unescape_string_var(input, output);
+            break;
+        }
+    }
    if (!success) output->clear();
    return success;
 }

-bool unescape_string(const wcstring &input, wcstring *output, unescape_flags_t escape_special) {
-    bool success = unescape_string_internal(input.c_str(), input.size(), output, escape_special);
+bool unescape_string(const wcstring &input, wcstring *output, unescape_flags_t escape_special,
+                     escape_string_style_t style) {
+    bool success;
+    switch (style) {
+        case STRING_STYLE_SCRIPT: {
+            success = unescape_string_internal(input.c_str(), input.size(), output, escape_special);
+            break;
+        }
+        case STRING_STYLE_URL: {
+            success = unescape_string_url(input.c_str(), output);
+            break;
+        }
+        case STRING_STYLE_VAR: {
+            success = unescape_string_var(input.c_str(), output);
+            break;
+        }
+    }
    if (!success) output->clear();
    return success;
 }
@@ -2023,22 +2154,6 @@ char **make_null_terminated_array(const std::vector<std::string> &lst) {
    return make_null_terminated_array_helper(lst);
 }

-long convert_digit(wchar_t d, int base) {
-    long res = -1;
-    if ((d <= L'9') && (d >= L'0')) {
-        res = d - L'0';
-    } else if ((d <= L'z') && (d >= L'a')) {
-        res = d + 10 - L'a';
-    } else if ((d <= L'Z') && (d >= L'A')) {
-        res = d + 10 - L'A';
-    }
-    if (res >= base) {
-        res = -1;
-    }
-
-    return res;
-}
-
 /// Test if the specified character is in a range that fish uses interally to store special tokens.
 ///
 /// NOTE: This is used when tokenizing the input. It is also used when reading input, before