From f3cb625802b719c6b212b1e5fbb4f34fe0ea2a60 Mon Sep 17 00:00:00 2001 From: Kurtis Rader Date: Thu, 22 Jun 2017 20:47:54 -0700 Subject: [PATCH] implement `string unescape` Fixes #3543 --- CHANGELOG.md | 1 + doc_src/string.txt | 3 + src/builtin_string.cpp | 99 +++++++++++++++++++++++-- src/common.cpp | 159 +++++++++++++++++++++++++++++++++++------ src/common.h | 11 +-- tests/string.err | 4 +- tests/string.in | 43 +++++++++++ tests/string.out | 33 ++++++++- 8 files changed, 319 insertions(+), 34 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2d160c4d8..37ab0e3f3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ - New `status is-breakpoint` command that is true when a prompt is displayed in response to a `breakpoint` command (#1310). - Invalid array indexes are now silently ignored (#826, #4127). - `string escape` has a new `--style=xxx` flag where `xxx` can be `script`, `var`, or `url` (#4150) +- `string unescape` has been implemented to reverse the effects of `string escape` (#3543) ## Other significant changes diff --git a/doc_src/string.txt b/doc_src/string.txt index f4d9c6f9b..4cb805c74 100644 --- a/doc_src/string.txt +++ b/doc_src/string.txt @@ -18,6 +18,7 @@ string sub [(-s | --start) START] [(-l | --length) LENGTH] [(-q | --quiet)] [STRING...] string trim [(-l | --left)] [(-r | --right)] [(-c | --chars CHARS)] [(-q | --quiet)] [STRING...] +string unescape [--style=xxx] [STRING...] string upper [(-q | --quiet)] [STRING...] \endfish @@ -42,6 +43,8 @@ The second is `--style=var` which ensures the string can be used as a variable n The third is `--style=url` which ensures the string can be used as a URL by hex encoding any character which is not legal in a URL. The string is first converted to UTF-8 before being encoded. +`string unescape` performs the inverse of the `string escape` command. If the string to be unescaped is not properly formatted it is ignored. For example, doing `string unescape --style=var (string escape --style=var $str)` will return the original string. + \subsection string-join "join" subcommand `string join` joins its STRING arguments into a single string separated by SEP, which can be an empty string. Exit status: 0 if at least one join was performed, or 1 otherwise. diff --git a/src/builtin_string.cpp b/src/builtin_string.cpp index fa58bd728..df0ccefa5 100644 --- a/src/builtin_string.cpp +++ b/src/builtin_string.cpp @@ -484,6 +484,64 @@ static int string_escape_var(options_t &opts, int optind, wchar_t **argv, io_str return nesc > 0 ? STATUS_CMD_OK : STATUS_CMD_ERROR; } +/// Unescape a string encoded so it can be used in fish script. +static int string_unescape_script(options_t &opts, int optind, wchar_t **argv, + io_streams_t &streams) { + UNUSED(opts); + wcstring storage; + int nesc = 0; + unescape_flags_t flags = 0; + + while (const wchar_t *arg = string_get_arg(&optind, argv, &storage, streams)) { + wcstring result; + if (unescape_string(arg, &result, flags, STRING_STYLE_SCRIPT)) { + streams.out.append(result); + streams.out.append(L'\n'); + nesc++; + } + } + + return nesc > 0 ? STATUS_CMD_OK : STATUS_CMD_ERROR; +} + +/// Unescape an encoded URL. +static int string_unescape_url(options_t &opts, int optind, wchar_t **argv, io_streams_t &streams) { + UNUSED(opts); + wcstring storage; + int nesc = 0; + unescape_flags_t flags = 0; + + while (const wchar_t *arg = string_get_arg(&optind, argv, &storage, streams)) { + wcstring result; + if (unescape_string(arg, &result, flags, STRING_STYLE_URL)) { + streams.out.append(result); + streams.out.append(L'\n'); + nesc++; + } + } + + return nesc > 0 ? STATUS_CMD_OK : STATUS_CMD_ERROR; +} + +/// Unescape an encoded var name. +static int string_unescape_var(options_t &opts, int optind, wchar_t **argv, io_streams_t &streams) { + UNUSED(opts); + wcstring storage; + int nesc = 0; + unescape_flags_t flags = 0; + + while (const wchar_t *arg = string_get_arg(&optind, argv, &storage, streams)) { + wcstring result; + if (unescape_string(arg, &result, flags, STRING_STYLE_VAR)) { + streams.out.append(result); + streams.out.append(L'\n'); + nesc++; + } + } + + return nesc > 0 ? STATUS_CMD_OK : STATUS_CMD_ERROR; +} + static int string_escape(parser_t &parser, io_streams_t &streams, int argc, wchar_t **argv) { options_t opts; opts.no_quoted_valid = true; @@ -507,6 +565,29 @@ static int string_escape(parser_t &parser, io_streams_t &streams, int argc, wcha DIE("should never reach this statement"); } +static int string_unescape(parser_t &parser, io_streams_t &streams, int argc, wchar_t **argv) { + options_t opts; + opts.no_quoted_valid = true; + opts.style_valid = true; + int optind; + int retval = parse_opts(&opts, &optind, 0, argc, argv, parser, streams); + if (retval != STATUS_CMD_OK) return retval; + + switch (opts.escape_style) { + case STRING_STYLE_SCRIPT: { + return string_unescape_script(opts, optind, argv, streams); + } + case STRING_STYLE_URL: { + return string_unescape_url(opts, optind, argv, streams); + } + case STRING_STYLE_VAR: { + return string_unescape_var(opts, optind, argv, streams); + } + } + + DIE("should never reach this statement"); +} + static int string_join(parser_t &parser, io_streams_t &streams, int argc, wchar_t **argv) { options_t opts; opts.quiet_valid = true; @@ -1277,11 +1358,19 @@ static const struct string_subcommand { wchar_t **argv); //!OCLINT(unused param) } -string_subcommands[] = { - {L"escape", &string_escape}, {L"join", &string_join}, {L"length", &string_length}, - {L"match", &string_match}, {L"replace", &string_replace}, {L"split", &string_split}, - {L"sub", &string_sub}, {L"trim", &string_trim}, {L"lower", &string_lower}, - {L"upper", &string_upper}, {L"repeat", &string_repeat}, {NULL, NULL}}; +string_subcommands[] = {{L"escape", &string_escape}, + {L"join", &string_join}, + {L"length", &string_length}, + {L"match", &string_match}, + {L"replace", &string_replace}, + {L"split", &string_split}, + {L"sub", &string_sub}, + {L"trim", &string_trim}, + {L"lower", &string_lower}, + {L"upper", &string_upper}, + {L"repeat", &string_repeat}, + {L"unescape", &string_unescape}, + {NULL, NULL}}; /// The string builtin, for manipulating strings. int builtin_string(parser_t &parser, io_streams_t &streams, wchar_t **argv) { diff --git a/src/common.cpp b/src/common.cpp index fb51791de..1bf691fe3 100644 --- a/src/common.cpp +++ b/src/common.cpp @@ -75,6 +75,38 @@ static void debug_shared(const wchar_t msg_level, const wcstring &msg); bool has_working_tty_timestamps = true; +/// Convert a character to its integer equivalent if it is a valid character for the requested base. +/// Return the integer value if it is valid else -1. +long convert_digit(wchar_t d, int base) { + long res = -1; + if ((d <= L'9') && (d >= L'0')) { + res = d - L'0'; + } else if ((d <= L'z') && (d >= L'a')) { + res = d + 10 - L'a'; + } else if ((d <= L'Z') && (d >= L'A')) { + res = d + 10 - L'A'; + } + if (res >= base) { + res = -1; + } + + return res; +} + +/// Test whether the char is a valid hex digit as used by the `escape_string_*()` functions. +static bool is_hex_digit(int c) { return strchr("0123456789ABCDEF", c) != NULL; } + +/// This is a specialization of `convert_digit()` that only handles base 16 and only uppercase. +long convert_hex_digit(wchar_t d) { + if ((d <= L'9') && (d >= L'0')) { + return d - L'0'; + } else if ((d <= L'Z') && (d >= L'A')) { + return 10 + d - L'A'; + } + + return -1; +} + #ifdef HAVE_BACKTRACE_SYMBOLS // This function produces a stack backtrace with demangled function & method names. It is based on // https://gist.github.com/fmela/591333 but adapted to the style of the fish project. @@ -765,12 +797,41 @@ static void escape_string_url(const wchar_t *orig_in, wcstring &out) { } } -static bool is_hex_digit(int c) { return strchr("0123456789abcdefABCDEF", c) != NULL; } +/// Reverse the effects of `escape_string_url()`. By definition the string has consist of just ASCII +/// chars. +static bool unescape_string_url(const wchar_t *in, wcstring *out) { + std::string result; + result.reserve(out->size()); + for (wchar_t c = *in; c; c = *++in) { + if (c > 0x7F) return false; // invalid character means we can't decode the string + if (c == '%') { + int c1 = in[1]; + if (c1 == 0) return false; // found unexpected end of string + if (c1 == '%') { + result.push_back('%'); + in++; + } else { + int c2 = in[2]; + if (c2 == 0) return false; // string ended prematurely + long d1 = convert_digit(c1, 16); + if (d1 < 0) return false; + long d2 = convert_digit(c2, 16); + if (d2 < 0) return false; + result.push_back(16 * d1 + d2); + in += 2; + } + } else { + result.push_back(c); + } + } + + *out = str2wcstring(result); + return true; +} /// Escape a string in a fashion suitable for using as a fish var name. Store the result in out_str. static void escape_string_var(const wchar_t *orig_in, wcstring &out) { bool prev_was_hex_encoded = false; - bool maybe_encode_next_char = false; const std::string &in = wcs2string(orig_in); for (auto c1 : in) { // This silliness is so we get the correct result whether chars are signed or unsigned. @@ -799,6 +860,46 @@ static void escape_string_var(const wchar_t *orig_in, wcstring &out) { } } +/// Reverse the effects of `escape_string_var()`. By definition the string has consist of just ASCII +/// chars. +static bool unescape_string_var(const wchar_t *in, wcstring *out) { + std::string result; + result.reserve(out->size()); + bool prev_was_hex_encoded = false; + for (wchar_t c = *in; c; c = *++in) { + if (c > 0x7F) return false; // invalid character means we can't decode the string + if (c == '_') { + int c1 = in[1]; + if (c1 == 0) { + if (prev_was_hex_encoded) break; + return false; // found unexpected escape char at end of string + } + if (c1 == '_') { + result.push_back('_'); + in++; + } else if (is_hex_digit(c1)) { + int c2 = in[2]; + if (c2 == 0) return false; // string ended prematurely + long d1 = convert_hex_digit(c1); + if (d1 < 0) return false; + long d2 = convert_hex_digit(c2); + if (d2 < 0) return false; + result.push_back(16 * d1 + d2); + in += 2; + prev_was_hex_encoded = true; + } + // No "else" clause because if the first char after an underscore is not another + // underscore or a valid hex character then the underscore is there to improve + // readability after we've encoded a character not valid in a var name. + } else { + result.push_back(c); + } + } + + *out = str2wcstring(result); + return true; +} + /// Escape a string in a fashion suitable for using in fish script. Store the result in out_str. static void escape_string_script(const wchar_t *orig_in, size_t in_len, wcstring &out, escape_flags_t flags) { @@ -1390,14 +1491,44 @@ bool unescape_string_in_place(wcstring *str, unescape_flags_t escape_special) { return success; } -bool unescape_string(const wchar_t *input, wcstring *output, unescape_flags_t escape_special) { - bool success = unescape_string_internal(input, wcslen(input), output, escape_special); +bool unescape_string(const wchar_t *input, wcstring *output, unescape_flags_t escape_special, + escape_string_style_t style) { + bool success; + switch (style) { + case STRING_STYLE_SCRIPT: { + success = unescape_string_internal(input, wcslen(input), output, escape_special); + break; + } + case STRING_STYLE_URL: { + success = unescape_string_url(input, output); + break; + } + case STRING_STYLE_VAR: { + success = unescape_string_var(input, output); + break; + } + } if (!success) output->clear(); return success; } -bool unescape_string(const wcstring &input, wcstring *output, unescape_flags_t escape_special) { - bool success = unescape_string_internal(input.c_str(), input.size(), output, escape_special); +bool unescape_string(const wcstring &input, wcstring *output, unescape_flags_t escape_special, + escape_string_style_t style) { + bool success; + switch (style) { + case STRING_STYLE_SCRIPT: { + success = unescape_string_internal(input.c_str(), input.size(), output, escape_special); + break; + } + case STRING_STYLE_URL: { + success = unescape_string_url(input.c_str(), output); + break; + } + case STRING_STYLE_VAR: { + success = unescape_string_var(input.c_str(), output); + break; + } + } if (!success) output->clear(); return success; } @@ -2023,22 +2154,6 @@ char **make_null_terminated_array(const std::vector &lst) { return make_null_terminated_array_helper(lst); } -long convert_digit(wchar_t d, int base) { - long res = -1; - if ((d <= L'9') && (d >= L'0')) { - res = d - L'0'; - } else if ((d <= L'z') && (d >= L'a')) { - res = d + 10 - L'a'; - } else if ((d <= L'Z') && (d >= L'A')) { - res = d + 10 - L'A'; - } - if (res >= base) { - res = -1; - } - - return res; -} - /// Test if the specified character is in a range that fish uses interally to store special tokens. /// /// NOTE: This is used when tokenizing the input. It is also used when reading input, before diff --git a/src/common.h b/src/common.h index e1ed13772..f633b24be 100644 --- a/src/common.h +++ b/src/common.h @@ -715,10 +715,13 @@ size_t read_unquoted_escape(const wchar_t *input, wcstring *result, bool allow_i /// indicates the string was unmodified. bool unescape_string_in_place(wcstring *str, unescape_flags_t escape_special); -/// Unescapes a string, returning the unescaped value by reference. On failure, the output is set to -/// an empty string. -bool unescape_string(const wchar_t *input, wcstring *output, unescape_flags_t escape_special); -bool unescape_string(const wcstring &input, wcstring *output, unescape_flags_t escape_special); +/// Reverse the effects of calling `escape_string`. Returns the unescaped value by reference. On +/// failure, the output is set to an empty string. +bool unescape_string(const wchar_t *input, wcstring *output, unescape_flags_t escape_special, + escape_string_style_t style = STRING_STYLE_SCRIPT); + +bool unescape_string(const wcstring &input, wcstring *output, unescape_flags_t escape_special, + escape_string_style_t style = STRING_STYLE_SCRIPT); /// Returns the width of the terminal window, so that not all functions that use these values /// continually have to keep track of it separately. diff --git a/tests/string.err b/tests/string.err index 0c14841dc..7092f30c2 100644 --- a/tests/string.err +++ b/tests/string.err @@ -5,7 +5,7 @@ string match: ^ # string invalidarg string: Subcommand 'invalidarg' is not valid -Standard input (line 215): +Standard input (line 258): string invalidarg; and echo "unexpected exit 0" >&2 ^ @@ -29,6 +29,6 @@ string repeat: Expected argument # string repeat -l fakearg 2>&1 string repeat: Unknown option '-l' -Standard input (line 316): +Standard input (line 359): string repeat -l fakearg ^ diff --git a/tests/string.in b/tests/string.in index 297c77679..4e66b7950 100644 --- a/tests/string.in +++ b/tests/string.in @@ -126,6 +126,49 @@ echo echo '# string escape --style=var -- -' string escape --style=var -- - +# The following tests verify that we can correctly unescape the same strings +# we tested escaping above. + +echo +echo '# set x (string unescape (echo \x07 | string escape))' +set x (string unescape (echo \x07 | string escape)) +test $x = \x07 +and echo success + +echo +echo '# string unescape --style=script (string escape --style=script \'a b#c"\\\'d\')' +string unescape --style=script (string escape --style=script 'a b#c"\'d') + +echo +echo '# string unescape --style=url (string escape --style=url \'a b#c"\\\'d\')' +string unescape --style=url (string escape --style=url 'a b#c"\'d') + +echo +echo '# string unescape --style=url (string escape --style=url \na\nb%c~d\n)' +string unescape --style=url (string escape --style=url \na\nb%c~d\n) + +echo +echo '# string unescape --style=var (string escape --style=var \'a b#c"\\\'d\')' +string unescape --style=var (string escape --style=var 'a b#c"\'d') + +echo +echo '# string unescape --style=var (string escape --style=var a\nghi_)' +string unescape --style=var (string escape --style=var a\nghi_) + +echo +echo '# string unescape --style=var (string escape --style=var \'abc\')' +string unescape --style=var (string escape --style=var 'abc') + +echo +echo '# string unescape --style=var (string escape --style=var \'_a_b_c_\')' +string unescape --style=var (string escape --style=var '_a_b_c_') + +echo +echo '# string unescape --style=var (string escape --style=var -- -)' +string unescape --style=var -- (string escape --style=var -- -) + +# The following tests verify that we can correctly match strings. + echo echo '# string match "?" a' string match "?" a diff --git a/tests/string.out b/tests/string.out index 1c24eec48..9c3dec49e 100644 --- a/tests/string.out +++ b/tests/string.out @@ -84,7 +84,7 @@ a%20b%23c%22%27d %0Aa%0Ab%25c~d%0A # string escape --style=var 'a b#c"\'d' -a_20_62_23_63_22_27_64_ +a_20_b_23_c_22_27_d # string escape --style=script a\nghi_ a_0A_ghi__ @@ -98,6 +98,37 @@ __a__b__c__ # string escape --style=var -- - _2D_ +# set x (string unescape (echo \x07 | string escape)) +success + +# string unescape --style=script (string escape --style=script 'a b#c"\'d') +a b#c"'d + +# string unescape --style=url (string escape --style=url 'a b#c"\'d') +a b#c"'d + +# string unescape --style=url (string escape --style=url \na\nb%c~d\n) + +a +b%c~d + + +# string unescape --style=var (string escape --style=var 'a b#c"\'d') +a b#c"'d + +# string unescape --style=var (string escape --style=var a\nghi_) +a +ghi_ + +# string unescape --style=var (string escape --style=var 'abc') +abc + +# string unescape --style=var (string escape --style=var '_a_b_c_') +_a_b_c_ + +# string unescape --style=var (string escape --style=var -- -) +- + # string match "?" a a