From 60bca14b3718840241f12164cbc4362ef70cb0c2 Mon Sep 17 00:00:00 2001 From: Kurtis Rader Date: Tue, 20 Jun 2017 21:55:16 -0700 Subject: [PATCH] implement `string escape --style=xxx` We need a way to encode arbitrary strings into valid fish variable names. It would also be nice if we could convert strings to valid URLs without using the slow and hard to understand `__fish_urlencode` function. In particular, eliminating the need to manipulate the locale. Fixes #4150 --- CHANGELOG.md | 1 + doc_src/string.txt | 13 ++++- src/builtin_string.cpp | 105 +++++++++++++++++++++++++++++++++++------ src/common.cpp | 101 ++++++++++++++++++++++++++++++++++----- src/common.h | 17 +++++-- tests/string.err | 4 +- tests/string.in | 32 +++++++++++++ tests/string.out | 24 ++++++++++ 8 files changed, 263 insertions(+), 34 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index adfb6a955..2d160c4d8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ - The `COLUMNS` and `LINES` env vars are now correctly set the first time `fish_prompt` is run (#4141). - New `status is-breakpoint` command that is true when a prompt is displayed in response to a `breakpoint` command (#1310). - Invalid array indexes are now silently ignored (#826, #4127). +- `string escape` has a new `--style=xxx` flag where `xxx` can be `script`, `var`, or `url` (#4150) ## Other significant changes diff --git a/doc_src/string.txt b/doc_src/string.txt index f893f7fb6..f4d9c6f9b 100644 --- a/doc_src/string.txt +++ b/doc_src/string.txt @@ -2,7 +2,7 @@ \subsection string-synopsis Synopsis \fish{synopsis} -string escape [(-n | --no-quoted)] [STRING...] +string escape [(-n | --no-quoted)] [--style=xxx] [STRING...] string join [(-q | --quiet)] SEP [STRING...] string length [(-q | --quiet)] [STRING...] string lower [(-q | --quiet)] [STRING...] @@ -36,7 +36,11 @@ The following subcommands are available. \subsection string-escape "escape" subcommand -`string escape` escapes each STRING such that it can be passed back to `eval` to produce the original argument again. By default, all special characters are escaped, and quotes are used to simplify the output when possible. If `-n` or `--no-quoted` is given, the simplifying quoted format is not used. Exit status: 0 if at least one string was escaped, or 1 otherwise. +`string escape` escapes each STRING in one of three ways. The first is `--style=script`. This is the default. It alters the string such that it can be passed back to `eval` to produce the original argument again. By default, all special characters are escaped, and quotes are used to simplify the output when possible. If `-n` or `--no-quoted` is given, the simplifying quoted format is not used. Exit status: 0 if at least one string was escaped, or 1 otherwise. + +The second is `--style=var` which ensures the string can be used as a variable name by hex encoding any non-alphanumeric characters. The string is first converted to UTF-8 before being encoded. + +The third is `--style=url` which ensures the string can be used as a URL by hex encoding any character which is not legal in a URL. The string is first converted to UTF-8 before being encoded. \subsection string-join "join" subcommand @@ -159,6 +163,11 @@ In general, special characters are special by default, so `a+` matches one or mo cg \endfish +\fish{cli-dark} +>_ string escape --style=var 'a1 b2'\u6161 +a1_20b2__c_E6_85_A1 +\endfish + \subsection string-example-match-glob Match Glob Examples \fish{cli-dark} diff --git a/src/builtin_string.cpp b/src/builtin_string.cpp index d2f8a6c26..fa58bd728 100644 --- a/src/builtin_string.cpp +++ b/src/builtin_string.cpp @@ -116,6 +116,7 @@ typedef struct { //!OCLINT(too many fields) bool regex_valid = false; bool right_valid = false; bool start_valid = false; + bool style_valid = false; bool all = false; bool entire = false; @@ -138,8 +139,34 @@ typedef struct { //!OCLINT(too many fields) const wchar_t *chars_to_trim = L" \f\n\r\t"; const wchar_t *arg1 = NULL; const wchar_t *arg2 = NULL; + + escape_string_style_t escape_style = STRING_STYLE_SCRIPT; } options_t; +/// This handles the `--style=xxx` flag. +static int handle_flag_1(wchar_t **argv, parser_t &parser, io_streams_t &streams, wgetopter_t &w, + options_t *opts) { + const wchar_t *cmd = argv[0]; + + if (opts->style_valid) { + if (wcscmp(w.woptarg, L"script") == 0) { + opts->escape_style = STRING_STYLE_SCRIPT; + } else if (wcscmp(w.woptarg, L"url") == 0) { + opts->escape_style = STRING_STYLE_URL; + } else if (wcscmp(w.woptarg, L"var") == 0) { + opts->escape_style = STRING_STYLE_VAR; + } + else { + string_error(streams, _(L"%ls: Invalid escape style '%ls'\n"), cmd, w.woptarg); + return STATUS_INVALID_ARGS; + } + return STATUS_CMD_OK; + } + + string_unknown_option(parser, streams, cmd, argv[w.woptind - 1]); + return STATUS_INVALID_ARGS; +} + static int handle_flag_N(wchar_t **argv, parser_t &parser, io_streams_t &streams, wgetopter_t &w, options_t *opts) { if (opts->no_newline_valid) { @@ -349,13 +376,14 @@ static const struct woption long_options[] = { {L"max", required_argument, NULL, 'm'}, {L"no-newline", no_argument, NULL, 'N'}, {L"no-quoted", no_argument, NULL, 'n'}, {L"quiet", no_argument, NULL, 'q'}, {L"regex", no_argument, NULL, 'r'}, {L"right", no_argument, NULL, 'r'}, - {L"start", required_argument, NULL, 's'}, {NULL, 0, NULL, 0}}; + {L"start", required_argument, NULL, 's'}, {L"style", required_argument, NULL, 1}, + {NULL, 0, NULL, 0}}; static std::map flag_to_function = { {'N', handle_flag_N}, {'a', handle_flag_a}, {'c', handle_flag_c}, {'e', handle_flag_e}, {'f', handle_flag_f}, {'i', handle_flag_i}, {'l', handle_flag_l}, {'m', handle_flag_m}, {'n', handle_flag_n}, {'q', handle_flag_q}, {'r', handle_flag_r}, {'s', handle_flag_s}, - {'v', handle_flag_v}}; + {'v', handle_flag_v}, {1, handle_flag_1}}; /// Parse the arguments for flags recognized by a specific string subcommand. static int parse_opts(options_t *opts, int *optind, int n_req_args, int argc, wchar_t **argv, @@ -408,21 +436,15 @@ static int parse_opts(options_t *opts, int *optind, int n_req_args, int argc, wc return STATUS_CMD_OK; } -static int string_escape(parser_t &parser, io_streams_t &streams, int argc, wchar_t **argv) { - options_t opts; - opts.no_quoted_valid = true; - int optind; - int retval = parse_opts(&opts, &optind, 0, argc, argv, parser, streams); - if (retval != STATUS_CMD_OK) return retval; - +/// Escape a string so that it can be used in a fish script without further word splitting. +static int string_escape_script(options_t &opts, int optind, wchar_t **argv, io_streams_t &streams) { + wcstring storage; + int nesc = 0; escape_flags_t flags = ESCAPE_ALL; if (opts.no_quoted) flags |= ESCAPE_NO_QUOTED; - int nesc = 0; - wcstring storage; - const wchar_t *arg; - while ((arg = string_get_arg(&optind, argv, &storage, streams)) != 0) { - streams.out.append(escape_string(arg, flags)); + while (const wchar_t *arg = string_get_arg(&optind, argv, &storage, streams)) { + streams.out.append(escape_string(arg, flags, STRING_STYLE_SCRIPT)); streams.out.append(L'\n'); nesc++; } @@ -430,6 +452,61 @@ static int string_escape(parser_t &parser, io_streams_t &streams, int argc, wcha return nesc > 0 ? STATUS_CMD_OK : STATUS_CMD_ERROR; } +/// Escape a string so that it can be used as a URL. +static int string_escape_url(options_t &opts, int optind, wchar_t **argv, io_streams_t &streams) { + UNUSED(opts); + wcstring storage; + int nesc = 0; + escape_flags_t flags = 0; + + while (const wchar_t *arg = string_get_arg(&optind, argv, &storage, streams)) { + streams.out.append(escape_string(arg, flags, STRING_STYLE_URL)); + streams.out.append(L'\n'); + nesc++; + } + + return nesc > 0 ? STATUS_CMD_OK : STATUS_CMD_ERROR; +} + +/// Escape a string so that it can be used as a fish var name. +static int string_escape_var(options_t &opts, int optind, wchar_t **argv, io_streams_t &streams) { + UNUSED(opts); + wcstring storage; + int nesc = 0; + escape_flags_t flags = 0; + + while (const wchar_t *arg = string_get_arg(&optind, argv, &storage, streams)) { + streams.out.append(escape_string(arg, flags, STRING_STYLE_VAR)); + streams.out.append(L'\n'); + nesc++; + } + + return nesc > 0 ? STATUS_CMD_OK : STATUS_CMD_ERROR; +} + +static int string_escape(parser_t &parser, io_streams_t &streams, int argc, wchar_t **argv) { + options_t opts; + opts.no_quoted_valid = true; + opts.style_valid = true; + int optind; + int retval = parse_opts(&opts, &optind, 0, argc, argv, parser, streams); + if (retval != STATUS_CMD_OK) return retval; + + switch (opts.escape_style) { + case STRING_STYLE_SCRIPT: { + return string_escape_script(opts, optind, argv, streams); + } + case STRING_STYLE_URL: { + return string_escape_url(opts, optind, argv, streams); + } + case STRING_STYLE_VAR: { + return string_escape_var(opts, optind, argv, streams); + } + } + + DIE("should never reach this statement"); +} + static int string_join(parser_t &parser, io_streams_t &streams, int argc, wchar_t **argv) { options_t opts; opts.quiet_valid = true; diff --git a/src/common.cpp b/src/common.cpp index d57654e7c..fb51791de 100644 --- a/src/common.cpp +++ b/src/common.cpp @@ -1,6 +1,7 @@ // Various functions, mostly string utilities, that are used by most parts of fish. #include "config.h" +#include #include #include #include @@ -745,11 +746,62 @@ wcstring reformat_for_screen(const wcstring &msg) { return buff; } -/// Escape a string, storing the result in out_str. -static void escape_string_internal(const wchar_t *orig_in, size_t in_len, wcstring *out_str, - escape_flags_t flags) { - assert(orig_in != NULL); +/// Escape a string in a fashion suitable for using as a URL. Store the result in out_str. +static void escape_string_url(const wchar_t *orig_in, wcstring &out) { + const std::string &in = wcs2string(orig_in); + for (auto c1 : in) { + // This silliness is so we get the correct result whether chars are signed or unsigned. + unsigned int c2 = (unsigned int)c1 & 0xFF; + if (!(c2 & 0x80) && + (isalnum(c2) || c2 == '/' || c2 == '.' || c2 == '~' || c2 == '-' || c2 == '_')) { + // The above characters don't need to be encoded. + out.push_back((wchar_t)c2); + } else { + // All other chars need to have their UTF-8 representation encoded in hex. + wchar_t buf[4]; + swprintf(buf, sizeof buf / sizeof buf[0], L"%%%02X", c2); + out.append(buf); + } + } +} +static bool is_hex_digit(int c) { return strchr("0123456789abcdefABCDEF", c) != NULL; } + +/// Escape a string in a fashion suitable for using as a fish var name. Store the result in out_str. +static void escape_string_var(const wchar_t *orig_in, wcstring &out) { + bool prev_was_hex_encoded = false; + bool maybe_encode_next_char = false; + const std::string &in = wcs2string(orig_in); + for (auto c1 : in) { + // This silliness is so we get the correct result whether chars are signed or unsigned. + unsigned int c2 = (unsigned int)c1 & 0xFF; + if (!(c2 & 0x80) && isalnum(c2) && (!prev_was_hex_encoded || !is_hex_digit(c2))) { + // ASCII alphanumerics don't need to be encoded. + if (prev_was_hex_encoded) { + out.push_back(L'_'); + prev_was_hex_encoded = false; + } + out.push_back((wchar_t)c2); + } else if (c2 == '_') { + // Underscores are encoded by doubling them. + out.append(L"__"); + prev_was_hex_encoded = false; + } else { + // All other chars need to have their UTF-8 representation encoded in hex. + wchar_t buf[4]; + swprintf(buf, sizeof buf / sizeof buf[0], L"_%02X", c2); + out.append(buf); + prev_was_hex_encoded = true; + } + } + if (prev_was_hex_encoded) { + out.push_back(L'_'); + } +} + +/// Escape a string in a fashion suitable for using in fish script. Store the result in out_str. +static void escape_string_script(const wchar_t *orig_in, size_t in_len, wcstring &out, + escape_flags_t flags) { const wchar_t *in = orig_in; bool escape_all = static_cast(flags & ESCAPE_ALL); bool no_quoted = static_cast(flags & ESCAPE_NO_QUOTED); @@ -758,9 +810,6 @@ static void escape_string_internal(const wchar_t *orig_in, size_t in_len, wcstri int need_escape = 0; int need_complex_escape = 0; - // Avoid dereferencing all over the place. - wcstring &out = *out_str; - if (!no_quoted && in_len == 0) { out.assign(L"''"); return; @@ -903,15 +952,45 @@ static void escape_string_internal(const wchar_t *orig_in, size_t in_len, wcstri } } -wcstring escape_string(const wchar_t *in, escape_flags_t flags) { +wcstring escape_string(const wchar_t *in, escape_flags_t flags, escape_string_style_t style) { wcstring result; - escape_string_internal(in, wcslen(in), &result, flags); + + switch (style) { + case STRING_STYLE_SCRIPT: { + escape_string_script(in, wcslen(in), result, flags); + break; + } + case STRING_STYLE_URL: { + escape_string_url(in, result); + break; + } + case STRING_STYLE_VAR: { + escape_string_var(in, result); + break; + } + } + return result; } -wcstring escape_string(const wcstring &in, escape_flags_t flags) { +wcstring escape_string(const wcstring &in, escape_flags_t flags, escape_string_style_t style) { wcstring result; - escape_string_internal(in.c_str(), in.size(), &result, flags); + + switch (style) { + case STRING_STYLE_SCRIPT: { + escape_string_script(in.c_str(), in.size(), result, flags); + break; + } + case STRING_STYLE_URL: { + DIE("STRING_STYLE_URL not implemented"); + break; + } + case STRING_STYLE_VAR: { + escape_string_var(in.c_str(), result); + break; + } + } + return result; } diff --git a/src/common.h b/src/common.h index a1c2e0841..e1ed13772 100644 --- a/src/common.h +++ b/src/common.h @@ -89,6 +89,12 @@ typedef std::vector wcstring_list_t; #define INPUT_COMMON_BASE (wchar_t)0xF700 #define INPUT_COMMON_END (INPUT_COMMON_BASE + 64) +enum escape_string_style_t { + STRING_STYLE_SCRIPT, + STRING_STYLE_URL, + STRING_STYLE_VAR +}; + // Flags for unescape_string functions. enum { UNESCAPE_DEFAULT = 0, // default behavior @@ -97,15 +103,14 @@ enum { }; typedef unsigned int unescape_flags_t; -// Flags for the escape_string() and escape_string() functions. +// Flags for the escape_string() and escape_string() functions. These are only applicable when the +// escape style is "script" (i.e., STRING_STYLE_SCRIPT). enum { /// Escape all characters, including magic characters like the semicolon. ESCAPE_ALL = 1 << 0, - /// Do not try to use 'simplified' quoted escapes, and do not use empty quotes as the empty /// string. ESCAPE_NO_QUOTED = 1 << 1, - /// Do not escape tildes. ESCAPE_NO_TILDE = 1 << 2 }; @@ -692,8 +697,10 @@ ssize_t read_loop(int fd, void *buff, size_t count); /// \param in The string to be escaped /// \param flags Flags to control the escaping /// \return The escaped string -wcstring escape_string(const wchar_t *in, escape_flags_t flags); -wcstring escape_string(const wcstring &in, escape_flags_t flags); +wcstring escape_string(const wchar_t *in, escape_flags_t flags, + escape_string_style_t style=STRING_STYLE_SCRIPT); +wcstring escape_string(const wcstring &in, escape_flags_t flags, + escape_string_style_t style=STRING_STYLE_SCRIPT); /// Expand backslashed escapes and substitute them with their unescaped counterparts. Also /// optionally change the wildcards, the tilde character and a few more into constants which are diff --git a/tests/string.err b/tests/string.err index 33e9e847f..0c14841dc 100644 --- a/tests/string.err +++ b/tests/string.err @@ -5,7 +5,7 @@ string match: ^ # string invalidarg string: Subcommand 'invalidarg' is not valid -Standard input (line 183): +Standard input (line 215): string invalidarg; and echo "unexpected exit 0" >&2 ^ @@ -29,6 +29,6 @@ string repeat: Expected argument # string repeat -l fakearg 2>&1 string repeat: Unknown option '-l' -Standard input (line 284): +Standard input (line 316): string repeat -l fakearg ^ diff --git a/tests/string.in b/tests/string.in index ef4fa1940..297c77679 100644 --- a/tests/string.in +++ b/tests/string.in @@ -94,6 +94,38 @@ echo echo '# echo \x07 | string escape' echo \x07 | string escape +echo +echo '# string escape --style=script \'a b#c"\\\'d\'' +string escape --style=script 'a b#c"\'d' + +echo +echo '# string escape --style=url \'a b#c"\\\'d\'' +string escape --style=url 'a b#c"\'d' + +echo +echo '# string escape --style=url \\na\\nb%c~d\\n' +string escape --style=url \na\nb%c~d\n + +echo +echo '# string escape --style=var \'a b#c"\\\'d\'' +string escape --style=var 'a b#c"\'d' + +echo +echo '# string escape --style=script a\nghi_' +string escape --style=var a\nghi_ + +echo +echo '# string escape --style=var \'abc\'' +string escape --style=var 'abc' + +echo +echo '# string escape --style=var \'_a_b_c_\'' +string escape --style=var '_a_b_c_' + +echo +echo '# string escape --style=var -- -' +string escape --style=var -- - + echo echo '# string match "?" a' string match "?" a diff --git a/tests/string.out b/tests/string.out index 8425591d8..1c24eec48 100644 --- a/tests/string.out +++ b/tests/string.out @@ -74,6 +74,30 @@ zan # echo \x07 | string escape \cg +# string escape --style=script 'a b#c"\'d' +a\ b\#c\"\'d + +# string escape --style=url 'a b#c"\'d' +a%20b%23c%22%27d + +# string escape --style=url \na\nb%c~d\n +%0Aa%0Ab%25c~d%0A + +# string escape --style=var 'a b#c"\'d' +a_20_62_23_63_22_27_64_ + +# string escape --style=script a\nghi_ +a_0A_ghi__ + +# string escape --style=var 'abc' +abc + +# string escape --style=var '_a_b_c_' +__a__b__c__ + +# string escape --style=var -- - +_2D_ + # string match "?" a a