implement string unescape

Fixes #3543
This commit is contained in:
Kurtis Rader
2017-06-22 20:47:54 -07:00
parent 60bca14b37
commit f3cb625802
8 changed files with 319 additions and 34 deletions

View File

@@ -75,6 +75,38 @@ static void debug_shared(const wchar_t msg_level, const wcstring &msg);
bool has_working_tty_timestamps = true;
/// Convert a character to its integer equivalent if it is a valid character for the requested base.
/// Return the integer value if it is valid else -1.
long convert_digit(wchar_t d, int base) {
long res = -1;
if ((d <= L'9') && (d >= L'0')) {
res = d - L'0';
} else if ((d <= L'z') && (d >= L'a')) {
res = d + 10 - L'a';
} else if ((d <= L'Z') && (d >= L'A')) {
res = d + 10 - L'A';
}
if (res >= base) {
res = -1;
}
return res;
}
/// Test whether the char is a valid hex digit as used by the `escape_string_*()` functions.
static bool is_hex_digit(int c) { return strchr("0123456789ABCDEF", c) != NULL; }
/// This is a specialization of `convert_digit()` that only handles base 16 and only uppercase.
long convert_hex_digit(wchar_t d) {
if ((d <= L'9') && (d >= L'0')) {
return d - L'0';
} else if ((d <= L'Z') && (d >= L'A')) {
return 10 + d - L'A';
}
return -1;
}
#ifdef HAVE_BACKTRACE_SYMBOLS
// This function produces a stack backtrace with demangled function & method names. It is based on
// https://gist.github.com/fmela/591333 but adapted to the style of the fish project.
@@ -765,12 +797,41 @@ static void escape_string_url(const wchar_t *orig_in, wcstring &out) {
}
}
static bool is_hex_digit(int c) { return strchr("0123456789abcdefABCDEF", c) != NULL; }
/// Reverse the effects of `escape_string_url()`. By definition the string has consist of just ASCII
/// chars.
static bool unescape_string_url(const wchar_t *in, wcstring *out) {
std::string result;
result.reserve(out->size());
for (wchar_t c = *in; c; c = *++in) {
if (c > 0x7F) return false; // invalid character means we can't decode the string
if (c == '%') {
int c1 = in[1];
if (c1 == 0) return false; // found unexpected end of string
if (c1 == '%') {
result.push_back('%');
in++;
} else {
int c2 = in[2];
if (c2 == 0) return false; // string ended prematurely
long d1 = convert_digit(c1, 16);
if (d1 < 0) return false;
long d2 = convert_digit(c2, 16);
if (d2 < 0) return false;
result.push_back(16 * d1 + d2);
in += 2;
}
} else {
result.push_back(c);
}
}
*out = str2wcstring(result);
return true;
}
/// Escape a string in a fashion suitable for using as a fish var name. Store the result in out_str.
static void escape_string_var(const wchar_t *orig_in, wcstring &out) {
bool prev_was_hex_encoded = false;
bool maybe_encode_next_char = false;
const std::string &in = wcs2string(orig_in);
for (auto c1 : in) {
// This silliness is so we get the correct result whether chars are signed or unsigned.
@@ -799,6 +860,46 @@ static void escape_string_var(const wchar_t *orig_in, wcstring &out) {
}
}
/// Reverse the effects of `escape_string_var()`. By definition the string has consist of just ASCII
/// chars.
static bool unescape_string_var(const wchar_t *in, wcstring *out) {
std::string result;
result.reserve(out->size());
bool prev_was_hex_encoded = false;
for (wchar_t c = *in; c; c = *++in) {
if (c > 0x7F) return false; // invalid character means we can't decode the string
if (c == '_') {
int c1 = in[1];
if (c1 == 0) {
if (prev_was_hex_encoded) break;
return false; // found unexpected escape char at end of string
}
if (c1 == '_') {
result.push_back('_');
in++;
} else if (is_hex_digit(c1)) {
int c2 = in[2];
if (c2 == 0) return false; // string ended prematurely
long d1 = convert_hex_digit(c1);
if (d1 < 0) return false;
long d2 = convert_hex_digit(c2);
if (d2 < 0) return false;
result.push_back(16 * d1 + d2);
in += 2;
prev_was_hex_encoded = true;
}
// No "else" clause because if the first char after an underscore is not another
// underscore or a valid hex character then the underscore is there to improve
// readability after we've encoded a character not valid in a var name.
} else {
result.push_back(c);
}
}
*out = str2wcstring(result);
return true;
}
/// Escape a string in a fashion suitable for using in fish script. Store the result in out_str.
static void escape_string_script(const wchar_t *orig_in, size_t in_len, wcstring &out,
escape_flags_t flags) {
@@ -1390,14 +1491,44 @@ bool unescape_string_in_place(wcstring *str, unescape_flags_t escape_special) {
return success;
}
bool unescape_string(const wchar_t *input, wcstring *output, unescape_flags_t escape_special) {
bool success = unescape_string_internal(input, wcslen(input), output, escape_special);
bool unescape_string(const wchar_t *input, wcstring *output, unescape_flags_t escape_special,
escape_string_style_t style) {
bool success;
switch (style) {
case STRING_STYLE_SCRIPT: {
success = unescape_string_internal(input, wcslen(input), output, escape_special);
break;
}
case STRING_STYLE_URL: {
success = unescape_string_url(input, output);
break;
}
case STRING_STYLE_VAR: {
success = unescape_string_var(input, output);
break;
}
}
if (!success) output->clear();
return success;
}
bool unescape_string(const wcstring &input, wcstring *output, unescape_flags_t escape_special) {
bool success = unescape_string_internal(input.c_str(), input.size(), output, escape_special);
bool unescape_string(const wcstring &input, wcstring *output, unescape_flags_t escape_special,
escape_string_style_t style) {
bool success;
switch (style) {
case STRING_STYLE_SCRIPT: {
success = unescape_string_internal(input.c_str(), input.size(), output, escape_special);
break;
}
case STRING_STYLE_URL: {
success = unescape_string_url(input.c_str(), output);
break;
}
case STRING_STYLE_VAR: {
success = unescape_string_var(input.c_str(), output);
break;
}
}
if (!success) output->clear();
return success;
}
@@ -2023,22 +2154,6 @@ char **make_null_terminated_array(const std::vector<std::string> &lst) {
return make_null_terminated_array_helper(lst);
}
long convert_digit(wchar_t d, int base) {
long res = -1;
if ((d <= L'9') && (d >= L'0')) {
res = d - L'0';
} else if ((d <= L'z') && (d >= L'a')) {
res = d + 10 - L'a';
} else if ((d <= L'Z') && (d >= L'A')) {
res = d + 10 - L'A';
}
if (res >= base) {
res = -1;
}
return res;
}
/// Test if the specified character is in a range that fish uses interally to store special tokens.
///
/// NOTE: This is used when tokenizing the input. It is also used when reading input, before