From 517d53dc4611b5b341b194048f63bb8b01995f6d Mon Sep 17 00:00:00 2001 From: Johannes Altmanninger Date: Mon, 30 Jan 2023 21:23:01 +0100 Subject: [PATCH] Port util.cpp to Rust The original implementation without the test took me 3 hours (first time seriously looking into this) The functions take "wcharz_t" for smooth integration with existing C++ callers. This is at the expense of Rust callers, which would prefer "&wstr". Would be nice to declare a function parameter that accepts both but I don't think that really works since "wcharz_t" drops the lifetime annotation. --- CMakeLists.txt | 4 +- fish-rust/build.rs | 1 + fish-rust/src/lib.rs | 1 + fish-rust/src/util.rs | 315 ++++++++++++++++++++++++++++++++++++++++++ src/fish_tests.cpp | 57 -------- src/util.cpp | 199 -------------------------- src/util.h | 41 ++---- 7 files changed, 328 insertions(+), 290 deletions(-) create mode 100644 fish-rust/src/util.rs delete mode 100644 src/util.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 935438924..35c5d5605 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -126,8 +126,8 @@ set(FISH_SRCS src/parser.cpp src/parser_keywords.cpp src/path.cpp src/postfork.cpp src/proc.cpp src/re.cpp src/reader.cpp src/redirection.cpp src/screen.cpp src/signals.cpp src/termsize.cpp src/timer.cpp src/tinyexpr.cpp - src/tokenizer.cpp src/trace.cpp src/utf8.cpp src/util.cpp - src/wait_handle.cpp src/wcstringutil.cpp src/wgetopt.cpp src/wildcard.cpp + src/tokenizer.cpp src/trace.cpp src/utf8.cpp + src/wait_handle.cpp src/wcstringutil.cpp src/wgetopt.cpp src/wildcard.cpp src/wutil.cpp src/fds.cpp src/rustffi.cpp ) diff --git a/fish-rust/build.rs b/fish-rust/build.rs index 35d16d12e..e3c7c7f10 100644 --- a/fish-rust/build.rs +++ b/fish-rust/build.rs @@ -24,6 +24,7 @@ fn main() -> miette::Result<()> { "src/ffi_tests.rs", "src/smoke.rs", "src/topic_monitor.rs", + "src/util.rs", "src/builtins/shared.rs", ]; cxx_build::bridges(&source_files) diff --git a/fish-rust/src/lib.rs b/fish-rust/src/lib.rs index cb8dafcc9..1f044fee6 100644 --- a/fish-rust/src/lib.rs +++ b/fish-rust/src/lib.rs @@ -15,6 +15,7 @@ mod signal; mod smoke; mod topic_monitor; +mod util; mod wchar; mod wchar_ext; mod wchar_ffi; diff --git a/fish-rust/src/util.rs b/fish-rust/src/util.rs new file mode 100644 index 000000000..e54fd09d5 --- /dev/null +++ b/fish-rust/src/util.rs @@ -0,0 +1,315 @@ +//! Generic utilities library. + +use crate::ffi::wcharz_t; +use crate::wchar::wstr; +use std::time; + +#[cxx::bridge] +mod ffi { + extern "C++" { + include!("wutil.h"); + type wcharz_t = super::wcharz_t; + } + + extern "Rust" { + fn wcsfilecmp(a: wcharz_t, b: wcharz_t) -> i32; + fn wcsfilecmp_glob(a: wcharz_t, b: wcharz_t) -> i32; + fn get_time() -> i64; + } +} + +/// Compares two wide character strings with an (arguably) intuitive ordering. This function tries +/// to order strings in a way which is intuitive to humans with regards to sorting strings +/// containing numbers. +/// +/// Most sorting functions would sort the strings 'file1.txt' 'file5.txt' and 'file12.txt' as: +/// +/// file1.txt +/// file12.txt +/// file5.txt +/// +/// This function regards any sequence of digits as a single entity when performing comparisons, so +/// the output is instead: +/// +/// file1.txt +/// file5.txt +/// file12.txt +/// +/// Which most people would find more intuitive. +/// +/// This won't return the optimum results for numbers in bases higher than ten, such as hexadecimal, +/// but at least a stable sort order will result. +/// +/// This function performs a two-tiered sort, where difference in case and in number of leading +/// zeroes in numbers only have effect if no other differences between strings are found. This way, +/// a 'file1' and 'File1' will not be considered identical, and hence their internal sort order is +/// not arbitrary, but the names 'file1', 'File2' and 'file3' will still be sorted in the order +/// given above. +pub fn wcsfilecmp(a: wcharz_t, b: wcharz_t) -> i32 { + // TODO This should return `std::cmp::Ordering`. + let a: &wstr = a.into(); + let b: &wstr = b.into(); + let mut retval = 0; + let mut ai = 0; + let mut bi = 0; + while ai < a.len() && bi < b.len() { + let ac = a.as_char_slice()[ai]; + let bc = b.as_char_slice()[bi]; + if ac.is_ascii_digit() && bc.is_ascii_digit() { + let (ad, bd); + (retval, ad, bd) = wcsfilecmp_leading_digits(&a[ai..], &b[bi..]); + ai += ad; + bi += bd; + if retval != 0 || ai == a.len() || bi == b.len() { + break; + } + continue; + } + + // Fast path: Skip towupper. + if ac == bc { + ai += 1; + bi += 1; + continue; + } + + // Sort dashes after Z - see #5634 + let mut acl = if ac == '-' { '[' } else { ac }; + let mut bcl = if bc == '-' { '[' } else { bc }; + // TODO Compare the tail (enabled by Rust's Unicode support). + acl = acl.to_uppercase().next().unwrap(); + bcl = bcl.to_uppercase().next().unwrap(); + + if acl < bcl { + retval = -1; + break; + } else if acl > bcl { + retval = 1; + break; + } else { + ai += 1; + bi += 1; + } + } + + if retval != 0 { + return retval; // we already know the strings aren't logically equal + } + + if ai == a.len() { + if bi == b.len() { + // The strings are logically equal. They may or may not be the same length depending on + // whether numbers were present but that doesn't matter. Disambiguate strings that + // differ by letter case or length. We don't bother optimizing the case where the file + // names are literally identical because that won't occur given how this function is + // used. And even if it were to occur (due to being reused in some other context) it + // would be so rare that it isn't worth optimizing for. + match a.cmp(b) { + std::cmp::Ordering::Less => -1, + std::cmp::Ordering::Equal => 0, + std::cmp::Ordering::Greater => 1, + } + } else { + -1 // string a is a prefix of b and b is longer + } + } else { + assert!(bi == b.len()); + return 1; // string b is a prefix of a and a is longer + } +} + +/// wcsfilecmp, but frozen in time for glob usage. +pub fn wcsfilecmp_glob(a: wcharz_t, b: wcharz_t) -> i32 { + // TODO This should return `std::cmp::Ordering`. + let a: &wstr = a.into(); + let b: &wstr = b.into(); + let mut retval = 0; + let mut ai = 0; + let mut bi = 0; + while ai < a.len() && bi < b.len() { + let ac = a.as_char_slice()[ai]; + let bc = b.as_char_slice()[bi]; + if ac.is_ascii_digit() && bc.is_ascii_digit() { + let (ad, bd); + (retval, ad, bd) = wcsfilecmp_leading_digits(&a[ai..], &b[bi..]); + ai += ad; + bi += bd; + // If we know the strings aren't logically equal or we've reached the end of one or both + // strings we can stop iterating over the chars in each string. + if retval != 0 || ai == a.len() || bi == b.len() { + break; + } + continue; + } + + // Fast path: Skip towlower. + if ac == bc { + ai += 1; + bi += 1; + continue; + } + + // TODO Compare the tail (enabled by Rust's Unicode support). + let acl = ac.to_lowercase().next().unwrap(); + let bcl = bc.to_lowercase().next().unwrap(); + if acl < bcl { + retval = -1; + break; + } else if acl > bcl { + retval = 1; + break; + } else { + ai += 1; + bi += 1; + } + } + + if retval != 0 { + return retval; // we already know the strings aren't logically equal + } + + if ai == a.len() { + if bi == b.len() { + // The strings are logically equal. They may or may not be the same length depending on + // whether numbers were present but that doesn't matter. Disambiguate strings that + // differ by letter case or length. We don't bother optimizing the case where the file + // names are literally identical because that won't occur given how this function is + // used. And even if it were to occur (due to being reused in some other context) it + // would be so rare that it isn't worth optimizing for. + match a.cmp(b) { + std::cmp::Ordering::Less => -1, + std::cmp::Ordering::Equal => 0, + std::cmp::Ordering::Greater => 1, + } + } else { + -1 // string a is a prefix of b and b is longer + } + } else { + assert!(bi == b.len()); + return 1; // string b is a prefix of a and a is longer + } +} + +/// Get the current time in microseconds since Jan 1, 1970. +pub fn get_time() -> i64 { + match time::SystemTime::now().duration_since(time::UNIX_EPOCH) { + Ok(difference) => difference.as_micros() as i64, + Err(until_epoch) => -(until_epoch.duration().as_micros() as i64), + } +} + +// Compare the strings to see if they begin with an integer that can be compared and return the +// result of that comparison. +fn wcsfilecmp_leading_digits(a: &wstr, b: &wstr) -> (i32, usize, usize) { + // Ignore leading 0s. + let mut ai = a.as_char_slice().iter().take_while(|c| **c == '0').count(); + let mut bi = b.as_char_slice().iter().take_while(|c| **c == '0').count(); + + let mut ret = 0; + loop { + let ac = a.as_char_slice().get(ai).unwrap_or(&'\0'); + let bc = b.as_char_slice().get(bi).unwrap_or(&'\0'); + if ac.is_ascii_digit() && bc.is_ascii_digit() { + // We keep the cmp value for the + // first differing digit. + // + // If the numbers have the same length, that's the value. + if ret == 0 { + // Comparing the string value is the same as numerical + // for wchar_t digits! + if ac > bc { + ret = 1; + } + if bc > ac { + ret = -1; + } + } + } else { + // We don't have negative numbers and we only allow ints, + // and we have already skipped leading zeroes, + // so the longer number is larger automatically. + if ac.is_ascii_digit() { + ret = 1; + } + if bc.is_ascii_digit() { + ret = -1; + } + break; + } + ai += 1; + bi += 1; + } + + // For historical reasons, we skip trailing whitespace + // like fish_wcstol does! + // This is used in sorting globs, and that's supposed to be stable. + ai += a + .as_char_slice() + .iter() + .skip(ai) + .take_while(|c| c.is_whitespace()) + .count(); + bi += b + .as_char_slice() + .iter() + .skip(bi) + .take_while(|c| c.is_whitespace()) + .count(); + (ret, ai, bi) +} + +/// Verify the behavior of the `wcsfilecmp()` function. +#[test] +fn test_wcsfilecmp() { + use crate::wchar::L; + use crate::wchar_ffi::wcharz; + + macro_rules! validate { + ($str1:expr, $str2:expr, $expected_rc:expr) => { + assert_eq!( + wcsfilecmp(wcharz!(L!($str1)), wcharz!(L!($str2))), + $expected_rc + ) + }; + } + + // Not using L as suffix because the macro munges error locations. + validate!("", "", 0); + validate!("", "def", -1); + validate!("abc", "", 1); + validate!("abc", "def", -1); + validate!("abc", "DEF", -1); + validate!("DEF", "abc", 1); + validate!("abc", "abc", 0); + validate!("ABC", "ABC", 0); + validate!("AbC", "abc", -1); + validate!("AbC", "ABC", 1); + validate!("def", "abc", 1); + validate!("1ghi", "1gHi", 1); + validate!("1ghi", "2ghi", -1); + validate!("1ghi", "01ghi", 1); + validate!("1ghi", "02ghi", -1); + validate!("01ghi", "1ghi", -1); + validate!("1ghi", "002ghi", -1); + validate!("002ghi", "1ghi", 1); + validate!("abc01def", "abc1def", -1); + validate!("abc1def", "abc01def", 1); + validate!("abc12", "abc5", 1); + validate!("51abc", "050abc", 1); + validate!("abc5", "abc12", -1); + validate!("5abc", "12ABC", -1); + validate!("abc0789", "abc789", -1); + validate!("abc0xA789", "abc0xA0789", 1); + validate!("abc002", "abc2", -1); + validate!("abc002g", "abc002", 1); + validate!("abc002g", "abc02g", -1); + validate!("abc002.txt", "abc02.txt", -1); + validate!("abc005", "abc012", -1); + validate!("abc02", "abc002", 1); + validate!("abc002.txt", "abc02.txt", -1); + validate!("GHI1abc2.txt", "ghi1abc2.txt", -1); + validate!("a0", "a00", -1); + validate!("a00b", "a0b", -1); + validate!("a0b", "a00b", 1); + validate!("a-b", "azb", 1); +} diff --git a/src/fish_tests.cpp b/src/fish_tests.cpp index 49d39855c..26b2c405d 100644 --- a/src/fish_tests.cpp +++ b/src/fish_tests.cpp @@ -1589,62 +1589,6 @@ static void test_parse_util_cmdsubst_extent() { } } -static struct wcsfilecmp_test { - const wchar_t *str1; - const wchar_t *str2; - int expected_rc; -} wcsfilecmp_tests[] = {{L"", L"", 0}, - {L"", L"def", -1}, - {L"abc", L"", 1}, - {L"abc", L"def", -1}, - {L"abc", L"DEF", -1}, - {L"DEF", L"abc", 1}, - {L"abc", L"abc", 0}, - {L"ABC", L"ABC", 0}, - {L"AbC", L"abc", -1}, - {L"AbC", L"ABC", 1}, - {L"def", L"abc", 1}, - {L"1ghi", L"1gHi", 1}, - {L"1ghi", L"2ghi", -1}, - {L"1ghi", L"01ghi", 1}, - {L"1ghi", L"02ghi", -1}, - {L"01ghi", L"1ghi", -1}, - {L"1ghi", L"002ghi", -1}, - {L"002ghi", L"1ghi", 1}, - {L"abc01def", L"abc1def", -1}, - {L"abc1def", L"abc01def", 1}, - {L"abc12", L"abc5", 1}, - {L"51abc", L"050abc", 1}, - {L"abc5", L"abc12", -1}, - {L"5abc", L"12ABC", -1}, - {L"abc0789", L"abc789", -1}, - {L"abc0xA789", L"abc0xA0789", 1}, - {L"abc002", L"abc2", -1}, - {L"abc002g", L"abc002", 1}, - {L"abc002g", L"abc02g", -1}, - {L"abc002.txt", L"abc02.txt", -1}, - {L"abc005", L"abc012", -1}, - {L"abc02", L"abc002", 1}, - {L"abc002.txt", L"abc02.txt", -1}, - {L"GHI1abc2.txt", L"ghi1abc2.txt", -1}, - {L"a0", L"a00", -1}, - {L"a00b", L"a0b", -1}, - {L"a0b", L"a00b", 1}, - {L"a-b", L"azb", 1}, - {nullptr, nullptr, 0}}; - -/// Verify the behavior of the `wcsfilecmp()` function. -static void test_wcsfilecmp() { - for (auto test = wcsfilecmp_tests; test->str1; test++) { - int rc = wcsfilecmp(test->str1, test->str2); - if (rc != test->expected_rc) { - err(L"New failed on line %lu: [\"%ls\" <=> \"%ls\"]: " - L"expected return code %d but got %d", - __LINE__, test->str1, test->str2, test->expected_rc, rc); - } - } -} - static void test_const_strlen() { do_test(const_strlen("") == 0); do_test(const_strlen(L"") == 0); @@ -1788,7 +1732,6 @@ void test_dir_iter() { static void test_utility_functions() { say(L"Testing utility functions"); - test_wcsfilecmp(); test_parse_util_cmdsubst_extent(); test_const_strlen(); test_const_strcmp(); diff --git a/src/util.cpp b/src/util.cpp deleted file mode 100644 index fae9d9e15..000000000 --- a/src/util.cpp +++ /dev/null @@ -1,199 +0,0 @@ -// Generic utilities library. -#include "config.h" // IWYU pragma: keep - -#include "util.h" - -#include -#include -#include - -#include - -#include "common.h" -#include "fallback.h" // IWYU pragma: keep -#include "wutil.h" // IWYU pragma: keep - -// Compare the strings to see if they begin with an integer that can be compared and return the -// result of that comparison. -static int wcsfilecmp_leading_digits(const wchar_t **a, const wchar_t **b) { - const wchar_t *a1 = *a; - const wchar_t *b1 = *b; - - // Ignore leading 0s. - while (*a1 == L'0') a1++; - while (*b1 == L'0') b1++; - - int ret = 0; - - while (true) { - if (iswdigit(*a1) && iswdigit(*b1)) { - // We keep the cmp value for the - // first differing digit. - // - // If the numbers have the same length, that's the value. - if (ret == 0) { - // Comparing the string value is the same as numerical - // for wchar_t digits! - if (*a1 > *b1) ret = 1; - if (*b1 > *a1) ret = -1; - } - } else { - // We don't have negative numbers and we only allow ints, - // and we have already skipped leading zeroes, - // so the longer number is larger automatically. - if (iswdigit(*a1)) ret = 1; - if (iswdigit(*b1)) ret = -1; - break; - } - a1++; - b1++; - } - - // For historical reasons, we skip trailing whitespace - // like fish_wcstol does! - // This is used in sorting globs, and that's supposed to be stable. - while (iswspace(*a1)) a1++; - while (iswspace(*b1)) b1++; - *a = a1; - *b = b1; - return ret; -} - -/// Compare two strings, representing file names, using "natural" ordering. This means that letter -/// case is ignored. It also means that integers in each string are compared based on the decimal -/// value rather than the string representation. It only handles base 10 integers and they can -/// appear anywhere in each string, including multiple integers. This means that a file name like -/// "0xAF0123" is treated as the literal "0xAF" followed by the integer 123. -/// -/// The intent is to ensure that file names like "file23" and "file5" are sorted so that the latter -/// appears before the former. -/// -/// This does not handle esoterica like Unicode combining characters. Nor does it use collating -/// sequences. Which means that an ASCII "A" will be less than an equivalent character with a higher -/// Unicode code point. In part because doing so is really hard without the help of something like -/// the ICU library. But also because file names might be in a different encoding than is used by -/// the current fish process which results in weird situations. This is basically a best effort -/// implementation that will do the right thing 99.99% of the time. -/// -/// Returns: -1 if a < b, 0 if a == b, 1 if a > b. -int wcsfilecmp(const wchar_t *a, const wchar_t *b) { - assert(a && b && "Null parameter"); - const wchar_t *orig_a = a; - const wchar_t *orig_b = b; - int retval = 0; // assume the strings will be equal - - while (*a && *b) { - if (iswdigit(*a) && iswdigit(*b)) { - retval = wcsfilecmp_leading_digits(&a, &b); - // If we know the strings aren't logically equal or we've reached the end of one or both - // strings we can stop iterating over the chars in each string. - if (retval || *a == 0 || *b == 0) break; - } - - // Fast path: Skip towupper. - if (*a == *b) { - a++; - b++; - continue; - } - - wint_t al = towupper(*a); - wint_t bl = towupper(*b); - // Sort dashes after Z - see #5634 - if (al == L'-') al = L'['; - if (bl == L'-') bl = L'['; - - if (al < bl) { - retval = -1; - break; - } else if (al > bl) { - retval = 1; - break; - } else { - a++; - b++; - } - } - - if (retval != 0) return retval; // we already know the strings aren't logically equal - - if (*a == 0) { - if (*b == 0) { - // The strings are logically equal. They may or may not be the same length depending on - // whether numbers were present but that doesn't matter. Disambiguate strings that - // differ by letter case or length. We don't bother optimizing the case where the file - // names are literally identical because that won't occur given how this function is - // used. And even if it were to occur (due to being reused in some other context) it - // would be so rare that it isn't worth optimizing for. - retval = std::wcscmp(orig_a, orig_b); - return retval < 0 ? -1 : retval == 0 ? 0 : 1; - } - return -1; // string a is a prefix of b and b is longer - } - - assert(*b == 0); - return 1; // string b is a prefix of a and a is longer -} - -/// wcsfilecmp, but frozen in time for glob usage. -int wcsfilecmp_glob(const wchar_t *a, const wchar_t *b) { - assert(a && b && "Null parameter"); - const wchar_t *orig_a = a; - const wchar_t *orig_b = b; - int retval = 0; // assume the strings will be equal - - while (*a && *b) { - if (iswdigit(*a) && iswdigit(*b)) { - retval = wcsfilecmp_leading_digits(&a, &b); - // If we know the strings aren't logically equal or we've reached the end of one or both - // strings we can stop iterating over the chars in each string. - if (retval || *a == 0 || *b == 0) break; - } - - // Fast path: Skip towlower. - if (*a == *b) { - a++; - b++; - continue; - } - - wint_t al = towlower(*a); - wint_t bl = towlower(*b); - if (al < bl) { - retval = -1; - break; - } else if (al > bl) { - retval = 1; - break; - } else { - a++; - b++; - } - } - - if (retval != 0) return retval; // we already know the strings aren't logically equal - - if (*a == 0) { - if (*b == 0) { - // The strings are logically equal. They may or may not be the same length depending on - // whether numbers were present but that doesn't matter. Disambiguate strings that - // differ by letter case or length. We don't bother optimizing the case where the file - // names are literally identical because that won't occur given how this function is - // used. And even if it were to occur (due to being reused in some other context) it - // would be so rare that it isn't worth optimizing for. - retval = wcscmp(orig_a, orig_b); - return retval < 0 ? -1 : retval == 0 ? 0 : 1; - } - return -1; // string a is a prefix of b and b is longer - } - - assert(*b == 0); - return 1; // string b is a prefix of a and a is longer -} - -/// Return microseconds since the epoch. -long long get_time() { - struct timeval time_struct; - gettimeofday(&time_struct, nullptr); - return 1000000LL * time_struct.tv_sec + time_struct.tv_usec; -} diff --git a/src/util.h b/src/util.h index 5cfb71270..fcb9996b8 100644 --- a/src/util.h +++ b/src/util.h @@ -1,40 +1,17 @@ -// Generic utilities library. #ifndef FISH_UTIL_H #define FISH_UTIL_H -/// Compares two wide character strings with an (arguably) intuitive ordering. This function tries -/// to order strings in a way which is intuitive to humans with regards to sorting strings -/// containing numbers. -/// -/// Most sorting functions would sort the strings 'file1.txt' 'file5.txt' and 'file12.txt' as: -/// -/// file1.txt -/// file12.txt -/// file5.txt -/// -/// This function regards any sequence of digits as a single entity when performing comparisons, so -/// the output is instead: -/// -/// file1.txt -/// file5.txt -/// file12.txt -/// -/// Which most people would find more intuitive. -/// -/// This won't return the optimum results for numbers in bases higher than ten, such as hexadecimal, -/// but at least a stable sort order will result. -/// -/// This function performs a two-tiered sort, where difference in case and in number of leading -/// zeroes in numbers only have effect if no other differences between strings are found. This way, -/// a 'file1' and 'File1' will not be considered identical, and hence their internal sort order is -/// not arbitrary, but the names 'file1', 'File2' and 'file3' will still be sorted in the order -/// given above. +#if INCLUDE_RUST_HEADERS + +#include "util.rs.h" + +#else + +// Hacks to allow us to compile without Rust headers. int wcsfilecmp(const wchar_t *a, const wchar_t *b); - -/// wcsfilecmp, but frozen in time for glob usage. int wcsfilecmp_glob(const wchar_t *a, const wchar_t *b); - -/// Get the current time in microseconds since Jan 1, 1970. long long get_time(); #endif + +#endif