fish-shell/src/util.rs

//! Generic utilities library.

use crate::wchar::prelude::*;
use rand::{SeedableRng, rngs::SmallRng};
use std::cmp::Ordering;
use std::time;
use std::time::{SystemTime, UNIX_EPOCH};

/// Compares two wide character strings with an (arguably) intuitive ordering. This function tries
/// to order strings in a way which is intuitive to humans with regards to sorting strings
/// containing numbers.
///
/// Most sorting functions would sort the strings 'file1.txt' 'file5.txt' and 'file12.txt' as:
///
/// file1.txt
/// file12.txt
/// file5.txt
///
/// This function regards any sequence of digits as a single entity when performing comparisons, so
/// the output is instead:
///
/// file1.txt
/// file5.txt
/// file12.txt
///
/// Which most people would find more intuitive.
///
/// This won't return the optimum results for numbers in bases higher than ten, such as hexadecimal,
/// but at least a stable sort order will result.
///
/// This function performs a two-tiered sort, where difference in case and in number of leading
/// zeroes in numbers only have effect if no other differences between strings are found. This way,
/// a 'file1' and 'File1' will not be considered identical, and hence their internal sort order is
/// not arbitrary, but the names 'file1', 'File2' and 'file3' will still be sorted in the order
/// given above.
pub fn wcsfilecmp(a: &wstr, b: &wstr) -> Ordering {
    let mut retval = Ordering::Equal;
    let mut ai = 0;
    let mut bi = 0;
    while ai < a.len() && bi < b.len() {
        let ac = a.as_char_slice()[ai];
        let bc = b.as_char_slice()[bi];
        if ac.is_ascii_digit() && bc.is_ascii_digit() {
            let (ad, bd);
            (retval, ad, bd) = wcsfilecmp_leading_digits(&a[ai..], &b[bi..]);
            ai += ad;
            bi += bd;
            if retval != Ordering::Equal || ai == a.len() || bi == b.len() {
                break;
            }
            continue;
        }

        // Fast path: Skip towupper.
        if ac == bc {
            ai += 1;
            bi += 1;
            continue;
        }

        // Sort dashes after Z - see #5634
        let mut acl = if ac == '-' { '[' } else { ac };
        let mut bcl = if bc == '-' { '[' } else { bc };
        // TODO Compare the tail (enabled by Rust's Unicode support).
        acl = acl.to_uppercase().next().unwrap();
        bcl = bcl.to_uppercase().next().unwrap();

        match acl.cmp(&bcl) {
            Ordering::Equal => {
                ai += 1;
                bi += 1;
            }
            o => {
                retval = o;
                break;
            }
        }
    }

    if retval != Ordering::Equal {
        return retval; // we already know the strings aren't logically equal
    }

    if ai == a.len() {
        if bi == b.len() {
            // The strings are logically equal. They may or may not be the same length depending on
            // whether numbers were present but that doesn't matter. Disambiguate strings that
            // differ by letter case or length. We don't bother optimizing the case where the file
            // names are literally identical because that won't occur given how this function is
            // used. And even if it were to occur (due to being reused in some other context) it
            // would be so rare that it isn't worth optimizing for.
            a.cmp(b)
        } else {
            Ordering::Less // string a is a prefix of b and b is longer
        }
    } else {
        assert!(bi == b.len());
        Ordering::Greater // string b is a prefix of a and a is longer
    }
}

/// wcsfilecmp, but frozen in time for glob usage.
pub fn wcsfilecmp_glob(a: &wstr, b: &wstr) -> Ordering {
    let mut retval = Ordering::Equal;
    let mut ai = 0;
    let mut bi = 0;
    while ai < a.len() && bi < b.len() {
        let ac = a.as_char_slice()[ai];
        let bc = b.as_char_slice()[bi];
        if ac.is_ascii_digit() && bc.is_ascii_digit() {
            let (ad, bd);
            (retval, ad, bd) = wcsfilecmp_leading_digits(&a[ai..], &b[bi..]);
            ai += ad;
            bi += bd;
            // If we know the strings aren't logically equal or we've reached the end of one or both
            // strings we can stop iterating over the chars in each string.
            if retval != Ordering::Equal || ai == a.len() || bi == b.len() {
                break;
            }
            continue;
        }

        // Fast path: Skip towlower.
        if ac == bc {
            ai += 1;
            bi += 1;
            continue;
        }

        // TODO Compare the tail (enabled by Rust's Unicode support).
        let acl = ac.to_lowercase().next().unwrap();
        let bcl = bc.to_lowercase().next().unwrap();
        match acl.cmp(&bcl) {
            Ordering::Equal => {
                ai += 1;
                bi += 1;
            }
            o => {
                retval = o;
                break;
            }
        }
    }

    if retval != Ordering::Equal {
        return retval; // we already know the strings aren't logically equal
    }

    if ai == a.len() {
        if bi == b.len() {
            // The strings are logically equal. They may or may not be the same length depending on
            // whether numbers were present but that doesn't matter. Disambiguate strings that
            // differ by letter case or length. We don't bother optimizing the case where the file
            // names are literally identical because that won't occur given how this function is
            // used. And even if it were to occur (due to being reused in some other context) it
            // would be so rare that it isn't worth optimizing for.
            a.cmp(b)
        } else {
            Ordering::Less // string a is a prefix of b and b is longer
        }
    } else {
        assert!(bi == b.len());
        Ordering::Greater // string b is a prefix of a and a is longer
    }
}

/// Get the current time in microseconds since Jan 1, 1970.
pub fn get_time() -> i64 {
    match time::SystemTime::now().duration_since(time::UNIX_EPOCH) {
        Ok(difference) => difference.as_micros() as i64,
        Err(until_epoch) => -(until_epoch.duration().as_micros() as i64),
    }
}

// Helper to get a small RNG seed, based on nanoseconds.
pub fn get_rng_seed() -> u128 {
    // Note we use an explicit seed to avoid the "getrandom" crate, which uses `getentropy()` on macOS which
    // is not available before macOS 10.12.
    SystemTime::now()
        .duration_since(UNIX_EPOCH)
        .unwrap_or_default()
        .as_nanos()
}

// Helper to get a small RNG with a seed.
pub fn get_seeded_rng(seed: u128) -> SmallRng {
    let seed = ((seed >> 64) as u64) ^ (seed as u64);
    SmallRng::seed_from_u64(seed)
}

// Helper to get a small RNG using the current time.
pub fn get_rng() -> SmallRng {
    get_seeded_rng(get_rng_seed())
}

// Compare the strings to see if they begin with an integer that can be compared and return the
// result of that comparison.
fn wcsfilecmp_leading_digits(a: &wstr, b: &wstr) -> (Ordering, usize, usize) {
    // Ignore leading 0s.
    let mut ai = a.as_char_slice().iter().take_while(|c| **c == '0').count();
    let mut bi = b.as_char_slice().iter().take_while(|c| **c == '0').count();

    let mut ret = Ordering::Equal;
    loop {
        let ac = a.as_char_slice().get(ai).unwrap_or(&'\0');
        let bc = b.as_char_slice().get(bi).unwrap_or(&'\0');
        if ac.is_ascii_digit() && bc.is_ascii_digit() {
            // We keep the cmp value for the
            // first differing digit.
            //
            // If the numbers have the same length, that's the value.
            if ret == Ordering::Equal {
                // Comparing the string value is the same as numerical
                // for wchar_t digits!
                ret = ac.cmp(bc);
            }
        } else {
            // We don't have negative numbers and we only allow ints,
            // and we have already skipped leading zeroes,
            // so the longer number is larger automatically.
            if ac.is_ascii_digit() {
                ret = Ordering::Greater;
            }
            if bc.is_ascii_digit() {
                ret = Ordering::Less;
            }
            break;
        }
        ai += 1;
        bi += 1;
    }

    // For historical reasons, we skip trailing whitespace
    // like fish_wcstol does!
    // This is used in sorting globs, and that's supposed to be stable.
    ai += a
        .as_char_slice()
        .iter()
        .skip(ai)
        .take_while(|c| c.is_whitespace())
        .count();
    bi += b
        .as_char_slice()
        .iter()
        .skip(bi)
        .take_while(|c| c.is_whitespace())
        .count();
    (ret, ai, bi)
}

/// Finds `needle` in a `haystack` and returns the index of the first matching element, if any.
///
/// # Examples
///
/// ```
/// use fish::util::find_subslice;
/// let haystack = b"ABC ABCDAB ABCDABCDABDE";
///
/// assert_eq!(find_subslice(b"ABCDABD", haystack), Some(15));
/// assert_eq!(find_subslice(b"ABCDE", haystack), None);
/// ```
pub fn find_subslice<T: PartialEq>(
    needle: impl AsRef<[T]>,
    haystack: impl AsRef<[T]>,
) -> Option<usize> {
    let needle = needle.as_ref();
    if needle.is_empty() {
        return Some(0);
    }
    let haystack = haystack.as_ref();
    haystack.windows(needle.len()).position(|w| w == needle)
}

#[cfg(test)]
mod tests {
    use super::wcsfilecmp;
    use crate::wchar::prelude::*;
    use std::cmp::Ordering;

    /// Verify the behavior of the `wcsfilecmp()` function.
    #[test]
    fn test_wcsfilecmp() {
        macro_rules! validate {
            ($str1:expr, $str2:expr, $expected_rc:expr) => {
                assert_eq!(wcsfilecmp(L!($str1), L!($str2)), $expected_rc)
            };
        }

        // Not using L as suffix because the macro munges error locations.
        validate!("", "", Ordering::Equal);
        validate!("", "def", Ordering::Less);
        validate!("abc", "", Ordering::Greater);
        validate!("abc", "def", Ordering::Less);
        validate!("abc", "DEF", Ordering::Less);
        validate!("DEF", "abc", Ordering::Greater);
        validate!("abc", "abc", Ordering::Equal);
        validate!("ABC", "ABC", Ordering::Equal);
        validate!("AbC", "abc", Ordering::Less);
        validate!("AbC", "ABC", Ordering::Greater);
        validate!("def", "abc", Ordering::Greater);
        validate!("1ghi", "1gHi", Ordering::Greater);
        validate!("1ghi", "2ghi", Ordering::Less);
        validate!("1ghi", "01ghi", Ordering::Greater);
        validate!("1ghi", "02ghi", Ordering::Less);
        validate!("01ghi", "1ghi", Ordering::Less);
        validate!("1ghi", "002ghi", Ordering::Less);
        validate!("002ghi", "1ghi", Ordering::Greater);
        validate!("abc01def", "abc1def", Ordering::Less);
        validate!("abc1def", "abc01def", Ordering::Greater);
        validate!("abc12", "abc5", Ordering::Greater);
        validate!("51abc", "050abc", Ordering::Greater);
        validate!("abc5", "abc12", Ordering::Less);
        validate!("5abc", "12ABC", Ordering::Less);
        validate!("abc0789", "abc789", Ordering::Less);
        validate!("abc0xA789", "abc0xA0789", Ordering::Greater);
        validate!("abc002", "abc2", Ordering::Less);
        validate!("abc002g", "abc002", Ordering::Greater);
        validate!("abc002g", "abc02g", Ordering::Less);
        validate!("abc002.txt", "abc02.txt", Ordering::Less);
        validate!("abc005", "abc012", Ordering::Less);
        validate!("abc02", "abc002", Ordering::Greater);
        validate!("abc002.txt", "abc02.txt", Ordering::Less);
        validate!("GHI1abc2.txt", "ghi1abc2.txt", Ordering::Less);
        validate!("a0", "a00", Ordering::Less);
        validate!("a00b", "a0b", Ordering::Less);
        validate!("a0b", "a00b", Ordering::Greater);
        validate!("a-b", "azb", Ordering::Greater);
    }
}