Fix fuzzy matching for multi-character lowercase sequences

Same as 8668ce336c (Fix common::wcscasecmp() for multi-byte lowercase strings,
2023-05-02).
This commit is contained in:
Johannes Altmanninger
2024-11-17 08:11:41 +01:00
parent 81f1ab75d0
commit d234ebf484
2 changed files with 30 additions and 23 deletions

View File

@@ -141,8 +141,12 @@ pub fn fish_mkstemp_cloexec(name_template: CString) -> Result<(File, CString), E
}
}
/// Compare two wide strings in a case-insensitive fashion
pub fn wcscasecmp(lhs: &wstr, rhs: &wstr) -> cmp::Ordering {
wcscasecmp_fuzzy(lhs, rhs, std::convert::identity)
}
/// Compare two wide strings in a case-insensitive fashion
pub fn wcscasecmp_fuzzy(lhs: &wstr, rhs: &wstr, canonicalize: fn(char) -> char) -> cmp::Ordering {
use std::char::ToLowercase;
use widestring::utfstr::CharsUtf32;
@@ -151,12 +155,12 @@ pub fn wcscasecmp(lhs: &wstr, rhs: &wstr) -> cmp::Ordering {
/// `char::to_lowercase()` returns an iterator of chars and we sometimes need to cmp the last
/// char of one char's `to_lowercase()` with the first char of the other char's
/// `to_lowercase()`. This makes that possible.
struct ToLowerBuffer<'a> {
struct ToLowerBuffer<'a, Canonicalize: Fn(char) -> char> {
current: ToLowercase,
chars: CharsUtf32<'a>,
chars: std::iter::Map<CharsUtf32<'a>, Canonicalize>,
}
impl<'a> Iterator for ToLowerBuffer<'a> {
impl<'a, Canonicalize: Fn(char) -> char> Iterator for ToLowerBuffer<'a, Canonicalize> {
type Item = char;
fn next(&mut self) -> Option<Self::Item> {
@@ -169,9 +173,8 @@ fn next(&mut self) -> Option<Self::Item> {
}
}
impl<'a> ToLowerBuffer<'a> {
pub fn from(w: &'a wstr) -> Self {
let mut chars = w.chars();
impl<'a, Canonicalize: Fn(char) -> char> ToLowerBuffer<'a, Canonicalize> {
pub fn new(mut chars: std::iter::Map<CharsUtf32<'a>, Canonicalize>) -> Self {
Self {
current: chars.next().map(|c| c.to_lowercase()).unwrap_or_else(|| {
let mut empty = 'a'.to_lowercase();
@@ -184,8 +187,8 @@ pub fn from(w: &'a wstr) -> Self {
}
}
let lhs = ToLowerBuffer::from(lhs);
let rhs = ToLowerBuffer::from(rhs);
let lhs = ToLowerBuffer::new(lhs.chars().map(canonicalize));
let rhs = ToLowerBuffer::new(rhs.chars().map(canonicalize));
lhs.cmp(rhs)
}

View File

@@ -2,7 +2,7 @@
use crate::common::{get_ellipsis_char, get_ellipsis_str};
use crate::expand::INTERNAL_SEPARATOR;
use crate::fallback::{fish_wcwidth, wcscasecmp};
use crate::fallback::{fish_wcwidth, wcscasecmp, wcscasecmp_fuzzy};
use crate::flog::FLOGF;
use crate::libc::MB_CUR_MAX;
use crate::wchar::{decode_byte_from_char, prelude::*};
@@ -95,21 +95,25 @@ pub fn ifind(haystack: &wstr, needle: &wstr, fuzzy: bool /* = false */) -> Optio
.as_char_slice()
.windows(needle.len())
.position(|window| {
for (l, r) in window.iter().zip(needle.chars()) {
// In fuzzy matching treat treat `-` and `_` as equal (#3584).
if fuzzy && ['-', '_'].contains(l) && ['-', '_'].contains(&r) {
continue;
}
// TODO Decide what to do for different lengths.
let l = l.to_lowercase();
let r = r.to_lowercase();
for (l, r) in l.zip(r) {
if l != r {
return false;
}
// In fuzzy matching treat treat `-` and `_` as equal (#3584).
fn fuzzy_canonicalize(c: char) -> char {
if c == '_' {
'-'
} else {
c
}
}
true
wcscasecmp_fuzzy(
wstr::from_char_slice(window),
needle,
if fuzzy {
fuzzy_canonicalize
} else {
std::convert::identity
},
)
.is_eq()
})
}