feroxbuster/src/utils.rs

use anyhow::{bail, Context, Result};
use console::{strip_ansi_codes, style, user_attended};
use indicatif::ProgressBar;
use regex::Regex;
use reqwest::{Client, Method, Response, StatusCode, Url};
#[cfg(not(target_os = "windows"))]
use rlimit::{getrlimit, setrlimit, Resource};
use std::{
    error::Error,
    fs,
    io::{self, BufWriter, Write},
    sync::Arc,
    time::Duration,
    time::{SystemTime, UNIX_EPOCH},
};
use tokio::sync::{mpsc::UnboundedSender, oneshot};

use crate::{
    config::Configuration,
    config::OutputLevel,
    event_handlers::{
        Command::{self, AddError, AddStatus},
        Handles,
    },
    progress::PROGRESS_PRINTER,
    response::FeroxResponse,
    send_command,
    statistics::StatError::{Certificate, Connection, Other, Redirection, Request, Timeout},
    traits::FeroxSerialize,
    USER_AGENTS,
};

/// simple counter for grabbing 'random' user agents
static mut USER_AGENT_CTR: usize = 0;

/// detects certificate-related errors by analyzing the error chain
fn is_certificate_error(error: &reqwest::Error) -> bool {
    let full_error = format!("{error:?}").to_lowercase();
    let error_msg = error.to_string().to_lowercase();

    // check the main error message first
    if error_msg.contains("certificate verify failed")
        || error_msg.contains("self-signed certificate")
        || error_msg.contains("certificate has expired")
        || error_msg.contains("hostname mismatch")
        || error_msg.contains("certificate")
    {
        return true;
    }

    // check the full debug representation for OpenSSL patterns
    if full_error.contains("ssl routines")
        || full_error.contains("certificate verify failed")
        || full_error.contains("self-signed certificate")
        || full_error.contains("certificate has expired")
        || full_error.contains("hostname mismatch")
        || full_error.contains("tls_post_process_server_certificate")
        || full_error.contains("certificate")
        || full_error.contains("cert")
    {
        return true;
    }

    // walk the error source chain to find underlying TLS/certificate errors
    let mut source = error.source();
    while let Some(err) = source {
        let source_msg = err.to_string().to_lowercase();

        // check for specific OpenSSL certificate error patterns
        if source_msg.contains("ssl routines")
            || source_msg.contains("certificate verify failed")
            || source_msg.contains("self-signed certificate")
            || source_msg.contains("certificate has expired")
            || source_msg.contains("hostname mismatch")
            || source_msg.contains("unable to get local issuer certificate")
            || source_msg.contains("certificate is not yet valid")
            || source_msg.contains("invalid certificate")
            || source_msg.contains("unknown ca")
            || source_msg.contains("certificate")
            || source_msg.contains("cert")
            || source_msg.contains("tls")
            || source_msg.contains("ssl")
        {
            return true;
        }

        source = err.source();
    }

    false
}

/// Given the path to a file, open the file in append mode (create it if it doesn't exist) and
/// return a reference to the buffered file
pub fn open_file(filename: &str) -> Result<BufWriter<fs::File>> {
    log::trace!("enter: open_file({filename})");

    let file = fs::OpenOptions::new() // std fs
        .create(true)
        .append(true)
        .open(filename)
        .with_context(|| fmt_err(&format!("Could not open {filename}")))?;

    let writer = BufWriter::new(file); // std io

    log::trace!("exit: open_file -> {writer:?}");
    Ok(writer)
}

/// Takes in a string and examines the first character to return a color version of the same string
pub fn status_colorizer(status: &str) -> String {
    match status.chars().next() {
        Some('1') => style(status).blue().to_string(), // informational
        Some('2') => style(status).green().to_string(), // success
        Some('3') => style(status).yellow().to_string(), // redirects
        Some('4') => style(status).red().to_string(),  // client error
        Some('5') => style(status).red().to_string(),  // server error
        Some('W') => style(status).cyan().to_string(), // wildcard
        Some('E') => style(status).red().to_string(),  // error
        _ => status.to_string(),                       // ¯\_(ツ)_/¯
    }
}

/// simple wrapper to stay DRY
pub fn fmt_err(msg: &str) -> String {
    format!("{}: {}", status_colorizer("ERROR"), msg)
}

/// simple wrapper to get the current system time as
/// time elapsed from unix epoch
pub fn timestamp() -> f64 {
    let since_the_epoch = SystemTime::now()
        .duration_since(UNIX_EPOCH)
        .unwrap_or_else(|_| Duration::from_secs(0));

    let secs = since_the_epoch.as_secs() as f64;
    let nanos = since_the_epoch.subsec_nanos() as f64;

    // Convert nanoseconds to fractional seconds and add to secs
    secs + (nanos / 1_000_000_000.0)
}

/// given a FeroxResponse, send a TryRecursion command
///
/// moved to utils to allow for calls from extractor and scanner
pub(crate) async fn send_try_recursion_command(
    handles: Arc<Handles>,
    response: FeroxResponse,
) -> Result<()> {
    // make the response mutable so we can drop the body before
    // sending it over the mpsc
    let mut response = response;
    response.drop_text();

    handles.send_scan_command(Command::TryRecursion(Box::new(response)))?;
    let (tx, rx) = oneshot::channel::<bool>();
    handles.send_scan_command(Command::Sync(tx))?;
    rx.await?;
    Ok(())
}

/// Takes in a string and colors it using console::style
///
/// mainly putting this here in case i want to change the color later, making any changes easy
pub fn module_colorizer(modname: &str) -> String {
    style(modname).cyan().to_string()
}

/// Simple helper to abstract away the check for an attached terminal.
///
/// If a terminal is attached, progress bars are visible and the progress bar is used to print
/// to stderr. The progress bar must be used when bars are visible in order to not jack up any
/// progress bar output (the bar knows how to print above itself)
///
/// If a terminal is not attached, `msg` is printed to stdout, with its ansi
/// color codes stripped.
///
/// additionally, provides a location for future printing options (no color, etc) to be handled
pub fn ferox_print(msg: &str, bar: &ProgressBar) {
    if user_attended() {
        bar.println(msg);
    } else {
        let stripped = strip_ansi_codes(msg);
        println!("{stripped}");
    }
}

/// wrapper for make_request used to pass error/response codes to FeroxScans for per-scan stats
/// tracking of information related to auto-tune/bail
pub async fn logged_request(
    url: &Url,
    method: &str,
    data: Option<&[u8]>,
    handles: Arc<Handles>,
) -> Result<Response> {
    let client = &handles.config.client;
    let level = handles.config.output_level;
    let tx_stats = handles.stats.tx.clone();

    let response = make_request(client, url, method, data, level, &handles.config, tx_stats).await;

    let scans = handles.ferox_scans()?;
    match response {
        Ok(resp) => {
            match resp.status() {
                StatusCode::TOO_MANY_REQUESTS | StatusCode::FORBIDDEN => {
                    scans.increment_status_code(url.as_str(), resp.status());
                }
                _ => {}
            }
            Ok(resp)
        }
        Err(e) => {
            log::warn!("err: {e:?}");
            scans.increment_error(url.as_str());
            bail!(e)
        }
    }
}

/// Initiate request to the given `Url` using `Client`
pub async fn make_request(
    client: &Client,
    url: &Url,
    method: &str,
    mut data: Option<&[u8]>,
    output_level: OutputLevel,
    config: &Configuration,
    tx_stats: UnboundedSender<Command>,
) -> Result<Response> {
    log::trace!(
        "enter: make_request(Configuration::Client, {url}, {output_level:?}, {tx_stats:?})"
    );
    let tmp_workaround: Option<&[u8]> = Some(&[0xd_u8, 0xa]); // \r\n

    let mut request = client.request(Method::from_bytes(method.as_bytes())?, url.to_owned());

    if (!config.proxy.is_empty() || !config.replay_proxy.is_empty())
        && data.is_none()
        && ["post", "put", "patch"].contains(&method.to_ascii_lowercase().as_str())
    {
        // either --proxy or --replay-proxy was specified
        // AND
        // --data wasn't used
        // AND
        // the method is either post/put/patch (case insensitive)
        //
        // this combination of factors results in requests that are delayed for 10 seconds before
        // being issued. The tracking issues are
        //   https://github.com/epi052/feroxbuster/issues/501
        //   https://github.com/seanmonstar/reqwest/issues/1474
        //
        // as a (hopefully temporary) workaround, we'll add \r\n to the body so that there's no
        // delay
        data = tmp_workaround;
    }

    if let Some(body_data) = data {
        request = request.body(body_data.to_vec());
    }

    if config.random_agent {
        let index = unsafe {
            USER_AGENT_CTR += 1;
            USER_AGENT_CTR % USER_AGENTS.len()
        };

        let user_agent = USER_AGENTS[index];

        request = request.header("User-Agent", user_agent);
    }

    match request.send().await {
        Err(e) => {
            log::trace!("exit: make_request -> {e}");

            if e.is_timeout() {
                send_command!(tx_stats, AddError(Timeout));
            } else if e.is_redirect() {
                if let Some(last_redirect) = e.url() {
                    // get where we were headed (last_redirect) and where we came from (url)
                    let fancy_message = format!(
                        "{} !=> {} ({})",
                        url,
                        last_redirect,
                        style("too many redirects").red(),
                    );

                    let msg_status = match e.status() {
                        Some(status) => status.to_string(),
                        None => "ERR".to_string(),
                    };

                    let report = create_report_string(
                        &msg_status,
                        method,
                        "-1",
                        "-1",
                        "-1",
                        &fancy_message,
                        output_level,
                    );

                    send_command!(tx_stats, AddError(Redirection));

                    ferox_print(&report, &PROGRESS_PRINTER)
                };
            } else if is_certificate_error(&e) {
                log::warn!("Certificate error detected: {e}");
                send_command!(tx_stats, AddError(Certificate));
                bail!(":SSL: {e}");
            } else if e.is_connect() {
                send_command!(tx_stats, AddError(Connection));
            } else if e.is_request() {
                send_command!(tx_stats, AddError(Request));
            } else {
                send_command!(tx_stats, AddError(Other));
            }

            log::warn!("Error while making request: {e}");
            bail!("{}", e)
        }
        Ok(resp) => {
            log::trace!("exit: make_request -> {resp:?}");
            send_command!(tx_stats, AddStatus(resp.status()));
            Ok(resp)
        }
    }
}

/// Helper to create the standard line for output to file/terminal
///
/// example output:
/// 200      127l      283w     4134c http://localhost/faq
pub fn create_report_string(
    status: &str,
    method: &str,
    line_count: &str,
    word_count: &str,
    content_length: &str,
    url: &str,
    output_level: OutputLevel,
) -> String {
    if matches!(output_level, OutputLevel::Silent) {
        // --silent used, just need the url
        format!("{url}\n")
    } else {
        // normal printing with status and sizes
        let color_status = status_colorizer(status);
        if status.contains("MSG") {
            format!(
                "{color_status} {method:>8} {line_count:>9} {word_count:>9} {content_length:>9} {url}\n"
            )
        } else {
            format!(
                "{color_status} {method:>8} {line_count:>8}l {word_count:>8}w {content_length:>8}c {url}\n"
            )
        }
    }
}

/// Attempts to set the soft limit for the RLIMIT_NOFILE resource
///
/// RLIMIT_NOFILE is the maximum number of file descriptors that can be opened by this process
///
/// The soft limit is the value that the kernel enforces for the corresponding resource.
/// The hard limit acts as a ceiling for the soft limit: an unprivileged process may set only its
/// soft limit to a value in the range from 0 up to the hard limit, and (irreversibly) lower its
/// hard limit.
///
/// A child process created via fork(2) inherits its parent's resource limits. Resource limits are
/// per-process attributes that are shared by all of the threads in a process.
///
/// Based on the above information, no attempt is made to restore the limit to its pre-scan value
/// as the adjustment made here is only valid for the scan itself (and any child processes, of which
/// there are none).
#[cfg(not(target_os = "windows"))]
pub fn set_open_file_limit(limit: u64) -> bool {
    log::trace!("enter: set_open_file_limit");

    if let Ok((soft, hard)) = getrlimit(Resource::NOFILE) {
        if hard > limit {
            // our default open file limit is less than the current hard limit, this means we can
            // set the soft limit to our default

            if setrlimit(Resource::NOFILE, limit, hard).is_ok() {
                log::debug!("set open file descriptor limit to {limit}");

                log::trace!("exit: set_open_file_limit -> {}", true);
                return true;
            }
        } else if soft != hard {
            // hard limit is lower than our default, the next best option is to set the soft limit as
            // high as the hard limit will allow
            if setrlimit(Resource::NOFILE, hard, hard).is_ok() {
                log::debug!("set open file descriptor limit to {limit}");

                log::trace!("exit: set_open_file_limit -> {}", true);
                return true;
            }
        }
    }

    // failed to set a new limit, as limit adjustments are a 'nice to have', we'll just log
    // and move along
    log::warn!("could not set open file descriptor limit to {limit}");

    log::trace!("exit: set_open_file_limit -> {}", false);
    false
}

/// Given a string and a reference to a locked buffered file, write the contents and flush
/// the buffer to disk.
pub fn write_to<T>(
    value: &T,
    file: &mut io::BufWriter<fs::File>,
    convert_to_json: bool,
) -> Result<()>
where
    T: FeroxSerialize,
{
    // note to future self: adding logging of anything other than error to this function
    // is a bad idea. we call this function while processing records generated by the logger.
    // If we then call log::... while already processing some logging output, it results in
    // the second log entry being injected into the first.

    let contents = if convert_to_json {
        value.as_json()?
    } else {
        value.as_str()
    };

    let contents = strip_ansi_codes(&contents);

    let written = file.write(contents.as_bytes())?;

    if written > 0 {
        // this function is used within async functions/loops, so i'm flushing so that in
        // the event of a ctrl+c or w/e results seen so far are saved instead of left lying
        // around in the buffer
        file.flush()?;
    }

    Ok(())
}

/// determine if a url should be denied based on the given absolute url
fn should_deny_absolute(url_to_test: &Url, denier: &Url, handles: Arc<Handles>) -> Result<bool> {
    log::trace!(
        "enter: should_deny_absolute({}, {:?})",
        url_to_test.as_str(),
        denier.as_str(),
    );

    // simplest case is an exact match, check for it first
    if url_to_test == denier {
        log::trace!("exit: should_deny_absolute -> true");
        return Ok(true);
    }

    match (url_to_test.host(), denier.host()) {
        // .host() will return an enum with ipv4|6 or domain and is comparable
        // whereas .domain() returns None for ip addresses
        (Some(normed_host), Some(denier_host)) => {
            if normed_host != denier_host {
                // domains don't even match
                return Ok(false);
            }
        }
        _ => {
            // one or the other couldn't determine the host value, which probably means
            // it's not suitable for further comparison
            return Ok(false);
        }
    }

    let tested_host = url_to_test.host().unwrap(); // match above will catch errors

    // at this point, we have a matching set of ips or domain names. now we can process the
    // url path. The goal is to determine whether the given url's path is a subpath of any
    // url in the deny list, for example
    //    GIVEN URL                        URL DENY LIST               USER-SPECIFIED URLS TO SCAN
    //    http://some.domain/stuff/things, [http://some.domain/stuff], [http://some.domain] => true
    //    http://some.domain/stuff/things, [http://some.domain/stuff/things], [http://some.domain] => true
    //    http://some.domain/stuff/things, [http://some.domain/api], [http://some.domain] => false
    // the examples above are all pretty obvious, the kicker comes when the blocking url's
    // path is a parent to a scanned url
    //    http://some.domain/stuff/things, [http://some.domain/], [http://some.domain/stuff] => false
    //    http://some.domain/api, [http://some.domain/], [http://some.domain/stuff] => true
    // we want to deny all children of the parent, unless that child is a child of a scan
    // we specified through -u(s) or --stdin

    let deny_path = denier.path();
    let tested_path = url_to_test.path();

    if tested_path.starts_with(deny_path) {
        // at this point, we know that the given normalized path is a sub-path of the
        // current deny-url, now we just need to check to see if this deny-url is a parent
        // to a scanned url that is also a parent of the given url
        for ferox_scan in handles.ferox_scans()?.get_active_scans() {
            let scanner = parse_url_with_raw_path(ferox_scan.url().trim_end_matches('/'))
                .with_context(|| format!("Could not parse {ferox_scan} as a url"))?;

            // by calling the new parse_url_with_raw_path, and reaching this point without an
            // error, we know we have an authority and therefore a host. leaving the code
            // below, but we should never hit the else condition. leaving it in so if we find
            // a case where i'm mistaken, we'll know about it and can address it

            if let Some(scan_host) = scanner.host() {
                // same domain/ip check we perform on the denier above
                if tested_host != scan_host {
                    // domains don't even match, keep on keepin' on...
                    continue;
                }
            } else {
                // couldn't process .host from scanner
                unreachable!("should_deny_absolute: scanner.host() returned None, which shouldn't be possible");
            };

            let scan_path = scanner.path();

            if scan_path.starts_with(deny_path) && tested_path.starts_with(scan_path) {
                // user-specified scan url is a sub-path of the deny-urls's path AND the
                // url to check is a sub-path of the user-specified scan url
                //
                // the assumption is the user knew what they wanted and we're going to give
                // the scanned url precedence, even though it's a sub-path
                log::trace!("exit: should_deny_absolute -> false");
                return Ok(false);
            }
        }
        log::trace!("exit: should_deny_absolute -> true");
        return Ok(true);
    }

    log::trace!("exit: should_deny_absolute -> false");
    Ok(false)
}

/// determine if a url should be denied based on the given regular expression
///
/// the regex ONLY matches against the PATH of the url (not the scheme, host, port, etc)
fn should_deny_regex(url_to_test: &Url, denier: &Regex) -> bool {
    log::trace!(
        "enter: should_deny_regex({}, {})",
        url_to_test.as_str(),
        denier,
    );

    let result = denier.is_match(url_to_test.as_str());

    log::trace!("exit: should_deny_regex -> {result}");
    result
}

/// determines whether or not a given url should be denied based on the user-supplied --dont-scan
/// flag
pub fn should_deny_url(url: &Url, handles: Arc<Handles>) -> Result<bool> {
    log::trace!(
        "enter: should_deny_url({}, {:?}, {:?})",
        url.as_str(),
        handles.config.url_denylist,
        handles.ferox_scans()?
    );

    // normalization for comparison is to remove the trailing / if one exists, this is done for
    // the given url and any url to which it's compared
    let normed_url = parse_url_with_raw_path(url.to_string().trim_end_matches('/'))?;

    for denier in &handles.config.url_denylist {
        // note to self: it may seem as though we can use regex only for --dont-scan, however, in
        // doing so, we lose the ability to block a parent directory while scanning a child
        if let Ok(should_deny) = should_deny_absolute(&normed_url, denier, handles.clone()) {
            if should_deny {
                return Ok(true);
            }
        }
    }

    for denier in &handles.config.regex_denylist {
        if should_deny_regex(&normed_url, denier) {
            return Ok(true);
        }
    }

    // made it to the end of the deny lists unscathed, return false, indicating we should not deny
    // this particular url
    log::trace!("exit: should_deny_url -> false");
    Ok(false)
}

/// given a url and filename-suffix, return a unique filename comprised of the slugified url,
/// current unix timestamp and suffix
///
/// ex: ferox-http_telsa_com-1606947491.state
pub fn slugify_filename(url: &str, prefix: &str, suffix: &str) -> String {
    log::trace!("enter: slugify({url:?}, {prefix:?}, {suffix:?})");

    let ts = SystemTime::now()
        .duration_since(UNIX_EPOCH)
        .unwrap_or_else(|_| Duration::from_secs(0))
        .as_secs();

    let altered_prefix = if !prefix.is_empty() {
        format!("{prefix}-")
    } else {
        String::new()
    };

    let slug = url.replace("://", "_").replace(['/', '.', ':'], "_");

    let filename = format!("{altered_prefix}{slug}-{ts}.{suffix}");

    log::trace!("exit: slugify -> {filename}");
    filename
}

/// This function takes a url string and returns a `url::Url`
///
/// It is primarily used to detect url paths that `url::Url::parse` will
/// silently transform, such as /path/../file.html -> /file.html
///
/// # Warning
///
/// In the instance of a url with encoded path traversal strings, such as
/// /path/%2e%2e/file.html, the underlying `url::Url::parse` will
/// further encode the %-signs and return /path/%252e%252e/file.html
pub fn parse_url_with_raw_path(url: &str) -> Result<Url> {
    log::trace!("enter: parse_url_with_raw_path({url})");

    let parsed = Url::parse(url)?;

    if !parsed.has_authority() {
        // parsed correctly, but no authority, meaning mailto: or tel: or
        // some other url that we don't care about
        bail!("url to parse has no authority and is therefore invalid");
    }

    // thanks to @devx00: the possibility exists for Url to return true for
    // has_authority, but not have a host/port, so we'll check for that
    // and bail if it's the case
    if parsed.host().is_none() {
        bail!("url to parse doesn't have a host");
    }

    // we have a valid url, the next step is to check the path and see if it's
    // something that url::Url::parse would silently transform
    //
    // i.e. if the path is /path/../file.html, url::Url::parse will transform it
    // to /file.html, which is not what we want

    let farthest_right_authority_part;

    // we want to find the farthest right authority component, which is the
    // component that is the furthest right in the url that is part of the
    // authority
    //
    // per RFC 3986, the authority is defined as:
    // - authority = [ userinfo "@" ] host [ ":" port ]
    //
    // so the farthest right authority component is either the port or the host
    //
    // i.e. in http://example.com:80/path/file.html, the farthest right authority
    // component is :80
    //
    // in http://example.com/path/file.html, the farthest right authority component
    // is example.com
    //
    // the farthest right authority component is used to split the url into two
    // parts: the part before the authority and the part after the authority
    if let Some(port) = parsed.port() {
        // if the url has a port, then the farthest right authority component is
        // the port
        farthest_right_authority_part = format!(":{port}");
    } else if parsed.has_host() {
        // if the url has a host, then the farthest right authority component is
        // the host
        farthest_right_authority_part = parsed.host_str().unwrap().to_owned();
    } else {
        // if the url has neither a port nor a host, then the url is invalid
        // and we can't do anything with it, but i don't think this is possible
        unreachable!("url has an authority, but has neither a port nor a host");
    }

    // split the original url string into two parts: the part before the authority and the part
    // after the authority (i.e. the path + query + fragment)

    let Some((_, after_authority)) = url.split_once(&farthest_right_authority_part) else {
        // if we can't split the url string into two parts, then the url doesn't conform to our
        // expectations, and we can't continue processing it, so we'll return the parsed url
        return Ok(parsed);
    };

    // when there is a port, but it matches the default port for the scheme,
    // url::Url::parse will mark the port as None, giving us a
    // `after_authority` that looks something like this:
    // - :80/path/file.html
    let after_authority = after_authority
        .replacen(":80", "", 1)
        .replacen(":443", "", 1);

    // snippets from rfc-3986:
    //
    //          foo://example.com:8042/over/there?name=ferret#nose
    //          \_/   \______________/\_________/ \_________/ \__/
    //           |           |            |            |        |
    //        scheme     authority       path        query   fragment
    //
    // The path component is terminated
    //    by the first question mark ("?") or number sign ("#") character, or
    //    by the end of the URI.
    //
    // The query component is indicated by the first question
    //    mark ("?") character and terminated by a number sign ("#") character
    //    or by the end of the URI.
    let (path, _discarded) = after_authority
        .split_once('?')
        // if there isn't a '?', try to remove a fragment
        .unwrap_or_else(|| {
            // if there isn't a '#', return (original, empty)
            after_authority
                .split_once('#')
                .unwrap_or((&after_authority, ""))
        });

    // at this point, we have the path, all by itself

    // each of the following is a string that we can expect url::Url::parse to
    // transform. The variety is to ensure we cover most common path traversal
    // encodings
    let transformation_detectors = [
        // ascii
        "..",
        // single url encoded
        "%2e%2e",
        // double url encoded
        "%25%32%65%25%32%65",
        // utf-8 encoded
        "%c0%ae%c0%ae",
        "%e0%40%ae%e0%40%ae",
        "%c0ae%c0ae",
        // 16 bit shenanigans
        "%uff0e%uff0e",
        "%u002e%u002e",
    ];

    let parsing_will_transform_path = transformation_detectors
        .iter()
        .any(|detector| path.to_lowercase().contains(detector));

    if !parsing_will_transform_path {
        // there's no string in the path of the url that will trigger a transformation
        // so, we can return it as-is
        return Ok(parsed);
    }

    // if we reach this point, the path contains a string that will trigger a transformation
    // so we need to manually create a Url that doesn't have the transformation
    // and return that
    //
    // special thanks to github user @lavafroth for this workaround

    let mut hacked_url = if path.ends_with('/') {
        // from_file_path silently strips trailing slashes, and
        // from_directory_path adds them, so we'll choose the appropriate
        // constructor based on the presence of a path's trailing slash

        // according to from_file_path docs:
        //   from_file_path returns `Err` if the given path is not absolute or,
        //   on Windows, if the prefix is not a disk prefix (e.g. `C:`) or a UNC prefix (`\\`).
        //
        // since we parsed out a valid url path, we know it is absolute, so on non-windows
        // platforms, we can safely unwrap. On windows, we need to fix up the path
        #[cfg(target_os = "windows")]
        {
            let path = format!("\\/IGNOREME{path}");
            Url::from_directory_path(path).unwrap()
        }
        #[cfg(not(target_os = "windows"))]
        Url::from_directory_path(path).unwrap()
    } else {
        #[cfg(target_os = "windows")]
        {
            let path = format!("\\/IGNOREME{path}");
            Url::from_file_path(path).unwrap()
        }
        #[cfg(not(target_os = "windows"))]
        Url::from_file_path(path).unwrap()
    };

    // host must be set first, otherwise multiple components may return Err
    hacked_url.set_host(parsed.host_str())?;
    // scheme/port/username/password can fail, but in this instance, we know they won't
    hacked_url.set_scheme(parsed.scheme()).unwrap();
    hacked_url.set_port(parsed.port()).unwrap();
    hacked_url.set_username(parsed.username()).unwrap();
    hacked_url.set_password(parsed.password()).unwrap();
    // query/fragment can't fail
    hacked_url.set_query(parsed.query());
    hacked_url.set_fragment(parsed.fragment());

    log::trace!("exit: parse_url_with_raw_path -> {hacked_url}");
    Ok(hacked_url)
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::config::Configuration;
    use crate::scan_manager::{FeroxScans, ScanOrder};

    #[test]
    /// parse_url_with_raw_path with javascript:// should not throw an unimplemented! error
    fn utils_parse_url_with_raw_path_javascript() {
        let url = "javascript://";
        let parsed = parse_url_with_raw_path(url);
        assert!(parsed.is_err());
        assert!(parsed
            .unwrap_err()
            .to_string()
            .contains("url to parse doesn't have a host"));
    }

    #[test]
    /// multiple tests for parse_url_with_raw_path
    fn utils_parse_url_with_raw_path() {
        // ../.. is preserved
        let url = "https://www.google.com/../../stuff";
        let parsed = parse_url_with_raw_path(url).unwrap();
        assert_eq!(parsed.as_str(), url);

        // ../.. is preserved as well as the trailing slash
        let url = "https://www.google.com/../../stuff/";
        let parsed = parse_url_with_raw_path(url).unwrap();
        assert_eq!(parsed.as_str(), url);

        // no trailing slash is preserved
        let url = "https://www.google.com/stuff";
        let parsed = parse_url_with_raw_path(url).unwrap();
        assert_eq!(parsed.as_str(), url);

        // trailing slash is preserved
        let url = "https://www.google.com/stuff/";
        let parsed: Url = parse_url_with_raw_path(url).unwrap();
        assert_eq!(parsed.as_str(), url);

        // mailto is an error
        let url = "mailto:user@example.com";
        let parsed = parse_url_with_raw_path(url);
        assert!(parsed.is_err());

        // relative url is an error
        let url = "../../stuff";
        let parsed = parse_url_with_raw_path(url);
        assert!(parsed.is_err());

        // absolute without host is an error
        let url = "/../../stuff";
        let parsed = parse_url_with_raw_path(url);
        assert!(parsed.is_err());

        // default ports are parsed correctly
        for url in [
            "http://example.com:80/path/file.html",
            "https://example.com:443/path/file.html",
        ] {
            let parsed = parse_url_with_raw_path(url).unwrap();
            assert!(parsed.port().is_none());
            assert_eq!(parsed.host().unwrap().to_string().as_str(), "example.com");
        }

        // non-default ports are parsed correctly
        for url in [
            "http://example.com:8080/path/file.html",
            "https://example.com:4433/path/file.html",
        ] {
            let parsed = parse_url_with_raw_path(url).unwrap();
            assert!(parsed.port().is_some());
            assert_eq!(parsed.as_str(), url);
        }

        // different encodings are respected if found in doubles
        //
        // note that the % sign is encoded as %25...
        let url = "http://user:pass@example.com/%2e%2e/stuff.php";
        let parsed = parse_url_with_raw_path(url).unwrap();
        assert_eq!(
            parsed.as_str(),
            "http://user:pass@example.com/%252e%252e/stuff.php"
        );

        let url = "http://user:pass@example.com/%25%32%65%25%32%65/stuff.php";
        let parsed = parse_url_with_raw_path(url).unwrap();
        assert_eq!(parsed.username(), "user");
        assert_eq!(parsed.password().unwrap(), "pass");
        assert_eq!(
            parsed.as_str(),
            "http://user:pass@example.com/%2525%2532%2565%2525%2532%2565/stuff.php"
        );

        let url = "http://user:pass@example.com/%c0%ae%c0%ae/stuff.php";
        let parsed = parse_url_with_raw_path(url).unwrap();
        assert_eq!(parsed.username(), "user");
        assert_eq!(parsed.password().unwrap(), "pass");
        assert_eq!(
            parsed.as_str(),
            "http://user:pass@example.com/%25c0%25ae%25c0%25ae/stuff.php"
        );

        let url = "http://user:pass@example.com/%e0%40%ae%e0%40%ae/stuff.php";
        let parsed = parse_url_with_raw_path(url).unwrap();
        assert_eq!(parsed.username(), "user");
        assert_eq!(parsed.password().unwrap(), "pass");
        assert_eq!(
            parsed.as_str(),
            "http://user:pass@example.com/%25e0%2540%25ae%25e0%2540%25ae/stuff.php"
        );

        let url = "http://user:pass@example.com/%c0ae%c0ae/stuff.php";
        let parsed = parse_url_with_raw_path(url).unwrap();
        assert_eq!(parsed.username(), "user");
        assert_eq!(parsed.password().unwrap(), "pass");
        assert_eq!(
            parsed.as_str(),
            "http://user:pass@example.com/%25c0ae%25c0ae/stuff.php"
        );

        let url = "http://user:pass@example.com/%uff0e%uff0e/stuff.php";
        let parsed = parse_url_with_raw_path(url).unwrap();
        assert_eq!(parsed.username(), "user");
        assert_eq!(parsed.password().unwrap(), "pass");
        assert_eq!(
            parsed.as_str(),
            "http://user:pass@example.com/%25uff0e%25uff0e/stuff.php"
        );

        let url = "http://user:pass@example.com/%u002e%u002e/stuff.php";
        let parsed = parse_url_with_raw_path(url).unwrap();
        assert_eq!(parsed.username(), "user");
        assert_eq!(parsed.password().unwrap(), "pass");
        assert_eq!(
            parsed.as_str(),
            "http://user:pass@example.com/%25u002e%25u002e/stuff.php"
        );
    }

    #[cfg(not(target_os = "windows"))]
    mod nix_only_tests {
        use super::*;

        #[test]
        /// set_open_file_limit with a low requested limit succeeds
        fn utils_set_open_file_limit_with_low_requested_limit() {
            let (_, hard) = getrlimit(Resource::NOFILE).unwrap();
            let lower_limit = hard - 1;
            assert!(set_open_file_limit(lower_limit));
        }

        #[test]
        /// set_open_file_limit with a high requested limit succeeds
        fn utils_set_open_file_limit_with_high_requested_limit() {
            let (_, hard) = getrlimit(Resource::NOFILE).unwrap();
            let higher_limit = hard + 1;
            // calculate a new soft to ensure soft != hard and hit that logic branch
            let new_soft = hard - 1;
            setrlimit(Resource::NOFILE, new_soft, hard).unwrap();
            assert!(set_open_file_limit(higher_limit));
        }

        #[test]
        /// set_open_file_limit should fail when hard == soft
        fn utils_set_open_file_limit_with_fails_when_both_limits_are_equal() {
            let (_, hard) = getrlimit(Resource::NOFILE).unwrap();
            // calculate a new soft to ensure soft == hard and hit the failure logic branch
            setrlimit(Resource::NOFILE, hard, hard).unwrap();
            assert!(!set_open_file_limit(hard)); // returns false
        }
    }

    #[test]
    /// status colorizer uses red for 500s
    fn status_colorizer_uses_red_for_500s() {
        assert_eq!(status_colorizer("500"), style("500").red().to_string());
    }

    #[test]
    /// status colorizer uses red for 400s
    fn status_colorizer_uses_red_for_400s() {
        assert_eq!(status_colorizer("400"), style("400").red().to_string());
    }

    #[test]
    /// status colorizer uses red for errors
    fn status_colorizer_uses_red_for_errors() {
        assert_eq!(status_colorizer("ERROR"), style("ERROR").red().to_string());
    }

    #[test]
    /// status colorizer uses cyan for wildcards
    fn status_colorizer_uses_cyan_for_wildcards() {
        assert_eq!(status_colorizer("WLD"), style("WLD").cyan().to_string());
    }

    #[test]
    /// status colorizer uses blue for 100s
    fn status_colorizer_uses_blue_for_100s() {
        assert_eq!(status_colorizer("100"), style("100").blue().to_string());
    }

    #[test]
    /// status colorizer uses green for 200s
    fn status_colorizer_uses_green_for_200s() {
        assert_eq!(status_colorizer("200"), style("200").green().to_string());
    }

    #[test]
    /// status colorizer uses yellow for 300s
    fn status_colorizer_uses_yellow_for_300s() {
        assert_eq!(status_colorizer("300"), style("300").yellow().to_string());
    }

    #[test]
    /// status colorizer doesnt color anything else
    fn status_colorizer_returns_as_is() {
        assert_eq!(status_colorizer("farfignewton"), "farfignewton".to_string());
    }

    #[test]
    /// provide a url that should be blocked where the denier is an exact match for the tested url
    /// expect true
    fn should_deny_url_blocks_when_denier_is_exact_match() {
        let scan_url = "https://testdomain.com/";
        let deny_url = "https://testdomain.com/denied";
        let tested_url = Url::parse("https://testdomain.com/denied/").unwrap();

        let scans = Arc::new(FeroxScans::default());
        scans.add_directory_scan(
            scan_url,
            ScanOrder::Initial,
            Arc::new(Handles::for_testing(None, None).0),
        );

        let mut config = Configuration::new().unwrap();
        config.url_denylist = vec![Url::parse(deny_url).unwrap()];
        let config = Arc::new(config);

        let handles = Arc::new(Handles::for_testing(Some(scans), Some(config)).0);

        assert!(should_deny_url(&tested_url, handles).unwrap());
    }

    #[test]
    /// provide a url that has a different host than the denier but the same path, expect false
    fn should_deny_url_doesnt_compare_mismatched_domains() {
        let scan_url = "https://testdomain.com/";
        let deny_url = "https://dev.testdomain.com/denied";
        let tested_url = Url::parse("https://testdomain.com/denied/").unwrap();

        let scans = Arc::new(FeroxScans::default());
        scans.add_directory_scan(
            scan_url,
            ScanOrder::Initial,
            Arc::new(Handles::for_testing(None, None).0),
        );

        let mut config = Configuration::new().unwrap();
        config.url_denylist = vec![Url::parse(deny_url).unwrap()];
        let config = Arc::new(config);

        let handles = Arc::new(Handles::for_testing(Some(scans), Some(config)).0);

        assert!(!should_deny_url(&tested_url, handles).unwrap());
    }

    #[test]
    /// provide a denier from which we can't check a host, which results in no comparison, expect false
    fn should_deny_url_doesnt_compare_non_domains() {
        let scan_url = "https://testdomain.com/";
        let deny_url = "unix:/run/foo.socket";
        let tested_url = Url::parse("https://testdomain.com/denied/").unwrap();

        let scans = Arc::new(FeroxScans::default());
        scans.add_directory_scan(
            scan_url,
            ScanOrder::Initial,
            Arc::new(Handles::for_testing(None, None).0),
        );

        let mut config = Configuration::new().unwrap();
        config.url_denylist = vec![Url::parse(deny_url).unwrap()];
        let config = Arc::new(config);

        let handles = Arc::new(Handles::for_testing(Some(scans), Some(config)).0);

        assert!(!should_deny_url(&tested_url, handles).unwrap());
    }

    #[test]
    /// provide a url that has a different host than the denier but the same path, expect false
    /// because the denier is a parent to the tested, even tho the scanned doesn't compare, it
    /// still returns true
    fn should_deny_url_doesnt_compare_mismatched_domains_in_scanned() {
        let deny_url = "https://testdomain.com/";
        let scan_url = "https://dev.testdomain.com/denied";
        let tested_url = Url::parse("https://testdomain.com/denied/").unwrap();

        let scans = Arc::new(FeroxScans::default());
        scans.add_directory_scan(
            scan_url,
            ScanOrder::Initial,
            Arc::new(Handles::for_testing(None, None).0),
        );

        let mut config = Configuration::new().unwrap();
        config.url_denylist = vec![Url::parse(deny_url).unwrap()];
        let config = Arc::new(config);

        let handles = Arc::new(Handles::for_testing(Some(scans), Some(config)).0);

        assert!(should_deny_url(&tested_url, handles).unwrap());
    }

    #[test]
    /// provide a denier from which we can't check a host, which results in no comparison, expect false
    /// because the denier is a parent to the tested, even tho the scanned doesn't compare, it
    /// still returns true
    ///
    /// note: adding parse_url_with_raw_path changed the behavior of this test, it used to return
    /// true, now it returns false. see my note in should_deny_absolute and the unreachable!
    /// call block to see why
    ///
    /// leaving this test here to document the behavior change and to catch regressions in the
    /// new expected behavior
    fn should_deny_url_doesnt_compare_non_domains_in_scanned() {
        let deny_url = "https://testdomain.com/";
        let scan_url = "unix:/run/foo.socket";
        let tested_url = Url::parse("https://testdomain.com/denied/").unwrap();

        let scans = Arc::new(FeroxScans::default());
        scans.add_directory_scan(
            scan_url,
            ScanOrder::Initial,
            Arc::new(Handles::for_testing(None, None).0),
        );

        let mut config = Configuration::new().unwrap();
        config.url_denylist = vec![Url::parse(deny_url).unwrap()];
        let config = Arc::new(config);

        let handles = Arc::new(Handles::for_testing(Some(scans), Some(config)).0);
        assert!(!should_deny_url(&tested_url, handles).unwrap());
    }

    #[test]
    /// provide a denier where the tested url is a sub-path and the scanned url is not, expect true
    fn should_deny_url_blocks_child() {
        let scan_url = "https://testdomain.com/";
        let deny_url = "https://testdomain.com/api";
        let tested_url = Url::parse("https://testdomain.com/api/denied/").unwrap();

        let scans = Arc::new(FeroxScans::default());
        scans.add_directory_scan(
            scan_url,
            ScanOrder::Initial,
            Arc::new(Handles::for_testing(None, None).0),
        );

        let mut config = Configuration::new().unwrap();
        config.url_denylist = vec![Url::parse(deny_url).unwrap()];
        let config = Arc::new(config);

        let handles = Arc::new(Handles::for_testing(Some(scans), Some(config)).0);

        assert!(should_deny_url(&tested_url, handles).unwrap());
    }

    #[test]
    /// provide a denier where the tested url is not a sub-path and the scanned url is not, expect false
    fn should_deny_url_doesnt_block_non_child() {
        let scan_url = "https://testdomain.com/";
        let deny_url = "https://testdomain.com/api";
        let tested_url = Url::parse("https://testdomain.com/not-denied/").unwrap();

        let scans = Arc::new(FeroxScans::default());
        scans.add_directory_scan(
            scan_url,
            ScanOrder::Initial,
            Arc::new(Handles::for_testing(None, None).0),
        );

        let mut config = Configuration::new().unwrap();
        config.url_denylist = vec![Url::parse(deny_url).unwrap()];
        let config = Arc::new(config);

        let handles = Arc::new(Handles::for_testing(Some(scans), Some(config)).0);

        assert!(!should_deny_url(&tested_url, handles).unwrap());
    }

    #[test]
    /// provide a denier where the tested url is a sub-path and the scanned url is not, expect true
    fn should_deny_url_blocks_child_when_scan_url_isnt_parent() {
        let scan_url = "https://testdomain.com/api";
        let deny_url = "https://testdomain.com/";
        let tested_url = Url::parse("https://testdomain.com/stuff/").unwrap();

        let scans = Arc::new(FeroxScans::default());
        scans.add_directory_scan(
            scan_url,
            ScanOrder::Initial,
            Arc::new(Handles::for_testing(None, None).0),
        );

        let mut config = Configuration::new().unwrap();
        config.url_denylist = vec![Url::parse(deny_url).unwrap()];
        let config = Arc::new(config);

        let handles = Arc::new(Handles::for_testing(Some(scans), Some(config)).0);

        assert!(should_deny_url(&tested_url, handles).unwrap());
    }

    #[test]
    /// provide a denier where the tested url is not a sub-path and the scanned url is not, expect false
    fn should_deny_url_doesnt_block_child_when_scan_url_is_parent() {
        let scan_url = "https://testdomain.com/api";
        let deny_url = "https://testdomain.com/";
        let tested_url = Url::parse("https://testdomain.com/api/not-denied/").unwrap();

        let scans = Arc::new(FeroxScans::default());
        scans.add_directory_scan(
            scan_url,
            ScanOrder::Initial,
            Arc::new(Handles::for_testing(None, None).0),
        );

        let mut config = Configuration::new().unwrap();
        config.url_denylist = vec![Url::parse(deny_url).unwrap()];
        let config = Arc::new(config);

        let handles = Arc::new(Handles::for_testing(Some(scans), Some(config)).0);

        assert!(!should_deny_url(&tested_url, handles).unwrap());
    }

    #[test]
    /// provide a denier where the tested url is matched against a regular expression in the path
    /// of the url
    fn should_deny_url_blocks_urls_based_on_regex_in_path() {
        let scan_url = "https://testdomain.com/";
        let deny_pattern = "/deni.*";
        let tested_url = Url::parse("https://testdomain.com/denied/").unwrap();

        let scans = Arc::new(FeroxScans::default());
        scans.add_directory_scan(
            scan_url,
            ScanOrder::Initial,
            Arc::new(Handles::for_testing(None, None).0),
        );

        let mut config = Configuration::new().unwrap();
        config.regex_denylist = vec![Regex::new(deny_pattern).unwrap()];
        let config = Arc::new(config);

        let handles = Arc::new(Handles::for_testing(Some(scans), Some(config)).0);

        assert!(should_deny_url(&tested_url, handles).unwrap());
    }

    #[test]
    /// provide a denier where the tested url is matched against a regular expression in the scheme
    /// of the url
    fn should_deny_url_blocks_urls_based_on_regex_in_scheme() {
        let scan_url = "https://testdomain.com/";
        let deny_pattern = "http:";
        let tested_http_url = Url::parse("http://testdomain.com/denied/").unwrap();
        let tested_https_url = Url::parse("https://testdomain.com/denied/").unwrap();

        let scans = Arc::new(FeroxScans::default());
        scans.add_directory_scan(
            scan_url,
            ScanOrder::Initial,
            Arc::new(Handles::for_testing(None, None).0),
        );

        let mut config = Configuration::new().unwrap();
        config.regex_denylist = vec![Regex::new(deny_pattern).unwrap()];
        let config = Arc::new(config);

        let handles = Arc::new(Handles::for_testing(Some(scans), Some(config)).0);

        assert!(!should_deny_url(&tested_https_url, handles.clone()).unwrap());
        assert!(should_deny_url(&tested_http_url, handles).unwrap());
    }
}