Merge pull request #884 from epi052/878-support-raw-urls

878 support raw urls
2026-06-05 00:01:12 -03:00 · 2023-04-26 06:59:04 -05:00
parent ec78ec3049 9876759606
commit 1cf37e38a2
13 changed files with 392 additions and 65 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -839,7 +839,7 @@ dependencies = [

 [[package]]
 name = "feroxbuster"
-version = "2.9.4"
+version = "2.9.5"
 dependencies = [
 "anyhow",
 "assert_cmd",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "feroxbuster"
-version = "2.9.4"
+version = "2.9.5"
 authors = ["Ben 'epi' Risher (@epi052)"]
 license = "MIT"
 edition = "2021"
--- a/shell_completions/_feroxbuster
+++ b/shell_completions/_feroxbuster
@@ -24,8 +24,8 @@ _feroxbuster() {
 '--replay-proxy=[Send only unfiltered requests through a Replay Proxy, instead of all requests]:REPLAY_PROXY:_urls' \
 '*-R+[Status Codes to send through a Replay Proxy when found (default: --status-codes value)]:REPLAY_CODE: ' \
 '*--replay-codes=[Status Codes to send through a Replay Proxy when found (default: --status-codes value)]:REPLAY_CODE: ' \
-'-a+[Sets the User-Agent (default: feroxbuster/2.9.4)]:USER_AGENT: ' \
-'--user-agent=[Sets the User-Agent (default: feroxbuster/2.9.4)]:USER_AGENT: ' \
+'-a+[Sets the User-Agent (default: feroxbuster/2.9.5)]:USER_AGENT: ' \
+'--user-agent=[Sets the User-Agent (default: feroxbuster/2.9.5)]:USER_AGENT: ' \
 '*-x+[File extension(s) to search for (ex: -x php -x pdf js)]:FILE_EXTENSION: ' \
 '*--extensions=[File extension(s) to search for (ex: -x php -x pdf js)]:FILE_EXTENSION: ' \
 '*-m+[Which HTTP request method(s) should be sent (default: GET)]:HTTP_METHODS: ' \
--- a/shell_completions/_feroxbuster.ps1
+++ b/shell_completions/_feroxbuster.ps1
@@ -30,8 +30,8 @@ Register-ArgumentCompleter -Native -CommandName 'feroxbuster' -ScriptBlock {
            [CompletionResult]::new('--replay-proxy', 'replay-proxy', [CompletionResultType]::ParameterName, 'Send only unfiltered requests through a Replay Proxy, instead of all requests')
            [CompletionResult]::new('-R', 'R', [CompletionResultType]::ParameterName, 'Status Codes to send through a Replay Proxy when found (default: --status-codes value)')
            [CompletionResult]::new('--replay-codes', 'replay-codes', [CompletionResultType]::ParameterName, 'Status Codes to send through a Replay Proxy when found (default: --status-codes value)')
-            [CompletionResult]::new('-a', 'a', [CompletionResultType]::ParameterName, 'Sets the User-Agent (default: feroxbuster/2.9.4)')
-            [CompletionResult]::new('--user-agent', 'user-agent', [CompletionResultType]::ParameterName, 'Sets the User-Agent (default: feroxbuster/2.9.4)')
+            [CompletionResult]::new('-a', 'a', [CompletionResultType]::ParameterName, 'Sets the User-Agent (default: feroxbuster/2.9.5)')
+            [CompletionResult]::new('--user-agent', 'user-agent', [CompletionResultType]::ParameterName, 'Sets the User-Agent (default: feroxbuster/2.9.5)')
            [CompletionResult]::new('-x', 'x', [CompletionResultType]::ParameterName, 'File extension(s) to search for (ex: -x php -x pdf js)')
            [CompletionResult]::new('--extensions', 'extensions', [CompletionResultType]::ParameterName, 'File extension(s) to search for (ex: -x php -x pdf js)')
            [CompletionResult]::new('-m', 'm', [CompletionResultType]::ParameterName, 'Which HTTP request method(s) should be sent (default: GET)')
--- a/shell_completions/feroxbuster.elv
+++ b/shell_completions/feroxbuster.elv
@@ -27,8 +27,8 @@ set edit:completion:arg-completer[feroxbuster] = {|@words|
            cand --replay-proxy 'Send only unfiltered requests through a Replay Proxy, instead of all requests'
            cand -R 'Status Codes to send through a Replay Proxy when found (default: --status-codes value)'
            cand --replay-codes 'Status Codes to send through a Replay Proxy when found (default: --status-codes value)'
-            cand -a 'Sets the User-Agent (default: feroxbuster/2.9.4)'
-            cand --user-agent 'Sets the User-Agent (default: feroxbuster/2.9.4)'
+            cand -a 'Sets the User-Agent (default: feroxbuster/2.9.5)'
+            cand --user-agent 'Sets the User-Agent (default: feroxbuster/2.9.5)'
            cand -x 'File extension(s) to search for (ex: -x php -x pdf js)'
            cand --extensions 'File extension(s) to search for (ex: -x php -x pdf js)'
            cand -m 'Which HTTP request method(s) should be sent (default: GET)'
--- a/src/banner/container.rs
+++ b/src/banner/container.rs
@@ -2,12 +2,11 @@ use super::entry::BannerEntry;
 use crate::{
    config::Configuration,
    event_handlers::Handles,
-    utils::{logged_request, status_colorizer},
+    utils::{logged_request, parse_url_with_raw_path, status_colorizer},
    DEFAULT_IGNORED_EXTENSIONS, DEFAULT_METHOD, DEFAULT_STATUS_CODES, VERSION,
 };
 use anyhow::{bail, Result};
 use console::{style, Emoji};
-use reqwest::Url;
 use serde_json::Value;
 use std::{io::Write, sync::Arc};

@@ -478,7 +477,7 @@ by Ben "epi" Risher {}                 ver: {}"#,
    pub async fn check_for_updates(&mut self, url: &str, handles: Arc<Handles>) -> Result<()> {
        log::trace!("enter: needs_update({}, {:?})", url, handles);

-        let api_url = Url::parse(url)?;
+        let api_url = parse_url_with_raw_path(url)?;

        let result = logged_request(&api_url, DEFAULT_METHOD, None, handles.clone()).await?;
        let body = result.text().await?;
--- a/src/config/container.rs
+++ b/src/config/container.rs
@@ -6,7 +6,10 @@ use super::utils::{
 use crate::config::determine_output_level;
 use crate::config::utils::determine_requester_policy;
 use crate::{
-    client, parser, scan_manager::resume_scan, traits::FeroxSerialize, utils::fmt_err,
+    client, parser,
+    scan_manager::resume_scan,
+    traits::FeroxSerialize,
+    utils::{fmt_err, parse_url_with_raw_path},
    DEFAULT_CONFIG_NAME,
 };
 use anyhow::{anyhow, Context, Result};
@@ -673,7 +676,7 @@ impl Configuration {
            for denier in arg {
                // could be an absolute url or a regex, need to determine which and populate the
                // appropriate vector
-                match Url::parse(denier.trim_end_matches('/')) {
+                match parse_url_with_raw_path(denier.trim_end_matches('/')) {
                    Ok(absolute) => {
                        // denier is an absolute url and can be parsed as such
                        config.url_denylist.push(absolute);
--- a/src/event_handlers/scans.rs
+++ b/src/event_handlers/scans.rs
@@ -16,7 +16,7 @@ use crate::{
 use super::command::Command::AddToUsizeField;
 use super::*;
 use crate::statistics::StatField;
-use reqwest::Url;
+use crate::utils::parse_url_with_raw_path;
 use tokio::time::Duration;

 #[derive(Debug)]
@@ -325,7 +325,9 @@ impl ScanHandler {
                self.data.add_directory_scan(&target, order).1 // add the new target; return FeroxScan
            };

-            if should_test_deny && should_deny_url(&Url::parse(&target)?, self.handles.clone())? {
+            if should_test_deny
+                && should_deny_url(&parse_url_with_raw_path(&target)?, self.handles.clone())?
+            {
                // response was caught by a user-provided deny list
                // checking this last, since it's most susceptible to longer runtimes due to what
                // input is received
--- a/src/extractor/container.rs
+++ b/src/extractor/container.rs
@@ -11,7 +11,10 @@ use crate::{
        StatField::{LinksExtracted, TotalExpected},
    },
    url::FeroxUrl,
-    utils::{logged_request, make_request, send_try_recursion_command, should_deny_url},
+    utils::{
+        logged_request, make_request, parse_url_with_raw_path, send_try_recursion_command,
+        should_deny_url,
+    },
    ExtractionResult, DEFAULT_METHOD,
 };
 use anyhow::{bail, Context, Result};
@@ -122,7 +125,7 @@ impl<'a> Extractor<'a> {
    ) -> Result<()> {
        log::trace!("enter: parse_url_and_add_subpaths({:?})", links);

-        match Url::parse(url_to_parse) {
+        match parse_url_with_raw_path(url_to_parse) {
            Ok(absolute) => {
                if absolute.domain() != original_url.domain()
                    || absolute.host() != original_url.host()
@@ -475,7 +478,7 @@ impl<'a> Extractor<'a> {
            ExtractionTarget::ResponseBody | ExtractionTarget::DirectoryListing => {
                self.response.unwrap().url().clone()
            }
-            ExtractionTarget::RobotsTxt => match Url::parse(&self.url) {
+            ExtractionTarget::RobotsTxt => match parse_url_with_raw_path(&self.url) {
                Ok(u) => u,
                Err(e) => {
                    bail!("Could not parse {}: {}", self.url, e);
@@ -524,7 +527,7 @@ impl<'a> Extractor<'a> {

        for capture in self.robots_regex.captures_iter(body) {
            if let Some(new_path) = capture.name("url_path") {
-                let mut new_url = Url::parse(&self.url)?;
+                let mut new_url = parse_url_with_raw_path(&self.url)?;

                new_url.set_path(new_path.as_str());

@@ -654,7 +657,7 @@ impl<'a> Extractor<'a> {
            &client
        };

-        let mut url = Url::parse(&self.url)?;
+        let mut url = parse_url_with_raw_path(&self.url)?;
        url.set_path(location); // overwrite existing path

        // purposefully not using logged_request here due to using the special client
--- a/src/filters/utils.rs
+++ b/src/filters/utils.rs
@@ -4,11 +4,10 @@ use crate::event_handlers::Handles;
 use crate::filters::similarity::SIM_HASHER;
 use crate::nlp::preprocess;
 use crate::response::FeroxResponse;
-use crate::utils::logged_request;
+use crate::utils::{logged_request, parse_url_with_raw_path};
 use crate::DEFAULT_METHOD;
 use anyhow::Result;
 use regex::Regex;
-use reqwest::Url;
 use std::sync::Arc;

 /// wrapper around logic necessary to create a SimilarityFilter
@@ -23,7 +22,7 @@ pub(crate) async fn create_similarity_filter(
    handles: Arc<Handles>,
 ) -> Result<SimilarityFilter> {
    // url as-is based on input, ignores user-specified url manipulation options (add-slash etc)
-    let url = Url::parse(similarity_filter)?;
+    let url = parse_url_with_raw_path(similarity_filter)?;

    // attempt to request the given url
    let resp = logged_request(&url, DEFAULT_METHOD, None, handles.clone()).await?;
--- a/src/response.rs
+++ b/src/response.rs
@@ -21,7 +21,7 @@ use crate::{
    event_handlers::{Command, Handles},
    traits::FeroxSerialize,
    url::FeroxUrl,
-    utils::{self, fmt_err, status_colorizer},
+    utils::{self, fmt_err, parse_url_with_raw_path, status_colorizer},
    CommandSender,
 };

@@ -140,7 +140,7 @@ impl FeroxResponse {

    /// Set `FeroxResponse`'s `url` attribute, has no affect if an error occurs
    pub fn set_url(&mut self, url: &str) {
-        match Url::parse(url) {
+        match parse_url_with_raw_path(url) {
            Ok(url) => {
                self.url = url;
            }
@@ -599,7 +599,7 @@ impl<'de> Deserialize<'de> for FeroxResponse {
            match key.as_str() {
                "url" => {
                    if let Some(url) = value.as_str() {
-                        if let Ok(parsed) = Url::parse(url) {
+                        if let Ok(parsed) = parse_url_with_raw_path(url) {
                            response.url = parsed;
                        }
                    }
--- a/src/url.rs
+++ b/src/url.rs
@@ -1,3 +1,4 @@
+use crate::utils::parse_url_with_raw_path;
 use crate::{event_handlers::Handles, statistics::StatError::UrlFormat, Command::AddError};
 use anyhow::{anyhow, bail, Result};
 use reqwest::Url;
@@ -142,19 +143,19 @@ impl FeroxUrl {
            word = word.trim_start_matches('/').to_string();
        };

-        let base_url = Url::parse(&url)?;
-        let joined = base_url.join(&word)?;
+        let base_url = parse_url_with_raw_path(&url)?;
+        let mut joined = base_url.join(&word)?;

-        if self.handles.config.queries.is_empty() {
-            // no query params to process
-            log::trace!("exit: format -> {}", joined);
-            Ok(joined)
-        } else {
-            let with_params =
-                Url::parse_with_params(joined.as_str(), &self.handles.config.queries)?;
-            log::trace!("exit: format_url -> {}", with_params);
-            Ok(with_params) // request with params attached
+        if !self.handles.config.queries.is_empty() {
+            // if called, this adds a '?' to the url, whether or not there are queries to be added
+            // so we need to check if there are queries to be added before blindly adding the '?'
+            joined
+                .query_pairs_mut()
+                .extend_pairs(self.handles.config.queries.iter());
        }
+
+        log::trace!("exit: format_url -> {}", joined);
+        Ok(joined)
    }

    /// Simple helper to abstract away adding a forward-slash to a url if not present
@@ -189,7 +190,7 @@ impl FeroxUrl {

        let target = self.normalize();

-        let parsed = Url::parse(&target)?;
+        let parsed = parse_url_with_raw_path(&target)?;
        let parts = parsed
            .path_segments()
            .ok_or_else(|| anyhow!("No path segments found"))?;
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -425,9 +425,14 @@ fn should_deny_absolute(url_to_test: &Url, denier: &Url, handles: Arc<Handles>)
        // current deny-url, now we just need to check to see if this deny-url is a parent
        // to a scanned url that is also a parent of the given url
        for ferox_scan in handles.ferox_scans()?.get_active_scans() {
-            let scanner = Url::parse(ferox_scan.url().trim_end_matches('/'))
+            let scanner = parse_url_with_raw_path(ferox_scan.url().trim_end_matches('/'))
                .with_context(|| format!("Could not parse {ferox_scan} as a url"))?;

+            // by calling the new parse_url_with_raw_path, and reaching this point without an
+            // error, we know we have an authority and therefore a host. leaving the code
+            // below, but we should never hit the else condition. leaving it in so if we find
+            // a case where i'm mistaken, we'll know about it and can address it
+
            if let Some(scan_host) = scanner.host() {
                // same domain/ip check we perform on the denier above
                if tested_host != scan_host {
@@ -436,7 +441,7 @@ fn should_deny_absolute(url_to_test: &Url, denier: &Url, handles: Arc<Handles>)
                }
            } else {
                // couldn't process .host from scanner
-                continue;
+                unreachable!("should_deny_absolute: scanner.host() returned None, which shouldn't be possible");
            };

            let scan_path = scanner.path();
@@ -487,7 +492,7 @@ pub fn should_deny_url(url: &Url, handles: Arc<Handles>) -> Result<bool> {

    // normalization for comparison is to remove the trailing / if one exists, this is done for
    // the given url and any url to which it's compared
-    let normed_url = Url::parse(url.to_string().trim_end_matches('/'))?;
+    let normed_url = parse_url_with_raw_path(url.to_string().trim_end_matches('/'))?;

    for denier in &handles.config.url_denylist {
        // note to self: it may seem as though we can use regex only for --dont-scan, however, in
@@ -537,6 +542,187 @@ pub fn slugify_filename(url: &str, prefix: &str, suffix: &str) -> String {
    filename
 }

+/// This function takes a url string and returns a `url::Url`
+///
+/// It is primarily used to detect url paths that `url::Url::parse` will
+/// silently transform, such as /path/../file.html -> /file.html
+///
+/// # Warning
+///
+/// In the instance of a url with encoded path traversal strings, such as
+/// /path/%2e%2e/file.html, the underlying `url::Url::parse` will
+/// further encode the %-signs and return /path/%252e%252e/file.html
+pub fn parse_url_with_raw_path(url: &str) -> Result<Url> {
+    log::trace!("enter: parse_url_with_raw_path({})", url);
+
+    let parsed = Url::parse(url)?;
+
+    if !parsed.has_authority() {
+        // parsed correctly, but no authority, meaning mailto: or tel: or
+        // some other url that we don't care about
+        bail!("url to parse has no authority and is therefore invalid");
+    }
+
+    // we have a valid url, the next step is to check the path and see if it's
+    // something that url::Url::parse would silently transform
+    //
+    // i.e. if the path is /path/../file.html, url::Url::parse will transform it
+    // to /file.html, which is not what we want
+
+    let farthest_right_authority_part;
+
+    // we want to find the farthest right authority component, which is the
+    // component that is the furthest right in the url that is part of the
+    // authority
+    //
+    // per RFC 3986, the authority is defined as:
+    // - authority = [ userinfo "@" ] host [ ":" port ]
+    //
+    // so the farthest right authority component is either the port or the host
+    //
+    // i.e. in http://example.com:80/path/file.html, the farthest right authority
+    // component is :80
+    //
+    // in http://example.com/path/file.html, the farthest right authority component
+    // is example.com
+    //
+    // the farthest right authority component is used to split the url into two
+    // parts: the part before the authority and the part after the authority
+    if let Some(port) = parsed.port() {
+        // if the url has a port, then the farthest right authority component is
+        // the port
+        farthest_right_authority_part = format!(":{}", port);
+    } else if parsed.has_host() {
+        // if the url has a host, then the farthest right authority component is
+        // the host
+        farthest_right_authority_part = parsed.host_str().unwrap().to_owned();
+    } else {
+        // if the url has neither a port nor a host, then the url is invalid
+        // and we can't do anything with it, but i don't think this is possible
+        unreachable!("url has an authority, but has neither a port nor a host");
+    }
+
+    // split the original url string into two parts: the part before the authority and the part
+    // after the authority (i.e. the path + query + fragment)
+
+    let Some((_, after_authority)) = url.split_once(&farthest_right_authority_part) else {
+        // if we can't split the url string into two parts, then the url doesn't conform to our
+        // expectations, and we can't continue processing it, so we'll return the parsed url
+        return Ok(parsed);
+    };
+
+    // when there is a port, but it matches the default port for the scheme,
+    // url::Url::parse will mark the port as None, giving us a
+    // `after_authority` that looks something like this:
+    // - :80/path/file.html
+    let after_authority = after_authority
+        .replacen(":80", "", 1)
+        .replacen(":443", "", 1);
+
+    // snippets from rfc-3986:
+    //
+    //          foo://example.com:8042/over/there?name=ferret#nose
+    //          \_/   \______________/\_________/ \_________/ \__/
+    //           |           |            |            |        |
+    //        scheme     authority       path        query   fragment
+    //
+    // The path component is terminated
+    //    by the first question mark ("?") or number sign ("#") character, or
+    //    by the end of the URI.
+    //
+    // The query component is indicated by the first question
+    //    mark ("?") character and terminated by a number sign ("#") character
+    //    or by the end of the URI.
+    let (path, _discarded) = after_authority
+        .split_once('?')
+        // if there isn't a '?', try to remove a fragment
+        .unwrap_or_else(|| {
+            // if there isn't a '#', return (original, empty)
+            after_authority
+                .split_once('#')
+                .unwrap_or((&after_authority, ""))
+        });
+
+    // at this point, we have the path, all by itself
+
+    // each of the following is a string that we can expect url::Url::parse to
+    // transform. The variety is to ensure we cover most common path traversal
+    // encodings
+    let transformation_detectors = vec![
+        // ascii
+        "..",
+        // single url encoded
+        "%2e%2e",
+        // double url encoded
+        "%25%32%65%25%32%65",
+        // utf-8 encoded
+        "%c0%ae%c0%ae",
+        "%e0%40%ae%e0%40%ae",
+        "%c0ae%c0ae",
+        // 16 bit shenanigans
+        "%uff0e%uff0e",
+        "%u002e%u002e",
+    ];
+
+    let parsing_will_transform_path = transformation_detectors
+        .iter()
+        .any(|detector| path.to_lowercase().contains(detector));
+
+    if !parsing_will_transform_path {
+        // there's no string in the path of the url that will trigger a transformation
+        // so, we can return it as-is
+        return Ok(parsed);
+    }
+
+    // if we reach this point, the path contains a string that will trigger a transformation
+    // so we need to manually create a Url that doesn't have the transformation
+    // and return that
+    //
+    // special thanks to github user @lavafroth for this workaround
+
+    let mut hacked_url = if path.ends_with('/') {
+        // from_file_path silently strips trailing slashes, and
+        // from_directory_path adds them, so we'll choose the appropriate
+        // constructor based on the presence of a path's trailing slash
+
+        // according to from_file_path docs:
+        //   from_file_path returns `Err` if the given path is not absolute or,
+        //   on Windows, if the prefix is not a disk prefix (e.g. `C:`) or a UNC prefix (`\\`).
+        //
+        // since we parsed out a valid url path, we know it is absolute, so on non-windows
+        // platforms, we can safely unwrap. On windows, we need to fix up the path
+        #[cfg(target_os = "windows")]
+        {
+            let path = format!("\\/IGNOREME{path}");
+            Url::from_directory_path(path).unwrap()
+        }
+        #[cfg(not(target_os = "windows"))]
+        Url::from_directory_path(path).unwrap()
+    } else {
+        #[cfg(target_os = "windows")]
+        {
+            let path = format!("\\/IGNOREME{path}");
+            Url::from_file_path(path).unwrap()
+        }
+        #[cfg(not(target_os = "windows"))]
+        Url::from_file_path(path).unwrap()
+    };
+
+    // host must be set first, otherwise multiple components may return Err
+    hacked_url.set_host(parsed.host_str())?;
+    // scheme/port/username/password can fail, but in this instance, we know they won't
+    hacked_url.set_scheme(parsed.scheme()).unwrap();
+    hacked_url.set_port(parsed.port()).unwrap();
+    hacked_url.set_username(parsed.username()).unwrap();
+    hacked_url.set_password(parsed.password()).unwrap();
+    // query/fragment can't fail
+    hacked_url.set_query(parsed.query());
+    hacked_url.set_fragment(parsed.fragment());
+
+    log::trace!("exit: parse_url_with_raw_path -> {}", hacked_url);
+    Ok(hacked_url)
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -544,31 +730,159 @@ mod tests {
    use crate::scan_manager::{FeroxScans, ScanOrder};

    #[test]
-    /// set_open_file_limit with a low requested limit succeeds
-    fn utils_set_open_file_limit_with_low_requested_limit() {
-        let (_, hard) = getrlimit(Resource::NOFILE).unwrap();
-        let lower_limit = hard - 1;
-        assert!(set_open_file_limit(lower_limit));
+    /// multiple tests for parse_url_with_raw_path
+    fn utils_parse_url_with_raw_path() {
+        // ../.. is preserved
+        let url = "https://www.google.com/../../stuff";
+        let parsed = parse_url_with_raw_path(url).unwrap();
+        assert_eq!(parsed.as_str(), url);
+
+        // ../.. is preserved as well as the trailing slash
+        let url = "https://www.google.com/../../stuff/";
+        let parsed = parse_url_with_raw_path(url).unwrap();
+        assert_eq!(parsed.as_str(), url);
+
+        // no trailing slash is preserved
+        let url = "https://www.google.com/stuff";
+        let parsed = parse_url_with_raw_path(url).unwrap();
+        assert_eq!(parsed.as_str(), url);
+
+        // trailing slash is preserved
+        let url = "https://www.google.com/stuff/";
+        let parsed: Url = parse_url_with_raw_path(url).unwrap();
+        assert_eq!(parsed.as_str(), url);
+
+        // mailto is an error
+        let url = "mailto:user@example.com";
+        let parsed = parse_url_with_raw_path(url);
+        assert!(parsed.is_err());
+
+        // relative url is an error
+        let url = "../../stuff";
+        let parsed = parse_url_with_raw_path(url);
+        assert!(parsed.is_err());
+
+        // absolute without host is an error
+        let url = "/../../stuff";
+        let parsed = parse_url_with_raw_path(url);
+        assert!(parsed.is_err());
+
+        // default ports are parsed correctly
+        for url in [
+            "http://example.com:80/path/file.html",
+            "https://example.com:443/path/file.html",
+        ] {
+            let parsed = parse_url_with_raw_path(url).unwrap();
+            assert!(parsed.port().is_none());
+            assert_eq!(parsed.host().unwrap().to_string().as_str(), "example.com");
+        }
+
+        // non-default ports are parsed correctly
+        for url in [
+            "http://example.com:8080/path/file.html",
+            "https://example.com:4433/path/file.html",
+        ] {
+            let parsed = parse_url_with_raw_path(url).unwrap();
+            assert!(parsed.port().is_some());
+            assert_eq!(parsed.as_str(), url);
+        }
+
+        // different encodings are respected if found in doubles
+        //
+        // note that the % sign is encoded as %25...
+        let url = "http://user:pass@example.com/%2e%2e/stuff.php";
+        let parsed = parse_url_with_raw_path(url).unwrap();
+        assert_eq!(
+            parsed.as_str(),
+            "http://user:pass@example.com/%252e%252e/stuff.php"
+        );
+
+        let url = "http://user:pass@example.com/%25%32%65%25%32%65/stuff.php";
+        let parsed = parse_url_with_raw_path(url).unwrap();
+        assert_eq!(parsed.username(), "user");
+        assert_eq!(parsed.password().unwrap(), "pass");
+        assert_eq!(
+            parsed.as_str(),
+            "http://user:pass@example.com/%2525%2532%2565%2525%2532%2565/stuff.php"
+        );
+
+        let url = "http://user:pass@example.com/%c0%ae%c0%ae/stuff.php";
+        let parsed = parse_url_with_raw_path(url).unwrap();
+        assert_eq!(parsed.username(), "user");
+        assert_eq!(parsed.password().unwrap(), "pass");
+        assert_eq!(
+            parsed.as_str(),
+            "http://user:pass@example.com/%25c0%25ae%25c0%25ae/stuff.php"
+        );
+
+        let url = "http://user:pass@example.com/%e0%40%ae%e0%40%ae/stuff.php";
+        let parsed = parse_url_with_raw_path(url).unwrap();
+        assert_eq!(parsed.username(), "user");
+        assert_eq!(parsed.password().unwrap(), "pass");
+        assert_eq!(
+            parsed.as_str(),
+            "http://user:pass@example.com/%25e0%2540%25ae%25e0%2540%25ae/stuff.php"
+        );
+
+        let url = "http://user:pass@example.com/%c0ae%c0ae/stuff.php";
+        let parsed = parse_url_with_raw_path(url).unwrap();
+        assert_eq!(parsed.username(), "user");
+        assert_eq!(parsed.password().unwrap(), "pass");
+        assert_eq!(
+            parsed.as_str(),
+            "http://user:pass@example.com/%25c0ae%25c0ae/stuff.php"
+        );
+
+        let url = "http://user:pass@example.com/%uff0e%uff0e/stuff.php";
+        let parsed = parse_url_with_raw_path(url).unwrap();
+        assert_eq!(parsed.username(), "user");
+        assert_eq!(parsed.password().unwrap(), "pass");
+        assert_eq!(
+            parsed.as_str(),
+            "http://user:pass@example.com/%25uff0e%25uff0e/stuff.php"
+        );
+
+        let url = "http://user:pass@example.com/%u002e%u002e/stuff.php";
+        let parsed = parse_url_with_raw_path(url).unwrap();
+        assert_eq!(parsed.username(), "user");
+        assert_eq!(parsed.password().unwrap(), "pass");
+        assert_eq!(
+            parsed.as_str(),
+            "http://user:pass@example.com/%25u002e%25u002e/stuff.php"
+        );
    }

-    #[test]
-    /// set_open_file_limit with a high requested limit succeeds
-    fn utils_set_open_file_limit_with_high_requested_limit() {
-        let (_, hard) = getrlimit(Resource::NOFILE).unwrap();
-        let higher_limit = hard + 1;
-        // calculate a new soft to ensure soft != hard and hit that logic branch
-        let new_soft = hard - 1;
-        setrlimit(Resource::NOFILE, new_soft, hard).unwrap();
-        assert!(set_open_file_limit(higher_limit));
-    }
+    #[cfg(not(target_os = "windows"))]
+    mod nix_only_tests {
+        use super::*;

-    #[test]
-    /// set_open_file_limit should fail when hard == soft
-    fn utils_set_open_file_limit_with_fails_when_both_limits_are_equal() {
-        let (_, hard) = getrlimit(Resource::NOFILE).unwrap();
-        // calculate a new soft to ensure soft == hard and hit the failure logic branch
-        setrlimit(Resource::NOFILE, hard, hard).unwrap();
-        assert!(!set_open_file_limit(hard)); // returns false
+        #[test]
+        /// set_open_file_limit with a low requested limit succeeds
+        fn utils_set_open_file_limit_with_low_requested_limit() {
+            let (_, hard) = getrlimit(Resource::NOFILE).unwrap();
+            let lower_limit = hard - 1;
+            assert!(set_open_file_limit(lower_limit));
+        }
+
+        #[test]
+        /// set_open_file_limit with a high requested limit succeeds
+        fn utils_set_open_file_limit_with_high_requested_limit() {
+            let (_, hard) = getrlimit(Resource::NOFILE).unwrap();
+            let higher_limit = hard + 1;
+            // calculate a new soft to ensure soft != hard and hit that logic branch
+            let new_soft = hard - 1;
+            setrlimit(Resource::NOFILE, new_soft, hard).unwrap();
+            assert!(set_open_file_limit(higher_limit));
+        }
+
+        #[test]
+        /// set_open_file_limit should fail when hard == soft
+        fn utils_set_open_file_limit_with_fails_when_both_limits_are_equal() {
+            let (_, hard) = getrlimit(Resource::NOFILE).unwrap();
+            // calculate a new soft to ensure soft == hard and hit the failure logic branch
+            setrlimit(Resource::NOFILE, hard, hard).unwrap();
+            assert!(!set_open_file_limit(hard)); // returns false
+        }
    }

    #[test]
@@ -702,6 +1016,13 @@ mod tests {
    /// provide a denier from which we can't check a host, which results in no comparison, expect false
    /// because the denier is a parent to the tested, even tho the scanned doesn't compare, it
    /// still returns true
+    ///
+    /// note: adding parse_url_with_raw_path changed the behavior of this test, it used to return
+    /// true, now it returns false. see my note in should_deny_absolute and the unreachable!
+    /// call block to see why
+    ///
+    /// leaving this test here to document the behavior change and to catch regressions in the
+    /// new expected behavior
    fn should_deny_url_doesnt_compare_non_domains_in_scanned() {
        let deny_url = "https://testdomain.com/";
        let scan_url = "unix:/run/foo.socket";
@@ -715,8 +1036,7 @@ mod tests {
        let config = Arc::new(config);

        let handles = Arc::new(Handles::for_testing(Some(scans), Some(config)).0);
-
-        assert!(should_deny_url(&tested_url, handles).unwrap());
+        assert!(!should_deny_url(&tested_url, handles).unwrap());
    }

    #[test]