broke absolute path denial into its own function; tests pass

2026-06-09 03:51:13 -03:00 · 2021-10-01 06:23:44 -05:00
parent 28fa90b093
commit 28769b5028
1 changed files with 197 additions and 72 deletions
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -1,6 +1,7 @@
 use anyhow::{bail, Context, Result};
 use console::{strip_ansi_codes, style, user_attended};
 use indicatif::ProgressBar;
+use regex::Regex;
 use reqwest::{Client, Response, StatusCode, Url};
 #[cfg(not(target_os = "windows"))]
 use rlimit::{getrlimit, setrlimit, Resource};
@@ -288,6 +289,94 @@ where
    Ok(())
 }

+/// todo
+fn should_deny_absolute(url_to_test: &Url, denier: &Url, handles: Arc<Handles>) -> Result<bool> {
+    log::trace!(
+        "enter: should_deny_absolute({}, {:?})",
+        url_to_test.as_str(),
+        denier.as_str(),
+    );
+
+    // simplest case is an exact match, check for it first
+    if url_to_test == denier {
+        log::trace!("exit: should_deny_absolute -> true");
+        return Ok(true);
+    }
+
+    match (url_to_test.host(), denier.host()) {
+        // .host() will return an enum with ipv4|6 or domain and is comparable
+        // whereas .domain() returns None for ip addresses
+        (Some(normed_host), Some(denier_host)) => {
+            if normed_host != denier_host {
+                // domains don't even match
+                return Ok(false);
+            }
+        }
+        _ => {
+            // one or the other couldn't determine the host value, which probably means
+            // it's not suitable for further comparison
+            return Ok(false);
+        }
+    }
+
+    let tested_host = url_to_test.host().unwrap(); // match above will catch errors
+
+    // at this point, we have a matching set of ips or domain names. now we can process the
+    // url path. The goal is to determine whether the given url's path is a subpath of any
+    // url in the deny list, for example
+    //    GIVEN URL                        URL DENY LIST               USER-SPECIFIED URLS TO SCAN
+    //    http://some.domain/stuff/things, [http://some.domain/stuff], [http://some.domain] => true
+    //    http://some.domain/stuff/things, [http://some.domain/stuff/things], [http://some.domain] => true
+    //    http://some.domain/stuff/things, [http://some.domain/api], [http://some.domain] => false
+    // the examples above are all pretty obvious, the kicker comes when the blocking url's
+    // path is a parent to a scanned url
+    //    http://some.domain/stuff/things, [http://some.domain/], [http://some.domain/stuff] => false
+    //    http://some.domain/api, [http://some.domain/], [http://some.domain/stuff] => true
+    // we want to deny all children of the parent, unless that child is a child of a scan
+    // we specified through -u(s) or --stdin
+
+    let deny_path = denier.path();
+    let tested_path = url_to_test.path();
+
+    if tested_path.starts_with(deny_path) {
+        // at this point, we know that the given normalized path is a sub-path of the
+        // current deny-url, now we just need to check to see if this deny-url is a parent
+        // to a scanned url that is also a parent of the given url
+        for ferox_scan in handles.ferox_scans()?.get_active_scans() {
+            let scanner = Url::parse(ferox_scan.url().trim_end_matches('/'))
+                .with_context(|| format!("Could not parse {} as a url", ferox_scan))?;
+
+            if let Some(scan_host) = scanner.host() {
+                // same domain/ip check we perform on the denier above
+                if tested_host != scan_host {
+                    // domains don't even match, keep on keepin' on...
+                    continue;
+                }
+            } else {
+                // couldn't process .host from scanner
+                continue;
+            };
+
+            let scan_path = scanner.path();
+
+            if scan_path.starts_with(deny_path) && tested_path.starts_with(scan_path) {
+                // user-specified scan url is a sub-path of the deny-urls's path AND the
+                // url to check is a sub-path of the user-specified scan url
+                //
+                // the assumption is the user knew what they wanted and we're going to give
+                // the scanned url precedence, even though it's a sub-path
+                log::trace!("exit: should_deny_absolute -> false");
+                return Ok(false);
+            }
+        }
+        log::trace!("exit: should_deny_absolute -> true");
+        return Ok(true);
+    }
+
+    log::trace!("exit: should_deny_absolute -> false");
+    Ok(false)
+}
+
 /// determines whether or not a given url should be denied based on the user-supplied --dont-scan
 /// flag
 pub fn should_deny_url(url: &Url, handles: Arc<Handles>) -> Result<bool> {
@@ -297,91 +386,127 @@ pub fn should_deny_url(url: &Url, handles: Arc<Handles>) -> Result<bool> {
        handles.config.url_denylist,
        handles.ferox_scans()?
    );
+
    // normalization for comparison is to remove the trailing / if one exists, this is done for
    // the given url and any url to which it's compared
    let normed_url = Url::parse(url.to_string().trim_end_matches('/'))?;

    for deny_url in &handles.config.url_denylist {
-        // parse the denying url for easier comparison
-        let denier = Url::parse(deny_url.trim_end_matches('/'))
-            .with_context(|| format!("Could not parse {} as a url", deny_url))?;
-
-        // simplest case is an exact match, check for it first
-        if normed_url == denier {
-            log::trace!("exit: should_deny_url -> true");
-            return Ok(true);
-        }
-
-        match (normed_url.host(), denier.host()) {
-            // .host() will return an enum with ipv4|6 or domain and is comparable
-            // whereas .domain() returns None for ip addresses
-            (Some(normed_host), Some(denier_host)) => {
-                if normed_host != denier_host {
-                    // domains don't even match, keep on keepin' on...
-                    continue;
+        let denier = match Url::parse(deny_url.trim_end_matches('/')) {
+            // todo consider breaking this out into a function, it's reused in
+            // todo /home/epi/PycharmProjects/feroxbuster/src/extractor/container.rs:L157
+            Ok(abs_denier) => {
+                // url is an absolute url and can be parsed as such
+                match should_deny_absolute(&normed_url, &abs_denier, handles.clone()) {
+                    // should_deny_absolute returns bool, if it's true, we found should deny
+                    // this url and no further processing is needed; otherwise, need to continue
+                    // to the next item in the deny_list
+                    Ok(true) => return Ok(true),
+                    _ => continue,
                }
            }
-            _ => {
-                // one or the other couldn't determine the host value, which probably means
-                // it's not suitable for further comparison
-                continue;
-            }
-        }
-
-        let normed_host = normed_url.host().unwrap(); // match above will catch errors
-
-        // at this point, we have a matching set of ips or domain names. now we can process the
-        // url path. The goal is to determine whether the given url's path is a subpath of any
-        // url in the deny list, for example
-        //    GIVEN URL                        URL DENY LIST               USER-SPECIFIED URLS TO SCAN
-        //    http://some.domain/stuff/things, [http://some.domain/stuff], [http://some.domain] => true
-        //    http://some.domain/stuff/things, [http://some.domain/stuff/things], [http://some.domain] => true
-        //    http://some.domain/stuff/things, [http://some.domain/api], [http://some.domain] => false
-        // the examples above are all pretty obvious, the kicker comes when the blocking url's
-        // path is a parent to a scanned url
-        //    http://some.domain/stuff/things, [http://some.domain/], [http://some.domain/stuff] => false
-        //    http://some.domain/api, [http://some.domain/], [http://some.domain/stuff] => true
-        // we want to deny all children of the parent, unless that child is a child of a scan
-        // we specified through -u(s) or --stdin
-
-        let deny_path = denier.path();
-        let norm_path = normed_url.path();
-
-        if norm_path.starts_with(deny_path) {
-            // at this point, we know that the given normalized path is a sub-path of the
-            // current deny-url, now we just need to check to see if this deny-url is a parent
-            // to a scanned url that is also a parent of the given url
-            for ferox_scan in handles.ferox_scans()?.get_active_scans() {
-                let scanner = Url::parse(ferox_scan.url().trim_end_matches('/'))
-                    .with_context(|| format!("Could not parse {} as a url", ferox_scan))?;
-
-                if let Some(scan_host) = scanner.host() {
-                    // same domain/ip check we perform on the denier above
-                    if normed_host != scan_host {
-                        // domains don't even match, keep on keepin' on...
-                        continue;
+            Err(e) => {
+                // this is the expected error that happens when we try to parse a url fragment
+                //     ex: Url::parse("/login") -> Err("relative URL without a base")
+                if e.to_string().contains("relative URL without a base") {
+                    // url without a base from the --dont-scan flag; we're going to assume
+                    // that the input is a regular expression to be parsed. The possibility
+                    // exists that the user rolled their face across the keyboard and we're
+                    // dealing with the results, but we'll attempt to work with what we're given
+                    if let Ok(regex_denier) = Regex::new(deny_url) {
+                        // todo figure out a way to split out normal url comparisons from regex
                    }
                } else {
-                    // couldn't process .host from scanner
+                    // unexpected error has occurred
+                    log::warn!("Could not parse given url denier: {}", e);
+                    handles.stats.send(AddError(Other)).unwrap_or_default();
                    continue;
-                };
-
-                let scan_path = scanner.path();
-
-                if scan_path.starts_with(deny_path) && norm_path.starts_with(scan_path) {
-                    // user-specified scan url is a sub-path of the deny-urls's path AND the
-                    // url to check is a sub-path of the user-specified scan url
-                    //
-                    // the assumption is the user knew what they wanted and we're going to give
-                    // the scanned url precedence, even though it's a sub-path
-                    log::trace!("exit: should_deny_url -> false");
-                    return Ok(false);
                }
            }
-            log::trace!("exit: should_deny_url -> true");
-            return Ok(true);
-        }
+        };
+
+        // parse the denying url for easier comparison
+        // let denier = Url::parse(deny_url.trim_end_matches('/'))
+        //     .with_context(|| format!("Could not parse {} as a url", deny_url))?;
+
+        // // simplest case is an exact match, check for it first
+        // if normed_url == denier {
+        //     log::trace!("exit: should_deny_url -> true");
+        //     return Ok(true);
+        // }
+        //
+        // match (normed_url.host(), denier.host()) {
+        //     // .host() will return an enum with ipv4|6 or domain and is comparable
+        //     // whereas .domain() returns None for ip addresses
+        //     (Some(normed_host), Some(denier_host)) => {
+        //         if normed_host != denier_host {
+        //             // domains don't even match, keep on keepin' on...
+        //             continue;
+        //         }
+        //     }
+        //     _ => {
+        //         // one or the other couldn't determine the host value, which probably means
+        //         // it's not suitable for further comparison
+        //         continue;
+        //     }
+        // }
+        //
+        // let normed_host = normed_url.host().unwrap(); // match above will catch errors
+
+        // // at this point, we have a matching set of ips or domain names. now we can process the
+        // // url path. The goal is to determine whether the given url's path is a subpath of any
+        // // url in the deny list, for example
+        // //    GIVEN URL                        URL DENY LIST               USER-SPECIFIED URLS TO SCAN
+        // //    http://some.domain/stuff/things, [http://some.domain/stuff], [http://some.domain] => true
+        // //    http://some.domain/stuff/things, [http://some.domain/stuff/things], [http://some.domain] => true
+        // //    http://some.domain/stuff/things, [http://some.domain/api], [http://some.domain] => false
+        // // the examples above are all pretty obvious, the kicker comes when the blocking url's
+        // // path is a parent to a scanned url
+        // //    http://some.domain/stuff/things, [http://some.domain/], [http://some.domain/stuff] => false
+        // //    http://some.domain/api, [http://some.domain/], [http://some.domain/stuff] => true
+        // // we want to deny all children of the parent, unless that child is a child of a scan
+        // // we specified through -u(s) or --stdin
+        //
+        // let deny_path = denier.path();
+        // let norm_path = normed_url.path();
+        //
+        // if norm_path.starts_with(deny_path) {
+        //     // at this point, we know that the given normalized path is a sub-path of the
+        //     // current deny-url, now we just need to check to see if this deny-url is a parent
+        //     // to a scanned url that is also a parent of the given url
+        //     for ferox_scan in handles.ferox_scans()?.get_active_scans() {
+        //         let scanner = Url::parse(ferox_scan.url().trim_end_matches('/'))
+        //             .with_context(|| format!("Could not parse {} as a url", ferox_scan))?;
+        //
+        //         if let Some(scan_host) = scanner.host() {
+        //             // same domain/ip check we perform on the denier above
+        //             if normed_host != scan_host {
+        //                 // domains don't even match, keep on keepin' on...
+        //                 continue;
+        //             }
+        //         } else {
+        //             // couldn't process .host from scanner
+        //             continue;
+        //         };
+        //
+        //         let scan_path = scanner.path();
+        //
+        //         if scan_path.starts_with(deny_path) && norm_path.starts_with(scan_path) {
+        //             // user-specified scan url is a sub-path of the deny-urls's path AND the
+        //             // url to check is a sub-path of the user-specified scan url
+        //             //
+        //             // the assumption is the user knew what they wanted and we're going to give
+        //             // the scanned url precedence, even though it's a sub-path
+        //             log::trace!("exit: should_deny_url -> false");
+        //             return Ok(false);
+        //         }
+        //     }
+        //     log::trace!("exit: should_deny_url -> true");
+        //     return Ok(true);
+        // }
    }
+    // made it to the end of the deny list unscathed, return false, indicating we should not deny
+    // this particular url
    log::trace!("exit: should_deny_url -> false");
    Ok(false)
 }