Merge pull request #884 from epi052/878-support-raw-urls

878 support raw urls
nitpickery
2026-05-27 00:11:11 -03:00 · 2023-04-26 06:59:04 -05:00 · 2023-04-26 06:45:13 -05:00 · 2023-04-26 06:33:43 -05:00 · 2023-04-25 07:10:48 -05:00 · 2023-04-25 07:09:56 -05:00
30 changed files with 1431 additions and 714 deletions
--- a/.all-contributorsrc
+++ b/.all-contributorsrc
@@ -562,6 +562,24 @@
      "contributions": [
        "bug"
      ]
+    },
+    {
+      "login": "acut3",
+      "name": "Nicolas Christin",
+      "avatar_url": "https://avatars.githubusercontent.com/u/17295243?v=4",
+      "profile": "https://acut3.github.io/",
+      "contributions": [
+        "bug"
+      ]
+    },
+    {
+      "login": "DrorDvash",
+      "name": "DrDv",
+      "avatar_url": "https://avatars.githubusercontent.com/u/8413651?v=4",
+      "profile": "https://github.com/DrorDvash",
+      "contributions": [
+        "bug"
+      ]
    }
  ],
  "contributorsPerLine": 7,
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -27,13 +27,13 @@ jobs:
          - type: armv7
            os: ubuntu-latest
            target: armv7-unknown-linux-gnueabihf
-            name: armv7-feroxbuster
+            name: armv7-linux-feroxbuster
            path: target/armv7-unknown-linux-gnueabihf/release/feroxbuster
            pkg_config_path: /usr/lib/x86_64-linux-gnu/pkgconfig
          - type: aarch64
            os: ubuntu-latest
            target: aarch64-unknown-linux-gnu
-            name: aarch64-feroxbuster
+            name: aarch64-linux-feroxbuster
            path: target/aarch64-unknown-linux-gnu/release/feroxbuster
            pkg_config_path: /usr/lib/x86_64-linux-gnu/pkgconfig
    steps:
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "feroxbuster"
-version = "2.9.2"
+version = "2.9.5"
 authors = ["Ben 'epi' Risher (@epi052)"]
 license = "MIT"
 edition = "2021"
@@ -22,47 +22,52 @@ build = "build.rs"
 maintenance = { status = "actively-developed" }

 [build-dependencies]
-clap = { version = "4.1.8", features = ["wrap_help", "cargo"] }
-clap_complete = "4.1.4"
-regex = "1.5.5"
-lazy_static = "1.4.0"
-dirs = "4.0.0"
+clap = { version = "4.2", features = ["wrap_help", "cargo"] }
+clap_complete = "4.1"
+regex = "1.5"
+lazy_static = "1.4"
+dirs = "5.0"

 [dependencies]
-scraper = "0.15.0"
-futures = "0.3.26"
-tokio = { version = "1.26.0", features = ["full"] }
-tokio-util = { version = "0.7.7", features = ["codec"] }
-log = "0.4.17"
-env_logger = "0.10.0"
-reqwest = { version = "0.11.10", features = ["socks"] }
+scraper = "0.16"
+futures = "0.3"
+tokio = { version = "1.26", features = ["full"] }
+tokio-util = { version = "0.7", features = ["codec"] }
+log = "0.4"
+env_logger = "0.10"
+reqwest = { version = "0.11", features = ["socks"] }
 # uses feature unification to add 'serde' to reqwest::Url
-url = { version = "2.2.2", features = ["serde"] }
-serde_regex = "1.1.0"
-clap = { version = "4.1.8", features = ["wrap_help", "cargo"] }
-lazy_static = "1.4.0"
-toml = "0.7.2"
-serde = { version = "1.0.137", features = ["derive", "rc"] }
-serde_json = "1.0.94"
-uuid = { version = "1.3.0", features = ["v4"] }
-indicatif = "0.15"
-console = "0.15.2"
+url = { version = "2.2", features = ["serde"] }
+serde_regex = "1.1"
+clap = { version = "4.2", features = ["wrap_help", "cargo"] }
+lazy_static = "1.4"
+toml = "0.7"
+serde = { version = "1.0", features = ["derive", "rc"] }
+serde_json = "1.0"
+uuid = { version = "1.3", features = ["v4"] }
+indicatif = "0.17"
+console = "0.15"
 openssl = { version = "0.10", features = ["vendored"] }
-dirs = "4.0.0"
-regex = "1.5.5"
-crossterm = "0.26.0"
-rlimit = "0.9.1"
-ctrlc = "3.2.2"
-anyhow = "1.0.69"
-leaky-bucket = "0.12.1"
-gaoya = "0.1.2"
-self_update = {version = "0.36.0", features = ["archive-tar", "compression-flate2", "archive-zip", "compression-zip-deflate"]}
+dirs = "5.0"
+regex = "1.5"
+crossterm = "0.26"
+rlimit = "0.9"
+ctrlc = "3.2"
+anyhow = "1.0"
+leaky-bucket = "0.12"
+gaoya = "0.1"
+self_update = { version = "0.36", features = [
+    "archive-tar",
+    "compression-flate2",
+    "archive-zip",
+    "compression-zip-deflate",
+] }

 [dev-dependencies]
-tempfile = "3.3.0"
-httpmock = "0.6.6"
-assert_cmd = "2.0.4"
-predicates = "3.0.1"
+tempfile = "3.3"
+httpmock = "0.6"
+assert_cmd = "2.0"
+predicates = "3.0"

 [profile.release]
 lto = true
--- a/Makefile.toml
+++ b/Makefile.toml
@@ -11,7 +11,7 @@ rm ferox-*.state
 # dependency management
 [tasks.upgrade-deps]
 command = "cargo"
-args = ["upgrade", "--exclude", "indicatif"]
+args = ["upgrade"]

 [tasks.update]
 command = "cargo"
--- a/README.md
+++ b/README.md
@@ -97,8 +97,14 @@ sudo apt update && sudo apt install -y feroxbuster

 #### Linux (32 and 64-bit) & MacOS

+Install to a particular directory
 ```
-curl -sL https://raw.githubusercontent.com/epi052/feroxbuster/master/install-nix.sh | bash
+curl -sL https://raw.githubusercontent.com/epi052/feroxbuster/main/install-nix.sh | bash -s $HOME/.local/bin
+```
+
+Install to current working directory
+```
+curl -sL https://raw.githubusercontent.com/epi052/feroxbuster/main/install-nix.sh | bash
 ```

 #### MacOS via Homebrew 
@@ -278,6 +284,8 @@ Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/d
      <td align="center" valign="top" width="14.28%"><a href="https://github.com/Luoooio"><img src="https://avatars.githubusercontent.com/u/26653157?v=4?s=100" width="100px;" alt="Luoooio"/><br /><sub><b>Luoooio</b></sub></a><br /><a href="#ideas-Luoooio" title="Ideas, Planning, & Feedback">🤔</a></td>
      <td align="center" valign="top" width="14.28%"><a href="https://petruknisme.com"><img src="https://avatars.githubusercontent.com/u/6284204?v=4?s=100" width="100px;" alt="Aan"/><br /><sub><b>Aan</b></sub></a><br /><a href="https://github.com/epi052/feroxbuster/commits?author=aancw" title="Code">💻</a> <a href="#infra-aancw" title="Infrastructure (Hosting, Build-Tools, etc)">🚇</a> <a href="#ideas-aancw" title="Ideas, Planning, & Feedback">🤔</a></td>
      <td align="center" valign="top" width="14.28%"><a href="https://github.com/imBigo"><img src="https://avatars.githubusercontent.com/u/54672433?v=4?s=100" width="100px;" alt="Simon"/><br /><sub><b>Simon</b></sub></a><br /><a href="https://github.com/epi052/feroxbuster/issues?q=author%3AimBigo" title="Bug reports">🐛</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://acut3.github.io/"><img src="https://avatars.githubusercontent.com/u/17295243?v=4?s=100" width="100px;" alt="Nicolas Christin"/><br /><sub><b>Nicolas Christin</b></sub></a><br /><a href="https://github.com/epi052/feroxbuster/issues?q=author%3Aacut3" title="Bug reports">🐛</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/DrorDvash"><img src="https://avatars.githubusercontent.com/u/8413651?v=4?s=100" width="100px;" alt="DrDv"/><br /><sub><b>DrDv</b></sub></a><br /><a href="https://github.com/epi052/feroxbuster/issues?q=author%3ADrorDvash" title="Bug reports">🐛</a></td>
    </tr>
  </tbody>
 </table>
--- a/install-nix.sh
+++ b/install-nix.sh
@@ -13,13 +13,13 @@ LIN64_URL="$BASE_URL/$LIN64_ZIP"

 EMOJI_URL=https://gist.github.com/epi052/8196b550ea51d0907ad4b93751b1b57d/raw/6112c9f32ae07922983fdc549c54fd3fb9a38e4c/NotoColorEmoji.ttf

-echo "[+] Installing feroxbuster!"
+INSTALL_DIR="${1:-$(pwd)}"
+
+echo "[+] Installing feroxbuster to ${INSTALL_DIR}!"

 which unzip &>/dev/null
-if [ "$?" = "0" ]; then
-  echo "[+] unzip found"
-else
-  echo "[ ] unzip not found, exiting. "
+if [ "$?" != "0" ]; then
+  echo "[!] unzip not found, exiting. "
  exit -1
 fi

@@ -27,20 +27,20 @@ if [[ "$(uname)" == "Darwin" ]]; then
  echo "[=] Found MacOS, downloading from $MAC_URL"

  curl -sLO "$MAC_URL"
-  unzip -o "$MAC_ZIP" >/dev/null
+  unzip -o "$MAC_ZIP" -d "${INSTALL_DIR}" >/dev/null
  rm "$MAC_ZIP"
 elif [[ "$(expr substr $(uname -s) 1 5)" == "Linux" ]]; then
  if [[ $(getconf LONG_BIT) == 32 ]]; then
    echo "[=] Found 32-bit Linux, downloading from $LIN32_URL"

    curl -sLO "$LIN32_URL"
-    unzip -o "$LIN32_ZIP" >/dev/null
+    unzip -o "$LIN32_ZIP" -d "${INSTALL_DIR}" >/dev/null
    rm "$LIN32_ZIP"
  else
    echo "[=] Found 64-bit Linux, downloading from $LIN64_URL"

    curl -sLO "$LIN64_URL"
-    unzip -o "$LIN64_ZIP" >/dev/null
+    unzip -o "$LIN64_ZIP" -d "${INSTALL_DIR}" >/dev/null
    rm "$LIN64_ZIP"
  fi

@@ -60,6 +60,8 @@ elif [[ "$(expr substr $(uname -s) 1 5)" == "Linux" ]]; then
  fi
 fi

-chmod +x ./feroxbuster
+chmod +x "${INSTALL_DIR}/feroxbuster"

-echo "[+] Installed feroxbuster version $(./feroxbuster -V)"
+echo "[+] Installed feroxbuster"
+echo "  [-] path: ${INSTALL_DIR}/feroxbuster"
+echo "  [-] version: $(${INSTALL_DIR}/feroxbuster -V | awk '{print $2}')"
--- a/shell_completions/_feroxbuster
+++ b/shell_completions/_feroxbuster
@@ -24,8 +24,8 @@ _feroxbuster() {
 '--replay-proxy=[Send only unfiltered requests through a Replay Proxy, instead of all requests]:REPLAY_PROXY:_urls' \
 '*-R+[Status Codes to send through a Replay Proxy when found (default: --status-codes value)]:REPLAY_CODE: ' \
 '*--replay-codes=[Status Codes to send through a Replay Proxy when found (default: --status-codes value)]:REPLAY_CODE: ' \
-'-a+[Sets the User-Agent (default: feroxbuster/2.9.2)]:USER_AGENT: ' \
-'--user-agent=[Sets the User-Agent (default: feroxbuster/2.9.2)]:USER_AGENT: ' \
+'-a+[Sets the User-Agent (default: feroxbuster/2.9.5)]:USER_AGENT: ' \
+'--user-agent=[Sets the User-Agent (default: feroxbuster/2.9.5)]:USER_AGENT: ' \
 '*-x+[File extension(s) to search for (ex: -x php -x pdf js)]:FILE_EXTENSION: ' \
 '*--extensions=[File extension(s) to search for (ex: -x php -x pdf js)]:FILE_EXTENSION: ' \
 '*-m+[Which HTTP request method(s) should be sent (default: GET)]:HTTP_METHODS: ' \
--- a/shell_completions/_feroxbuster.ps1
+++ b/shell_completions/_feroxbuster.ps1
@@ -30,8 +30,8 @@ Register-ArgumentCompleter -Native -CommandName 'feroxbuster' -ScriptBlock {
            [CompletionResult]::new('--replay-proxy', 'replay-proxy', [CompletionResultType]::ParameterName, 'Send only unfiltered requests through a Replay Proxy, instead of all requests')
            [CompletionResult]::new('-R', 'R', [CompletionResultType]::ParameterName, 'Status Codes to send through a Replay Proxy when found (default: --status-codes value)')
            [CompletionResult]::new('--replay-codes', 'replay-codes', [CompletionResultType]::ParameterName, 'Status Codes to send through a Replay Proxy when found (default: --status-codes value)')
-            [CompletionResult]::new('-a', 'a', [CompletionResultType]::ParameterName, 'Sets the User-Agent (default: feroxbuster/2.9.2)')
-            [CompletionResult]::new('--user-agent', 'user-agent', [CompletionResultType]::ParameterName, 'Sets the User-Agent (default: feroxbuster/2.9.2)')
+            [CompletionResult]::new('-a', 'a', [CompletionResultType]::ParameterName, 'Sets the User-Agent (default: feroxbuster/2.9.5)')
+            [CompletionResult]::new('--user-agent', 'user-agent', [CompletionResultType]::ParameterName, 'Sets the User-Agent (default: feroxbuster/2.9.5)')
            [CompletionResult]::new('-x', 'x', [CompletionResultType]::ParameterName, 'File extension(s) to search for (ex: -x php -x pdf js)')
            [CompletionResult]::new('--extensions', 'extensions', [CompletionResultType]::ParameterName, 'File extension(s) to search for (ex: -x php -x pdf js)')
            [CompletionResult]::new('-m', 'm', [CompletionResultType]::ParameterName, 'Which HTTP request method(s) should be sent (default: GET)')
--- a/shell_completions/feroxbuster.elv
+++ b/shell_completions/feroxbuster.elv
@@ -27,8 +27,8 @@ set edit:completion:arg-completer[feroxbuster] = {|@words|
            cand --replay-proxy 'Send only unfiltered requests through a Replay Proxy, instead of all requests'
            cand -R 'Status Codes to send through a Replay Proxy when found (default: --status-codes value)'
            cand --replay-codes 'Status Codes to send through a Replay Proxy when found (default: --status-codes value)'
-            cand -a 'Sets the User-Agent (default: feroxbuster/2.9.2)'
-            cand --user-agent 'Sets the User-Agent (default: feroxbuster/2.9.2)'
+            cand -a 'Sets the User-Agent (default: feroxbuster/2.9.5)'
+            cand --user-agent 'Sets the User-Agent (default: feroxbuster/2.9.5)'
            cand -x 'File extension(s) to search for (ex: -x php -x pdf js)'
            cand --extensions 'File extension(s) to search for (ex: -x php -x pdf js)'
            cand -m 'Which HTTP request method(s) should be sent (default: GET)'
--- a/src/banner/container.rs
+++ b/src/banner/container.rs
@@ -2,12 +2,11 @@ use super::entry::BannerEntry;
 use crate::{
    config::Configuration,
    event_handlers::Handles,
-    utils::{logged_request, status_colorizer},
+    utils::{logged_request, parse_url_with_raw_path, status_colorizer},
    DEFAULT_IGNORED_EXTENSIONS, DEFAULT_METHOD, DEFAULT_STATUS_CODES, VERSION,
 };
 use anyhow::{bail, Result};
 use console::{style, Emoji};
-use reqwest::Url;
 use serde_json::Value;
 use std::{io::Write, sync::Arc};

@@ -478,7 +477,7 @@ by Ben "epi" Risher {}                 ver: {}"#,
    pub async fn check_for_updates(&mut self, url: &str, handles: Arc<Handles>) -> Result<()> {
        log::trace!("enter: needs_update({}, {:?})", url, handles);

-        let api_url = Url::parse(url)?;
+        let api_url = parse_url_with_raw_path(url)?;

        let result = logged_request(&api_url, DEFAULT_METHOD, None, handles.clone()).await?;
        let body = result.text().await?;
--- a/src/config/container.rs
+++ b/src/config/container.rs
@@ -6,7 +6,10 @@ use super::utils::{
 use crate::config::determine_output_level;
 use crate::config::utils::determine_requester_policy;
 use crate::{
-    client, parser, scan_manager::resume_scan, traits::FeroxSerialize, utils::fmt_err,
+    client, parser,
+    scan_manager::resume_scan,
+    traits::FeroxSerialize,
+    utils::{fmt_err, parse_url_with_raw_path},
    DEFAULT_CONFIG_NAME,
 };
 use anyhow::{anyhow, Context, Result};
@@ -673,7 +676,7 @@ impl Configuration {
            for denier in arg {
                // could be an absolute url or a regex, need to determine which and populate the
                // appropriate vector
-                match Url::parse(denier.trim_end_matches('/')) {
+                match parse_url_with_raw_path(denier.trim_end_matches('/')) {
                    Ok(absolute) => {
                        // denier is an absolute url and can be parsed as such
                        config.url_denylist.push(absolute);
--- a/src/event_handlers/outputs.rs
+++ b/src/event_handlers/outputs.rs
@@ -242,14 +242,6 @@ impl TermOutHandler {
        log::trace!("enter: process_response({:?}, {:?})", resp, call_type);

        async move {
-            let should_filter = self
-                .handles
-                .as_ref()
-                .unwrap()
-                .filters
-                .data
-                .should_filter_response(&resp, tx_stats.clone());
-
            let contains_sentry = if !self.config.filter_status.is_empty() {
                // -C was used, meaning -s was not and we should ignore the defaults
                // https://github.com/epi052/feroxbuster/issues/535
@@ -261,7 +253,7 @@ impl TermOutHandler {
            };

            let unknown_sentry = !RESPONSES.contains(&resp); // !contains == unknown
-            let should_process_response = contains_sentry && unknown_sentry && !should_filter;
+            let should_process_response = contains_sentry && unknown_sentry;

            if should_process_response {
                // print to stdout
--- a/src/event_handlers/scans.rs
+++ b/src/event_handlers/scans.rs
@@ -16,7 +16,7 @@ use crate::{
 use super::command::Command::AddToUsizeField;
 use super::*;
 use crate::statistics::StatField;
-use reqwest::Url;
+use crate::utils::parse_url_with_raw_path;
 use tokio::time::Duration;

 #[derive(Debug)]
@@ -266,7 +266,7 @@ impl ScanHandler {
                        let bar = scan.progress_bar();

                        // (4000 - 3000) / 2 => 500 words left to send
-                        let length = bar.length();
+                        let length = bar.length().unwrap_or(1);
                        let num_words_left = (length - bar.position()) / divisor;

                        // accumulate each bar's increment value for incrementing the total bar
@@ -325,7 +325,9 @@ impl ScanHandler {
                self.data.add_directory_scan(&target, order).1 // add the new target; return FeroxScan
            };

-            if should_test_deny && should_deny_url(&Url::parse(&target)?, self.handles.clone())? {
+            if should_test_deny
+                && should_deny_url(&parse_url_with_raw_path(&target)?, self.handles.clone())?
+            {
                // response was caught by a user-provided deny list
                // checking this last, since it's most susceptible to longer runtimes due to what
                // input is received
--- a/src/event_handlers/statistics.rs
+++ b/src/event_handlers/statistics.rs
@@ -147,7 +147,7 @@ impl StatsHandler {
            self.stats.errors(),
        );

-        self.bar.set_message(&msg);
+        self.bar.set_message(msg);

        if self.bar.position() < self.stats.total_expected() as u64 {
            // don't run off the end when we're a few requests over the expected total
--- a/src/extractor/container.rs
+++ b/src/extractor/container.rs
@@ -11,16 +11,60 @@ use crate::{
        StatField::{LinksExtracted, TotalExpected},
    },
    url::FeroxUrl,
-    utils::{logged_request, make_request, send_try_recursion_command, should_deny_url},
+    utils::{
+        logged_request, make_request, parse_url_with_raw_path, send_try_recursion_command,
+        should_deny_url,
+    },
    ExtractionResult, DEFAULT_METHOD,
 };
 use anyhow::{bail, Context, Result};
-use reqwest::{Client, StatusCode, Url};
+use futures::StreamExt;
+use reqwest::{Client, Response, StatusCode, Url};
 use scraper::{Html, Selector};
 use std::{borrow::Cow, collections::HashSet};

+/// Wrapper around link extraction logic
+///   - create a new Url object based on cli options/args
+///   - check if the new Url has already been seen/scanned -> None
+///   - make a request to the new Url ? -> Some(response) : None
+pub(super) async fn request_link(url: &str, handles: Arc<Handles>) -> Result<Response> {
+    log::trace!("enter: request_link({})", url);
+
+    let ferox_url = FeroxUrl::from_string(url, handles.clone());
+
+    // create a url based on the given command line options
+    let new_url = ferox_url.format("", None)?;
+
+    let scanned_urls = handles.ferox_scans()?;
+
+    if scanned_urls.get_scan_by_url(new_url.as_ref()).is_some() {
+        //we've seen the url before and don't need to scan again
+        log::trace!("exit: request_link -> None");
+        bail!("previously seen url");
+    }
+
+    if (!handles.config.url_denylist.is_empty() || !handles.config.regex_denylist.is_empty())
+        && should_deny_url(&new_url, handles.clone())?
+    {
+        // can't allow a denied url to be requested
+        bail!(
+            "prevented request to {} due to {:?} || {:?}",
+            url,
+            handles.config.url_denylist,
+            handles.config.regex_denylist,
+        );
+    }
+
+    // make the request and store the response
+    let new_response = logged_request(&new_url, DEFAULT_METHOD, None, handles.clone()).await?;
+
+    log::trace!("exit: request_link -> {:?}", new_response);
+
+    Ok(new_response)
+}
+
 /// Whether an active scan is recursive or not
-#[derive(Debug)]
+#[derive(Debug, Copy, Clone)]
 enum RecursionStatus {
    /// Scan is recursive
    Recursive,
@@ -81,7 +125,7 @@ impl<'a> Extractor<'a> {
    ) -> Result<()> {
        log::trace!("enter: parse_url_and_add_subpaths({:?})", links);

-        match Url::parse(url_to_parse) {
+        match parse_url_with_raw_path(url_to_parse) {
            Ok(absolute) => {
                if absolute.domain() != original_url.domain()
                    || absolute.host() != original_url.host()
@@ -121,91 +165,140 @@ impl<'a> Extractor<'a> {

    /// given a set of links from a normal http body response, task the request handler to make
    /// the requests
-    pub async fn request_links(&mut self, links: HashSet<String>) -> Result<()> {
+    pub async fn request_links(
+        &mut self,
+        links: HashSet<String>,
+    ) -> Result<Option<tokio::task::JoinHandle<()>>> {
        log::trace!("enter: request_links({:?})", links);

        if links.is_empty() {
-            return Ok(());
+            return Ok(None);
        }

+        self.update_stats(links.len())?;
+
+        // create clones/remove use of self of/from everything the async move block will need to function
+        let cloned_scanned_urls = self.handles.ferox_scans()?;
+        let cloned_handles = self.handles.clone();
+        let cloned_url = self.url.clone();
+        let threads = self.handles.config.threads;
        let recursive = if self.handles.config.no_recursion {
            RecursionStatus::NotRecursive
        } else {
            RecursionStatus::Recursive
        };

-        let scanned_urls = self.handles.ferox_scans()?;
-        self.update_stats(links.len())?;
+        let link_request_task = tokio::spawn(async move {
+            let producers = futures::stream::iter(links.into_iter())
+                .map(|link| {
+                    // another clone to satisfy the async move block
+                    let inner_clone = cloned_handles.clone();

-        for link in links {
-            let mut resp = match self.request_link(&link).await {
-                Ok(resp) => resp,
-                Err(_) => continue,
-            };
+                    (
+                        tokio::spawn(async move { request_link(&link, inner_clone).await }),
+                        cloned_handles.clone(),
+                        cloned_scanned_urls.clone(),
+                        recursive,
+                        cloned_url.clone(),
+                    )
+                })
+                .for_each_concurrent(
+                    threads,
+                    |(join_handle, c_handles, c_scanned_urls, c_recursive, og_url)| async move {
+                        match join_handle.await {
+                            Ok(Ok(reqwest_response)) => {
+                                let mut resp = FeroxResponse::from(
+                                    reqwest_response,
+                                    &og_url,
+                                    DEFAULT_METHOD,
+                                    c_handles.config.output_level,
+                                )
+                                .await;

-            // filter if necessary
-            if self
-                .handles
-                .filters
-                .data
-                .should_filter_response(&resp, self.handles.stats.tx.clone())
-            {
-                continue;
-            }
+                                // filter if necessary
+                                if c_handles
+                                    .filters
+                                    .data
+                                    .should_filter_response(&resp, c_handles.stats.tx.clone())
+                                {
+                                    return;
+                                }

-            // request and report assumed file
-            if resp.is_file() || !resp.is_directory() {
-                log::debug!("Extracted File: {}", resp);
+                                // request and report assumed file
+                                if resp.is_file() || !resp.is_directory() {
+                                    log::debug!("Extracted File: {}", resp);

-                scanned_urls.add_file_scan(resp.url().as_str(), ScanOrder::Latest);
+                                    c_scanned_urls
+                                        .add_file_scan(resp.url().as_str(), ScanOrder::Latest);

-                if self.handles.config.collect_extensions {
-                    resp.parse_extension(self.handles.clone())?;
-                }
+                                    if c_handles.config.collect_extensions {
+                                        // no real reason this should fail
+                                        resp.parse_extension(c_handles.clone()).unwrap();
+                                    }

-                if let Err(e) = resp.send_report(self.handles.output.tx.clone()) {
-                    log::warn!("Could not send FeroxResponse to output handler: {}", e);
-                }
+                                    if let Err(e) = resp.send_report(c_handles.output.tx.clone()) {
+                                        log::warn!(
+                                            "Could not send FeroxResponse to output handler: {}",
+                                            e
+                                        );
+                                    }

-                continue;
-            }
+                                    return;
+                                }

-            if matches!(recursive, RecursionStatus::Recursive) {
-                log::debug!("Extracted Directory: {}", resp);
+                                if matches!(c_recursive, RecursionStatus::Recursive) {
+                                    log::debug!("Extracted Directory: {}", resp);

-                if !resp.url().as_str().ends_with('/')
-                    && (resp.status().is_success()
-                        || matches!(resp.status(), &StatusCode::FORBIDDEN))
-                {
-                    // if the url doesn't end with a /
-                    // and the response code is either a 2xx or 403
+                                    if !resp.url().as_str().ends_with('/')
+                                        && (resp.status().is_success()
+                                            || matches!(resp.status(), &StatusCode::FORBIDDEN))
+                                    {
+                                        // if the url doesn't end with a /
+                                        // and the response code is either a 2xx or 403

-                    // since all of these are 2xx or 403, recursion is only attempted if the
-                    // url ends in a /. I am actually ok with adding the slash and not
-                    // adding it, as both have merit.  Leaving it in for now to see how
-                    // things turn out (current as of: v1.1.0)
-                    resp.set_url(&format!("{}/", resp.url()));
-                }
+                                        // since all of these are 2xx or 403, recursion is only attempted if the
+                                        // url ends in a /. I am actually ok with adding the slash and not
+                                        // adding it, as both have merit.  Leaving it in for now to see how
+                                        // things turn out (current as of: v1.1.0)
+                                        resp.set_url(&format!("{}/", resp.url()));
+                                    }
+
+                                    if c_handles.config.filter_status.is_empty() {
+                                        // -C wasn't used, so -s is the only 'filter' left to account for
+                                        if c_handles
+                                            .config
+                                            .status_codes
+                                            .contains(&resp.status().as_u16())
+                                        {
+                                            send_try_recursion_command(c_handles.clone(), resp)
+                                                .await
+                                                .unwrap_or_default();
+                                        }
+                                    } else {
+                                        // -C was used, that means the filters above would have removed
+                                        // those responses, and anything else should be let through
+                                        send_try_recursion_command(c_handles.clone(), resp)
+                                            .await
+                                            .unwrap_or_default();
+                                    }
+                                }
+                            }
+                            Ok(Err(err)) => {
+                                log::warn!("Error during link extraction: {}", err);
+                            }
+                            Err(err) => {
+                                log::warn!("JoinError during link extraction: {}", err);
+                            }
+                        }
+                    },
+                );
+
+            // wait for the requests to finish
+            producers.await;
+        });

-                if self.handles.config.filter_status.is_empty() {
-                    // -C wasn't used, so -s is the only 'filter' left to account for
-                    if self
-                        .handles
-                        .config
-                        .status_codes
-                        .contains(&resp.status().as_u16())
-                    {
-                        send_try_recursion_command(self.handles.clone(), resp).await?;
-                    }
-                } else {
-                    // -C was used, that means the filters above would have removed
-                    // those responses, and anything else should be let through
-                    send_try_recursion_command(self.handles.clone(), resp).await?;
-                }
-            }
-        }
        log::trace!("exit: request_links");
-        Ok(())
+        Ok(Some(link_request_task))
    }

    /// wrapper around link extraction via html attributes
@@ -385,7 +478,7 @@ impl<'a> Extractor<'a> {
            ExtractionTarget::ResponseBody | ExtractionTarget::DirectoryListing => {
                self.response.unwrap().url().clone()
            }
-            ExtractionTarget::RobotsTxt => match Url::parse(&self.url) {
+            ExtractionTarget::RobotsTxt => match parse_url_with_raw_path(&self.url) {
                Ok(u) => u,
                Err(e) => {
                    bail!("Could not parse {}: {}", self.url, e);
@@ -415,56 +508,6 @@ impl<'a> Extractor<'a> {
        Ok(())
    }

-    /// Wrapper around link extraction logic
-    ///   - create a new Url object based on cli options/args
-    ///   - check if the new Url has already been seen/scanned -> None
-    ///   - make a request to the new Url ? -> Some(response) : None
-    pub(super) async fn request_link(&self, url: &str) -> Result<FeroxResponse> {
-        log::trace!("enter: request_link({})", url);
-
-        let ferox_url = FeroxUrl::from_string(url, self.handles.clone());
-
-        // create a url based on the given command line options
-        let new_url = ferox_url.format("", None)?;
-
-        let scanned_urls = self.handles.ferox_scans()?;
-
-        if scanned_urls.get_scan_by_url(new_url.as_ref()).is_some() {
-            //we've seen the url before and don't need to scan again
-            log::trace!("exit: request_link -> None");
-            bail!("previously seen url");
-        }
-
-        if (!self.handles.config.url_denylist.is_empty()
-            || !self.handles.config.regex_denylist.is_empty())
-            && should_deny_url(&new_url, self.handles.clone())?
-        {
-            // can't allow a denied url to be requested
-            bail!(
-                "prevented request to {} due to {:?} || {:?}",
-                url,
-                self.handles.config.url_denylist,
-                self.handles.config.regex_denylist,
-            );
-        }
-
-        // make the request and store the response
-        let new_response =
-            logged_request(&new_url, DEFAULT_METHOD, None, self.handles.clone()).await?;
-
-        let new_ferox_response = FeroxResponse::from(
-            new_response,
-            url,
-            DEFAULT_METHOD,
-            self.handles.config.output_level,
-        )
-        .await;
-
-        log::trace!("exit: request_link -> {:?}", new_ferox_response);
-
-        Ok(new_ferox_response)
-    }
-
    /// Entry point to perform link extraction from robots.txt
    ///
    /// `base_url` can have paths and subpaths, however robots.txt will be requested from the
@@ -484,7 +527,7 @@ impl<'a> Extractor<'a> {

        for capture in self.robots_regex.captures_iter(body) {
            if let Some(new_path) = capture.name("url_path") {
-                let mut new_url = Url::parse(&self.url)?;
+                let mut new_url = parse_url_with_raw_path(&self.url)?;

                new_url.set_path(new_path.as_str());

@@ -614,7 +657,7 @@ impl<'a> Extractor<'a> {
            &client
        };

-        let mut url = Url::parse(&self.url)?;
+        let mut url = parse_url_with_raw_path(&self.url)?;
        url.set_path(location); // overwrite existing path

        // purposefully not using logged_request here due to using the special client
--- a/src/extractor/tests.rs
+++ b/src/extractor/tests.rs
@@ -1,4 +1,5 @@
 use super::builder::{LINKFINDER_REGEX, ROBOTS_TXT_REGEX, URL_CHARS_REGEX};
+use super::container::request_link;
 use super::*;
 use crate::config::{Configuration, OutputLevel};
 use crate::scan_manager::ScanOrder;
@@ -360,13 +361,13 @@ async fn request_link_happy_path() -> Result<()> {
        then.status(200).body("this is a test");
    });

-    let r_resp = ROBOTS_EXT.request_link(&srv.url("/login.php")).await?;
-    let b_resp = BODY_EXT.request_link(&srv.url("/login.php")).await?;
+    let r_resp = request_link(&srv.url("/login.php"), ROBOTS_EXT.handles.clone()).await?;
+    let b_resp = request_link(&srv.url("/login.php"), BODY_EXT.handles.clone()).await?;

-    assert!(matches!(r_resp.status(), &StatusCode::OK));
-    assert!(matches!(b_resp.status(), &StatusCode::OK));
-    assert_eq!(r_resp.content_length(), 14);
-    assert_eq!(b_resp.content_length(), 14);
+    assert!(matches!(r_resp.status(), StatusCode::OK));
+    assert!(matches!(b_resp.status(), StatusCode::OK));
+    assert_eq!(r_resp.content_length().unwrap(), 14);
+    assert_eq!(b_resp.content_length().unwrap(), 14);
    assert_eq!(mock.hits(), 2);
    Ok(())
 }
@@ -390,8 +391,8 @@ async fn request_link_bails_on_seen_url() -> Result<()> {
    let robots = setup_extractor(ExtractionTarget::RobotsTxt, scans.clone());
    let body = setup_extractor(ExtractionTarget::ResponseBody, scans);

-    let r_resp = robots.request_link(&served).await;
-    let b_resp = body.request_link(&served).await;
+    let r_resp = request_link(&served, robots.handles.clone()).await;
+    let b_resp = request_link(&served, body.handles.clone()).await;

    assert!(r_resp.is_err());
    assert!(b_resp.is_err());
--- a/src/filters/utils.rs
+++ b/src/filters/utils.rs
@@ -4,11 +4,10 @@ use crate::event_handlers::Handles;
 use crate::filters::similarity::SIM_HASHER;
 use crate::nlp::preprocess;
 use crate::response::FeroxResponse;
-use crate::utils::logged_request;
+use crate::utils::{logged_request, parse_url_with_raw_path};
 use crate::DEFAULT_METHOD;
 use anyhow::Result;
 use regex::Regex;
-use reqwest::Url;
 use std::sync::Arc;

 /// wrapper around logic necessary to create a SimilarityFilter
@@ -23,7 +22,7 @@ pub(crate) async fn create_similarity_filter(
    handles: Arc<Handles>,
 ) -> Result<SimilarityFilter> {
    // url as-is based on input, ignores user-specified url manipulation options (add-slash etc)
-    let url = Url::parse(similarity_filter)?;
+    let url = parse_url_with_raw_path(similarity_filter)?;

    // attempt to request the given url
    let resp = logged_request(&url, DEFAULT_METHOD, None, handles.clone()).await?;
--- a/src/heuristics.rs
+++ b/src/heuristics.rs
@@ -1,6 +1,8 @@
+use std::collections::HashMap;
 use std::sync::Arc;

 use anyhow::{bail, Result};
+use futures::future;
 use scraper::{Html, Selector};
 use uuid::Uuid;

@@ -276,138 +278,185 @@ impl HeuristicTests {
            None
        };

-        // 4 is due to the array in the nested for loop below
-        let mut responses = Vec::with_capacity(4);
+        // no matter what, we want an empty extension for the base case
+        let mut extensions = vec!["".to_string()];
+
+        // and then we want to add any extensions that was specified
+        // or has since been added to the running config
+        for ext in &self.handles.config.extensions {
+            extensions.push(format!(".{}", ext));
+        }

        // for every method, attempt to id its 404 response
        //
        // a good example of one where the GET/POST differ is on hackthebox:
        // - http://prd.m.rendering-api.interface.htb/api
+        //
+        // a good example of one where the heuristics return a 403 and a 404 (apache)
+        // as well as return two different types of 404s based on the file extension
+        // - http://10.10.11.198 (Encoding box in normal labs)
+        //
+        // both methods and extensions can elicit different responses from a given
+        // server, so both are considered when building auto-filter rules
        for method in self.handles.config.methods.iter() {
-            for (prefix, length) in [("", 1), ("", 3), (".htaccess", 1), ("admin", 1)] {
-                let path = format!("{prefix}{}", self.unique_string(length));
+            for extension in extensions.iter() {
+                // build out the 6 paths we'll use
+                let paths = [
+                    ("", 1),
+                    ("", 3),
+                    (".htaccess", 1),
+                    (".htaccess", 3),
+                    ("admin", 1),
+                    ("admin", 3),
+                ]
+                .map(|(prefix, length)| {
+                    format!("{prefix}{}{extension}", self.unique_string(length))
+                });

-                let ferox_url = FeroxUrl::from_string(target_url, self.handles.clone());
+                // allow all 6 requests to fly asynchronously
+                let responses = future::join_all(paths.into_iter().map(|path| async move {
+                    let ferox_url = FeroxUrl::from_string(target_url, self.handles.clone());

-                let nonexistent_url = ferox_url.format(&path, slash)?;
+                    let Ok(nonexistent_url) = ferox_url.format(&path, slash) else {
+                        return None;
+                    };

-                // example requests:
-                // - http://localhost/2fc1077836ad43ab98b7a31c2ca28fea
-                // - http://localhost/92969beae6bf4beb855d1622406d87e395c87387a9ad432e8a11245002b709b03cf609d471004154b83bcc1c6ec49f6f
-                // - http://localhost/.htaccessa005a2131e68449aa26e99029c914c09
-                // - http://localhost/adminf1d2541e73c44dcb9d1fb7d93334b280
-                let response =
-                    logged_request(&nonexistent_url, method, data, self.handles.clone()).await;
+                    // example requests:
+                    // - http://localhost/2fc1077836ad43ab98b7a31c2ca28fea
+                    // - http://localhost/92969beae6bf4beb855d1622406d87e395c87387a9ad432e8a11245002b709b03cf609d471004154b83bcc1c6ec49f6f
+                    // - http://localhost/.htaccessa005a2131e68449aa26e99029c914c09
+                    // - http://localhost/.htaccess92969beae6bf4beb855d1622406d87e395c87387a9ad432e8a11245002b709b03cf609d471004154b83bcc1c6ec49f6f
+                    // - http://localhost/adminf1d2541e73c44dcb9d1fb7d93334b280
+                    // - http://localhost/admin92969beae6bf4beb855d1622406d87e395c87387a9ad432e8a11245002b709b03cf609d471004154b83bcc1c6ec49f6f
+                    let Ok(response) =
+                        logged_request(&nonexistent_url, method, data, self.handles.clone())
+                            .await else {
+                                return None;
+                            };

-                req_counter += 1;
+                    if !self
+                        .handles
+                        .config
+                        .status_codes
+                        .contains(&response.status().as_u16())
+                    {
+                        // if the response code isn't one that's accepted via -s values, then skip to the next
+                        //
+                        // the default value for -s is all status codes, so unless the user says otherwise
+                        // this won't fire
+                        return None;
+                    }

-                // continue to next on error
-                let response = skip_fail!(response);
+                    Some(
+                        FeroxResponse::from(
+                            response,
+                            &ferox_url.target,
+                            method,
+                            self.handles.config.output_level,
+                        )
+                        .await,
+                    )
+                }))
+                .await // await gives vector of options containing feroxresponses
+                .into_iter()
+                .flatten() // strip out the none values
+                .collect::<Vec<_>>();

-                if !self
-                    .handles
-                    .config
-                    .status_codes
-                    .contains(&response.status().as_u16())
-                {
-                    // if the response code isn't one that's accepted via -s values, then skip to the next
-                    //
-                    // the default value for -s is all status codes, so unless the user says otherwise
-                    // this won't fire
+                if responses.len() < 2 {
+                    // don't have enough responses to make a determination, continue to next method
+                    log::debug!("not enough responses to make a determination");
                    continue;
                }

-                let ferox_response = FeroxResponse::from(
-                    response,
-                    &ferox_url.target,
-                    method,
+                // check the responses for similarities on which we can filter, multiple may be returned
+                let Some((wildcard_filters, wildcard_responses)) = self.examine_404_like_responses(&responses) else {
+                    // no match was found during analysis of responses
+                    log::warn!("no match found for 404 responses");
+                    continue;
+                };
+
+                // report to the user, if appropriate
+                if matches!(
                    self.handles.config.output_level,
-                )
-                .await;
+                    OutputLevel::Default | OutputLevel::Quiet
+                ) {
+                    // sentry value to control whether or not to print the filter
+                    // used because we only want to print the same filter once
+                    let mut print_sentry;

-                responses.push(ferox_response);
-            }
+                    if let Ok(filters) = self.handles.filters.data.filters.read() {
+                        for new_wildcard in &wildcard_filters {
+                            // reset the sentry for every new wildcard produced by examine_404_like_responses
+                            print_sentry = true;

-            if responses.len() < 2 {
-                // don't have enough responses to make a determination, continue to next method
-                responses.clear();
-                continue;
-            }
+                            for other in filters.iter() {
+                                if let Some(other_wildcard) =
+                                    other.as_any().downcast_ref::<WildcardFilter>()
+                                {
+                                    // check the new wildcard against all existing wildcards, if it was added
+                                    // on the cli or by a previous directory, don't print it
+                                    if new_wildcard.as_ref() == other_wildcard {
+                                        print_sentry = false;
+                                        break;
+                                    }
+                                }
+                            }

-            // Command::AddFilter, &str (bytes/words/lines), usize (i.e. length associated with the type)
-            let Some(filter) = self.examine_404_like_responses(&responses) else {
-                // no match was found during analysis of responses
-                responses.clear();
-                continue;
-            };
-
-            // report to the user, if appropriate
-            if matches!(
-                self.handles.config.output_level,
-                OutputLevel::Default | OutputLevel::Quiet
-            ) {
-                // sentry value to control whether or not to print the filter
-                // used because we only want to print the same filter once
-                let mut print_sentry = true;
-
-                if let Ok(filters) = self.handles.filters.data.filters.read() {
-                    for other in filters.iter() {
-                        if let Some(other_wildcard) =
-                            other.as_any().downcast_ref::<WildcardFilter>()
-                        {
-                            if &*filter == other_wildcard {
-                                print_sentry = false;
-                                break;
+                            // if we're here, we've found a new wildcard that we didn't previously display, print it
+                            if print_sentry {
+                                ferox_print(&format!("{}", new_wildcard), &PROGRESS_PRINTER);
                            }
                        }
                    }
                }

-                if print_sentry {
-                    ferox_print(&format!("{}", filter), &PROGRESS_PRINTER);
+                // create the new filter
+                for wildcard in wildcard_filters {
+                    self.handles.filters.send(Command::AddFilter(wildcard))?;
+                }
+
+                // if we're here, we've detected a 404-like response pattern, and we're already filtering for size/word/line
+                //
+                // in addition, we'll create a similarity filter as a fallback
+                for resp in wildcard_responses {
+                    let hash = SIM_HASHER.create_signature(preprocess(resp.text()).iter());
+
+                    let sim_filter = SimilarityFilter {
+                        hash,
+                        original_url: resp.url().to_string(),
+                    };
+
+                    self.handles
+                        .filters
+                        .send(Command::AddFilter(Box::new(sim_filter)))?;
+
+                    if resp.is_directory() {
+                        // response is either a 3XX with a Location header that matches url + '/'
+                        // or it's a 2XX that ends with a '/'
+                        // or it's a 403 that ends with a '/'
+
+                        // set the wildcard flag to true, so we can check it when preventing
+                        // recursion in event_handlers/scans.rs
+
+                        // we'd need to clone the response to give ownership to the global list anyway
+                        // so we'll also use that clone to set the wildcard flag
+                        let mut cloned_resp = resp.clone();
+
+                        cloned_resp.set_wildcard(true);
+
+                        // add the response to the global list of responses
+                        RESPONSES.insert(cloned_resp);
+
+                        // function-internal magic number, indicates that we've detected a wildcard directory
+                        req_counter += 100;
+                    }
                }
            }
-
-            // create the new filter
-            self.handles.filters.send(Command::AddFilter(filter))?;
-
-            // if we're here, we've detected a 404-like response pattern, and we're already filtering for size/word/line
-            //
-            // in addition, we'll create a similarity filter as a fallback
-            let hash = SIM_HASHER.create_signature(preprocess(responses[0].text()).iter());
-
-            let sim_filter = SimilarityFilter {
-                hash,
-                original_url: responses[0].url().to_string(),
-            };
-
-            self.handles
-                .filters
-                .send(Command::AddFilter(Box::new(sim_filter)))?;
-
-            if responses[0].is_directory() {
-                // response is either a 3XX with a Location header that matches url + '/'
-                // or it's a 2XX that ends with a '/'
-                // or it's a 403 that ends with a '/'
-
-                // set the wildcard flag to true, so we can check it when preventing
-                // recursion in event_handlers/scans.rs
-                responses[0].set_wildcard(true);
-
-                // add the response to the global list of responses
-                RESPONSES.insert(responses[0].clone());
-
-                // function-internal magic number, indicates that we've detected a wildcard directory
-                req_counter += 100;
-            }
-
-            // reset the responses for the next method, if it exists
-            responses.clear();
        }

        log::trace!("exit: detect_404_like_responses");

-        let retval = if req_counter > 100 {
+        let retval = if req_counter >= 100 {
            WildcardResult::WildcardDirectory(req_counter)
        } else {
            WildcardResult::FourOhFourLike(req_counter)
@@ -416,96 +465,138 @@ impl HeuristicTests {
        Ok(Some(retval))
    }

-    /// for all responses, examine chars/words/lines
-    /// if all responses respective lengths match each other, we can assume
-    /// that will remain true for subsequent non-existent urls
+    /// for all responses, group them by status code, then examine chars/words/lines.
+    /// if all responses' respective lengths within a status code grouping match
+    /// each other, we can assume that will remain true for subsequent non-existent urls
    ///
-    /// values are examined from most to least specific (content length, word count, line count)
-    fn examine_404_like_responses(
+    /// within a status code grouping, values are examined from most to
+    /// least specific (content length, word count, line count)
+    #[allow(clippy::vec_box)] // the box is needed in the caller and i dont feel like changing it
+    fn examine_404_like_responses<'a>(
        &self,
-        responses: &[FeroxResponse],
-    ) -> Option<Box<WildcardFilter>> {
+        responses: &'a [FeroxResponse],
+    ) -> Option<(Vec<Box<WildcardFilter>>, Vec<&'a FeroxResponse>)> {
+        // aside from word/line/byte counts, additional discriminators are status code
+        // extension, and request method. The request method and extension are handled by
+        // the caller, since they're part of the request and make up the nested for loops
+        // in detect_404_like_responses.
+        //
+        // The status code is handled here, since it's part of the response to catch cases
+        // where we have something like a 403 and a 404
+
        let mut size_sentry = true;
        let mut word_sentry = true;
        let mut line_sentry = true;

-        let method = responses[0].method();
-        let status_code = responses[0].status();
-        let content_length = responses[0].content_length();
-        let word_count = responses[0].word_count();
-        let line_count = responses[0].line_count();
+        // returned vec of boxed wildcard filters
+        let mut wildcards = Vec::new();

-        for response in &responses[1..] {
-            // if any of the responses differ in length, that particular
-            // response length type is no longer a candidate for filtering
-            if response.content_length() != content_length {
-                size_sentry = false;
-            }
+        // returned vec of ferox responses that are needed for additional
+        // analysis
+        let mut wild_responses = Vec::new();

-            if response.word_count() != word_count {
-                word_sentry = false;
-            }
+        // mapping of grouped responses to status code
+        let mut grouped_responses = HashMap::new();

-            if response.line_count() != line_count {
-                line_sentry = false;
-            }
+        // iterate over all responses and add each response to its
+        // corresponding status code group
+        for response in responses {
+            grouped_responses
+                .entry(response.status())
+                .or_insert_with(Vec::new)
+                .push(response);
        }

-        if !size_sentry && !word_sentry && !line_sentry {
-            // none of the response lengths match, so we can't filter on any of them
-            return None;
+        // iterate over each grouped response and determine the most specific
+        // filter that can be applied to all responses in the group, i.e.
+        // start from byte count and work 'out' to line count
+        for response_group in grouped_responses.values() {
+            if response_group.len() < 2 {
+                // not enough responses to make a determination
+                continue;
+            }
+
+            let method = response_group[0].method();
+            let status_code = response_group[0].status();
+            let content_length = response_group[0].content_length();
+            let word_count = response_group[0].word_count();
+            let line_count = response_group[0].line_count();
+
+            for response in &response_group[1..] {
+                // if any of the responses differ in length, that particular
+                // response length type is no longer a candidate for filtering
+                if response.content_length() != content_length {
+                    size_sentry = false;
+                }
+
+                if response.word_count() != word_count {
+                    word_sentry = false;
+                }
+
+                if response.line_count() != line_count {
+                    line_sentry = false;
+                }
+            }
+
+            if !size_sentry && !word_sentry && !line_sentry {
+                // none of the response lengths match, so we can't filter on any of them
+                continue;
+            }
+
+            let mut wildcard = WildcardFilter {
+                content_length: None,
+                line_count: None,
+                word_count: None,
+                method: method.to_string(),
+                status_code: status_code.as_u16(),
+                dont_filter: self.handles.config.dont_filter,
+            };
+
+            match (size_sentry, word_sentry, line_sentry) {
+                (true, true, true) => {
+                    // all three types of length match, so we can't filter on any of them
+                    wildcard.content_length = Some(content_length);
+                    wildcard.word_count = Some(word_count);
+                    wildcard.line_count = Some(line_count);
+                }
+                (true, true, false) => {
+                    // content length and word count match, so we can filter on either
+                    wildcard.content_length = Some(content_length);
+                    wildcard.word_count = Some(word_count);
+                }
+                (true, false, true) => {
+                    // content length and line count match, so we can filter on either
+                    wildcard.content_length = Some(content_length);
+                    wildcard.line_count = Some(line_count);
+                }
+                (false, true, true) => {
+                    // word count and line count match, so we can filter on either
+                    wildcard.word_count = Some(word_count);
+                    wildcard.line_count = Some(line_count);
+                }
+                (true, false, false) => {
+                    // content length matches, so we can filter on that
+                    wildcard.content_length = Some(content_length);
+                }
+                (false, true, false) => {
+                    // word count matches, so we can filter on that
+                    wildcard.word_count = Some(word_count);
+                }
+                (false, false, true) => {
+                    // line count matches, so we can filter on that
+                    wildcard.line_count = Some(line_count);
+                }
+                (false, false, false) => {
+                    // none of the length types match, so we can't filter on any of them
+                    unreachable!("no wildcard size matches; handled by the if statement above");
+                }
+            };
+
+            wild_responses.push(response_group[0]);
+            wildcards.push(Box::new(wildcard));
        }

-        let mut wildcard = WildcardFilter {
-            content_length: None,
-            line_count: None,
-            word_count: None,
-            method: method.to_string(),
-            status_code: status_code.as_u16(),
-            dont_filter: self.handles.config.dont_filter,
-        };
-
-        match (size_sentry, word_sentry, line_sentry) {
-            (true, true, true) => {
-                // all three types of length match, so we can't filter on any of them
-                wildcard.content_length = Some(content_length);
-                wildcard.word_count = Some(word_count);
-                wildcard.line_count = Some(line_count);
-            }
-            (true, true, false) => {
-                // content length and word count match, so we can filter on either
-                wildcard.content_length = Some(content_length);
-                wildcard.word_count = Some(word_count);
-            }
-            (true, false, true) => {
-                // content length and line count match, so we can filter on either
-                wildcard.content_length = Some(content_length);
-                wildcard.line_count = Some(line_count);
-            }
-            (false, true, true) => {
-                // word count and line count match, so we can filter on either
-                wildcard.word_count = Some(word_count);
-                wildcard.line_count = Some(line_count);
-            }
-            (true, false, false) => {
-                // content length matches, so we can filter on that
-                wildcard.content_length = Some(content_length);
-            }
-            (false, true, false) => {
-                // word count matches, so we can filter on that
-                wildcard.word_count = Some(word_count);
-            }
-            (false, false, true) => {
-                // line count matches, so we can filter on that
-                wildcard.line_count = Some(line_count);
-            }
-            (false, false, false) => {
-                // none of the length types match, so we can't filter on any of them
-                unreachable!("no wildcard size matches; handled by the if statement above");
-            }
-        };
-
-        Some(Box::new(wildcard))
+        Some((wildcards, wild_responses))
    }
 }

--- a/src/main.rs
+++ b/src/main.rs
@@ -31,7 +31,7 @@ use feroxbuster::{
        TermOutHandler, SCAN_COMPLETE,
    },
    filters, heuristics, logger,
-    progress::{PROGRESS_BAR, PROGRESS_PRINTER},
+    progress::PROGRESS_PRINTER,
    scan_manager::{self, ScanType},
    scanner,
    utils::{fmt_err, slugify_filename},
@@ -220,7 +220,6 @@ async fn wrapped_main(config: Arc<Configuration>) -> Result<()> {
        // PROGRESS_PRINTER and PROGRESS_BAR have been used at least once.  This call satisfies
        // that constraint
        PROGRESS_PRINTER.println("");
-        PROGRESS_BAR.join().unwrap();
    });

    // check if update_app is true
--- a/src/progress.rs
+++ b/src/progress.rs
@@ -1,4 +1,6 @@
-use indicatif::{MultiProgress, ProgressBar, ProgressDrawTarget, ProgressStyle};
+use std::time::Duration;
+
+use indicatif::{HumanDuration, MultiProgress, ProgressBar, ProgressDrawTarget, ProgressStyle};
 use lazy_static::lazy_static;

 lazy_static! {
@@ -31,30 +33,68 @@ pub enum BarType {
 /// Add an [indicatif::ProgressBar](https://docs.rs/indicatif/latest/indicatif/struct.ProgressBar.html)
 /// to the global [PROGRESS_BAR](../config/struct.PROGRESS_BAR.html)
 pub fn add_bar(prefix: &str, length: u64, bar_type: BarType) -> ProgressBar {
-    let mut style = ProgressStyle::default_bar().progress_chars("#>-");
+    let mut style = ProgressStyle::default_bar()
+        .progress_chars("#>-")
+        .with_key(
+            "smoothed_per_sec",
+            |state: &indicatif::ProgressState, w: &mut dyn std::fmt::Write| match (
+                state.pos(),
+                state.elapsed().as_millis(),
+            ) {
+                // https://github.com/console-rs/indicatif/issues/394#issuecomment-1309971049
+                //
+                // indicatif released a change to how they reported eta/per_sec
+                // and the results looked really weird based on how we use the progress
+                // bars. this fixes that
+                (pos, elapsed_ms) if elapsed_ms > 0 => {
+                    write!(w, "{:.0}/s", pos as f64 * 1000_f64 / elapsed_ms as f64).unwrap()
+                }
+                _ => write!(w, "-").unwrap(),
+            },
+        )
+        .with_key(
+            "smoothed_eta",
+            |state: &indicatif::ProgressState, w: &mut dyn std::fmt::Write| match (
+                state.pos(),
+                state.len(),
+            ) {
+                (pos, Some(len)) => write!(
+                    w,
+                    "{:#}",
+                    HumanDuration(Duration::from_millis(
+                        (state.elapsed().as_millis()
+                            * ((len as u128).checked_sub(pos as u128).unwrap_or(1))
+                                .checked_div(pos as u128)
+                                .unwrap_or(1)) as u64
+                    ))
+                )
+                .unwrap(),
+                _ => write!(w, "-").unwrap(),
+            },
+        );

    style = match bar_type {
-        BarType::Hidden => style.template(""),
-        BarType::Default => style.template(
-            "[{bar:.cyan/blue}] - {elapsed:<4} {pos:>7}/{len:7} {per_sec:7} {prefix} {msg}",
-        ),
-        BarType::Message => style.template(&format!(
+        BarType::Hidden => style.template("").unwrap(),
+        BarType::Default => style
+            .template("[{bar:.cyan/blue}] - {elapsed:<4} {pos:>7}/{len:7} {smoothed_per_sec:7} {prefix} {msg}")
+            .unwrap(),
+        BarType::Message => style
+            .template(&format!(
            "[{{bar:.cyan/blue}}] - {{elapsed:<4}} {{pos:>7}}/{{len:7}} {:7} {{prefix}} {{msg}}",
            "-"
-        )),
-        BarType::Total => {
-            style.template("[{bar:.yellow/blue}] - {elapsed:<4} {pos:>7}/{len:7} {eta:7} {msg}")
-        }
-        BarType::Quiet => style.template("Scanning: {prefix}"),
+        ))
+            .unwrap(),
+        BarType::Total => style
+            .template("[{bar:.yellow/blue}] - {elapsed:<4} {pos:>7}/{len:7} {smoothed_eta:7} {msg}")
+            .unwrap(),
+        BarType::Quiet => style.template("Scanning: {prefix}").unwrap(),
    };

-    let progress_bar = PROGRESS_BAR.add(ProgressBar::new(length));
-
-    progress_bar.set_style(style);
-
-    progress_bar.set_prefix(prefix);
-
-    progress_bar
+    PROGRESS_BAR.add(
+        ProgressBar::new(length)
+            .with_style(style)
+            .with_prefix(prefix.to_string()),
+    )
 }

 #[cfg(test)]
--- a/src/response.rs
+++ b/src/response.rs
@@ -21,7 +21,7 @@ use crate::{
    event_handlers::{Command, Handles},
    traits::FeroxSerialize,
    url::FeroxUrl,
-    utils::{self, fmt_err, status_colorizer},
+    utils::{self, fmt_err, parse_url_with_raw_path, status_colorizer},
    CommandSender,
 };

@@ -140,7 +140,7 @@ impl FeroxResponse {

    /// Set `FeroxResponse`'s `url` attribute, has no affect if an error occurs
    pub fn set_url(&mut self, url: &str) {
-        match Url::parse(url) {
+        match parse_url_with_raw_path(url) {
            Ok(url) => {
                self.url = url;
            }
@@ -170,7 +170,8 @@ impl FeroxResponse {

    /// free the `text` data, reducing memory usage
    pub fn drop_text(&mut self) {
-        self.text = String::new();
+        self.text.clear(); // length is set to 0
+        self.text.shrink_to_fit(); // allocated capacity shrinks to reflect the new size
    }

    /// Make a reasonable guess at whether the response is a file or not
@@ -394,7 +395,14 @@ impl FeroxResponse {
    pub fn send_report(self, report_sender: CommandSender) -> Result<()> {
        log::trace!("enter: send_report({:?}", report_sender);

-        report_sender.send(Command::Report(Box::new(self)))?;
+        // there's no reason to send the response body across the mpsc
+        //
+        // the only possible reason is for filtering on the body, but both `send_report`
+        // calls are gated behind checks for `should_filter_response`
+        let mut me = self;
+        me.drop_text();
+
+        report_sender.send(Command::Report(Box::new(me)))?;

        log::trace!("exit: send_report");
        Ok(())
@@ -591,7 +599,7 @@ impl<'de> Deserialize<'de> for FeroxResponse {
            match key.as_str() {
                "url" => {
                    if let Some(url) = value.as_str() {
-                        if let Ok(parsed) = Url::parse(url) {
+                        if let Ok(parsed) = parse_url_with_raw_path(url) {
                            response.url = parsed;
                        }
                    }
--- a/src/scan_manager/scan.rs
+++ b/src/scan_manager/scan.rs
@@ -159,7 +159,7 @@ impl FeroxScan {
                if pb.position() > self.num_requests {
                    pb.finish()
                } else {
-                    pb.finish_at_current_pos()
+                    pb.abandon()
                }
            }
        }
--- a/src/scan_manager/scan_container.rs
+++ b/src/scan_manager/scan_container.rs
@@ -379,7 +379,7 @@ impl FeroxScans {
                    .unwrap_or_else(|e| log::warn!("Could not cancel task: {}", e));

                let pb = selected.progress_bar();
-                num_cancelled += pb.length() as usize - pb.position() as usize;
+                num_cancelled += pb.length().unwrap_or(0) as usize - pb.position() as usize;
            } else {
                self.menu.println("Ok, doing nothing...");
            }
--- a/src/scan_manager/tests.rs
+++ b/src/scan_manager/tests.rs
@@ -72,7 +72,7 @@ fn add_url_to_list_of_scanned_urls_with_known_url() {
        url,
        ScanType::Directory,
        ScanOrder::Latest,
-        pb.length(),
+        pb.length().unwrap(),
        OutputLevel::Default,
        Some(pb),
    );
@@ -94,7 +94,7 @@ fn stop_progress_bar_stops_bar() {
        url,
        ScanType::Directory,
        ScanOrder::Latest,
-        pb.length(),
+        pb.length().unwrap(),
        OutputLevel::Default,
        Some(pb),
    );
@@ -152,7 +152,7 @@ async fn call_display_scans() {
        url,
        ScanType::Directory,
        ScanOrder::Latest,
-        pb.length(),
+        pb.length().unwrap(),
        OutputLevel::Default,
        Some(pb),
    );
@@ -160,7 +160,7 @@ async fn call_display_scans() {
        url_two,
        ScanType::Directory,
        ScanOrder::Latest,
-        pb_two.length(),
+        pb_two.length().unwrap(),
        OutputLevel::Default,
        Some(pb_two),
    );
--- a/src/scanner/ferox_scanner.rs
+++ b/src/scanner/ferox_scanner.rs
@@ -203,6 +203,9 @@ impl FeroxScanner {
        log::info!("Starting scan against: {}", self.target_url);

        let mut scan_timer = Instant::now();
+        // every time we extract links we'll need to await the task to make sure
+        // it completes before the scan ends
+        let mut extraction_tasks = Vec::new();

        if self.handles.config.extract_links && matches!(self.order, ScanOrder::Initial) {
            // check for robots.txt (cannot be in sub-directories, so limited to Initial)
@@ -213,7 +216,7 @@ impl FeroxScanner {
                .build()?;

            let result = extractor.extract().await?;
-            extractor.request_links(result).await?;
+            extraction_tasks.push(extractor.request_links(result).await?)
        }

        let scanned_urls = self.handles.ferox_scans()?;
@@ -265,7 +268,7 @@ impl FeroxScanner {

                        let result = extractor.extract_from_dir_listing().await?;

-                        extractor.request_links(result).await?;
+                        extraction_tasks.push(extractor.request_links(result).await?);

                        log::trace!("exit: scan_url -> Directory listing heuristic");

@@ -276,19 +279,27 @@ impl FeroxScanner {

                        self.handles.stats.send(SubtractFromUsizeField(
                            TotalExpected,
-                            progress_bar.length() as usize,
+                            progress_bar.length().unwrap_or(0) as usize,
                        ))?;
                    }

                    let mut message = format!("=> {}", style("Directory listing").blue().bright());

                    if !self.handles.config.extract_links {
-                        write!(message, " (add {} to scan)", style("-e").bright().yellow())?;
+                        write!(
+                            message,
+                            " (remove {} to scan)",
+                            style("--dont-extract-links").bright().yellow()
+                        )?;
                    }

                    if !self.handles.config.force_recursion {
+                        for handle in extraction_tasks.into_iter().flatten() {
+                            _ = handle.await;
+                        }
+
                        progress_bar.reset_eta();
-                        progress_bar.finish_with_message(&message);
+                        progress_bar.finish_with_message(message);

                        ferox_scan.finish()?;

@@ -313,7 +324,7 @@ impl FeroxScanner {
                        style("Wildcard").blue().bright(),
                        style("stopped").red()
                    );
-                    progress_bar.set_message(&message);
+                    progress_bar.set_message(message);
                    progress_bar.inc(num_reqs as u64);
                }
                Some(WildcardResult::FourOhFourLike(num_reqs)) => {
@@ -340,7 +351,7 @@ impl FeroxScanner {
            let new_words = TF_IDF.read().unwrap().all_words();
            let new_words_len = new_words.len();

-            let cur_length = progress_bar.length();
+            let cur_length = progress_bar.length().unwrap_or(0);
            let new_length = cur_length + new_words_len as u64;

            progress_bar.set_length(new_length);
@@ -370,6 +381,10 @@ impl FeroxScanner {
            scan_timer.elapsed().as_secs_f64(),
        ))?;

+        for handle in extraction_tasks.into_iter().flatten() {
+            _ = handle.await;
+        }
+
        ferox_scan.finish()?;

        log::trace!("exit: scan_url");
--- a/src/scanner/requester.rs
+++ b/src/scanner/requester.rs
@@ -217,7 +217,7 @@ impl Requester {

                    self.ferox_scan
                        .progress_bar()
-                        .set_message(&format!("=> 🚦 {styled_direction} scan speed",));
+                        .set_message(format!("=> 🚦 {styled_direction} scan speed",));
                }
                self.policy_data.set_errors(scan_errors);
            } else {
@@ -230,7 +230,7 @@ impl Requester {

                self.ferox_scan
                    .progress_bar()
-                    .set_message(&format!("=> 🚦 {styled_direction} scan speed",));
+                    .set_message(format!("=> 🚦 {styled_direction} scan speed",));
            }
        }

@@ -286,7 +286,7 @@ impl Requester {
            self.set_rate_limiter(Some(new_limit)).await?;
            self.ferox_scan
                .progress_bar()
-                .set_message(&format!("=> 🚦 set rate limit ({new_limit}/s)"));
+                .set_message(format!("=> 🚦 set rate limit ({new_limit}/s)"));
        }

        self.adjust_limit(trigger, true).await?;
@@ -321,11 +321,11 @@ impl Requester {

            // figure out how many requests are skipped as a result
            let pb = self.ferox_scan.progress_bar();
-            let num_skipped = pb.length().saturating_sub(pb.position()) as usize;
+            let num_skipped = pb.length().unwrap_or(0).saturating_sub(pb.position()) as usize;

            let styled_trigger = style(format!("{trigger:?}")).red();

-            pb.set_message(&format!(
+            pb.set_message(format!(
                "=> 💀 too many {} ({}) 💀 bailing",
                styled_trigger,
                self.ferox_scan.num_errors(trigger),
@@ -490,6 +490,7 @@ impl Requester {
                        .target(ExtractionTarget::ResponseBody)
                        .response(&ferox_response)
                        .handles(self.handles.clone())
+                        .url(self.ferox_scan.url())
                        .build()?;

                    let new_links: HashSet<_>;
@@ -513,7 +514,11 @@ impl Requester {
                    }

                    if !new_links.is_empty() {
-                        extractor.request_links(new_links).await?;
+                        let extraction_task = extractor.request_links(new_links).await?;
+
+                        if let Some(task) = extraction_task {
+                            _ = task.await;
+                        }
                    }
                }

--- a/src/url.rs
+++ b/src/url.rs
@@ -1,3 +1,4 @@
+use crate::utils::parse_url_with_raw_path;
 use crate::{event_handlers::Handles, statistics::StatError::UrlFormat, Command::AddError};
 use anyhow::{anyhow, bail, Result};
 use reqwest::Url;
@@ -142,19 +143,19 @@ impl FeroxUrl {
            word = word.trim_start_matches('/').to_string();
        };

-        let base_url = Url::parse(&url)?;
-        let joined = base_url.join(&word)?;
+        let base_url = parse_url_with_raw_path(&url)?;
+        let mut joined = base_url.join(&word)?;

-        if self.handles.config.queries.is_empty() {
-            // no query params to process
-            log::trace!("exit: format -> {}", joined);
-            Ok(joined)
-        } else {
-            let with_params =
-                Url::parse_with_params(joined.as_str(), &self.handles.config.queries)?;
-            log::trace!("exit: format_url -> {}", with_params);
-            Ok(with_params) // request with params attached
+        if !self.handles.config.queries.is_empty() {
+            // if called, this adds a '?' to the url, whether or not there are queries to be added
+            // so we need to check if there are queries to be added before blindly adding the '?'
+            joined
+                .query_pairs_mut()
+                .extend_pairs(self.handles.config.queries.iter());
        }
+
+        log::trace!("exit: format_url -> {}", joined);
+        Ok(joined)
    }

    /// Simple helper to abstract away adding a forward-slash to a url if not present
@@ -189,7 +190,7 @@ impl FeroxUrl {

        let target = self.normalize();

-        let parsed = Url::parse(&target)?;
+        let parsed = parse_url_with_raw_path(&target)?;
        let parts = parsed
            .path_segments()
            .ok_or_else(|| anyhow!("No path segments found"))?;
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -75,7 +75,12 @@ pub(crate) async fn send_try_recursion_command(
    handles: Arc<Handles>,
    response: FeroxResponse,
 ) -> Result<()> {
-    handles.send_scan_command(Command::TryRecursion(Box::new(response.clone())))?;
+    // make the response mutable so we can drop the body before
+    // sending it over the mpsc
+    let mut response = response;
+    response.drop_text();
+
+    handles.send_scan_command(Command::TryRecursion(Box::new(response)))?;
    let (tx, rx) = oneshot::channel::<bool>();
    handles.send_scan_command(Command::Sync(tx))?;
    rx.await?;
@@ -420,9 +425,14 @@ fn should_deny_absolute(url_to_test: &Url, denier: &Url, handles: Arc<Handles>)
        // current deny-url, now we just need to check to see if this deny-url is a parent
        // to a scanned url that is also a parent of the given url
        for ferox_scan in handles.ferox_scans()?.get_active_scans() {
-            let scanner = Url::parse(ferox_scan.url().trim_end_matches('/'))
+            let scanner = parse_url_with_raw_path(ferox_scan.url().trim_end_matches('/'))
                .with_context(|| format!("Could not parse {ferox_scan} as a url"))?;

+            // by calling the new parse_url_with_raw_path, and reaching this point without an
+            // error, we know we have an authority and therefore a host. leaving the code
+            // below, but we should never hit the else condition. leaving it in so if we find
+            // a case where i'm mistaken, we'll know about it and can address it
+
            if let Some(scan_host) = scanner.host() {
                // same domain/ip check we perform on the denier above
                if tested_host != scan_host {
@@ -431,7 +441,7 @@ fn should_deny_absolute(url_to_test: &Url, denier: &Url, handles: Arc<Handles>)
                }
            } else {
                // couldn't process .host from scanner
-                continue;
+                unreachable!("should_deny_absolute: scanner.host() returned None, which shouldn't be possible");
            };

            let scan_path = scanner.path();
@@ -482,7 +492,7 @@ pub fn should_deny_url(url: &Url, handles: Arc<Handles>) -> Result<bool> {

    // normalization for comparison is to remove the trailing / if one exists, this is done for
    // the given url and any url to which it's compared
-    let normed_url = Url::parse(url.to_string().trim_end_matches('/'))?;
+    let normed_url = parse_url_with_raw_path(url.to_string().trim_end_matches('/'))?;

    for denier in &handles.config.url_denylist {
        // note to self: it may seem as though we can use regex only for --dont-scan, however, in
@@ -532,6 +542,187 @@ pub fn slugify_filename(url: &str, prefix: &str, suffix: &str) -> String {
    filename
 }

+/// This function takes a url string and returns a `url::Url`
+///
+/// It is primarily used to detect url paths that `url::Url::parse` will
+/// silently transform, such as /path/../file.html -> /file.html
+///
+/// # Warning
+///
+/// In the instance of a url with encoded path traversal strings, such as
+/// /path/%2e%2e/file.html, the underlying `url::Url::parse` will
+/// further encode the %-signs and return /path/%252e%252e/file.html
+pub fn parse_url_with_raw_path(url: &str) -> Result<Url> {
+    log::trace!("enter: parse_url_with_raw_path({})", url);
+
+    let parsed = Url::parse(url)?;
+
+    if !parsed.has_authority() {
+        // parsed correctly, but no authority, meaning mailto: or tel: or
+        // some other url that we don't care about
+        bail!("url to parse has no authority and is therefore invalid");
+    }
+
+    // we have a valid url, the next step is to check the path and see if it's
+    // something that url::Url::parse would silently transform
+    //
+    // i.e. if the path is /path/../file.html, url::Url::parse will transform it
+    // to /file.html, which is not what we want
+
+    let farthest_right_authority_part;
+
+    // we want to find the farthest right authority component, which is the
+    // component that is the furthest right in the url that is part of the
+    // authority
+    //
+    // per RFC 3986, the authority is defined as:
+    // - authority = [ userinfo "@" ] host [ ":" port ]
+    //
+    // so the farthest right authority component is either the port or the host
+    //
+    // i.e. in http://example.com:80/path/file.html, the farthest right authority
+    // component is :80
+    //
+    // in http://example.com/path/file.html, the farthest right authority component
+    // is example.com
+    //
+    // the farthest right authority component is used to split the url into two
+    // parts: the part before the authority and the part after the authority
+    if let Some(port) = parsed.port() {
+        // if the url has a port, then the farthest right authority component is
+        // the port
+        farthest_right_authority_part = format!(":{}", port);
+    } else if parsed.has_host() {
+        // if the url has a host, then the farthest right authority component is
+        // the host
+        farthest_right_authority_part = parsed.host_str().unwrap().to_owned();
+    } else {
+        // if the url has neither a port nor a host, then the url is invalid
+        // and we can't do anything with it, but i don't think this is possible
+        unreachable!("url has an authority, but has neither a port nor a host");
+    }
+
+    // split the original url string into two parts: the part before the authority and the part
+    // after the authority (i.e. the path + query + fragment)
+
+    let Some((_, after_authority)) = url.split_once(&farthest_right_authority_part) else {
+        // if we can't split the url string into two parts, then the url doesn't conform to our
+        // expectations, and we can't continue processing it, so we'll return the parsed url
+        return Ok(parsed);
+    };
+
+    // when there is a port, but it matches the default port for the scheme,
+    // url::Url::parse will mark the port as None, giving us a
+    // `after_authority` that looks something like this:
+    // - :80/path/file.html
+    let after_authority = after_authority
+        .replacen(":80", "", 1)
+        .replacen(":443", "", 1);
+
+    // snippets from rfc-3986:
+    //
+    //          foo://example.com:8042/over/there?name=ferret#nose
+    //          \_/   \______________/\_________/ \_________/ \__/
+    //           |           |            |            |        |
+    //        scheme     authority       path        query   fragment
+    //
+    // The path component is terminated
+    //    by the first question mark ("?") or number sign ("#") character, or
+    //    by the end of the URI.
+    //
+    // The query component is indicated by the first question
+    //    mark ("?") character and terminated by a number sign ("#") character
+    //    or by the end of the URI.
+    let (path, _discarded) = after_authority
+        .split_once('?')
+        // if there isn't a '?', try to remove a fragment
+        .unwrap_or_else(|| {
+            // if there isn't a '#', return (original, empty)
+            after_authority
+                .split_once('#')
+                .unwrap_or((&after_authority, ""))
+        });
+
+    // at this point, we have the path, all by itself
+
+    // each of the following is a string that we can expect url::Url::parse to
+    // transform. The variety is to ensure we cover most common path traversal
+    // encodings
+    let transformation_detectors = vec![
+        // ascii
+        "..",
+        // single url encoded
+        "%2e%2e",
+        // double url encoded
+        "%25%32%65%25%32%65",
+        // utf-8 encoded
+        "%c0%ae%c0%ae",
+        "%e0%40%ae%e0%40%ae",
+        "%c0ae%c0ae",
+        // 16 bit shenanigans
+        "%uff0e%uff0e",
+        "%u002e%u002e",
+    ];
+
+    let parsing_will_transform_path = transformation_detectors
+        .iter()
+        .any(|detector| path.to_lowercase().contains(detector));
+
+    if !parsing_will_transform_path {
+        // there's no string in the path of the url that will trigger a transformation
+        // so, we can return it as-is
+        return Ok(parsed);
+    }
+
+    // if we reach this point, the path contains a string that will trigger a transformation
+    // so we need to manually create a Url that doesn't have the transformation
+    // and return that
+    //
+    // special thanks to github user @lavafroth for this workaround
+
+    let mut hacked_url = if path.ends_with('/') {
+        // from_file_path silently strips trailing slashes, and
+        // from_directory_path adds them, so we'll choose the appropriate
+        // constructor based on the presence of a path's trailing slash
+
+        // according to from_file_path docs:
+        //   from_file_path returns `Err` if the given path is not absolute or,
+        //   on Windows, if the prefix is not a disk prefix (e.g. `C:`) or a UNC prefix (`\\`).
+        //
+        // since we parsed out a valid url path, we know it is absolute, so on non-windows
+        // platforms, we can safely unwrap. On windows, we need to fix up the path
+        #[cfg(target_os = "windows")]
+        {
+            let path = format!("\\/IGNOREME{path}");
+            Url::from_directory_path(path).unwrap()
+        }
+        #[cfg(not(target_os = "windows"))]
+        Url::from_directory_path(path).unwrap()
+    } else {
+        #[cfg(target_os = "windows")]
+        {
+            let path = format!("\\/IGNOREME{path}");
+            Url::from_file_path(path).unwrap()
+        }
+        #[cfg(not(target_os = "windows"))]
+        Url::from_file_path(path).unwrap()
+    };
+
+    // host must be set first, otherwise multiple components may return Err
+    hacked_url.set_host(parsed.host_str())?;
+    // scheme/port/username/password can fail, but in this instance, we know they won't
+    hacked_url.set_scheme(parsed.scheme()).unwrap();
+    hacked_url.set_port(parsed.port()).unwrap();
+    hacked_url.set_username(parsed.username()).unwrap();
+    hacked_url.set_password(parsed.password()).unwrap();
+    // query/fragment can't fail
+    hacked_url.set_query(parsed.query());
+    hacked_url.set_fragment(parsed.fragment());
+
+    log::trace!("exit: parse_url_with_raw_path -> {}", hacked_url);
+    Ok(hacked_url)
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -539,31 +730,159 @@ mod tests {
    use crate::scan_manager::{FeroxScans, ScanOrder};

    #[test]
-    /// set_open_file_limit with a low requested limit succeeds
-    fn utils_set_open_file_limit_with_low_requested_limit() {
-        let (_, hard) = getrlimit(Resource::NOFILE).unwrap();
-        let lower_limit = hard - 1;
-        assert!(set_open_file_limit(lower_limit));
+    /// multiple tests for parse_url_with_raw_path
+    fn utils_parse_url_with_raw_path() {
+        // ../.. is preserved
+        let url = "https://www.google.com/../../stuff";
+        let parsed = parse_url_with_raw_path(url).unwrap();
+        assert_eq!(parsed.as_str(), url);
+
+        // ../.. is preserved as well as the trailing slash
+        let url = "https://www.google.com/../../stuff/";
+        let parsed = parse_url_with_raw_path(url).unwrap();
+        assert_eq!(parsed.as_str(), url);
+
+        // no trailing slash is preserved
+        let url = "https://www.google.com/stuff";
+        let parsed = parse_url_with_raw_path(url).unwrap();
+        assert_eq!(parsed.as_str(), url);
+
+        // trailing slash is preserved
+        let url = "https://www.google.com/stuff/";
+        let parsed: Url = parse_url_with_raw_path(url).unwrap();
+        assert_eq!(parsed.as_str(), url);
+
+        // mailto is an error
+        let url = "mailto:user@example.com";
+        let parsed = parse_url_with_raw_path(url);
+        assert!(parsed.is_err());
+
+        // relative url is an error
+        let url = "../../stuff";
+        let parsed = parse_url_with_raw_path(url);
+        assert!(parsed.is_err());
+
+        // absolute without host is an error
+        let url = "/../../stuff";
+        let parsed = parse_url_with_raw_path(url);
+        assert!(parsed.is_err());
+
+        // default ports are parsed correctly
+        for url in [
+            "http://example.com:80/path/file.html",
+            "https://example.com:443/path/file.html",
+        ] {
+            let parsed = parse_url_with_raw_path(url).unwrap();
+            assert!(parsed.port().is_none());
+            assert_eq!(parsed.host().unwrap().to_string().as_str(), "example.com");
+        }
+
+        // non-default ports are parsed correctly
+        for url in [
+            "http://example.com:8080/path/file.html",
+            "https://example.com:4433/path/file.html",
+        ] {
+            let parsed = parse_url_with_raw_path(url).unwrap();
+            assert!(parsed.port().is_some());
+            assert_eq!(parsed.as_str(), url);
+        }
+
+        // different encodings are respected if found in doubles
+        //
+        // note that the % sign is encoded as %25...
+        let url = "http://user:pass@example.com/%2e%2e/stuff.php";
+        let parsed = parse_url_with_raw_path(url).unwrap();
+        assert_eq!(
+            parsed.as_str(),
+            "http://user:pass@example.com/%252e%252e/stuff.php"
+        );
+
+        let url = "http://user:pass@example.com/%25%32%65%25%32%65/stuff.php";
+        let parsed = parse_url_with_raw_path(url).unwrap();
+        assert_eq!(parsed.username(), "user");
+        assert_eq!(parsed.password().unwrap(), "pass");
+        assert_eq!(
+            parsed.as_str(),
+            "http://user:pass@example.com/%2525%2532%2565%2525%2532%2565/stuff.php"
+        );
+
+        let url = "http://user:pass@example.com/%c0%ae%c0%ae/stuff.php";
+        let parsed = parse_url_with_raw_path(url).unwrap();
+        assert_eq!(parsed.username(), "user");
+        assert_eq!(parsed.password().unwrap(), "pass");
+        assert_eq!(
+            parsed.as_str(),
+            "http://user:pass@example.com/%25c0%25ae%25c0%25ae/stuff.php"
+        );
+
+        let url = "http://user:pass@example.com/%e0%40%ae%e0%40%ae/stuff.php";
+        let parsed = parse_url_with_raw_path(url).unwrap();
+        assert_eq!(parsed.username(), "user");
+        assert_eq!(parsed.password().unwrap(), "pass");
+        assert_eq!(
+            parsed.as_str(),
+            "http://user:pass@example.com/%25e0%2540%25ae%25e0%2540%25ae/stuff.php"
+        );
+
+        let url = "http://user:pass@example.com/%c0ae%c0ae/stuff.php";
+        let parsed = parse_url_with_raw_path(url).unwrap();
+        assert_eq!(parsed.username(), "user");
+        assert_eq!(parsed.password().unwrap(), "pass");
+        assert_eq!(
+            parsed.as_str(),
+            "http://user:pass@example.com/%25c0ae%25c0ae/stuff.php"
+        );
+
+        let url = "http://user:pass@example.com/%uff0e%uff0e/stuff.php";
+        let parsed = parse_url_with_raw_path(url).unwrap();
+        assert_eq!(parsed.username(), "user");
+        assert_eq!(parsed.password().unwrap(), "pass");
+        assert_eq!(
+            parsed.as_str(),
+            "http://user:pass@example.com/%25uff0e%25uff0e/stuff.php"
+        );
+
+        let url = "http://user:pass@example.com/%u002e%u002e/stuff.php";
+        let parsed = parse_url_with_raw_path(url).unwrap();
+        assert_eq!(parsed.username(), "user");
+        assert_eq!(parsed.password().unwrap(), "pass");
+        assert_eq!(
+            parsed.as_str(),
+            "http://user:pass@example.com/%25u002e%25u002e/stuff.php"
+        );
    }

-    #[test]
-    /// set_open_file_limit with a high requested limit succeeds
-    fn utils_set_open_file_limit_with_high_requested_limit() {
-        let (_, hard) = getrlimit(Resource::NOFILE).unwrap();
-        let higher_limit = hard + 1;
-        // calculate a new soft to ensure soft != hard and hit that logic branch
-        let new_soft = hard - 1;
-        setrlimit(Resource::NOFILE, new_soft, hard).unwrap();
-        assert!(set_open_file_limit(higher_limit));
-    }
+    #[cfg(not(target_os = "windows"))]
+    mod nix_only_tests {
+        use super::*;

-    #[test]
-    /// set_open_file_limit should fail when hard == soft
-    fn utils_set_open_file_limit_with_fails_when_both_limits_are_equal() {
-        let (_, hard) = getrlimit(Resource::NOFILE).unwrap();
-        // calculate a new soft to ensure soft == hard and hit the failure logic branch
-        setrlimit(Resource::NOFILE, hard, hard).unwrap();
-        assert!(!set_open_file_limit(hard)); // returns false
+        #[test]
+        /// set_open_file_limit with a low requested limit succeeds
+        fn utils_set_open_file_limit_with_low_requested_limit() {
+            let (_, hard) = getrlimit(Resource::NOFILE).unwrap();
+            let lower_limit = hard - 1;
+            assert!(set_open_file_limit(lower_limit));
+        }
+
+        #[test]
+        /// set_open_file_limit with a high requested limit succeeds
+        fn utils_set_open_file_limit_with_high_requested_limit() {
+            let (_, hard) = getrlimit(Resource::NOFILE).unwrap();
+            let higher_limit = hard + 1;
+            // calculate a new soft to ensure soft != hard and hit that logic branch
+            let new_soft = hard - 1;
+            setrlimit(Resource::NOFILE, new_soft, hard).unwrap();
+            assert!(set_open_file_limit(higher_limit));
+        }
+
+        #[test]
+        /// set_open_file_limit should fail when hard == soft
+        fn utils_set_open_file_limit_with_fails_when_both_limits_are_equal() {
+            let (_, hard) = getrlimit(Resource::NOFILE).unwrap();
+            // calculate a new soft to ensure soft == hard and hit the failure logic branch
+            setrlimit(Resource::NOFILE, hard, hard).unwrap();
+            assert!(!set_open_file_limit(hard)); // returns false
+        }
    }

    #[test]
@@ -697,6 +1016,13 @@ mod tests {
    /// provide a denier from which we can't check a host, which results in no comparison, expect false
    /// because the denier is a parent to the tested, even tho the scanned doesn't compare, it
    /// still returns true
+    ///
+    /// note: adding parse_url_with_raw_path changed the behavior of this test, it used to return
+    /// true, now it returns false. see my note in should_deny_absolute and the unreachable!
+    /// call block to see why
+    ///
+    /// leaving this test here to document the behavior change and to catch regressions in the
+    /// new expected behavior
    fn should_deny_url_doesnt_compare_non_domains_in_scanned() {
        let deny_url = "https://testdomain.com/";
        let scan_url = "unix:/run/foo.socket";
@@ -710,8 +1036,7 @@ mod tests {
        let config = Arc::new(config);

        let handles = Arc::new(Handles::for_testing(Some(scans), Some(config)).0);
-
-        assert!(should_deny_url(&tested_url, handles).unwrap());
+        assert!(!should_deny_url(&tested_url, handles).unwrap());
    }

    #[test]
--- a/tests/test_heuristics.rs
+++ b/tests/test_heuristics.rs
@@ -164,7 +164,7 @@ fn test_static_wildcard_request_found() -> Result<(), Box<dyn std::error::Error>

    let mock = srv.mock(|when, then| {
        when.method(GET)
-            .path_matches(Regex::new("/[a-zA-Z0-9]{32}/").unwrap());
+            .path_matches(Regex::new("/[.a-zA-Z0-9]{32,}/").unwrap());
        then.status(200).body("this is a test");
    });

@@ -188,7 +188,8 @@ fn test_static_wildcard_request_found() -> Result<(), Box<dyn std::error::Error>
            .and(predicate::str::contains("1l")),
    );

-    assert_eq!(mock.hits(), 1);
+    assert_eq!(mock.hits(), 6);
+
    Ok(())
 }

@@ -305,11 +306,67 @@ fn heuristics_wildcard_test_with_two_static_wildcards_with_silent_enabled(
        .success()
        .stdout(predicate::str::contains(srv.url("/")));

-    assert_eq!(mock.hits(), 4);
+    assert_eq!(mock.hits(), 6);
    assert_eq!(mock2.hits(), 1);
    Ok(())
 }

+#[test]
+/// test finds a 404-like response that returns a 403 and a 403 directory should still be allowed
+/// to be tested for recrusion
+fn heuristics_wildcard_test_that_auto_filtering_403s_still_allows_for_recursion_into_403_directories(
+) -> Result<(), Box<dyn std::error::Error>> {
+    let srv = MockServer::start();
+
+    let super_long = String::from("92969beae6bf4beb855d1622406d87e395c87387a9ad432e8a11245002b709b03cf609d471004154b83bcc1c6ec49f6f09d471004154b83bcc1c6ec49f6f");
+
+    let (tmp_dir, file) =
+        setup_tmp_directory(&["LICENSE".to_string(), super_long.clone()], "wordlist")?;
+
+    srv.mock(|when, then| {
+        when.method(GET)
+            .path_matches(Regex::new("/.?[a-zA-Z0-9]{32,103}").unwrap());
+        then.status(403)
+            .body("this is a testAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA");
+    });
+
+    srv.mock(|when, then| {
+        when.method(GET).path("/LICENSE/");
+        then.status(403)
+            .body("this is a testAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA");
+    });
+
+    srv.mock(|when, then| {
+        when.method(GET).path(format!("/LICENSE/{}", super_long));
+        then.status(200);
+    });
+
+    let cmd = Command::cargo_bin("feroxbuster")
+        .unwrap()
+        .arg("--url")
+        .arg(srv.url("/"))
+        .arg("--wordlist")
+        .arg(file.as_os_str())
+        .arg("--add-slash")
+        .unwrap();
+
+    teardown_tmp_directory(tmp_dir);
+
+    cmd.assert().success().stdout(
+        predicate::str::contains("GET")
+            .and(predicate::str::contains(
+                "Auto-filtering found 404-like response and created new filter",
+            ))
+            .and(predicate::str::contains("403"))
+            .and(predicate::str::contains("1l"))
+            .and(predicate::str::contains("4w"))
+            .and(predicate::str::contains("46c"))
+            .and(predicate::str::contains(srv.url("/LICENSE/LICENSE/"))),
+    );
+
+    Ok(())
+}
+
 // #[test]
 // /// test finds a static wildcard and reports as much to stdout and a file
 // fn heuristics_wildcard_test_with_two_static_wildcards_and_output_to_file() {
Author	SHA1	Message	Date
epi	1cf37e38a2	Merge pull request #884 from epi052/878-support-raw-urls 878 support raw urls	2023-04-26 06:59:04 -05:00
epi	9876759606	nitpickery	2023-04-26 06:45:13 -05:00
epi	4150b61a42	fixed windows logic	2023-04-26 06:33:43 -05:00
epi	16d34bbee0	bumped version to 2.9.5	2023-04-25 07:10:48 -05:00
epi	f1fd2fc379	updated Url::parse callsites to use the new utility function	2023-04-25 07:09:56 -05:00
epi	3dd070a0db	fmt	2023-04-24 06:20:14 -05:00
epi	a3dc6c97a0	added workaround to add partial support for raw urls	2023-04-24 06:19:21 -05:00
epi	ec78ec3049	added ability to specify install directory for install-nix.sh	2023-04-19 17:15:50 -05:00
epi	960536e918	Merge pull request #879 from epi052/all-contributors/add-DrorDvash docs: add DrorDvash as a contributor for bug	2023-04-19 08:05:15 -05:00
allcontributors[bot]	fdae9aa9d6	docs: update .all-contributorsrc [skip ci]	2023-04-19 13:03:50 +00:00
allcontributors[bot]	5c73c3fb23	docs: update README.md [skip ci]	2023-04-19 13:03:49 +00:00
epi	02ef6d7e3f	Merge pull request #877 from epi052/update-indicatif-finally Random improvements	2023-04-19 07:59:47 -05:00
epi	3378246820	updated arm release names for --update fix	2023-04-19 07:46:43 -05:00
epi	692db93048	clippy/tests and added logic to wait for link extraction if done	2023-04-19 06:57:36 -05:00
epi	233cf99907	made link extraction req/resp async	2023-04-19 06:56:52 -05:00
epi	8cd9918b76	upgraded deps	2023-04-19 06:55:23 -05:00
epi	66bcbfc2f2	bumped version to 2.9.4	2023-04-19 06:51:35 -05:00
epi	8b127c0093	made 404-like req/resp async	2023-04-17 06:37:28 -05:00
epi	94de58d855	removed response body from mpsc traversal	2023-04-17 06:36:47 -05:00
epi	2b95b7be69	updated indicatif to 0.17.3	2023-04-17 06:26:59 -05:00
epi	e77c1314b1	Merge pull request #869 from epi052/auto-filtering-account-for-extensions added extensions and status codes into auto filtering decision calculus	2023-04-11 19:07:53 -05:00
epi	1ced3b5d77	modified msg when dir listing is found with dont-extract	2023-04-11 18:48:18 -05:00
epi	b5472f5341	updated deps	2023-04-11 18:39:28 -05:00
epi	ea81600850	clippy	2023-04-11 18:36:37 -05:00
epi	4f679592b8	bumped version to 2.9.3	2023-04-11 18:34:02 -05:00
epi	b375893461	nitpickery	2023-04-11 18:32:56 -05:00
epi	e110f86f39	added extensions and status codes into auto filtering decision calculus	2023-04-11 18:29:12 -05:00
epi	c7498a7695	Merge pull request #839 from epi052/all-contributors/add-acut3 docs: add acut3 as a contributor for bug	2023-03-18 12:23:34 -05:00
allcontributors[bot]	f973baaba8	docs: update .all-contributorsrc [skip ci]	2023-03-18 17:23:25 +00:00
allcontributors[bot]	148982cdc4	docs: update README.md [skip ci]	2023-03-18 17:23:24 +00:00