even more cleanup; extraction looking mostly complete

2026-06-06 00:41:13 -03:00 · 2020-10-19 19:47:03 -05:00
parent 6fa542ecc5
commit 74aa5e8047
2 changed files with 157 additions and 38 deletions
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -120,15 +120,38 @@ impl FeroxResponse {
        };
    }

+    /// Make a reasonable guess at whether the response is a file or not
+    ///
+    /// Examines the last part of a path to determine if it has an obvious extension
+    /// i.e. http://localhost/some/path/stuff.js where stuff.js indicates a file
+    ///
+    /// Additionally, inspects query parameters, as they're also often indicative of a file
+    pub fn is_file(&self) -> bool {
+        let has_extension = match self.url.path_segments() {
+            Some(path) => {
+                if let Some(last) = path.last() {
+                    last.contains('.') // last segment has some sort of extension, probably
+                } else {
+                    false
+                }
+            }
+            None => false,
+        };
+
+        self.url.query_pairs().count() > 0 || has_extension
+    }
+
    /// Create a new `FeroxResponse` from the given `Response`
    pub async fn from(response: Response) -> Self {
        let url = response.url().clone();
-        let status = response.status().clone();
+        let status = response.status();
        let headers = response.headers().clone();
        let content_length = response.content_length().unwrap_or(0);

        let text = if CONFIGURATION.extract_links {
            // .text() consumes the response, must be called last
+            // additionally, --extract-links is currently the only place we use the body of the
+            // response, so we forego the processing if not performing extraction
            match response.text().await {
                // await the response's body
                Ok(text) => text,
--- a/src/scanner.rs
+++ b/src/scanner.rs
@@ -1,11 +1,12 @@
 use crate::config::{CONFIGURATION, PROGRESS_BAR};
+use crate::extractor::get_links;
 use crate::heuristics::WildcardFilter;
 use crate::utils::{format_url, get_current_depth, get_url_path_length, make_request};
 use crate::{heuristics, progress, FeroxChannel, FeroxResponse};
 use futures::future::{BoxFuture, FutureExt};
 use futures::{stream, StreamExt};
 use lazy_static::lazy_static;
-use reqwest::{Response, Url};
+use reqwest::Url;
 use std::collections::HashSet;
 use std::convert::TryInto;
 use std::ops::Deref;
@@ -160,7 +161,7 @@ fn create_urls(target_url: &str, word: &str, extensions: &[String]) -> Vec<Url>
 ///
 /// handles 2xx and 3xx responses by either checking if the url ends with a / (2xx)
 /// or if the Location header is present and matches the base url + / (3xx)
-fn response_is_directory(response: &Response) -> bool {
+fn response_is_directory(response: &FeroxResponse) -> bool {
    log::trace!("enter: is_directory({:?})", response);

    if response.status().is_redirection() {
@@ -240,7 +241,7 @@ fn reached_max_depth(url: &Url, base_depth: usize, max_depth: usize) -> bool {
 ///
 /// When a recursion opportunity is found, the new url is sent across the recursion channel
 async fn try_recursion(
-    response: &Response,
+    response: &FeroxResponse,
    base_depth: usize,
    transmitter: UnboundedSender<String>,
 ) {
@@ -290,6 +291,45 @@ async fn try_recursion(
    log::trace!("exit: try_recursion");
 }

+/// Simple helper to stay DRY; determines whether or not a given `FeroxResponse` should be reported
+/// to the user or not.
+fn should_filter_response(
+    content_len: &u64,
+    filter: Arc<WildcardFilter>,
+    response: &FeroxResponse,
+) -> bool {
+    if CONFIGURATION.sizefilters.contains(content_len) {
+        // filtered value from --sizefilters, move on to the next url
+        log::debug!("size filter: filtered out {}", response.url());
+        return true;
+    }
+
+    if filter.size > 0 && filter.size == *content_len && !CONFIGURATION.dontfilter {
+        // static wildcard size found during testing
+        // size isn't default, size equals response length, and auto-filter is on
+        log::debug!("static wildcard: filtered out {}", response.url());
+        return true;
+    }
+
+    if filter.dynamic > 0 && !CONFIGURATION.dontfilter {
+        // dynamic wildcard offset found during testing
+
+        // I'm about to manually split this url path instead of using reqwest::Url's
+        // builtin parsing. The reason is that they call .split() on the url path
+        // except that I don't want an empty string taking up the last index in the
+        // event that the url ends with a forward slash.  It's ugly enough to be split
+        // into its own function for readability.
+        let url_len = get_url_path_length(&response.url());
+
+        if url_len + filter.dynamic == *content_len {
+            log::debug!("dynamic wildcard: filtered out {}", response.url());
+            return true;
+        }
+    }
+
+    false
+}
+
 /// Wrapper for [make_request](fn.make_request.html)
 ///
 /// Handles making multiple requests based on the presence of extensions
@@ -316,63 +356,119 @@ async fn make_requests(

    for url in urls {
        if let Ok(response) = make_request(&CONFIGURATION.client, &url).await {
-            // response came back without error
+            // response came back without error, convert it to FeroxResponse
+            let ferox_response = FeroxResponse::from(response).await;

            // do recursion if appropriate
-            if !CONFIGURATION.norecursion && response_is_directory(&response) {
-                try_recursion(&response, base_depth, dir_chan.clone()).await;
+            if !CONFIGURATION.norecursion {
+                try_recursion(&ferox_response, base_depth, dir_chan.clone()).await;
            }

            // purposefully doing recursion before filtering. the thought process is that
            // even though this particular url is filtered, subsequent urls may not

-            let content_len = &response.content_length().unwrap_or(0);
+            let content_len = &ferox_response.content_length();

-            if CONFIGURATION.sizefilters.contains(content_len) {
-                // filtered value from --sizefilters, move on to the next url
-                log::debug!("size filter: filtered out {}", response.url());
+            if should_filter_response(content_len, filter.clone(), &ferox_response) {
                continue;
            }

-            if filter.size > 0 && filter.size == *content_len && !CONFIGURATION.dontfilter {
-                // static wildcard size found during testing
-                // size isn't default, size equals response length, and auto-filter is on
-                log::debug!("static wildcard: filtered out {}", response.url());
-                continue;
-            }
+            if CONFIGURATION.extract_links && !ferox_response.status().is_redirection() {
+                let new_links = get_links(&ferox_response).await;

-            if filter.dynamic > 0 && !CONFIGURATION.dontfilter {
-                // dynamic wildcard offset found during testing
+                for new_link in new_links {
+                    let unknown = add_url_to_list_of_scanned_urls(&new_link, &SCANNED_URLS);

-                // I'm about to manually split this url path instead of using reqwest::Url's
-                // builtin parsing. The reason is that they call .split() on the url path
-                // except that I don't want an empty string taking up the last index in the
-                // event that the url ends with a forward slash.  It's ugly enough to be split
-                // into its own function for readability.
-                let url_len = get_url_path_length(&response.url());
+                    if !unknown {
+                        // not unknown, i.e. we've seen the url before and don't need to scan again
+                        continue;
+                    }

-                if url_len + filter.dynamic == *content_len {
-                    log::debug!("dynamic wildcard: filtered out {}", response.url());
-                    continue;
+                    // create a url based on the given command line options, continue on error
+                    let new_url = match format_url(
+                        &new_link,
+                        &"",
+                        CONFIGURATION.addslash,
+                        &CONFIGURATION.queries,
+                        None,
+                    ) {
+                        Ok(url) => url,
+                        Err(_) => continue,
+                    };
+
+                    // make the request and store the response
+                    let new_response = match make_request(&CONFIGURATION.client, &new_url).await {
+                        Ok(resp) => resp,
+                        Err(_) => continue,
+                    };
+
+                    let mut new_ferox_response = FeroxResponse::from(new_response).await;
+
+                    // filter if necessary
+                    if should_filter_response(
+                        &new_ferox_response.content_length(),
+                        filter.clone(),
+                        &new_ferox_response,
+                    ) {
+                        continue;
+                    }
+
+                    if new_ferox_response.is_file() {
+                        // very likely a file, simply request and report
+                        log::debug!(
+                            "Singular extraction: {} ({})",
+                            new_ferox_response.url(),
+                            new_ferox_response.status().as_str(),
+                        );
+
+                        send_report(report_chan.clone(), new_ferox_response);
+
+                        continue;
+                    }
+
+                    if !CONFIGURATION.norecursion {
+                        log::debug!(
+                            "Recursive extraction: {} ({})",
+                            new_ferox_response.url(),
+                            new_ferox_response.status().as_str()
+                        );
+
+                        if new_ferox_response.status().is_success()
+                            && !new_ferox_response.url().as_str().ends_with('/')
+                        {
+                            // since all of these are 2xx, recursion is only attempted if the
+                            // url ends in a /. I am actually ok with adding the slash and not
+                            // adding it, as both have merit.  Leaving it in for now to see how
+                            // things turn out (current as of: v1.1.0)
+                            new_ferox_response.set_url(&format!("{}/", new_ferox_response.url()));
+                        }
+
+                        try_recursion(&new_ferox_response, base_depth, dir_chan.clone()).await;
+                    }
                }
            }

-            let ferox_response = FeroxResponse::new(response).await;
-
            // everything else should be reported
-            match report_chan.send(ferox_response) {
-                Ok(_) => {
-                    log::debug!("sent {}/{} over reporting channel", &target_url, &word);
-                }
-                Err(e) => {
-                    log::error!("wtf: {}", e);
-                }
-            }
+            send_report(report_chan.clone(), ferox_response);
        }
    }
    log::trace!("exit: make_requests");
 }

+/// Simple helper to send a `FeroxResponse` over the tx side of an `mpsc::unbounded_channel`
+fn send_report(report_sender: UnboundedSender<FeroxResponse>, response: FeroxResponse) {
+    log::trace!("enter: send_report({:?}, {:?}", report_sender, response);
+
+    match report_sender.send(response) {
+        Ok(_) => {}
+        Err(e) => {
+            log::error!("{}", e);
+        }
+    }
+
+    log::trace!("exit: send_report");
+}
+
 /// Scan a given url using a given wordlist
 ///
 /// This is the primary entrypoint for the scanner