added robots.txt extraction

2026-06-05 00:01:12 -03:00 · 2020-12-19 07:30:24 -06:00
parent 77a450195c
commit f7ef202849
3 changed files with 154 additions and 48 deletions
--- a/src/extractor.rs
+++ b/src/extractor.rs
@@ -1,4 +1,10 @@
-use crate::{client, config::Configuration, utils::make_request, FeroxResponse};
+use crate::{
+    client,
+    config::{Configuration, CONFIGURATION},
+    scanner::SCANNED_URLS,
+    utils::{format_url, make_request},
+    FeroxResponse,
+};
 use lazy_static::lazy_static;
 use regex::Regex;
 use reqwest::Url;
@@ -12,11 +18,12 @@ const LINKFINDER_REGEX: &str = r#"(?:"|')(((?:[a-zA-Z]{1,10}://|//)[^"'/]{1,}\.[
 /// Regular expression to pull url paths from robots.txt
 ///
 /// ref: https://developers.google.com/search/reference/robots_txt
-const ROBOTS_TXT_REGEX: &str = r#"^ *(Allow|Disallow): *(?P<Url>.*?)$"#;
+const ROBOTS_TXT_REGEX: &str =
+    r#"(?m)^ *(Allow|Disallow): *(?P<url_path>[a-zA-Z0-9._/?#@!&'()+,;%=-]+?)$"#; // multi-line (?m)

 lazy_static! {
    /// `LINKFINDER_REGEX` as a regex::Regex type
-    static ref REGEX: Regex = Regex::new(LINKFINDER_REGEX).unwrap();
+    static ref LINKS_REGEX: Regex = Regex::new(LINKFINDER_REGEX).unwrap();

    /// `ROBOTS_TXT_REGEX` as a regex::Regex type
    static ref ROBOTS_REGEX: Regex = Regex::new(ROBOTS_TXT_REGEX).unwrap();
@@ -98,7 +105,7 @@ pub async fn get_links(response: &FeroxResponse) -> HashSet<String> {

    let body = response.text();

-    for capture in REGEX.captures_iter(&body) {
+    for capture in LINKS_REGEX.captures_iter(&body) {
        // remove single & double quotes from both ends of the capture
        // capture[0] is the entire match, additional capture groups start at [1]
        let link = capture[0].trim_matches(|c| c == '\'' || c == '"');
@@ -113,27 +120,14 @@ pub async fn get_links(response: &FeroxResponse) -> HashSet<String> {
                    continue;
                }

-                for sub_path in get_sub_paths_from_path(absolute.path()) {
-                    // take a url fragment like homepage/assets/img/icons/handshake.svg and
-                    // incrementally add
-                    //     - homepage/assets/img/icons/
-                    //     - homepage/assets/img/
-                    //     - homepage/assets/
-                    //     - homepage/
-                    log::debug!("Adding {} to {:?}", sub_path, links);
-                    add_link_to_set_of_links(&sub_path, &response.url(), &mut links);
-                }
+                add_all_sub_paths(absolute.path(), &response, &mut links);
            }
            Err(e) => {
                // this is the expected error that happens when we try to parse a url fragment
                //     ex: Url::parse("/login") -> Err("relative URL without a base")
                // while this is technically an error, these are good results for us
                if e.to_string().contains("relative URL without a base") {
-                    for sub_path in get_sub_paths_from_path(link) {
-                        // incrementally save all sub-paths that led to the relative url's resource
-                        log::debug!("Adding {} to {:?}", sub_path, links);
-                        add_link_to_set_of_links(&sub_path, &response.url(), &mut links);
-                    }
+                    add_all_sub_paths(link, &response, &mut links);
                } else {
                    // unexpected error has occurred
                    log::error!("Could not parse given url: {}", e);
@@ -147,6 +141,79 @@ pub async fn get_links(response: &FeroxResponse) -> HashSet<String> {
    links
 }

+/// take a url fragment like homepage/assets/img/icons/handshake.svg and
+/// incrementally add
+///     - homepage/assets/img/icons/
+///     - homepage/assets/img/
+///     - homepage/assets/
+///     - homepage/
+fn add_all_sub_paths(url_path: &str, response: &FeroxResponse, mut links: &mut HashSet<String>) {
+    log::trace!(
+        "enter: add_all_sub_paths({}, {}, {:?})",
+        url_path,
+        response,
+        links
+    );
+
+    for sub_path in get_sub_paths_from_path(url_path) {
+        log::debug!("Adding {} to {:?}", sub_path, links);
+        add_link_to_set_of_links(&sub_path, &response.url(), &mut links);
+    }
+
+    log::trace!("exit: add_all_sub_paths");
+}
+
+/// Wrapper around link extraction logic
+/// currently used in two places:
+///   - links from response bodys
+///   - links from robots.txt responses
+///
+/// general steps taken:
+///   - create a new Url object based on cli options/args
+///   - check if the new Url has already been seen/scanned -> None
+///   - make a request to the new Url ? -> Some(response) : None
+pub async fn request_feroxresponse_from_new_link(url: &str) -> Option<FeroxResponse> {
+    log::trace!("enter: request_feroxresponse_from_new_link({})", url);
+
+    // create a url based on the given command line options, return None on error
+    let new_url = match format_url(
+        &url,
+        &"",
+        CONFIGURATION.add_slash,
+        &CONFIGURATION.queries,
+        None,
+    ) {
+        Ok(url) => url,
+        Err(_) => {
+            log::trace!("exit: request_feroxresponse_from_new_link -> None");
+            return None;
+        }
+    };
+
+    if SCANNED_URLS.get_scan_by_url(&new_url.to_string()).is_some() {
+        //we've seen the url before and don't need to scan again
+        log::trace!("exit: request_feroxresponse_from_new_link -> None");
+        return None;
+    }
+
+    // make the request and store the response
+    let new_response = match make_request(&CONFIGURATION.client, &new_url).await {
+        Ok(resp) => resp,
+        Err(_) => {
+            log::trace!("exit: request_feroxresponse_from_new_link -> None");
+            return None;
+        }
+    };
+
+    let new_ferox_response = FeroxResponse::from(new_response, true).await;
+
+    log::trace!(
+        "exit: request_feroxresponse_from_new_link -> {:?}",
+        new_ferox_response
+    );
+    Some(new_ferox_response)
+}
+
 /// helper function that simply requests /robots.txt on the given url's base url
 ///
 /// example:
@@ -192,6 +259,33 @@ pub async fn request_robots_txt(base_url: &str, config: &Configuration) -> Optio
    None
 }

+/// Entry point to perform link extraction from robots.txt
+///
+/// `base_url` can have paths and subpaths, however robots.txt will be requested from the
+/// root of the url
+/// given the url:
+///     http://localhost/stuff/things
+/// this function requests:
+///     http://localhost/robots.txt
+pub async fn extract_robots_txt(base_url: &str, config: &Configuration) -> HashSet<String> {
+    log::trace!("enter: extract_robots_txt({}, CONFIGURATION)", base_url);
+    let mut links = HashSet::new();
+
+    if let Some(response) = request_robots_txt(&base_url, &config).await {
+        for capture in ROBOTS_REGEX.captures_iter(response.text.as_str()) {
+            if let Some(new_path) = capture.name("url_path") {
+                if let Ok(mut new_url) = Url::parse(base_url) {
+                    new_url.set_path(new_path.as_str());
+                    add_all_sub_paths(new_url.path(), &response, &mut links);
+                }
+            }
+        }
+    }
+
+    log::trace!("exit: extract_robots_txt -> {:?}", links);
+    links
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,9 +1,11 @@
 use crossterm::event::{self, Event, KeyCode};
-use feroxbuster::progress::add_bar;
 use feroxbuster::{
    banner,
    config::{CONFIGURATION, PROGRESS_BAR, PROGRESS_PRINTER},
-    heuristics, logger, reporter,
+    extractor::{extract_robots_txt, request_feroxresponse_from_new_link},
+    heuristics, logger,
+    progress::add_bar,
+    reporter,
    scan_manager::{self, PAUSE_SCAN},
    scanner::{self, scan_url, RESPONSES, SCANNED_URLS},
    utils::{ferox_print, get_current_depth, module_colorizer, status_colorizer},
@@ -97,7 +99,7 @@ fn get_unique_words_from_wordlist(path: &str) -> FeroxResult<Arc<HashSet<String>

 /// Determine whether it's a single url scan or urls are coming from stdin, then scan as needed
 async fn scan(
-    targets: Vec<String>,
+    mut targets: Vec<String>,
    tx_term: UnboundedSender<FeroxResponse>,
    tx_file: UnboundedSender<FeroxResponse>,
 ) -> FeroxResult<()> {
@@ -142,6 +144,35 @@ async fn scan(
        }
    }

+    if CONFIGURATION.extract_links {
+        for target in targets.clone() {
+            // modifying the targets vector, so we can't have a reference to it while we borrow
+            // it as mutable; thus the clone
+            let robots_links = extract_robots_txt(&target, &CONFIGURATION).await;
+
+            for robot_link in robots_links {
+                // create a url based on the given command line options, continue on error
+                let ferox_response = match request_feroxresponse_from_new_link(&robot_link).await {
+                    Some(resp) => resp,
+                    None => continue,
+                };
+
+                let (unknown, _) = if ferox_response.is_file() {
+                    SCANNED_URLS.add_file_scan(&robot_link)
+                } else {
+                    SCANNED_URLS.add_directory_scan(&robot_link)
+                };
+
+                if !unknown {
+                    // not unknown, i.e. we've seen the url before and don't need to scan again
+                    continue;
+                }
+
+                targets.push(robot_link);
+            }
+        }
+    }
+
    let mut tasks = vec![];

    for target in targets {
--- a/src/scanner.rs
+++ b/src/scanner.rs
@@ -1,6 +1,6 @@
 use crate::{
    config::{Configuration, CONFIGURATION},
-    extractor::get_links,
+    extractor::{get_links, request_feroxresponse_from_new_link},
    filters::{
        FeroxFilter, LinesFilter, RegexFilter, SizeFilter, StatusCodeFilter, WildcardFilter,
        WordsFilter,
@@ -385,30 +385,11 @@ async fn make_requests(
                let new_links = get_links(&ferox_response).await;

                for new_link in new_links {
-                    // create a url based on the given command line options, continue on error
-                    let new_url = match format_url(
-                        &new_link,
-                        &"",
-                        CONFIGURATION.add_slash,
-                        &CONFIGURATION.queries,
-                        None,
-                    ) {
-                        Ok(url) => url,
-                        Err(_) => continue,
-                    };
-
-                    if SCANNED_URLS.get_scan_by_url(&new_url.to_string()).is_some() {
-                        //we've seen the url before and don't need to scan again
-                        continue;
-                    }
-
-                    // make the request and store the response
-                    let new_response = match make_request(&CONFIGURATION.client, &new_url).await {
-                        Ok(resp) => resp,
-                        Err(_) => continue,
-                    };
-
-                    let mut new_ferox_response = FeroxResponse::from(new_response, true).await;
+                    let mut new_ferox_response =
+                        match request_feroxresponse_from_new_link(&new_link).await {
+                            Some(resp) => resp,
+                            None => continue,
+                        };

                    // filter if necessary
                    if should_filter_response(&new_ferox_response) {
@@ -419,7 +400,7 @@ async fn make_requests(
                        // very likely a file, simply request and report
                        log::debug!("Singular extraction: {}", new_ferox_response);

-                        SCANNED_URLS.add_file_scan(&new_url.to_string());
+                        SCANNED_URLS.add_file_scan(&new_ferox_response.url().to_string());

                        send_report(report_chan.clone(), new_ferox_response);