Merge pull request #163 from epi052/137-extract-robots-txt

add robots.txt extraction to increase scan coverage
2026-05-31 03:51:12 -03:00 · 2020-12-19 10:58:53 -06:00
parent b10c4caefb 4e492939c1
commit 7b3540e13f
6 changed files with 325 additions and 55 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "feroxbuster"
-version = "1.10.1"
+version = "1.10.2"
 authors = ["Ben 'epi' Risher <epibar052@gmail.com>"]
 license = "MIT"
 edition = "2018"
--- a/README.md
+++ b/README.md
@@ -95,6 +95,7 @@ This attack is also known as Predictable Resource Location, File Enumeration, Di
    - [Progress bars print one line at a time](#progress-bars-print-one-line-at-a-time)
    - [What do each of the numbers beside the URL mean?](#what-do-each-of-the-numbers-beside-the-url-mean)
    - [Connection closed before message completed](#connection-closed-before-message-completed)
+    - [SSL Error routines:tls_process_server_certificate:certificate verify failed](#ssl-error-routinestls_process_server_certificatecertificate-verify-failed)

 ## 💿 Installation

@@ -651,6 +652,12 @@ A valid time_spec can be passed to `--time-limit` in order to force a shutdown a

 ![time-limit](img/time-limit.gif)

+### Extract Links from robots.txt (New in `v1.10.2`)
+
+In addition to [extracting links from the response body](#extract-links-from-response-body-new-in-v110), using 
+`--extract-links` makes a request to `/robots.txt` and examines all `Allow` and `Disallow` entries.  Directory entries 
+are added to the scan queue, while file entries are requested and then reported if appropriate.  
+
 ## 🧐 Comparison w/ Similar Tools

 There are quite a few similar tools for forced browsing/content discovery.  Burp Suite Pro, Dirb, Dirbuster, etc... 
@@ -694,6 +701,7 @@ a few of the use-cases in which feroxbuster may be a better fit:
 | filter out responses by regular expression (`v1.8.0`)                        | ✔ |   | ✔ |
 | save scan's state to disk (can pick up where it left off) (`v1.9.0`)         | ✔ |   |   |
 | maximum run time limit (`v1.10.0`)                                           | ✔ |   | ✔ |
+| use robots.txt to increase scan coverage (`v1.10.2`)                         | ✔ |   |   |
 | **huge** number of other options                                             |   |   | ✔ |

 Of note, there's another written-in-rust content discovery tool, [rustbuster](https://github.com/phra/rustbuster). I 
@@ -795,3 +803,17 @@ This isn't a bug. Simply slow down the scan. A `-t` value of 50 was chosen as a
 > This is just due to the racy nature of networking.
 > 
 > hyper has a connection pool of idle connections, and it selected one to send your request. Most of the time, hyper will receive the server's FIN and drop the dead connection from its pool. But occasionally, a connection will be selected from the pool and written to at the same time the server is deciding to close the connection. Since hyper already wrote some of the request, it can't really retry it automatically on a new connection, since the server may have acted already.
+
+### SSL Error routines:tls_process_server_certificate:certificate verify failed
+
+In the event you see an error similar to 
+
+![self-signed](img/insecure.png)
+
+```
+error trying to connect: error:1416F086:SSL routines:tls_process_server_certificate:certificate verify failed:ssl/statem/statem_clnt.c:1913: (self signed certificate)
+```
+
+You just need to add the `-k|--insecure` flag to your command.
+
+`feroxbuster` rejects self-signed certs and other "insecure" certificates/site configurations by default. You can choose to scan these services anyway by telling `feroxbuster` to ignore insecure server certs.
--- a/src/extractor.rs
+++ b/src/extractor.rs
@@ -1,4 +1,10 @@
-use crate::FeroxResponse;
+use crate::{
+    client,
+    config::{Configuration, CONFIGURATION},
+    scanner::SCANNED_URLS,
+    utils::{format_url, make_request},
+    FeroxResponse,
+};
 use lazy_static::lazy_static;
 use regex::Regex;
 use reqwest::Url;
@@ -9,9 +15,18 @@ use std::collections::HashSet;
 /// Incorporates change from this [Pull Request](https://github.com/GerbenJavado/LinkFinder/pull/66/files)
 const LINKFINDER_REGEX: &str = r#"(?:"|')(((?:[a-zA-Z]{1,10}://|//)[^"'/]{1,}\.[a-zA-Z]{2,}[^"']{0,})|((?:/|\.\./|\./)[^"'><,;| *()(%%$^/\\\[\]][^"'><,;|()]{1,})|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{1,}\.(?:[a-zA-Z]{1,4}|action)(?:[\?|#][^"|']{0,}|))|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{3,}(?:[\?|#][^"|']{0,}|))|([a-zA-Z0-9_\-.]{1,}\.(?:php|asp|aspx|jsp|json|action|html|js|txt|xml)(?:[\?|#][^"|']{0,}|)))(?:"|')"#;

+/// Regular expression to pull url paths from robots.txt
+///
+/// ref: https://developers.google.com/search/reference/robots_txt
+const ROBOTS_TXT_REGEX: &str =
+    r#"(?m)^ *(Allow|Disallow): *(?P<url_path>[a-zA-Z0-9._/?#@!&'()+,;%=-]+?)$"#; // multi-line (?m)
+
 lazy_static! {
    /// `LINKFINDER_REGEX` as a regex::Regex type
-    static ref REGEX: Regex = Regex::new(LINKFINDER_REGEX).unwrap();
+    static ref LINKS_REGEX: Regex = Regex::new(LINKFINDER_REGEX).unwrap();
+
+    /// `ROBOTS_TXT_REGEX` as a regex::Regex type
+    static ref ROBOTS_REGEX: Regex = Regex::new(ROBOTS_TXT_REGEX).unwrap();
 }

 /// Iterate over a given path, return a list of every sub-path found
@@ -90,7 +105,7 @@ pub async fn get_links(response: &FeroxResponse) -> HashSet<String> {

    let body = response.text();

-    for capture in REGEX.captures_iter(&body) {
+    for capture in LINKS_REGEX.captures_iter(&body) {
        // remove single & double quotes from both ends of the capture
        // capture[0] is the entire match, additional capture groups start at [1]
        let link = capture[0].trim_matches(|c| c == '\'' || c == '"');
@@ -105,27 +120,14 @@ pub async fn get_links(response: &FeroxResponse) -> HashSet<String> {
                    continue;
                }

-                for sub_path in get_sub_paths_from_path(absolute.path()) {
-                    // take a url fragment like homepage/assets/img/icons/handshake.svg and
-                    // incrementally add
-                    //     - homepage/assets/img/icons/
-                    //     - homepage/assets/img/
-                    //     - homepage/assets/
-                    //     - homepage/
-                    log::debug!("Adding {} to {:?}", sub_path, links);
-                    add_link_to_set_of_links(&sub_path, &response.url(), &mut links);
-                }
+                add_all_sub_paths(absolute.path(), &response, &mut links);
            }
            Err(e) => {
                // this is the expected error that happens when we try to parse a url fragment
                //     ex: Url::parse("/login") -> Err("relative URL without a base")
                // while this is technically an error, these are good results for us
                if e.to_string().contains("relative URL without a base") {
-                    for sub_path in get_sub_paths_from_path(link) {
-                        // incrementally save all sub-paths that led to the relative url's resource
-                        log::debug!("Adding {} to {:?}", sub_path, links);
-                        add_link_to_set_of_links(&sub_path, &response.url(), &mut links);
-                    }
+                    add_all_sub_paths(link, &response, &mut links);
                } else {
                    // unexpected error has occurred
                    log::error!("Could not parse given url: {}", e);
@@ -135,6 +137,152 @@ pub async fn get_links(response: &FeroxResponse) -> HashSet<String> {
    }

    log::trace!("exit: get_links -> {:?}", links);
+
+    links
+}
+
+/// take a url fragment like homepage/assets/img/icons/handshake.svg and
+/// incrementally add
+///     - homepage/assets/img/icons/
+///     - homepage/assets/img/
+///     - homepage/assets/
+///     - homepage/
+fn add_all_sub_paths(url_path: &str, response: &FeroxResponse, mut links: &mut HashSet<String>) {
+    log::trace!(
+        "enter: add_all_sub_paths({}, {}, {:?})",
+        url_path,
+        response,
+        links
+    );
+
+    for sub_path in get_sub_paths_from_path(url_path) {
+        log::debug!("Adding {} to {:?}", sub_path, links);
+        add_link_to_set_of_links(&sub_path, &response.url(), &mut links);
+    }
+
+    log::trace!("exit: add_all_sub_paths");
+}
+
+/// Wrapper around link extraction logic
+/// currently used in two places:
+///   - links from response bodys
+///   - links from robots.txt responses
+///
+/// general steps taken:
+///   - create a new Url object based on cli options/args
+///   - check if the new Url has already been seen/scanned -> None
+///   - make a request to the new Url ? -> Some(response) : None
+pub async fn request_feroxresponse_from_new_link(url: &str) -> Option<FeroxResponse> {
+    log::trace!("enter: request_feroxresponse_from_new_link({})", url);
+
+    // create a url based on the given command line options, return None on error
+    let new_url = match format_url(
+        &url,
+        &"",
+        CONFIGURATION.add_slash,
+        &CONFIGURATION.queries,
+        None,
+    ) {
+        Ok(url) => url,
+        Err(_) => {
+            log::trace!("exit: request_feroxresponse_from_new_link -> None");
+            return None;
+        }
+    };
+
+    if SCANNED_URLS.get_scan_by_url(&new_url.to_string()).is_some() {
+        //we've seen the url before and don't need to scan again
+        log::trace!("exit: request_feroxresponse_from_new_link -> None");
+        return None;
+    }
+
+    // make the request and store the response
+    let new_response = match make_request(&CONFIGURATION.client, &new_url).await {
+        Ok(resp) => resp,
+        Err(_) => {
+            log::trace!("exit: request_feroxresponse_from_new_link -> None");
+            return None;
+        }
+    };
+
+    let new_ferox_response = FeroxResponse::from(new_response, true).await;
+
+    log::trace!(
+        "exit: request_feroxresponse_from_new_link -> {:?}",
+        new_ferox_response
+    );
+    Some(new_ferox_response)
+}
+
+/// helper function that simply requests /robots.txt on the given url's base url
+///
+/// example:
+///     http://localhost/api/users -> http://localhost/robots.txt
+///     
+/// The length of the given path has no effect on what's requested; it's always
+/// base url + /robots.txt
+pub async fn request_robots_txt(base_url: &str, config: &Configuration) -> Option<FeroxResponse> {
+    log::trace!("enter: get_robots_file({})", base_url);
+
+    // more often than not, domain/robots.txt will redirect to www.domain/robots.txt or something
+    // similar; to account for that, create a client that will follow redirects, regardless of
+    // what the user specified for the scanning client. Other than redirects, it will respect
+    // all other user specified settings
+    let follow_redirects = true;
+
+    let proxy = if config.proxy.is_empty() {
+        None
+    } else {
+        Some(config.proxy.as_str())
+    };
+
+    let client = client::initialize(
+        config.timeout,
+        &config.user_agent,
+        follow_redirects,
+        config.insecure,
+        &config.headers,
+        proxy,
+    );
+
+    if let Ok(mut url) = Url::parse(base_url) {
+        url.set_path("/robots.txt"); // overwrite existing path with /robots.txt
+
+        if let Ok(response) = make_request(&client, &url).await {
+            let ferox_response = FeroxResponse::from(response, true).await;
+
+            log::trace!("exit: get_robots_file -> {}", ferox_response);
+            return Some(ferox_response);
+        }
+    }
+
+    None
+}
+
+/// Entry point to perform link extraction from robots.txt
+///
+/// `base_url` can have paths and subpaths, however robots.txt will be requested from the
+/// root of the url
+/// given the url:
+///     http://localhost/stuff/things
+/// this function requests:
+///     http://localhost/robots.txt
+pub async fn extract_robots_txt(base_url: &str, config: &Configuration) -> HashSet<String> {
+    log::trace!("enter: extract_robots_txt({}, CONFIGURATION)", base_url);
+    let mut links = HashSet::new();
+
+    if let Some(response) = request_robots_txt(&base_url, &config).await {
+        for capture in ROBOTS_REGEX.captures_iter(response.text.as_str()) {
+            if let Some(new_path) = capture.name("url_path") {
+                if let Ok(mut new_url) = Url::parse(base_url) {
+                    new_url.set_path(new_path.as_str());
+                    add_all_sub_paths(new_url.path(), &response, &mut links);
+                }
+            }
+        }
+    }
+
+    log::trace!("exit: extract_robots_txt -> {:?}", links);
    links
 }

@@ -266,4 +414,27 @@ mod tests {
        assert_eq!(mock.hits(), 1);
        Ok(())
    }
+
+    #[tokio::test(core_threads = 1)]
+    /// test that /robots.txt is correctly requested given a base url (happy path)
+    async fn request_robots_txt_with_and_without_proxy() {
+        let srv = MockServer::start();
+
+        let mock = srv.mock(|when, then| {
+            when.method(GET).path("/robots.txt");
+            then.status(200).body("this is a test");
+        });
+
+        let mut config = Configuration::default();
+
+        request_robots_txt(&srv.url("/api/users/stuff/things"), &config).await;
+
+        // note: the proxy doesn't actually do anything other than hit a different code branch
+        // in this unit test; it would however have an effect on an integration test
+        config.proxy = srv.url("/ima-proxy");
+
+        request_robots_txt(&srv.url("/api/different/path"), &config).await;
+
+        assert_eq!(mock.hits(), 2);
+    }
 }
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,11 +1,13 @@
 use crossterm::event::{self, Event, KeyCode};
-use feroxbuster::progress::add_bar;
 use feroxbuster::{
    banner,
    config::{CONFIGURATION, PROGRESS_BAR, PROGRESS_PRINTER},
-    heuristics, logger, reporter,
+    extractor::{extract_robots_txt, request_feroxresponse_from_new_link},
+    heuristics, logger,
+    progress::add_bar,
+    reporter,
    scan_manager::{self, PAUSE_SCAN},
-    scanner::{self, scan_url, RESPONSES, SCANNED_URLS},
+    scanner::{self, scan_url, send_report, RESPONSES, SCANNED_URLS},
    utils::{ferox_print, get_current_depth, module_colorizer, status_colorizer},
    FeroxError, FeroxResponse, FeroxResult, FeroxSerialize, SLEEP_DURATION, VERSION,
 };
@@ -97,7 +99,7 @@ fn get_unique_words_from_wordlist(path: &str) -> FeroxResult<Arc<HashSet<String>

 /// Determine whether it's a single url scan or urls are coming from stdin, then scan as needed
 async fn scan(
-    targets: Vec<String>,
+    mut targets: Vec<String>,
    tx_term: UnboundedSender<FeroxResponse>,
    tx_file: UnboundedSender<FeroxResponse>,
 ) -> FeroxResult<()> {
@@ -142,6 +144,37 @@ async fn scan(
        }
    }

+    if CONFIGURATION.extract_links {
+        for target in targets.clone() {
+            // modifying the targets vector, so we can't have a reference to it while we borrow
+            // it as mutable; thus the clone
+            let robots_links = extract_robots_txt(&target, &CONFIGURATION).await;
+
+            for robot_link in robots_links {
+                // create a url based on the given command line options, continue on error
+                let ferox_response = match request_feroxresponse_from_new_link(&robot_link).await {
+                    Some(resp) => resp,
+                    None => continue,
+                };
+
+                if ferox_response.is_file() {
+                    SCANNED_URLS.add_file_scan(&robot_link);
+                    send_report(tx_term.clone(), ferox_response);
+                } else {
+                    let (unknown, _) = SCANNED_URLS.add_directory_scan(&robot_link);
+
+                    if !unknown {
+                        // known directory; can skip (unlikely)
+                        continue;
+                    }
+
+                    // unknown directory; add to targets for scanning
+                    targets.push(robot_link);
+                }
+            }
+        }
+    }
+
    let mut tasks = vec![];

    for target in targets {
--- a/src/scanner.rs
+++ b/src/scanner.rs
@@ -1,6 +1,6 @@
 use crate::{
    config::{Configuration, CONFIGURATION},
-    extractor::get_links,
+    extractor::{get_links, request_feroxresponse_from_new_link},
    filters::{
        FeroxFilter, LinesFilter, RegexFilter, SizeFilter, StatusCodeFilter, WildcardFilter,
        WordsFilter,
@@ -385,30 +385,11 @@ async fn make_requests(
                let new_links = get_links(&ferox_response).await;

                for new_link in new_links {
-                    // create a url based on the given command line options, continue on error
-                    let new_url = match format_url(
-                        &new_link,
-                        &"",
-                        CONFIGURATION.add_slash,
-                        &CONFIGURATION.queries,
-                        None,
-                    ) {
-                        Ok(url) => url,
-                        Err(_) => continue,
-                    };
-
-                    if SCANNED_URLS.get_scan_by_url(&new_url.to_string()).is_some() {
-                        //we've seen the url before and don't need to scan again
-                        continue;
-                    }
-
-                    // make the request and store the response
-                    let new_response = match make_request(&CONFIGURATION.client, &new_url).await {
-                        Ok(resp) => resp,
-                        Err(_) => continue,
-                    };
-
-                    let mut new_ferox_response = FeroxResponse::from(new_response, true).await;
+                    let mut new_ferox_response =
+                        match request_feroxresponse_from_new_link(&new_link).await {
+                            Some(resp) => resp,
+                            None => continue,
+                        };

                    // filter if necessary
                    if should_filter_response(&new_ferox_response) {
@@ -419,7 +400,7 @@ async fn make_requests(
                        // very likely a file, simply request and report
                        log::debug!("Singular extraction: {}", new_ferox_response);

-                        SCANNED_URLS.add_file_scan(&new_url.to_string());
+                        SCANNED_URLS.add_file_scan(&new_ferox_response.url().to_string());

                        send_report(report_chan.clone(), new_ferox_response);

@@ -452,7 +433,7 @@ async fn make_requests(
 }

 /// Simple helper to send a `FeroxResponse` over the tx side of an `mpsc::unbounded_channel`
-fn send_report(report_sender: UnboundedSender<FeroxResponse>, response: FeroxResponse) {
+pub fn send_report(report_sender: UnboundedSender<FeroxResponse>, response: FeroxResponse) {
    log::trace!("enter: send_report({:?}, {}", report_sender, response);

    match report_sender.send(response) {
--- a/tests/test_extractor.rs
+++ b/tests/test_extractor.rs
@@ -163,15 +163,14 @@ fn extractor_finds_same_relative_url_twice() {
    cmd.assert().success().stdout(
        predicate::str::contains("/LICENSE")
            .and(predicate::str::contains("200"))
-            .and(predicate::str::contains(
-                "/homepage/assets/img/icons/handshake.svg",
-            )),
+            // .count(1) asserts that we only see the endpoint reported once, even though there
+            // is the potential to request the same url twice
+            .and(predicate::str::contains("/homepage/assets/img/icons/handshake.svg").count(1)),
    );

    assert_eq!(mock.hits(), 1);
    assert_eq!(mock_two.hits(), 1);
-    assert!(mock_three.hits() <= 2); // todo: sometimes this is 2 instead of 1
-                                     // the expectation is one, suggesting a race condition... investigate and fix
+    assert!(mock_three.hits() <= 2);
    teardown_tmp_directory(tmp_dir);
 }

@@ -220,3 +219,67 @@ fn extractor_finds_filtered_content() -> Result<(), Box<dyn std::error::Error>>
    teardown_tmp_directory(tmp_dir);
    Ok(())
 }
+
+#[test]
+/// serve a robots.txt with a file and and a folder link contained within it. ferox should
+/// find both links and request each one. Additionally, a scan should start with the directory
+/// link found, meaning the wordlist will be thrown at the sub directory
+fn extractor_finds_robots_txt_links_and_displays_files_or_scans_directories() {
+    let srv = MockServer::start();
+    let (tmp_dir, file) = setup_tmp_directory(&["LICENSE".to_string()], "wordlist").unwrap();
+
+    let mock = srv.mock(|when, then| {
+        when.method(GET).path("/LICENSE");
+        then.status(200).body("im a little teapot"); // 18
+    });
+
+    let mock_two = srv.mock(|when, then| {
+        when.method(GET).path("/robots.txt");
+        then.status(200).body(
+            r#"
+            User-agent: *
+            Crawl-delay: 10
+            # CSS, JS, Images
+            Allow: /misc/*.css$
+            Disallow: /misc/stupidfile.php
+               Disallow: /disallowed-subdir/
+            "#,
+        );
+    });
+
+    let mock_file = srv.mock(|when, then| {
+        when.method(GET).path("/misc/stupidfile.php");
+        then.status(200).body("im a little teapot too"); // 22
+    });
+
+    let mock_dir = srv.mock(|when, then| {
+        when.method(GET).path("/disallowed-subdir/LICENSE");
+        then.status(200).body("i too, am a container for tea"); // 29
+    });
+
+    let cmd = Command::cargo_bin("feroxbuster")
+        .unwrap()
+        .arg("--url")
+        .arg(srv.url("/"))
+        .arg("--wordlist")
+        .arg(file.as_os_str())
+        .arg("--extract-links")
+        .unwrap();
+
+    cmd.assert().success().stdout(
+        predicate::str::contains("/LICENSE") // 2 directories contain LICENSE
+            .count(2)
+            .and(predicate::str::contains("18c"))
+            .and(predicate::str::contains("/misc/stupidfile.php"))
+            .and(predicate::str::contains("22c"))
+            .and(predicate::str::contains("/disallowed-subdir/LICENSE"))
+            .and(predicate::str::contains("29c"))
+            .and(predicate::str::contains("200").count(3)),
+    );
+
+    assert_eq!(mock.hits(), 1);
+    assert_eq!(mock_dir.hits(), 1);
+    assert_eq!(mock_two.hits(), 1);
+    assert_eq!(mock_file.hits(), 1);
+    teardown_tmp_directory(tmp_dir);
+}