diff --git a/.gitignore b/.gitignore index cfcc7f9..b550b63 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,9 @@ target/ # jetbrains metadata folder .idea/ +# vscode metadata folder +.vscode/ + # personal feroxbuster config for testing ferox-config.toml diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index b270d90..a38d254 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -166,7 +166,7 @@ primarily related to continuous integration and release deployment. feroxbuster uses the [`clippy`](https://rust-lang.github.io/rust-clippy/) code linter. -The command that will ultimately be used in the CI pipeline for linting is `cargo clippy --all-targets --all-features -- -D warnings -A clippy::unnecessary_unwrap`. +The command that will ultimately be used in the CI pipeline for linting is `cargo clippy --all-targets --all-features -- -D warnings -A clippy::mutex-atomic`. Before submitting a Pull Request, the above command should be run. Please do not ignore any linting errors in code you write or modify, as they are meant to **help** by ensuring a clean and simple code base. diff --git a/Cargo.lock b/Cargo.lock index f7dfb8b..2048c4c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -291,6 +291,12 @@ version = "3.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a4a45a46ab1f2412e53d3a0ade76ffad2025804294569aae387231a0cd6e0899" +[[package]] +name = "byteorder" +version = "1.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" + [[package]] name = "bytes" version = "1.1.0" @@ -371,6 +377,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "convert_case" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e" + [[package]] name = "core-foundation" version = "0.9.2" @@ -428,6 +440,33 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" +[[package]] +name = "cssparser" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "754b69d351cdc2d8ee09ae203db831e005560fc6030da058f86ad60c92a9cb0a" +dependencies = [ + "cssparser-macros", + "dtoa-short", + "itoa 0.4.8", + "matches", + "phf", + "proc-macro2", + "quote", + "smallvec", + "syn", +] + +[[package]] +name = "cssparser-macros" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfae75de57f2b2e85e8768c3ea840fd159c8f33e2b6522c7835b7abac81be16e" +dependencies = [ + "quote", + "syn", +] + [[package]] name = "ctor" version = "0.1.21" @@ -479,6 +518,19 @@ dependencies = [ "winapi", ] +[[package]] +name = "derive_more" +version = "0.99.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fb810d30a7c1953f91334de7244731fc3f3c10d7fe163338a35b9f640960321" +dependencies = [ + "convert_case", + "proc-macro2", + "quote", + "rustc_version", + "syn", +] + [[package]] name = "diff" version = "0.1.12" @@ -538,6 +590,27 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" +[[package]] +name = "dtoa" +version = "0.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56899898ce76aaf4a0f24d914c97ea6ed976d42fec6ad33fcbb0a1103e07b2b0" + +[[package]] +name = "dtoa-short" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bde03329ae10e79ede66c9ce4dc930aa8599043b0743008548680f25b91502d6" +dependencies = [ + "dtoa", +] + +[[package]] +name = "ego-tree" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a68a4904193147e0a8dec3314640e6db742afd5f6e634f428a6af230d9b3591" + [[package]] name = "either" version = "1.6.1" @@ -621,6 +694,7 @@ dependencies = [ "regex", "reqwest", "rlimit", + "scraper", "serde", "serde_json", "serde_regex", @@ -678,6 +752,16 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "futf" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c9c1ce3fa9336301af935ab852c437817d14cd33690446569392e65170aac3b" +dependencies = [ + "mac", + "new_debug_unreachable", +] + [[package]] name = "futures" version = "0.3.19" @@ -788,6 +872,35 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8fb6c4351f4f134772edf9bcd17de13b7fbcb2c56928b440d6823bd4dc9ebd80" +[[package]] +name = "fxhash" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" +dependencies = [ + "byteorder", +] + +[[package]] +name = "getopts" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14dbbfd5c71d70241ecf9e6f13737f7b5ce823821063188d7e46c41d371eebd5" +dependencies = [ + "unicode-width", +] + +[[package]] +name = "getrandom" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce" +dependencies = [ + "cfg-if", + "libc", + "wasi 0.9.0+wasi-snapshot-preview1", +] + [[package]] name = "getrandom" version = "0.2.4" @@ -796,7 +909,7 @@ checksum = "418d37c8b1d42553c93648be529cb70f920d3baf8ef469b74b9638df426e0b4c" dependencies = [ "cfg-if", "libc", - "wasi", + "wasi 0.10.2+wasi-snapshot-preview1", ] [[package]] @@ -846,6 +959,20 @@ dependencies = [ "libc", ] +[[package]] +name = "html5ever" +version = "0.25.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aafcf38a1a36118242d29b92e1b08ef84e67e4a5ed06e0a80be20e6a32bfed6b" +dependencies = [ + "log", + "mac", + "markup5ever", + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "http" version = "0.2.6" @@ -1170,6 +1297,26 @@ dependencies = [ "value-bag", ] +[[package]] +name = "mac" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" + +[[package]] +name = "markup5ever" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a24f40fb03852d1cdd84330cddcaf98e9ec08a7b7768e952fad3b4cf048ec8fd" +dependencies = [ + "log", + "phf", + "phf_codegen", + "string_cache", + "string_cache_codegen", + "tendril", +] + [[package]] name = "matches" version = "0.1.9" @@ -1256,6 +1403,12 @@ dependencies = [ "memoffset", ] +[[package]] +name = "nodrop" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb" + [[package]] name = "normalize-line-endings" version = "0.3.0" @@ -1401,6 +1554,51 @@ dependencies = [ "indexmap", ] +[[package]] +name = "phf" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3dfb61232e34fcb633f43d12c58f83c1df82962dcdfa565a4e866ffc17dafe12" +dependencies = [ + "phf_macros", + "phf_shared", + "proc-macro-hack", +] + +[[package]] +name = "phf_codegen" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cbffee61585b0411840d3ece935cce9cb6321f01c45477d30066498cd5e1a815" +dependencies = [ + "phf_generator", + "phf_shared", +] + +[[package]] +name = "phf_generator" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17367f0cc86f2d25802b2c26ee58a7b23faeccf78a396094c13dced0d0182526" +dependencies = [ + "phf_shared", + "rand", +] + +[[package]] +name = "phf_macros" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f6fde18ff429ffc8fe78e2bf7f8b7a5a5a6e2a8b58bc5a9ac69198bbda9189c" +dependencies = [ + "phf_generator", + "phf_shared", + "proc-macro-hack", + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "phf_shared" version = "0.8.0" @@ -1467,6 +1665,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "ppv-lite86" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872" + [[package]] name = "precomputed-hash" version = "0.1.1" @@ -1503,6 +1707,12 @@ dependencies = [ "termtree", ] +[[package]] +name = "proc-macro-hack" +version = "0.5.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5" + [[package]] name = "proc-macro2" version = "1.0.36" @@ -1521,6 +1731,57 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "rand" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" +dependencies = [ + "getrandom 0.1.16", + "libc", + "rand_chacha", + "rand_core", + "rand_hc", + "rand_pcg", +] + +[[package]] +name = "rand_chacha" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" +dependencies = [ + "getrandom 0.1.16", +] + +[[package]] +name = "rand_hc" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" +dependencies = [ + "rand_core", +] + +[[package]] +name = "rand_pcg" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16abd0c1b639e9eb4d7c50c0b8100b0d0f849be2349829c740fe8e6eb4816429" +dependencies = [ + "rand_core", +] + [[package]] name = "redox_syscall" version = "0.2.10" @@ -1536,7 +1797,7 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "528532f3d801c87aec9def2add9ca802fe569e44a544afe633765267840abe64" dependencies = [ - "getrandom", + "getrandom 0.2.4", "redox_syscall", ] @@ -1618,6 +1879,15 @@ dependencies = [ "libc", ] +[[package]] +name = "rustc_version" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" +dependencies = [ + "semver", +] + [[package]] name = "rustversion" version = "1.0.6" @@ -1646,6 +1916,22 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" +[[package]] +name = "scraper" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48e02aa790c80c2e494130dec6a522033b6a23603ffc06360e9fe6c611ea2c12" +dependencies = [ + "cssparser", + "ego-tree", + "getopts", + "html5ever", + "matches", + "selectors", + "smallvec", + "tendril", +] + [[package]] name = "security-framework" version = "2.4.2" @@ -1669,6 +1955,32 @@ dependencies = [ "libc", ] +[[package]] +name = "selectors" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df320f1889ac4ba6bc0cdc9c9af7af4bd64bb927bccdf32d81140dc1f9be12fe" +dependencies = [ + "bitflags", + "cssparser", + "derive_more", + "fxhash", + "log", + "matches", + "phf", + "phf_codegen", + "precomputed-hash", + "servo_arc", + "smallvec", + "thin-slice", +] + +[[package]] +name = "semver" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "568a8e6258aa33c13358f81fd834adb854c6f7c9468520910a9b1e8fac068012" + [[package]] name = "serde" version = "1.0.133" @@ -1722,6 +2034,16 @@ dependencies = [ "serde", ] +[[package]] +name = "servo_arc" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d98238b800e0d1576d8b6e3de32827c2d74bee68bb97748dcf5071fb53965432" +dependencies = [ + "nodrop", + "stable_deref_trait", +] + [[package]] name = "signal-hook" version = "0.3.13" @@ -1797,6 +2119,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "stable_deref_trait" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" + [[package]] name = "string_cache" version = "0.8.2" @@ -1808,6 +2136,19 @@ dependencies = [ "parking_lot", "phf_shared", "precomputed-hash", + "serde", +] + +[[package]] +name = "string_cache_codegen" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f24c8e5e19d22a726626f1a5e16fe15b132dcf21d10177fa5a45ce7962996b97" +dependencies = [ + "phf_generator", + "phf_shared", + "proc-macro2", + "quote", ] [[package]] @@ -1841,6 +2182,17 @@ dependencies = [ "winapi", ] +[[package]] +name = "tendril" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9ef557cb397a4f0a5a3a628f06515f78563f2209e64d47055d9dc6052bf5e33" +dependencies = [ + "futf", + "mac", + "utf-8", +] + [[package]] name = "term" version = "0.7.0" @@ -1886,6 +2238,12 @@ dependencies = [ "terminal_size", ] +[[package]] +name = "thin-slice" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8eaa81235c7058867fa8c0e7314f33dcce9c215f535d1913822a2b3f5e289f3c" + [[package]] name = "thiserror" version = "1.0.30" @@ -2111,13 +2469,19 @@ dependencies = [ "serde", ] +[[package]] +name = "utf-8" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" + [[package]] name = "uuid" version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bc5cf98d8186244414c848017f0e2676b3fcb46807f6668a97dfe67359a3c4b7" dependencies = [ - "getrandom", + "getrandom 0.2.4", ] [[package]] @@ -2167,6 +2531,12 @@ dependencies = [ "try-lock", ] +[[package]] +name = "wasi" +version = "0.9.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" + [[package]] name = "wasi" version = "0.10.2+wasi-snapshot-preview1" diff --git a/Cargo.toml b/Cargo.toml index 433f570..7bc9fb5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,6 +23,7 @@ lazy_static = "1.4" dirs = "4.0" [dependencies] +scraper = "0.12" futures = { version = "0.3"} tokio = { version = "1.15", features = ["full"] } tokio-util = {version = "0.6", features = ["codec"]} diff --git a/src/extractor/builder.rs b/src/extractor/builder.rs index c8d9b28..d9c0d7d 100644 --- a/src/extractor/builder.rs +++ b/src/extractor/builder.rs @@ -21,6 +21,9 @@ pub enum ExtractionTarget { /// Examine robots.txt (specifically) and extract links RobotsTxt, + + // Parse HTML and extract links + ParseHtml, } /// responsible for building an `Extractor` @@ -28,7 +31,7 @@ pub struct ExtractorBuilder<'a> { /// Response from which to extract links response: Option<&'a FeroxResponse>, - /// Response from which to extract links + /// URL of where to extract links url: String, /// Handles object to house the underlying mpsc transmitters diff --git a/src/extractor/container.rs b/src/extractor/container.rs index 129463b..c7bbb08 100644 --- a/src/extractor/container.rs +++ b/src/extractor/container.rs @@ -18,6 +18,7 @@ use crate::{ }; use anyhow::{bail, Context, Result}; use reqwest::{StatusCode, Url}; +use scraper::{Html, Selector}; use std::collections::HashSet; use tokio::sync::oneshot; @@ -43,7 +44,7 @@ pub struct Extractor<'a> { /// Response from which to extract links pub(super) response: Option<&'a FeroxResponse>, - /// Response from which to extract links + /// URL of where to extract links pub(super) url: String, /// Handles object to house the underlying mpsc transmitters @@ -56,11 +57,12 @@ pub struct Extractor<'a> { /// Extractor implementation impl<'a> Extractor<'a> { /// perform extraction from the given target and return any links found - pub async fn extract(&self) -> Result> { + pub async fn extract(&self) -> Result<(HashSet, bool)> { log::trace!("enter: extract (this fn has associated trace exit msg)"); match self.target { ExtractionTarget::ResponseBody => Ok(self.extract_from_body().await?), ExtractionTarget::RobotsTxt => Ok(self.extract_from_robots().await?), + ExtractionTarget::ParseHtml => Ok(self.parse_html().await?), } } @@ -92,11 +94,11 @@ impl<'a> Extractor<'a> { continue; } - if resp.is_file() { - // very likely a file, simply request and report - log::debug!("Extracted file: {}", resp); + // request and report assumed file + if resp.is_file() || !resp.is_directory() { + log::debug!("Extracted File: {}", resp); - scanned_urls.add_file_scan(&resp.url().to_string(), ScanOrder::Latest); + scanned_urls.add_file_scan(resp.url().as_str(), ScanOrder::Latest); if let Err(e) = resp.send_report(self.handles.output.tx.clone()) { log::warn!("Could not send FeroxResponse to output handler: {}", e); @@ -143,12 +145,28 @@ impl<'a> Extractor<'a> { /// - homepage/assets/img/ /// - homepage/assets/ /// - homepage/ - pub(super) async fn extract_from_body(&self) -> Result> { - log::trace!("enter: get_links"); + pub(super) async fn extract_from_body(&self) -> Result<(HashSet, bool)> { + log::trace!("enter: extract_from_body"); let mut links = HashSet::::new(); + let dirlist_flag = false; - let body = self.response.unwrap().text(); + // Response + let response = self.response.unwrap(); + let resp_url = response.url(); + let body = response.text(); + let html = Html::parse_document(body); + + // Extract Links + self.extract_links_by_attr(resp_url, &mut links, &html, "a", "href"); + self.extract_links_by_attr(resp_url, &mut links, &html, "img", "src"); + self.extract_links_by_attr(resp_url, &mut links, &html, "form", "action"); + self.extract_links_by_attr(resp_url, &mut links, &html, "script", "src"); + self.extract_links_by_attr(resp_url, &mut links, &html, "iframe", "src"); + self.extract_links_by_attr(resp_url, &mut links, &html, "div", "src"); + self.extract_links_by_attr(resp_url, &mut links, &html, "frame", "src"); + self.extract_links_by_attr(resp_url, &mut links, &html, "embed", "src"); + self.extract_links_by_attr(resp_url, &mut links, &html, "script", "src"); for capture in self.links_regex.captures_iter(body) { // remove single & double quotes from both ends of the capture @@ -188,17 +206,16 @@ impl<'a> Extractor<'a> { self.update_stats(links.len())?; - log::trace!("exit: get_links -> {:?}", links); - - Ok(links) + log::trace!("exit: extract_from_body -> {:?} {}", links, dirlist_flag); + Ok((links, dirlist_flag)) } /// take a url fragment like homepage/assets/img/icons/handshake.svg and /// incrementally add - /// - homepage/assets/img/icons/ - /// - homepage/assets/img/ - /// - homepage/assets/ - /// - homepage/ + /// - homepage/assets/img/icons/ + /// - homepage/assets/img/ + /// - homepage/assets/ + /// - homepage/ fn add_all_sub_paths(&self, url_path: &str, links: &mut HashSet) -> Result<()> { log::trace!("enter: add_all_sub_paths({}, {:?})", url_path, links); @@ -267,12 +284,14 @@ impl<'a> Extractor<'a> { let old_url = match self.target { ExtractionTarget::ResponseBody => self.response.unwrap().url().clone(), - ExtractionTarget::RobotsTxt => match Url::parse(&self.url) { - Ok(u) => u, - Err(e) => { - bail!("Could not parse {}: {}", self.url, e); + ExtractionTarget::ParseHtml | ExtractionTarget::RobotsTxt => { + match Url::parse(&self.url) { + Ok(u) => u, + Err(e) => { + bail!("Could not parse {}: {}", self.url, e); + } } - }, + } }; let new_url = old_url @@ -287,11 +306,6 @@ impl<'a> Extractor<'a> { } /// Wrapper around link extraction logic - /// currently used in two places: - /// - links from response bodies - /// - links from robots.txt responses - /// - /// general steps taken: /// - create a new Url object based on cli options/args /// - check if the new Url has already been seen/scanned -> None /// - make a request to the new Url ? -> Some(response) : None @@ -350,14 +364,17 @@ impl<'a> Extractor<'a> { /// http://localhost/stuff/things /// this function requests: /// http://localhost/robots.txt - pub(super) async fn extract_from_robots(&self) -> Result> { + pub(super) async fn extract_from_robots(&self) -> Result<(HashSet, bool)> { log::trace!("enter: extract_robots_txt"); let mut links: HashSet = HashSet::new(); + let dirlist_flag = false; - let response = self.request_robots_txt().await?; + // request + let response = self.make_extract_request("/robots.txt").await?; + let body = response.text(); - for capture in self.robots_regex.captures_iter(response.text()) { + for capture in self.robots_regex.captures_iter(body) { if let Some(new_path) = capture.name("url_path") { let mut new_url = Url::parse(&self.url)?; new_url.set_path(new_path.as_str()); @@ -369,19 +386,126 @@ impl<'a> Extractor<'a> { self.update_stats(links.len())?; - log::trace!("exit: extract_robots_txt -> {:?}", links); - Ok(links) + log::trace!("exit: extract_robots_txt -> {:?} {}", links, dirlist_flag); + Ok((links, dirlist_flag)) } - /// helper function that simply requests /robots.txt on the given url's base url + /// Entry point to parse html for links (i.e. webscraping, directory listings) + /// this function requests: + /// http://localhost/ + pub(super) async fn parse_html(&self) -> Result<(HashSet, bool)> { + log::trace!("enter: parse_html"); + + let mut links: HashSet = HashSet::new(); + let mut dirlist_flag = false; + + // Response + let url = Url::parse(&self.url)?; + let response = self.make_extract_request(url.path()).await?; + let resp_url = response.url(); + let body = response.text(); + let html = Html::parse_document(body); + + // Directory listing heuristic detection to not continue scanning + // Index of /: apache + // Directory Listing for /: tomcat, + // Directory Listing -- /: ASP.NET + // - /: iis, azure, skipping due to loose heuristic + let title_selector = Selector::parse("title").unwrap(); + for t in html.select(&title_selector) { + let title = t.inner_html().to_lowercase(); + if title.contains("directory listing for /") + || title.contains("index of /") + || title.contains("directory listing -- /") + { + log::debug!("Directory listing heuristic detection from \"{}\"", title); + dirlist_flag = true; + + self.extract_links_by_attr(resp_url, &mut links, &html, "a", "href"); + self.update_stats(links.len())?; + + log::trace!("exit: parse_html -> {:?} {}", links, dirlist_flag); + return Ok((links, dirlist_flag)); + } + } + + // Extract Links + self.extract_links_by_attr(resp_url, &mut links, &html, "a", "href"); + self.extract_links_by_attr(resp_url, &mut links, &html, "img", "src"); + self.extract_links_by_attr(resp_url, &mut links, &html, "form", "action"); + self.extract_links_by_attr(resp_url, &mut links, &html, "script", "src"); + self.extract_links_by_attr(resp_url, &mut links, &html, "iframe", "src"); + self.extract_links_by_attr(resp_url, &mut links, &html, "div", "src"); + self.extract_links_by_attr(resp_url, &mut links, &html, "frame", "src"); + self.extract_links_by_attr(resp_url, &mut links, &html, "embed", "src"); + self.extract_links_by_attr(resp_url, &mut links, &html, "script", "src"); + + self.update_stats(links.len())?; + + log::trace!("exit: parse_html -> {:?} {}", links, dirlist_flag); + Ok((links, dirlist_flag)) + } + + /// simple helper to get html links by tag/attribute and add it to the `links` HashSet + fn extract_links_by_attr( + &self, + resp_url: &Url, + links: &mut HashSet, + html: &Html, + html_tag: &str, + html_attr: &str, + ) { + log::trace!("enter: extract_links_by_attr"); + + let selector = Selector::parse(html_tag).unwrap(); + let tags = html + .select(&selector) + .filter(|a| a.value().attrs().any(|attr| attr.0 == html_attr)); + for t in tags { + if let Some(link) = t.value().attr(html_attr) { + log::debug!("Parsed link \"{}\" from {}", link, resp_url.as_str()); + + match Url::parse(link) { + Ok(absolute) => { + if absolute.domain() != resp_url.domain() + || absolute.host() != resp_url.host() + { + // domains/ips are not the same, don't scan things that aren't part of the original + // target url + continue; + } + + if self.add_all_sub_paths(absolute.path(), links).is_err() { + log::warn!("could not add sub-paths from {} to {:?}", absolute, links); + } + } + Err(e) => { + // this is the expected error that happens when we try to parse a url fragment + // ex: Url::parse("/login") -> Err("relative URL without a base") + // while this is technically an error, these are good results for us + if e.to_string().contains("relative URL without a base") { + if self.add_all_sub_paths(link, links).is_err() { + log::warn!("could not add sub-paths from {} to {:?}", link, links); + } + } else { + // unexpected error has occurred + log::warn!("Could not parse given url: {}", e); + self.handles.stats.send(AddError(Other)).unwrap_or_default(); + } + } + } + } + } + + log::trace!("exit: extract_links_by_attr"); + } + + /// helper function that simply requests at on the given url's base url /// /// example: - /// http://localhost/api/users -> http://localhost/robots.txt - /// - /// The length of the given path has no effect on what's requested; it's always - /// base url + /robots.txt - pub(super) async fn request_robots_txt(&self) -> Result { - log::trace!("enter: get_robots_file"); + /// http://localhost/api/users -> http://localhost/ + pub(super) async fn make_extract_request(&self, location: &str) -> Result { + log::trace!("enter: make_extract_request"); // more often than not, domain/robots.txt will redirect to www.domain/robots.txt or something // similar; to account for that, create a client that will follow redirects, regardless of @@ -405,7 +529,7 @@ impl<'a> Extractor<'a> { )?; let mut url = Url::parse(&self.url)?; - url.set_path("/robots.txt"); // overwrite existing path with /robots.txt + url.set_path(location); // overwrite existing path // purposefully not using logged_request here due to using the special client let response = make_request( @@ -428,7 +552,7 @@ impl<'a> Extractor<'a> { ) .await; - log::trace!("exit: get_robots_file -> {}", ferox_response); + log::trace!("exit: make_extract_request -> {}", ferox_response); Ok(ferox_response) } diff --git a/src/extractor/tests.rs b/src/extractor/tests.rs index 6baef11..8d9c972 100644 --- a/src/extractor/tests.rs +++ b/src/extractor/tests.rs @@ -20,6 +20,9 @@ lazy_static! { /// Extractor for testing response bodies static ref BODY_EXT: Extractor<'static> = setup_extractor(ExtractionTarget::ResponseBody, Arc::new(FeroxScans::default())); + /// Extractor for testing paring html + static ref PARSEHTML_EXT: Extractor<'static> = setup_extractor(ExtractionTarget::ParseHtml, Arc::new(FeroxScans::default())); + /// FeroxResponse for Extractor static ref RESPONSE: FeroxResponse = get_test_response(); } @@ -42,6 +45,9 @@ fn setup_extractor(target: ExtractionTarget, scanned_urls: Arc) -> E ExtractionTarget::RobotsTxt => builder .url("http://localhost") .target(ExtractionTarget::RobotsTxt), + ExtractionTarget::ParseHtml => builder + .url("http://localhost") + .target(ExtractionTarget::ParseHtml), }; let config = Arc::new(Configuration::new().unwrap()); @@ -252,7 +258,7 @@ async fn extractor_get_links_with_absolute_url_that_differs_from_target_domain() handles: handles.clone(), }; - let links = extractor.extract_from_body().await?; + let links = (extractor.extract_from_body().await?).0; assert!(links.is_empty()); assert_eq!(mock.hits(), 1); @@ -280,7 +286,7 @@ async fn request_robots_txt_without_proxy() -> Result<()> { handles, }; - let resp = extractor.request_robots_txt().await?; + let resp = extractor.make_extract_request("/robots.txt").await?; assert!(matches!(resp.status(), &StatusCode::OK)); println!("{}", resp); @@ -313,7 +319,7 @@ async fn request_robots_txt_with_proxy() -> Result<()> { .handles(handles) .build()?; - let resp = extractor.request_robots_txt().await?; + let resp = extractor.make_extract_request("/robots.txt").await?; assert!(matches!(resp.status(), &StatusCode::OK)); assert_eq!(resp.content_length(), 19); diff --git a/src/progress.rs b/src/progress.rs index 690439e..df808d8 100644 --- a/src/progress.rs +++ b/src/progress.rs @@ -35,10 +35,11 @@ pub fn add_bar(prefix: &str, length: u64, bar_type: BarType) -> ProgressBar { style = match bar_type { BarType::Hidden => style.template(""), - BarType::Default => style - .template("[{bar:.cyan/blue}] - {elapsed:<4} {pos:>7}/{len:7} {per_sec:7} {prefix}"), + BarType::Default => style.template( + "[{bar:.cyan/blue}] - {elapsed:<4} {pos:>7}/{len:7} {per_sec:7} {prefix} {msg}", + ), BarType::Message => style.template(&format!( - "[{{bar:.cyan/blue}}] - {{elapsed:<4}} {{pos:>7}}/{{len:7}} {:7} {{prefix}}", + "[{{bar:.cyan/blue}}] - {{elapsed:<4}} {{pos:>7}}/{{len:7}} {:7} {{prefix}} {{msg}}", "-" )), BarType::Total => { diff --git a/src/scanner/ferox_scanner.rs b/src/scanner/ferox_scanner.rs index d48a976..51c392f 100644 --- a/src/scanner/ferox_scanner.rs +++ b/src/scanner/ferox_scanner.rs @@ -1,6 +1,7 @@ use std::{ops::Deref, sync::atomic::Ordering, sync::Arc, time::Instant}; use anyhow::{bail, Result}; +use console::style; use futures::{stream, StreamExt}; use lazy_static::lazy_static; use tokio::sync::Semaphore; @@ -10,7 +11,7 @@ use crate::{ Command::{AddError, AddToF64Field, SubtractFromUsizeField}, Handles, }, - extractor::{ExtractionTarget::RobotsTxt, ExtractorBuilder}, + extractor::{ExtractionTarget, ExtractorBuilder}, heuristics, scan_manager::{FeroxResponses, MenuCmdResult, ScanOrder, ScanStatus, PAUSE_SCAN}, statistics::{ @@ -43,7 +44,7 @@ pub struct FeroxScanner { /// wordlist that's already been read from disk wordlist: Arc>, - /// limiter that restricts the number of active FeroxScanners + /// limiter that restricts the number of active FeroxScanners scan_limiter: Arc, } @@ -74,22 +75,33 @@ impl FeroxScanner { log::info!("Starting scan against: {}", self.target_url); let scan_timer = Instant::now(); + let mut dirlist_flag = false; - if matches!(self.order, ScanOrder::Initial) && self.handles.config.extract_links { - // only grab robots.txt on the initial scan_url calls. all fresh dirs will be passed - // to try_recursion + if self.handles.config.extract_links { + // parse html for links (i.e. web scraping) let extractor = ExtractorBuilder::default() + .target(ExtractionTarget::ParseHtml) .url(&self.target_url) .handles(self.handles.clone()) - .target(RobotsTxt) .build()?; - - let links = extractor.extract().await?; + let extract_out = extractor.extract().await?; + let links = extract_out.0; + dirlist_flag = extract_out.1; extractor.request_links(links).await?; + + if matches!(self.order, ScanOrder::Initial) { + // check for robots.txt (cannot be in subdirs) + let extractor = ExtractorBuilder::default() + .target(ExtractionTarget::RobotsTxt) + .url(&self.target_url) + .handles(self.handles.clone()) + .build()?; + let links = (extractor.extract().await?).0; + extractor.request_links(links).await?; + } } let scanned_urls = self.handles.ferox_scans()?; - let ferox_scan = match scanned_urls.get_scan_by_url(&self.target_url) { Some(scan) => { scan.set_status(ScanStatus::Running)?; @@ -106,6 +118,28 @@ impl FeroxScanner { let progress_bar = ferox_scan.progress_bar(); + // Directory listing heuristic detection to not continue scanning + if dirlist_flag { + log::trace!("exit: scan_url -> Directory listing heuristic"); + + self.handles.stats.send(AddToF64Field( + DirScanTimes, + scan_timer.elapsed().as_secs_f64(), + ))?; + + self.handles.stats.send(SubtractFromUsizeField( + TotalExpected, + progress_bar.length() as usize, + ))?; + + progress_bar.reset_eta(); + progress_bar.finish_with_message(&format!("=> {}", style("Directory listing").green())); + + ferox_scan.finish()?; + + return Ok(()); + } + // When acquire is called and the semaphore has remaining permits, the function immediately // returns a permit. However, if no remaining permits are available, acquire (asynchronously) // waits until an outstanding permit is dropped, at which point, the freed permit is assigned diff --git a/src/scanner/requester.rs b/src/scanner/requester.rs index 89b0422..ba29fc3 100644 --- a/src/scanner/requester.rs +++ b/src/scanner/requester.rs @@ -17,7 +17,7 @@ use crate::{ Command::{self, AddError, SubtractFromUsizeField}, Handles, }, - extractor::{ExtractionTarget::ResponseBody, ExtractorBuilder}, + extractor::{ExtractionTarget, ExtractorBuilder}, response::FeroxResponse, scan_manager::{FeroxScan, ScanStatus}, statistics::{StatError::Other, StatField::TotalExpected}, @@ -395,13 +395,12 @@ impl Requester { if self.handles.config.extract_links && !ferox_response.status().is_redirection() { let extractor = ExtractorBuilder::default() - .target(ResponseBody) + .target(ExtractionTarget::ResponseBody) .response(&ferox_response) .handles(self.handles.clone()) .build()?; - let new_links: HashSet<_>; - let extracted = extractor.extract().await?; + let extracted = (extractor.extract().await?).0; { // gain and quickly drop the read lock on seen_links, using it while unlocked diff --git a/tests/test_extractor.rs b/tests/test_extractor.rs index ff6d193..bea4685 100644 --- a/tests/test_extractor.rs +++ b/tests/test_extractor.rs @@ -288,7 +288,7 @@ fn extractor_finds_robots_txt_links_and_displays_files_or_scans_directories() { ); assert_eq!(mock.hits(), 1); - assert_eq!(mock_dir.hits(), 1); + assert_eq!(mock_dir.hits(), 2); assert_eq!(mock_two.hits(), 1); assert_eq!(mock_file.hits(), 1); assert_eq!(mock_disallowed.hits(), 1); @@ -370,6 +370,226 @@ fn extractor_finds_robots_txt_links_and_displays_files_non_recursive() { teardown_tmp_directory(tmp_dir); } +#[test] +/// serve a directory listing with a file and and a folder contained within it. ferox should +/// find both links and request each one. +fn extractor_finds_directory_listing_links_and_displays_files() { + let srv = MockServer::start(); + let (tmp_dir, file) = setup_tmp_directory(&["invalid".to_string()], "wordlist").unwrap(); + + let mock_root = srv.mock(|when, then| { + when.method(GET).path("/"); + then.status(200).body( + r#" + + + + Directory listing for / + + +

Directory listing for /

+
+ +
+ + + "#, + ); + }); + + let mock_root_file = srv.mock(|when, then| { + when.method(GET).path("/LICENSE"); + then.status(200).body("im a little teapot"); // 18 + }); + + let mock_dir_disallowed = srv.mock(|when, then| { + when.method(GET).path("/disallowed-subdir"); + then.status(404); + }); + + let mock_dir_redir = srv.mock(|when, then| { + when.method(GET).path("/misc"); + then.status(301).header("Location", &srv.url("/misc/")); + }); + let mock_dir = srv.mock(|when, then| { + when.method(GET).path("/misc/"); + then.status(200).body( + r#" + + + + Directory listing for /misc + + +

Directory listing for /misc

+
+ +
+ + + "#, + ); + }); + + let mock_dir_file = srv.mock(|when, then| { + when.method(GET).path("/misc/LICENSE"); + then.status(200).body("i too, am a container for tea"); // 29 + }); + + let mock_dir_file_ext = srv.mock(|when, then| { + when.method(GET).path("/misc/stupidfile.php"); + then.status(200).body("im a little teapot too"); // 22 + }); + + let cmd = Command::cargo_bin("feroxbuster") + .unwrap() + .arg("--url") + .arg(srv.url("/")) + .arg("--wordlist") + .arg(file.as_os_str()) + .arg("--extract-links") + .arg("--redirects") + .unwrap(); + + cmd.assert().success().stdout( + predicate::str::contains("/LICENSE") // 2 directories contain LICENSE + .count(2) + .and(predicate::str::contains("18c")) + .and(predicate::str::contains("/misc/stupidfile.php")) + .and(predicate::str::contains("22c")) + .and(predicate::str::contains("/misc/LICENSE")) + .and(predicate::str::contains("29c")) + .and(predicate::str::contains("200").count(3)), + ); + + assert_eq!(mock_root.hits(), 2); + assert_eq!(mock_root_file.hits(), 1); + assert_eq!(mock_dir_disallowed.hits(), 1); + assert_eq!(mock_dir_redir.hits(), 1); + assert_eq!(mock_dir.hits(), 2); + assert_eq!(mock_dir_file.hits(), 1); + assert_eq!(mock_dir_file_ext.hits(), 1); + teardown_tmp_directory(tmp_dir); +} + +#[test] +/// serve a directory listing with a file and and a folder contained within it. ferox should +/// find both links and request each one. This is the non-recursive version of the test above +fn extractor_finds_directory_listing_links_and_displays_files_non_recursive() { + let srv = MockServer::start(); + let (tmp_dir, file) = setup_tmp_directory(&["invalid".to_string()], "wordlist").unwrap(); + + let mock_root = srv.mock(|when, then| { + when.method(GET).path("/"); + then.status(200).body( + r#" + + + + Directory listing for / + + +

Directory listing for /

+
+ +
+ + + "#, + ); + }); + + let mock_root_file = srv.mock(|when, then| { + when.method(GET).path("/LICENSE"); + then.status(200).body("im a little teapot"); // 18 + }); + + let mock_dir_disallowed = srv.mock(|when, then| { + when.method(GET).path("/disallowed-subdir"); + then.status(404); + }); + + let mock_dir_redir = srv.mock(|when, then| { + when.method(GET).path("/misc"); + then.status(301).header("Location", &srv.url("/misc/")); + }); + let mock_dir = srv.mock(|when, then| { + when.method(GET).path("/misc/"); + then.status(200).body( + r#" + + + + Directory listing for /misc + + +

Directory listing for /misc

+
+ +
+ + + "#, + ); + }); + + let mock_dir_file = srv.mock(|when, then| { + when.method(GET).path("/misc/LICENSE"); + then.status(200).body("i too, am a container for tea"); // 29 + }); + + let mock_dir_file_ext = srv.mock(|when, then| { + when.method(GET).path("/misc/stupidfile.php"); + then.status(200).body("im a little teapot too"); // 22 + }); + + let cmd = Command::cargo_bin("feroxbuster") + .unwrap() + .arg("--url") + .arg(srv.url("/")) + .arg("--wordlist") + .arg(file.as_os_str()) + .arg("--extract-links") + .arg("--redirects") + .arg("--no-recursion") + .unwrap(); + + cmd.assert().success().stdout( + predicate::str::contains("/LICENSE") + .and(predicate::str::contains("18c")) + .and(predicate::str::contains("/misc/stupidfile.php")) + .not() + .and(predicate::str::contains("22c")) + .not() + .and(predicate::str::contains("/misc/LICENSE").not()) + .and(predicate::str::contains("29c").not()) + .and(predicate::str::contains("200").count(1)), + ); + + assert_eq!(mock_root.hits(), 2); + assert_eq!(mock_root_file.hits(), 1); + assert_eq!(mock_dir_disallowed.hits(), 1); + assert_eq!(mock_dir_redir.hits(), 1); + assert_eq!(mock_dir.hits(), 1); + assert_eq!(mock_dir_file.hits(), 0); + assert_eq!(mock_dir_file_ext.hits(), 0); + teardown_tmp_directory(tmp_dir); +} + #[test] /// send a request to a page that contains a link that contains a directory that returns a 403 /// --extract-links should find the link and make recurse into the 403 directory, finding LICENSE @@ -416,7 +636,7 @@ fn extractor_recurses_into_403_directories() -> Result<(), Box