diff --git a/src/extractor.rs b/src/extractor.rs deleted file mode 100644 index c78e916..0000000 --- a/src/extractor.rs +++ /dev/null @@ -1,504 +0,0 @@ -use crate::{ - client, - config::{Configuration, CONFIGURATION}, - scanner::SCANNED_URLS, - statistics::{ - StatCommand::{self, UpdateUsizeField}, - StatField::{LinksExtracted, TotalExpected}, - }, - utils::{format_url, make_request}, - FeroxResponse, -}; -use lazy_static::lazy_static; -use regex::Regex; -use reqwest::Url; -use std::collections::HashSet; -use tokio::sync::mpsc::UnboundedSender; - -/// Regular expression used in [LinkFinder](https://github.com/GerbenJavado/LinkFinder) -/// -/// Incorporates change from this [Pull Request](https://github.com/GerbenJavado/LinkFinder/pull/66/files) -const LINKFINDER_REGEX: &str = r#"(?:"|')(((?:[a-zA-Z]{1,10}://|//)[^"'/]{1,}\.[a-zA-Z]{2,}[^"']{0,})|((?:/|\.\./|\./)[^"'><,;| *()(%%$^/\\\[\]][^"'><,;|()]{1,})|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{1,}\.(?:[a-zA-Z]{1,4}|action)(?:[\?|#][^"|']{0,}|))|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{3,}(?:[\?|#][^"|']{0,}|))|([a-zA-Z0-9_\-.]{1,}\.(?:php|asp|aspx|jsp|json|action|html|js|txt|xml)(?:[\?|#][^"|']{0,}|)))(?:"|')"#; - -/// Regular expression to pull url paths from robots.txt -/// -/// ref: https://developers.google.com/search/reference/robots_txt -const ROBOTS_TXT_REGEX: &str = - r#"(?m)^ *(Allow|Disallow): *(?P[a-zA-Z0-9._/?#@!&'()+,;%=-]+?)$"#; // multi-line (?m) - -lazy_static! { - /// `LINKFINDER_REGEX` as a regex::Regex type - static ref LINKS_REGEX: Regex = Regex::new(LINKFINDER_REGEX).unwrap(); - - /// `ROBOTS_TXT_REGEX` as a regex::Regex type - static ref ROBOTS_REGEX: Regex = Regex::new(ROBOTS_TXT_REGEX).unwrap(); -} - -/// Iterate over a given path, return a list of every sub-path found -/// -/// example: `path` contains a link fragment `homepage/assets/img/icons/handshake.svg` -/// the following fragments would be returned: -/// - homepage/assets/img/icons/handshake.svg -/// - homepage/assets/img/icons/ -/// - homepage/assets/img/ -/// - homepage/assets/ -/// - homepage/ -fn get_sub_paths_from_path(path: &str) -> Vec { - log::trace!("enter: get_sub_paths_from_path({})", path); - let mut paths = vec![]; - - // filter out any empty strings caused by .split - let mut parts: Vec<&str> = path.split('/').filter(|s| !s.is_empty()).collect(); - - let length = parts.len(); - - for i in 0..length { - // iterate over all parts of the path - if parts.is_empty() { - // pop left us with an empty vector, we're done - break; - } - - let mut possible_path = parts.join("/"); - - if possible_path.is_empty() { - // .join can result in an empty string, which we don't need, ignore - continue; - } - - if i > 0 { - // this isn't the last index of the parts array - // ex: /buried/misc/stupidfile.php - // this block skips the file but sees all parent folders - possible_path = format!("{}/", possible_path); - } - - paths.push(possible_path); // good sub-path found - parts.pop(); // use .pop() to remove the last part of the path and continue iteration - } - - log::trace!("exit: get_sub_paths_from_path -> {:?}", paths); - paths -} - -/// simple helper to stay DRY, trys to join a url + fragment and add it to the `links` HashSet -fn add_link_to_set_of_links(link: &str, url: &Url, links: &mut HashSet) { - log::trace!( - "enter: add_link_to_set_of_links({}, {}, {:?})", - link, - url.to_string(), - links - ); - match url.join(&link) { - Ok(new_url) => { - links.insert(new_url.to_string()); - } - Err(e) => { - log::error!("Could not join given url to the base url: {}", e); - } - } - log::trace!("exit: add_link_to_set_of_links"); -} - -/// Given a `reqwest::Response`, perform the following actions -/// - parse the response's text for links using the linkfinder regex -/// - for every link found take its url path and parse each sub-path -/// - example: Response contains a link fragment `homepage/assets/img/icons/handshake.svg` -/// with a base url of http://localhost, the following urls would be returned: -/// - homepage/assets/img/icons/handshake.svg -/// - homepage/assets/img/icons/ -/// - homepage/assets/img/ -/// - homepage/assets/ -/// - homepage/ -pub async fn get_links( - response: &FeroxResponse, - tx_stats: UnboundedSender, -) -> HashSet { - log::trace!( - "enter: get_links({}, {:?})", - response.url().as_str(), - tx_stats - ); - - let mut links = HashSet::::new(); - - let body = response.text(); - - for capture in LINKS_REGEX.captures_iter(&body) { - // remove single & double quotes from both ends of the capture - // capture[0] is the entire match, additional capture groups start at [1] - let link = capture[0].trim_matches(|c| c == '\'' || c == '"'); - - match Url::parse(link) { - Ok(absolute) => { - if absolute.domain() != response.url().domain() - || absolute.host() != response.url().host() - { - // domains/ips are not the same, don't scan things that aren't part of the original - // target url - continue; - } - - add_all_sub_paths(absolute.path(), &response, &mut links); - } - Err(e) => { - // this is the expected error that happens when we try to parse a url fragment - // ex: Url::parse("/login") -> Err("relative URL without a base") - // while this is technically an error, these are good results for us - if e.to_string().contains("relative URL without a base") { - add_all_sub_paths(link, &response, &mut links); - } else { - // unexpected error has occurred - log::error!("Could not parse given url: {}", e); - } - } - } - } - - let multiplier = CONFIGURATION.extensions.len().max(1); - - update_stat!(tx_stats, UpdateUsizeField(LinksExtracted, links.len())); - update_stat!( - tx_stats, - UpdateUsizeField(TotalExpected, links.len() * multiplier) - ); - - log::trace!("exit: get_links -> {:?}", links); - - links -} - -/// take a url fragment like homepage/assets/img/icons/handshake.svg and -/// incrementally add -/// - homepage/assets/img/icons/ -/// - homepage/assets/img/ -/// - homepage/assets/ -/// - homepage/ -fn add_all_sub_paths(url_path: &str, response: &FeroxResponse, mut links: &mut HashSet) { - log::trace!( - "enter: add_all_sub_paths({}, {}, {:?})", - url_path, - response, - links - ); - - for sub_path in get_sub_paths_from_path(url_path) { - log::debug!("Adding {} to {:?}", sub_path, links); - add_link_to_set_of_links(&sub_path, &response.url(), &mut links); - } - - log::trace!("exit: add_all_sub_paths"); -} - -/// Wrapper around link extraction logic -/// currently used in two places: -/// - links from response bodys -/// - links from robots.txt responses -/// -/// general steps taken: -/// - create a new Url object based on cli options/args -/// - check if the new Url has already been seen/scanned -> None -/// - make a request to the new Url ? -> Some(response) : None -pub async fn request_feroxresponse_from_new_link( - url: &str, - tx_stats: UnboundedSender, -) -> Option { - log::trace!( - "enter: request_feroxresponse_from_new_link({}, {:?})", - url, - tx_stats - ); - - // create a url based on the given command line options, return None on error - let new_url = match format_url( - &url, - &"", - CONFIGURATION.add_slash, - &CONFIGURATION.queries, - None, - tx_stats.clone(), - ) { - Ok(url) => url, - Err(_) => { - log::trace!("exit: request_feroxresponse_from_new_link -> None"); - return None; - } - }; - - if SCANNED_URLS.get_scan_by_url(&new_url.to_string()).is_some() { - //we've seen the url before and don't need to scan again - log::trace!("exit: request_feroxresponse_from_new_link -> None"); - return None; - } - - // make the request and store the response - let new_response = match make_request(&CONFIGURATION.client, &new_url, tx_stats).await { - Ok(resp) => resp, - Err(_) => { - log::trace!("exit: request_feroxresponse_from_new_link -> None"); - return None; - } - }; - - let new_ferox_response = FeroxResponse::from(new_response, true).await; - - log::trace!( - "exit: request_feroxresponse_from_new_link -> {:?}", - new_ferox_response - ); - Some(new_ferox_response) -} - -/// helper function that simply requests /robots.txt on the given url's base url -/// -/// example: -/// http://localhost/api/users -> http://localhost/robots.txt -/// -/// The length of the given path has no effect on what's requested; it's always -/// base url + /robots.txt -pub async fn request_robots_txt( - base_url: &str, - config: &Configuration, - tx_stats: UnboundedSender, -) -> Option { - log::trace!( - "enter: get_robots_file({}, CONFIGURATION, {:?})", - base_url, - tx_stats - ); - - // more often than not, domain/robots.txt will redirect to www.domain/robots.txt or something - // similar; to account for that, create a client that will follow redirects, regardless of - // what the user specified for the scanning client. Other than redirects, it will respect - // all other user specified settings - let follow_redirects = true; - - let proxy = if config.proxy.is_empty() { - None - } else { - Some(config.proxy.as_str()) - }; - - let client = client::initialize( - config.timeout, - &config.user_agent, - follow_redirects, - config.insecure, - &config.headers, - proxy, - ); - - if let Ok(mut url) = Url::parse(base_url) { - url.set_path("/robots.txt"); // overwrite existing path with /robots.txt - - if let Ok(response) = make_request(&client, &url, tx_stats).await { - let ferox_response = FeroxResponse::from(response, true).await; - - log::trace!("exit: get_robots_file -> {}", ferox_response); - return Some(ferox_response); - } - } - - None -} - -/// Entry point to perform link extraction from robots.txt -/// -/// `base_url` can have paths and subpaths, however robots.txt will be requested from the -/// root of the url -/// given the url: -/// http://localhost/stuff/things -/// this function requests: -/// http://localhost/robots.txt -pub async fn extract_robots_txt( - base_url: &str, - config: &Configuration, - tx_stats: UnboundedSender, -) -> HashSet { - log::trace!( - "enter: extract_robots_txt({}, CONFIGURATION, {:?})", - base_url, - tx_stats - ); - let mut links = HashSet::new(); - - if let Some(response) = request_robots_txt(&base_url, &config, tx_stats.clone()).await { - for capture in ROBOTS_REGEX.captures_iter(response.text.as_str()) { - if let Some(new_path) = capture.name("url_path") { - if let Ok(mut new_url) = Url::parse(base_url) { - new_url.set_path(new_path.as_str()); - add_all_sub_paths(new_url.path(), &response, &mut links); - } - } - } - } - - let multiplier = CONFIGURATION.extensions.len().max(1); - - update_stat!(tx_stats, UpdateUsizeField(LinksExtracted, links.len())); - update_stat!( - tx_stats, - UpdateUsizeField(TotalExpected, links.len() * multiplier) - ); - - log::trace!("exit: extract_robots_txt -> {:?}", links); - links -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::utils::make_request; - use crate::FeroxChannel; - use httpmock::Method::GET; - use httpmock::MockServer; - use reqwest::Client; - use tokio::sync::mpsc; - - #[test] - /// extract sub paths from the given url fragment; expect 4 sub paths and that all are - /// in the expected array - fn extractor_get_sub_paths_from_path_with_multiple_paths() { - let path = "homepage/assets/img/icons/handshake.svg"; - let paths = get_sub_paths_from_path(&path); - let expected = vec![ - "homepage/", - "homepage/assets/", - "homepage/assets/img/", - "homepage/assets/img/icons/", - "homepage/assets/img/icons/handshake.svg", - ]; - - assert_eq!(paths.len(), expected.len()); - for expected_path in expected { - assert_eq!(paths.contains(&expected_path.to_string()), true); - } - } - - #[test] - /// extract sub paths from the given url fragment; expect 2 sub paths and that all are - /// in the expected array. the fragment is wrapped in slashes to ensure no empty strings are - /// returned - fn extractor_get_sub_paths_from_path_with_enclosing_slashes() { - let path = "/homepage/assets/"; - let paths = get_sub_paths_from_path(&path); - let expected = vec!["homepage/", "homepage/assets"]; - - assert_eq!(paths.len(), expected.len()); - for expected_path in expected { - assert_eq!(paths.contains(&expected_path.to_string()), true); - } - } - - #[test] - /// extract sub paths from the given url fragment; expect 1 sub path, no forward slashes are - /// included - fn extractor_get_sub_paths_from_path_with_only_a_word() { - let path = "homepage"; - let paths = get_sub_paths_from_path(&path); - let expected = vec!["homepage"]; - - assert_eq!(paths.len(), expected.len()); - for expected_path in expected { - assert_eq!(paths.contains(&expected_path.to_string()), true); - } - } - - #[test] - /// extract sub paths from the given url fragment; expect 1 sub path, forward slash removed - fn extractor_get_sub_paths_from_path_with_an_absolute_word() { - let path = "/homepage"; - let paths = get_sub_paths_from_path(&path); - let expected = vec!["homepage"]; - - assert_eq!(paths.len(), expected.len()); - for expected_path in expected { - assert_eq!(paths.contains(&expected_path.to_string()), true); - } - } - - #[test] - /// test that a full url and fragment are joined correctly, then added to the given list - /// i.e. the happy path - fn extractor_add_link_to_set_of_links_happy_path() { - let url = Url::parse("https://localhost").unwrap(); - let mut links = HashSet::::new(); - let link = "admin"; - - assert_eq!(links.len(), 0); - add_link_to_set_of_links(link, &url, &mut links); - - assert_eq!(links.len(), 1); - assert!(links.contains("https://localhost/admin")); - } - - #[test] - /// test that an invalid path fragment doesn't add anything to the set of links - fn extractor_add_link_to_set_of_links_with_non_base_url() { - let url = Url::parse("https://localhost").unwrap(); - let mut links = HashSet::::new(); - let link = "\\\\\\\\"; - - assert_eq!(links.len(), 0); - add_link_to_set_of_links(link, &url, &mut links); - - assert_eq!(links.len(), 0); - assert!(links.is_empty()); - } - - #[tokio::test(flavor = "multi_thread", worker_threads = 1)] - /// use make_request to generate a Response, and use the Response to test get_links; - /// the response will contain an absolute path to a domain that is not part of the scanned - /// domain; expect an empty set returned - async fn extractor_get_links_with_absolute_url_that_differs_from_target_domain( - ) -> Result<(), Box> { - let srv = MockServer::start(); - - let mock = srv.mock(|when, then|{ - when.method(GET) - .path("/some-path"); - then.status(200) - .body("\"http://defintely.not.a.thing.probably.com/homepage/assets/img/icons/handshake.svg\""); - }); - - let client = Client::new(); - let url = Url::parse(&srv.url("/some-path")).unwrap(); - let (tx, _): FeroxChannel = mpsc::unbounded_channel(); - - let response = make_request(&client, &url, tx.clone()).await.unwrap(); - - let ferox_response = FeroxResponse::from(response, true).await; - - let links = get_links(&ferox_response, tx).await; - - assert!(links.is_empty()); - - assert_eq!(mock.hits(), 1); - Ok(()) - } - - #[tokio::test(flavor = "multi_thread", worker_threads = 1)] - /// test that /robots.txt is correctly requested given a base url (happy path) - async fn request_robots_txt_with_and_without_proxy() { - let srv = MockServer::start(); - - let mock = srv.mock(|when, then| { - when.method(GET).path("/robots.txt"); - then.status(200).body("this is a test"); - }); - - let mut config = Configuration::default(); - - let (tx, _): FeroxChannel = mpsc::unbounded_channel(); - - request_robots_txt(&srv.url("/api/users/stuff/things"), &config, tx.clone()).await; - - // note: the proxy doesn't actually do anything other than hit a different code branch - // in this unit test; it would however have an effect on an integration test - config.proxy = srv.url("/ima-proxy"); - - request_robots_txt(&srv.url("/api/different/path"), &config, tx).await; - - assert_eq!(mock.hits(), 2); - } -} diff --git a/src/extractor/builder.rs b/src/extractor/builder.rs new file mode 100644 index 0000000..4670f21 --- /dev/null +++ b/src/extractor/builder.rs @@ -0,0 +1,168 @@ +use super::*; +use anyhow::{bail, Result}; + +/// Regular expression used in [LinkFinder](https://github.com/GerbenJavado/LinkFinder) +/// +/// Incorporates change from this [Pull Request](https://github.com/GerbenJavado/LinkFinder/pull/66/files) +pub(super) const LINKFINDER_REGEX: &str = r#"(?:"|')(((?:[a-zA-Z]{1,10}://|//)[^"'/]{1,}\.[a-zA-Z]{2,}[^"']{0,})|((?:/|\.\./|\./)[^"'><,;| *()(%%$^/\\\[\]][^"'><,;|()]{1,})|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{1,}\.(?:[a-zA-Z]{1,4}|action)(?:[\?|#][^"|']{0,}|))|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{3,}(?:[\?|#][^"|']{0,}|))|([a-zA-Z0-9_\-.]{1,}\.(?:php|asp|aspx|jsp|json|action|html|js|txt|xml)(?:[\?|#][^"|']{0,}|)))(?:"|')"#; + +/// Regular expression to pull url paths from robots.txt +/// +/// ref: https://developers.google.com/search/reference/robots_txt +pub(super) const ROBOTS_TXT_REGEX: &str = + r#"(?m)^ *(Allow|Disallow): *(?P[a-zA-Z0-9._/?#@!&'()+,;%=-]+?)$"#; // multi-line (?m) + +/// Which type of extraction should be performed +#[derive(Debug, Copy, Clone)] +pub enum ExtractionTarget { + /// Examine a response body and extract links + ResponseBody, + + /// Examine robots.txt (specifically) and extract links + RobotsTxt, +} + +/// responsible for building an `Extractor` +pub struct ExtractorBuilder<'a> { + /// Response from which to extract links + response: Option<&'a FeroxResponse>, + + /// Response from which to extract links + url: String, + + /// Whether or not to try recursion + config: Option<&'a Configuration>, + + /// transmitter to the mpsc that handles statistics gathering + tx_stats: Option>, + + /// transmitter to the mpsc that handles recursive scan calls + tx_recursion: Option>, + + /// transmitter to the mpsc that handles reporting information to the user + tx_reporter: Option>, + + /// list of urls that will be added to when new urls are extracted + scanned_urls: Option<&'a FeroxScans>, + + /// depth at which the scan was started + depth: Option, + + /// copy of Stats object + stats: Option>, + + /// type of extraction to be performed + target: ExtractionTarget, +} + +/// ExtractorBuilder implementation +impl<'a> ExtractorBuilder<'a> { + /// Given a FeroxResponse, create new ExtractorBuilder + /// + /// Once built, Extractor::target is ExtractionTarget::ResponseBody + pub fn with_response(response: &'a FeroxResponse) -> Self { + Self { + response: Some(response), + url: "".to_string(), + config: None, + tx_stats: None, + tx_recursion: None, + tx_reporter: None, + scanned_urls: None, + depth: None, + stats: None, + target: ExtractionTarget::ResponseBody, + } + } + + /// Given a url and Stats transmitter, create new ExtractorBuilder + /// + /// Once built, Extractor::target is ExtractionTarget::ResponseBody + pub fn with_url(url: &str) -> Self { + Self { + response: None, + url: url.to_string(), + config: None, + tx_stats: None, + tx_recursion: None, + tx_reporter: None, + scanned_urls: None, + depth: None, + stats: None, + target: ExtractionTarget::RobotsTxt, + } + } + + /// builder call to set `config` + pub fn config(&mut self, config: &'a Configuration) -> &mut Self { + self.config = Some(config); + self + } + + /// builder call to set `tx_recursion` + pub fn recursion_transmitter(&mut self, tx_recursion: UnboundedSender) -> &mut Self { + self.tx_recursion = Some(tx_recursion); + self + } + + /// builder call to set `tx_stats` + pub fn stats_transmitter(&mut self, tx_stats: UnboundedSender) -> &mut Self { + self.tx_stats = Some(tx_stats); + self + } + + /// builder call to set `tx_reporter` + pub fn reporter_transmitter( + &mut self, + tx_reporter: UnboundedSender, + ) -> &mut Self { + self.tx_reporter = Some(tx_reporter); + self + } + + /// builder call to set `scanned_urls` + pub fn scanned_urls(&mut self, scanned_urls: &'a FeroxScans) -> &mut Self { + self.scanned_urls = Some(scanned_urls); + self + } + + /// builder call to set `stats` + pub fn stats(&mut self, stats: Arc) -> &mut Self { + self.stats = Some(stats); + self + } + + /// builder call to set `depth` + pub fn depth(&mut self, depth: usize) -> &mut Self { + self.depth = Some(depth); + self + } + + /// finalize configuration of ExtratorBuilder and return an Extractor + /// + /// requires either with_url or with_response to have been used in the build process + pub fn build(&self) -> Result> { + if self.url.is_empty() && self.response.is_none() { + bail!("Extractor requires either a URL or a FeroxResponse be specified") + } + + Ok(Extractor { + links_regex: Regex::new(LINKFINDER_REGEX).unwrap(), + robots_regex: Regex::new(ROBOTS_TXT_REGEX).unwrap(), + response: if self.response.is_some() { + Some(self.response.unwrap()) + } else { + None + }, + url: self.url.to_owned(), + config: self.config.unwrap(), + tx_stats: self.tx_stats.as_ref().unwrap().clone(), + tx_recursion: self.tx_recursion.as_ref().unwrap().clone(), + tx_reporter: self.tx_reporter.as_ref().unwrap().clone(), + scanned_urls: self.scanned_urls.unwrap(), + depth: self.depth.unwrap(), + stats: self.stats.as_ref().unwrap().clone(), + target: self.target, + }) + } +} diff --git a/src/extractor/container.rs b/src/extractor/container.rs new file mode 100644 index 0000000..ef3e322 --- /dev/null +++ b/src/extractor/container.rs @@ -0,0 +1,408 @@ +use super::*; +use crate::{ + client, + scanner::{send_report, should_filter_response, try_recursion}, + statistics::{ + StatCommand::UpdateUsizeField, + StatField::{LinksExtracted, TotalExpected}, + }, + update_stat, + utils::{format_url, make_request}, +}; +use anyhow::{bail, Context, Result}; +use reqwest::{StatusCode, Url}; +use std::collections::HashSet; + +/// Whether an active scan is recursive or not +#[derive(Debug)] +enum RecursionStatus { + /// Scan is recursive + Recursive, + + /// Scan is not recursive + NotRecursive, +} + +/// Handles all logic related to extracting links from requested source code +#[derive(Debug)] +pub struct Extractor<'a> { + /// `LINKFINDER_REGEX` as a regex::Regex type + pub(super) links_regex: Regex, + + /// `ROBOTS_TXT_REGEX` as a regex::Regex type + pub(super) robots_regex: Regex, + + /// Response from which to extract links + pub(super) response: Option<&'a FeroxResponse>, + + /// Response from which to extract links + pub(super) url: String, + + /// Whether or not to try recursion + pub(super) config: &'a Configuration, + + /// transmitter to the mpsc that handles statistics gathering + pub(super) tx_stats: UnboundedSender, + + /// transmitter to the mpsc that handles recursive scan calls + pub(super) tx_recursion: UnboundedSender, + + /// transmitter to the mpsc that handles reporting information to the user + pub(super) tx_reporter: UnboundedSender, + + /// list of urls that will be added to when new urls are extracted + pub(super) scanned_urls: &'a FeroxScans, + + /// depth at which the scan was started + pub(super) depth: usize, + + /// copy of Stats object + pub(super) stats: Arc, + + /// type of extraction to be performed + pub(super) target: ExtractionTarget, +} + +/// Extractor implementation +impl<'a> Extractor<'a> { + /// business logic that handles getting links from a normal http body response + pub async fn extract(&self) -> Result<()> { + let links = match self.target { + ExtractionTarget::ResponseBody => self.extract_from_body().await?, + ExtractionTarget::RobotsTxt => self.extract_from_robots().await?, + }; + + let recursive = if self.config.no_recursion { + RecursionStatus::NotRecursive + } else { + RecursionStatus::Recursive + }; + + for link in links { + // todo rename get_feroxresponse_from_link + let mut resp = match self.request_link(&link).await { + Ok(resp) => resp, + Err(_) => continue, + }; + + // filter if necessary + if should_filter_response(&resp, self.tx_stats.clone()) { + continue; + } + + if resp.is_file() { + // very likely a file, simply request and report + log::debug!("Extracted file: {}", resp); + + self.scanned_urls + .add_file_scan(&resp.url().to_string(), self.stats.clone()); + + send_report(self.tx_reporter.clone(), resp); + + continue; + } + + if matches!(recursive, RecursionStatus::Recursive) { + log::debug!("Extracted Directory: {}", resp); + + if !resp.url().as_str().ends_with('/') + && (resp.status().is_success() + || matches!(resp.status(), &StatusCode::FORBIDDEN)) + { + // if the url doesn't end with a / + // and the response code is either a 2xx or 403 + + // since all of these are 2xx or 403, recursion is only attempted if the + // url ends in a /. I am actually ok with adding the slash and not + // adding it, as both have merit. Leaving it in for now to see how + // things turn out (current as of: v1.1.0) + resp.set_url(&format!("{}/", resp.url())); + } + + try_recursion(&resp, self.depth, self.tx_recursion.clone()).await; + } + } + Ok(()) + } + + /// Given a `reqwest::Response`, perform the following actions + /// - parse the response's text for links using the linkfinder regex + /// - for every link found take its url path and parse each sub-path + /// - example: Response contains a link fragment `homepage/assets/img/icons/handshake.svg` + /// with a base url of http://localhost, the following urls would be returned: + /// - homepage/assets/img/icons/handshake.svg + /// - homepage/assets/img/icons/ + /// - homepage/assets/img/ + /// - homepage/assets/ + /// - homepage/ + pub(super) async fn extract_from_body(&self) -> Result> { + log::trace!("enter: get_links"); + + let mut links = HashSet::::new(); + + let body = self.response.unwrap().text(); + + for capture in self.links_regex.captures_iter(&body) { + // remove single & double quotes from both ends of the capture + // capture[0] is the entire match, additional capture groups start at [1] + let link = capture[0].trim_matches(|c| c == '\'' || c == '"'); + + match Url::parse(link) { + Ok(absolute) => { + if absolute.domain() != self.response.unwrap().url().domain() + || absolute.host() != self.response.unwrap().url().host() + { + // domains/ips are not the same, don't scan things that aren't part of the original + // target url + continue; + } + + if self.add_all_sub_paths(absolute.path(), &mut links).is_err() { + log::warn!("could not add sub-paths from {} to {:?}", absolute, links); + } + } + Err(e) => { + // this is the expected error that happens when we try to parse a url fragment + // ex: Url::parse("/login") -> Err("relative URL without a base") + // while this is technically an error, these are good results for us + if e.to_string().contains("relative URL without a base") { + if self.add_all_sub_paths(link, &mut links).is_err() { + log::warn!("could not add sub-paths from {} to {:?}", link, links); + } + } else { + // unexpected error has occurred + log::error!("Could not parse given url: {}", e); + } + } + } + } + + self.update_stats(links.len()); + + log::trace!("exit: get_links -> {:?}", links); + + Ok(links) + } + + /// take a url fragment like homepage/assets/img/icons/handshake.svg and + /// incrementally add + /// - homepage/assets/img/icons/ + /// - homepage/assets/img/ + /// - homepage/assets/ + /// - homepage/ + fn add_all_sub_paths(&self, url_path: &str, mut links: &mut HashSet) -> Result<()> { + log::trace!("enter: add_all_sub_paths({}, {:?})", url_path, links); + + for sub_path in self.get_sub_paths_from_path(url_path) { + self.add_link_to_set_of_links(&sub_path, &mut links)?; + } + + log::trace!("exit: add_all_sub_paths"); + Ok(()) + } + + /// Iterate over a given path, return a list of every sub-path found + /// + /// example: `path` contains a link fragment `homepage/assets/img/icons/handshake.svg` + /// the following fragments would be returned: + /// - homepage/assets/img/icons/handshake.svg + /// - homepage/assets/img/icons/ + /// - homepage/assets/img/ + /// - homepage/assets/ + /// - homepage/ + pub(super) fn get_sub_paths_from_path(&self, path: &str) -> Vec { + log::trace!("enter: get_sub_paths_from_path({})", path); + let mut paths = vec![]; + + // filter out any empty strings caused by .split + let mut parts: Vec<&str> = path.split('/').filter(|s| !s.is_empty()).collect(); + + let length = parts.len(); + + for i in 0..length { + // iterate over all parts of the path + if parts.is_empty() { + // pop left us with an empty vector, we're done + break; + } + + let mut possible_path = parts.join("/"); + + if possible_path.is_empty() { + // .join can result in an empty string, which we don't need, ignore + continue; + } + + if i > 0 { + // this isn't the last index of the parts array + // ex: /buried/misc/stupidfile.php + // this block skips the file but sees all parent folders + possible_path = format!("{}/", possible_path); + } + + paths.push(possible_path); // good sub-path found + parts.pop(); // use .pop() to remove the last part of the path and continue iteration + } + + log::trace!("exit: get_sub_paths_from_path -> {:?}", paths); + paths + } + + /// simple helper to stay DRY, trys to join a url + fragment and add it to the `links` HashSet + pub(super) fn add_link_to_set_of_links( + &self, + link: &str, + links: &mut HashSet, + ) -> Result<()> { + log::trace!("enter: add_link_to_set_of_links({}, {:?})", link, links); + + let old_url = match self.target { + ExtractionTarget::ResponseBody => self.response.unwrap().url.clone(), + ExtractionTarget::RobotsTxt => match Url::parse(&self.url) { + Ok(u) => u, + Err(e) => { + bail!("Could not parse {}: {}", self.url, e); + } + }, + }; + + let new_url = old_url + .join(&link) + .with_context(|| format!("Could not join {} with {}", old_url, link))?; + + links.insert(new_url.to_string()); + + log::trace!("exit: add_link_to_set_of_links"); + + Ok(()) + } + + /// Wrapper around link extraction logic + /// currently used in two places: + /// - links from response bodies + /// - links from robots.txt responses + /// + /// general steps taken: + /// - create a new Url object based on cli options/args + /// - check if the new Url has already been seen/scanned -> None + /// - make a request to the new Url ? -> Some(response) : None + pub(super) async fn request_link(&self, url: &str) -> Result { + log::trace!("enter: get_feroxresponse_from_link({})", url); + + // create a url based on the given command line options, return None on error + let new_url = format_url( + &url, + &"", + self.config.add_slash, + &self.config.queries, + None, + self.tx_stats.clone(), + )?; + + if self + .scanned_urls + .get_scan_by_url(&new_url.to_string()) + .is_some() + { + //we've seen the url before and don't need to scan again + log::trace!("exit: get_feroxresponse_from_link -> None"); + bail!("previously seen url"); + } + + // make the request and store the response + let new_response = + make_request(&self.config.client, &new_url, self.tx_stats.clone()).await?; + + let new_ferox_response = FeroxResponse::from(new_response, true).await; + + log::trace!( + "exit: get_feroxresponse_from_link -> {:?}", + new_ferox_response + ); + + Ok(new_ferox_response) + } + + /// Entry point to perform link extraction from robots.txt + /// + /// `base_url` can have paths and subpaths, however robots.txt will be requested from the + /// root of the url + /// given the url: + /// http://localhost/stuff/things + /// this function requests: + /// http://localhost/robots.txt + pub(super) async fn extract_from_robots(&self) -> Result> { + log::trace!("enter: extract_robots_txt"); + + let mut links: HashSet = HashSet::new(); + + let response = self.request_robots_txt().await?; + + for capture in self.robots_regex.captures_iter(response.text.as_str()) { + if let Some(new_path) = capture.name("url_path") { + let mut new_url = Url::parse(&self.url)?; + new_url.set_path(new_path.as_str()); + if self.add_all_sub_paths(&new_url.path(), &mut links).is_err() { + log::warn!("could not add sub-paths from {} to {:?}", new_url, links); + } + } + } + + self.update_stats(links.len()); + + log::trace!("exit: extract_robots_txt -> {:?}", links); + Ok(links) + } + + /// helper function that simply requests /robots.txt on the given url's base url + /// + /// example: + /// http://localhost/api/users -> http://localhost/robots.txt + /// + /// The length of the given path has no effect on what's requested; it's always + /// base url + /robots.txt + pub(super) async fn request_robots_txt(&self) -> Result { + log::trace!("enter: get_robots_file"); + + // more often than not, domain/robots.txt will redirect to www.domain/robots.txt or something + // similar; to account for that, create a client that will follow redirects, regardless of + // what the user specified for the scanning client. Other than redirects, it will respect + // all other user specified settings + let follow_redirects = true; + + let proxy = if self.config.proxy.is_empty() { + None + } else { + Some(self.config.proxy.as_str()) + }; + + let client = client::initialize( + self.config.timeout, + &self.config.user_agent, + follow_redirects, + self.config.insecure, + &self.config.headers, + proxy, + ); + + let mut url = Url::parse(&self.url)?; + url.set_path("/robots.txt"); // overwrite existing path with /robots.txt + + let response = make_request(&client, &url, self.tx_stats.clone()).await?; + let ferox_response = FeroxResponse::from(response, true).await; + + log::trace!("exit: get_robots_file -> {}", ferox_response); + return Ok(ferox_response); + } + + /// update total number of links extracted and expected responses + fn update_stats(&self, num_links: usize) { + let multiplier = self.config.extensions.len().max(1); + + update_stat!(self.tx_stats, UpdateUsizeField(LinksExtracted, num_links)); + update_stat!( + self.tx_stats, + UpdateUsizeField(TotalExpected, num_links * multiplier) + ); + } +} diff --git a/src/extractor/mod.rs b/src/extractor/mod.rs new file mode 100644 index 0000000..59ffa2a --- /dev/null +++ b/src/extractor/mod.rs @@ -0,0 +1,19 @@ +//! extract links from html source and robots.txt +mod builder; +mod container; +#[cfg(test)] +mod tests; + +pub use self::builder::ExtractionTarget; +pub use self::builder::ExtractorBuilder; +pub use self::container::Extractor; + +use crate::{ + config::Configuration, + scan_manager::FeroxScans, + statistics::{StatCommand, Stats}, + FeroxResponse, +}; +use regex::Regex; +use std::sync::Arc; +use tokio::sync::mpsc::UnboundedSender; diff --git a/src/extractor/tests.rs b/src/extractor/tests.rs new file mode 100644 index 0000000..5636dce --- /dev/null +++ b/src/extractor/tests.rs @@ -0,0 +1,404 @@ +use super::builder::{LINKFINDER_REGEX, ROBOTS_TXT_REGEX}; +use super::*; +use crate::utils::make_request; +use crate::FeroxChannel; +use anyhow::Result; +use httpmock::Method::GET; +use httpmock::MockServer; +use lazy_static::lazy_static; +use reqwest::{header::HeaderMap, Client, StatusCode, Url}; +use std::collections::HashSet; +use tokio::sync::mpsc; + +lazy_static! { + /// Extractor for testing robots.txt + static ref ROBOTS_EXT: Extractor<'static> = setup_extractor(ExtractionTarget::RobotsTxt); + + /// Extractor for testing response bodies + static ref BODY_EXT: Extractor<'static> = setup_extractor(ExtractionTarget::ResponseBody); + + /// Configuration for Extractor + static ref CONFIG: Configuration = Configuration::new(); + + /// FeroxScans for Extractor + static ref SCANS: FeroxScans = FeroxScans::default(); + + /// FeroxResponse for Extractor + static ref RESPONSE: FeroxResponse = get_test_response(); +} + +/// constructor for the default FeroxResponse used during testing +fn get_test_response() -> FeroxResponse { + FeroxResponse { + text: String::new(), + wildcard: true, + url: Url::parse("https://localhost").unwrap(), + content_length: 125, + word_count: 10, + line_count: 14, + headers: HeaderMap::new(), + status: StatusCode::OK, + } +} + +/// creates a single extractor that can be used to test standalone functions +fn setup_extractor(target: ExtractionTarget) -> Extractor<'static> { + let (tx_dir, _): FeroxChannel = mpsc::unbounded_channel(); + let (tx_stats, _): FeroxChannel = mpsc::unbounded_channel(); + let (tx_term, _): FeroxChannel = mpsc::unbounded_channel(); + let stats = Arc::new(Stats::new()); + + let mut builder = match target { + ExtractionTarget::ResponseBody => ExtractorBuilder::with_response(&RESPONSE), + ExtractionTarget::RobotsTxt => ExtractorBuilder::with_url("https://localhost"), + }; + + builder + .depth(4) + .config(&CONFIG) + .recursion_transmitter(tx_dir) + .stats_transmitter(tx_stats) + .reporter_transmitter(tx_term) + .scanned_urls(&SCANS) + .stats(stats) + .build() + .unwrap() +} + +#[test] +/// extract sub paths from the given url fragment; expect 4 sub paths and that all are +/// in the expected array +fn extractor_get_sub_paths_from_path_with_multiple_paths() { + let path = "homepage/assets/img/icons/handshake.svg"; + let r_paths = ROBOTS_EXT.get_sub_paths_from_path(&path); + let b_paths = BODY_EXT.get_sub_paths_from_path(&path); + let expected = vec![ + "homepage/", + "homepage/assets/", + "homepage/assets/img/", + "homepage/assets/img/icons/", + "homepage/assets/img/icons/handshake.svg", + ]; + + assert_eq!(r_paths.len(), expected.len()); + assert_eq!(b_paths.len(), expected.len()); + for expected_path in expected { + assert_eq!(r_paths.contains(&expected_path.to_string()), true); + assert_eq!(b_paths.contains(&expected_path.to_string()), true); + } +} + +#[test] +/// extract sub paths from the given url fragment; expect 2 sub paths and that all are +/// in the expected array. the fragment is wrapped in slashes to ensure no empty strings are +/// returned +fn extractor_get_sub_paths_from_path_with_enclosing_slashes() { + let path = "/homepage/assets/"; + let r_paths = ROBOTS_EXT.get_sub_paths_from_path(&path); + let b_paths = BODY_EXT.get_sub_paths_from_path(&path); + let expected = vec!["homepage/", "homepage/assets"]; + + assert_eq!(r_paths.len(), expected.len()); + assert_eq!(b_paths.len(), expected.len()); + for expected_path in expected { + assert_eq!(r_paths.contains(&expected_path.to_string()), true); + assert_eq!(b_paths.contains(&expected_path.to_string()), true); + } +} + +#[test] +/// extract sub paths from the given url fragment; expect 1 sub path, no forward slashes are +/// included +fn extractor_get_sub_paths_from_path_with_only_a_word() { + let path = "homepage"; + let r_paths = ROBOTS_EXT.get_sub_paths_from_path(&path); + let b_paths = BODY_EXT.get_sub_paths_from_path(&path); + let expected = vec!["homepage"]; + + assert_eq!(r_paths.len(), expected.len()); + assert_eq!(b_paths.len(), expected.len()); + for expected_path in expected { + assert_eq!(r_paths.contains(&expected_path.to_string()), true); + assert_eq!(b_paths.contains(&expected_path.to_string()), true); + } +} + +#[test] +/// extract sub paths from the given url fragment; expect 1 sub path, forward slash removed +fn extractor_get_sub_paths_from_path_with_an_absolute_word() { + let path = "/homepage"; + let r_paths = ROBOTS_EXT.get_sub_paths_from_path(&path); + let b_paths = BODY_EXT.get_sub_paths_from_path(&path); + let expected = vec!["homepage"]; + + assert_eq!(r_paths.len(), expected.len()); + assert_eq!(b_paths.len(), expected.len()); + for expected_path in expected { + assert_eq!(r_paths.contains(&expected_path.to_string()), true); + assert_eq!(b_paths.contains(&expected_path.to_string()), true); + } +} + +#[test] +/// test that an ExtractorBuilder without a FeroxResponse and without a URL bails +fn extractor_builder_bails_when_neither_required_field_is_set() { + let (tx_dir, _): FeroxChannel = mpsc::unbounded_channel(); + let (tx_stats, _): FeroxChannel = mpsc::unbounded_channel(); + let (tx_term, _): FeroxChannel = mpsc::unbounded_channel(); + let stats = Arc::new(Stats::new()); + + let extractor = ExtractorBuilder::with_url("") + .depth(4) + .config(&CONFIG) + .recursion_transmitter(tx_dir) + .stats_transmitter(tx_stats) + .reporter_transmitter(tx_term) + .scanned_urls(&SCANS) + .stats(stats) + .build(); + + assert!(extractor.is_err()); +} + +#[test] +/// Extractor with a non-base url bails +fn extractor_with_non_base_url_bails() -> Result<()> { + let mut links = HashSet::::new(); + let link = "admin"; + + let (tx_dir, _): FeroxChannel = mpsc::unbounded_channel(); + let (tx_stats, _): FeroxChannel = mpsc::unbounded_channel(); + let (tx_term, _): FeroxChannel = mpsc::unbounded_channel(); + let stats = Arc::new(Stats::new()); + + let extractor = ExtractorBuilder::with_url("\\\\\\") + .depth(4) + .config(&CONFIG) + .recursion_transmitter(tx_dir) + .stats_transmitter(tx_stats) + .reporter_transmitter(tx_term) + .scanned_urls(&SCANS) + .stats(stats) + .build()?; + + let result = extractor.add_link_to_set_of_links(link, &mut links); + + assert!(result.is_err()); + Ok(()) +} + +#[test] +/// test that a full url and fragment are joined correctly, then added to the given list +/// i.e. the happy path +fn extractor_add_link_to_set_of_links_happy_path() { + let mut r_links = HashSet::::new(); + let r_link = "admin"; + let mut b_links = HashSet::::new(); + let b_link = "shmadmin"; + + assert_eq!(r_links.len(), 0); + ROBOTS_EXT + .add_link_to_set_of_links(r_link, &mut r_links) + .unwrap(); + + assert_eq!(r_links.len(), 1); + assert!(r_links.contains("https://localhost/admin")); + + assert_eq!(b_links.len(), 0); + + BODY_EXT + .add_link_to_set_of_links(b_link, &mut b_links) + .unwrap(); + + assert_eq!(b_links.len(), 1); + assert!(b_links.contains("https://localhost/shmadmin")); +} + +#[test] +/// test that an invalid path fragment doesn't add anything to the set of links +fn extractor_add_link_to_set_of_links_with_non_base_url() { + let mut links = HashSet::::new(); + let link = "\\\\\\\\"; + + assert_eq!(links.len(), 0); + assert!(ROBOTS_EXT + .add_link_to_set_of_links(link, &mut links) + .is_err()); + assert!(BODY_EXT.add_link_to_set_of_links(link, &mut links).is_err()); + + assert_eq!(links.len(), 0); + assert!(links.is_empty()); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 1)] +/// use make_request to generate a Response, and use the Response to test get_links; +/// the response will contain an absolute path to a domain that is not part of the scanned +/// domain; expect an empty set returned +async fn extractor_get_links_with_absolute_url_that_differs_from_target_domain() -> Result<()> { + let (tx_dir, _): FeroxChannel = mpsc::unbounded_channel(); + let (tx_stats, _): FeroxChannel = mpsc::unbounded_channel(); + let (tx_term, _): FeroxChannel = mpsc::unbounded_channel(); + let stats = Arc::new(Stats::new()); + + let srv = MockServer::start(); + + let mock = srv.mock(|when, then| { + when.method(GET).path("/some-path"); + then.status(200).body( + "\"http://defintely.not.a.thing.probably.com/homepage/assets/img/icons/handshake.svg\"", + ); + }); + + let client = Client::new(); + let url = Url::parse(&srv.url("/some-path")).unwrap(); + + let response = make_request(&client, &url, tx_stats.clone()).await.unwrap(); + + let ferox_response = FeroxResponse::from(response, true).await; + + let extractor = Extractor { + links_regex: Regex::new(LINKFINDER_REGEX).unwrap(), + robots_regex: Regex::new(ROBOTS_TXT_REGEX).unwrap(), + response: Some(&ferox_response), + url: String::new(), + config: &CONFIG, + tx_stats, + tx_recursion: tx_dir, + tx_reporter: tx_term, + scanned_urls: &SCANS, + depth: 4, + stats, + target: ExtractionTarget::ResponseBody, + }; + + let links = extractor.extract_from_body().await?; + + assert!(links.is_empty()); + + assert_eq!(mock.hits(), 1); + Ok(()) +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 1)] +/// test that /robots.txt is correctly requested given a base url (happy path) +async fn request_robots_txt_without_proxy() -> Result<()> { + let (tx_dir, _): FeroxChannel = mpsc::unbounded_channel(); + let (tx_stats, _): FeroxChannel = mpsc::unbounded_channel(); + let (tx_term, _): FeroxChannel = mpsc::unbounded_channel(); + let stats = Arc::new(Stats::new()); + let config = Configuration::new(); + + let srv = MockServer::start(); + + let mock = srv.mock(|when, then| { + when.method(GET).path("/robots.txt"); + then.status(200).body("this is a test"); + }); + + let extractor = Extractor { + links_regex: Regex::new(LINKFINDER_REGEX).unwrap(), + robots_regex: Regex::new(ROBOTS_TXT_REGEX).unwrap(), + response: None, + url: srv.url("/api/users/stuff/things"), + config: &config, + tx_stats, + tx_recursion: tx_dir, + tx_reporter: tx_term, + scanned_urls: &SCANS, + depth: 4, + stats, + target: ExtractionTarget::RobotsTxt, + }; + + let resp = extractor.request_robots_txt().await?; + + assert!(matches!(resp.status(), &StatusCode::OK)); + println!("{}", resp); + assert_eq!(resp.content_length(), 14); + assert_eq!(mock.hits(), 1); + Ok(()) +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 1)] +/// test that /robots.txt is correctly requested given a base url (happy path) when a proxy is used +async fn request_robots_txt_with_proxy() -> Result<()> { + let (tx_dir, _): FeroxChannel = mpsc::unbounded_channel(); + let (tx_stats, _): FeroxChannel = mpsc::unbounded_channel(); + let (tx_term, _): FeroxChannel = mpsc::unbounded_channel(); + let stats = Arc::new(Stats::new()); + let mut config = Configuration::new(); + + let srv = MockServer::start(); + + let mock = srv.mock(|when, then| { + when.method(GET).path("/robots.txt"); + then.status(200).body("this is also a test"); + }); + + // note: the proxy doesn't actually do anything other than hit a different code branch + // in this unit test; it would however have an effect on an integration test + config.proxy = srv.url("/ima-proxy"); + config.no_recursion = true; + + let extractor = ExtractorBuilder::with_url(&srv.url("/api/different/path")) + .depth(4) + .config(&config) + .recursion_transmitter(tx_dir) + .stats_transmitter(tx_stats) + .reporter_transmitter(tx_term) + .scanned_urls(&SCANS) + .stats(stats) + .build()?; + + let resp = extractor.request_robots_txt().await?; + + assert!(matches!(resp.status(), &StatusCode::OK)); + assert_eq!(resp.content_length(), 19); + assert_eq!(mock.hits(), 1); + Ok(()) +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 1)] +/// get_feroxresponse_from_link's happy path, expect back a FeroxResponse +async fn get_feroxresponse_from_link_happy_path() -> Result<()> { + let srv = MockServer::start(); + + let mock = srv.mock(|when, then| { + when.method(GET).path("/login.php"); + then.status(200).body("this is a test"); + }); + + let r_resp = ROBOTS_EXT.request_link(&srv.url("/login.php")).await?; + let b_resp = BODY_EXT.request_link(&srv.url("/login.php")).await?; + + assert!(matches!(r_resp.status(), &StatusCode::OK)); + assert!(matches!(b_resp.status(), &StatusCode::OK)); + assert_eq!(r_resp.content_length(), 14); + assert_eq!(b_resp.content_length(), 14); + assert_eq!(mock.hits(), 2); + Ok(()) +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 1)] +/// get_feroxresponse_from_link should bail in the event that the url is already in scanned_urls +async fn get_feroxresponse_from_link_bails_on_seen_url() -> Result<()> { + let url = "/unique-for-this-test.php"; + let srv = MockServer::start(); + let served = srv.url(url); + + let mock = srv.mock(|when, then| { + when.method(GET).path(url); + then.status(200) + .body("this is a unique test, don't reuse the endpoint"); + }); + + SCANS.add_file_scan(&served, ROBOTS_EXT.stats.clone()); + + let r_resp = ROBOTS_EXT.request_link(&served).await; + let b_resp = BODY_EXT.request_link(&served).await; + + assert!(r_resp.is_err()); + assert!(b_resp.is_err()); + assert_eq!(mock.hits(), 0); // function exits before requests can happen + Ok(()) +} diff --git a/src/lib.rs b/src/lib.rs index 475e934..1921c8c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,19 +1,19 @@ -pub mod utils; -pub mod client; +pub mod banner; pub mod config; -pub mod extractor; -pub mod filters; +mod client; +mod event_handlers; +mod filters; pub mod heuristics; pub mod logger; -pub mod parser; +mod parser; pub mod progress; pub mod reporter; pub mod scan_manager; pub mod scanner; pub mod statistics; -mod event_handlers; -pub mod banner; mod traits; +pub mod utils; +mod extractor; use crate::{ traits::FeroxSerialize, diff --git a/src/reporter.rs b/src/reporter.rs index 85193cd..ebf1dc7 100644 --- a/src/reporter.rs +++ b/src/reporter.rs @@ -5,9 +5,11 @@ use crate::{ StatCommand::{self, UpdateUsizeField}, StatField::ResourcesDiscovered, }, + update_stat, utils::{ferox_print, make_request, open_file}, FeroxChannel, FeroxResponse, FeroxSerialize, }; + use console::strip_ansi_codes; use std::{ fs, io, diff --git a/src/scanner.rs b/src/scanner.rs index 1dae6ac..efd8911 100644 --- a/src/scanner.rs +++ b/src/scanner.rs @@ -1,6 +1,6 @@ use crate::{ config::{Configuration, CONFIGURATION}, - extractor::{extract_robots_txt, get_links, request_feroxresponse_from_new_link}, + extractor::ExtractorBuilder, filters::{ LinesFilter, RegexFilter, SimilarityFilter, SizeFilter, StatusCodeFilter, WildcardFilter, WordsFilter, @@ -13,6 +13,7 @@ use crate::{ Stats, }, traits::FeroxFilter, + update_stat, utils::{format_url, get_current_depth, make_request}, FeroxChannel, FeroxResponse, SIMILARITY_THRESHOLD, }; @@ -307,11 +308,12 @@ fn reached_max_depth(url: &Url, base_depth: usize, max_depth: usize) -> bool { /// Helper function that wraps logic to check for recursion opportunities /// /// When a recursion opportunity is found, the new url is sent across the recursion channel -async fn try_recursion( +pub async fn try_recursion( response: &FeroxResponse, base_depth: usize, transmitter: UnboundedSender, ) { + // todo this should be part of the recursion handler log::trace!( "enter: try_recursion({}, {}, {:?})", response, @@ -433,56 +435,18 @@ async fn make_requests( } if CONFIGURATION.extract_links && !ferox_response.status().is_redirection() { - let new_links = get_links(&ferox_response, tx_stats.clone()).await; + let extractor = ExtractorBuilder::with_response(&ferox_response) + .depth(base_depth) + .config(&CONFIGURATION) + .recursion_transmitter(dir_chan.clone()) + .stats_transmitter(tx_stats.clone()) + .reporter_transmitter(report_chan.clone()) + .scanned_urls(&SCANNED_URLS) + .stats(stats.clone()) + .build() + .unwrap(); // todo change once this function returns Result - for new_link in new_links { - let mut new_ferox_response = match request_feroxresponse_from_new_link( - &new_link, - tx_stats.clone(), - ) - .await - { - Some(resp) => resp, - None => continue, - }; - - // filter if necessary - if should_filter_response(&new_ferox_response, tx_stats.clone()) { - continue; - } - - if new_ferox_response.is_file() { - // very likely a file, simply request and report - log::debug!("Singular extraction: {}", new_ferox_response); - - SCANNED_URLS - .add_file_scan(&new_ferox_response.url().to_string(), stats.clone()); - - send_report(report_chan.clone(), new_ferox_response); - - continue; - } - - if !CONFIGURATION.no_recursion { - log::debug!("Recursive extraction: {}", new_ferox_response); - - if !new_ferox_response.url().as_str().ends_with('/') - && (new_ferox_response.status().is_success() - || matches!(new_ferox_response.status(), &StatusCode::FORBIDDEN)) - { - // if the url doesn't end with a / - // and the response code is either a 2xx or 403 - - // since all of these are 2xx or 403, recursion is only attempted if the - // url ends in a /. I am actually ok with adding the slash and not - // adding it, as both have merit. Leaving it in for now to see how - // things turn out (current as of: v1.1.0) - new_ferox_response.set_url(&format!("{}/", new_ferox_response.url())); - } - - try_recursion(&new_ferox_response, base_depth, dir_chan.clone()).await; - } - } + let _ = extractor.extract().await; } // everything else should be reported @@ -506,61 +470,6 @@ pub fn send_report(report_sender: UnboundedSender, response: Fero log::trace!("exit: send_report"); } -/// Request /robots.txt from given url -async fn scan_robots_txt( - target_url: &str, - base_depth: usize, - stats: Arc, - tx_term: UnboundedSender, - tx_dir: UnboundedSender, - tx_stats: UnboundedSender, -) { - log::trace!( - "enter: scan_robots_txt({}, {}, {:?}, {:?}, {:?}, {:?})", - target_url, - base_depth, - stats, - tx_term, - tx_dir, - tx_stats - ); - - let robots_links = extract_robots_txt(&target_url, &CONFIGURATION, tx_stats.clone()).await; - - for robot_link in robots_links { - // create a url based on the given command line options, continue on error - let mut ferox_response = - match request_feroxresponse_from_new_link(&robot_link, tx_stats.clone()).await { - Some(resp) => resp, - None => continue, - }; - - if should_filter_response(&ferox_response, tx_stats.clone()) { - continue; - } - - if ferox_response.is_file() { - log::debug!("File extracted from robots.txt: {}", ferox_response); - SCANNED_URLS.add_file_scan(&robot_link, stats.clone()); - send_report(tx_term.clone(), ferox_response); - } else if !CONFIGURATION.no_recursion { - log::debug!("Directory extracted from robots.txt: {}", ferox_response); - // todo this code is essentially the same as another piece around ~467 of this file - if !ferox_response.url().as_str().ends_with('/') - && (ferox_response.status().is_success() - || matches!(ferox_response.status(), &StatusCode::FORBIDDEN)) - { - // if the url doesn't end with a / - // and the response code is either a 2xx or 403 - ferox_response.set_url(&format!("{}/", ferox_response.url())); - } - - try_recursion(&ferox_response, base_depth, tx_dir.clone()).await; - } - } - log::trace!("exit: scan_robots_txt"); -} - /// Scan a given url using a given wordlist /// /// This is the primary entrypoint for the scanner @@ -596,15 +505,19 @@ pub async fn scan_url( if CONFIGURATION.extract_links { // only grab robots.txt on the initial scan_url calls. all fresh dirs will be passed // to try_recursion - scan_robots_txt( - target_url, - base_depth, - stats.clone(), - tx_term.clone(), - tx_dir.clone(), - tx_stats.clone(), - ) - .await; + + let extractor = ExtractorBuilder::with_url(target_url) + .depth(base_depth) + .config(&CONFIGURATION) + .recursion_transmitter(tx_dir.clone()) + .stats_transmitter(tx_stats.clone()) + .reporter_transmitter(tx_term.clone()) + .scanned_urls(&SCANNED_URLS) + .stats(stats.clone()) + .build() + .unwrap(); // todo change once this function returns Result + + let _ = extractor.extract().await; } update_stat!(tx_stats, UpdateUsizeField(TotalScans, 1)); diff --git a/src/utils.rs b/src/utils.rs index b8f8976..c021ec1 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -5,7 +5,7 @@ use crate::{ StatCommand::{self, AddError, AddStatus}, StatError::{Connection, Other, Redirection, Request, Timeout, UrlFormat}, }, - FeroxError, FeroxResult, + FeroxError, }; use anyhow::{bail, Context, Result}; use console::{strip_ansi_codes, style, user_attended}; @@ -184,7 +184,7 @@ pub fn format_url( queries: &[(String, String)], extension: Option<&str>, tx_stats: UnboundedSender, -) -> FeroxResult { +) -> Result { log::trace!( "enter: format_url({}, {}, {}, {:?} {:?}, {:?})", url, @@ -214,7 +214,7 @@ pub fn format_url( update_stat!(tx_stats, AddError(UrlFormat)); log::trace!("exit: format_url -> {}", err); - return Err(Box::new(err)); + bail!("{}", err); } // from reqwest::Url::join @@ -242,6 +242,15 @@ pub fn format_url( } else if add_slash && !word.ends_with('/') { // -f used, and word doesn't already end with a / format!("{}/", word) + } else if word.starts_with("//") { + // bug ID'd by @Sicks3c, when a wordlist contains words that begin with 2 forward slashes + // i.e. //1_40_0/static/js, it gets joined onto the base url in a surprising way + // ex: https://localhost/ + //1_40_0/static/js -> https://1_40_0/static/js + // this is due to the fact that //... is a valid url. The fix is introduced here in 1.12.2 + // and simply removes prefixed forward slashes if there are two of them. Additionally, + // trim_start_matches will trim the pattern until it's gone, so even if there are more than + // 2 /'s, they'll still be trimmed + word.trim_start_matches('/').to_string() } else { String::from(word) }; @@ -275,7 +284,7 @@ pub fn format_url( update_stat!(tx_stats, AddError(UrlFormat)); log::trace!("exit: format_url -> {}", e); log::error!("Could not join {} with {}", word, base_url); - Err(Box::new(e)) + bail!("{}", e) } } } @@ -585,6 +594,27 @@ mod tests { ); } + #[test] + /// word with two prepended slashes doesn't discard the entire domain + fn format_url_word_with_two_prepended_slashes() { + let (tx, _): FeroxChannel = mpsc::unbounded_channel(); + + let result = format_url( + "http://localhost", + "//upload/img", + false, + &Vec::new(), + None, + tx, + ) + .unwrap(); + + assert_eq!( + result, + reqwest::Url::parse("http://localhost/upload/img").unwrap() + ); + } + #[test] /// word that is a fully formed url, should return an error fn format_url_word_that_is_a_url() { diff --git a/tests/test_extractor.rs b/tests/test_extractor.rs index 4d872d6..ff6d193 100644 --- a/tests/test_extractor.rs +++ b/tests/test_extractor.rs @@ -263,7 +263,7 @@ fn extractor_finds_robots_txt_links_and_displays_files_or_scans_directories() { let mock_disallowed = srv.mock(|when, then| { when.method(GET).path("/disallowed-subdir"); - then.status(404); + then.status(403); }); let cmd = Command::cargo_bin("feroxbuster") @@ -296,6 +296,80 @@ fn extractor_finds_robots_txt_links_and_displays_files_or_scans_directories() { teardown_tmp_directory(tmp_dir); } +#[test] +/// serve a robots.txt with a file and and a folder link contained within it. ferox should +/// find both links and request each one. This is the non-recursive version of the test above +fn extractor_finds_robots_txt_links_and_displays_files_non_recursive() { + let srv = MockServer::start(); + let (tmp_dir, file) = setup_tmp_directory(&["LICENSE".to_string()], "wordlist").unwrap(); + + let mock = srv.mock(|when, then| { + when.method(GET).path("/LICENSE"); + then.status(200).body("im a little teapot"); // 18 + }); + + let mock_two = srv.mock(|when, then| { + when.method(GET).path("/robots.txt"); + then.status(200).body( + r#" + User-agent: * + Crawl-delay: 10 + # CSS, JS, Images + Allow: /misc/*.css$ + Disallow: /misc/stupidfile.php + Disallow: /disallowed-subdir/ + "#, + ); + }); + + let mock_file = srv.mock(|when, then| { + when.method(GET).path("/misc/stupidfile.php"); + then.status(200).body("im a little teapot too"); // 22 + }); + + let mock_scanned_file = srv.mock(|when, then| { + when.method(GET).path("/misc/LICENSE"); + then.status(200).body("i too, am a container for tea"); // 29 + }); + + let mock_dir = srv.mock(|when, _| { + when.method(GET).path("/misc/"); + }); + + let mock_disallowed = srv.mock(|when, then| { + when.method(GET).path("/disallowed-subdir"); + then.status(404); + }); + + let cmd = Command::cargo_bin("feroxbuster") + .unwrap() + .arg("--url") + .arg(srv.url("/")) + .arg("--wordlist") + .arg(file.as_os_str()) + .arg("--extract-links") + .arg("--no-recursion") + .unwrap(); + + cmd.assert().success().stdout( + predicate::str::contains("/LICENSE") + .and(predicate::str::contains("18c")) + .and(predicate::str::contains("/misc/stupidfile.php")) + .and(predicate::str::contains("22c")) + .and(predicate::str::contains("/misc/LICENSE").not()) + .and(predicate::str::contains("29c").not()) + .and(predicate::str::contains("200").count(2)), + ); + + assert_eq!(mock.hits(), 1); + assert_eq!(mock_dir.hits(), 1); + assert_eq!(mock_two.hits(), 1); + assert_eq!(mock_file.hits(), 1); + assert_eq!(mock_disallowed.hits(), 1); + assert_eq!(mock_scanned_file.hits(), 0); + teardown_tmp_directory(tmp_dir); +} + #[test] /// send a request to a page that contains a link that contains a directory that returns a 403 /// --extract-links should find the link and make recurse into the 403 directory, finding LICENSE