diff --git a/src/extractor/container.rs b/src/extractor/container.rs index a1b123c..c5a03d2 100644 --- a/src/extractor/container.rs +++ b/src/extractor/container.rs @@ -15,12 +15,53 @@ use crate::{ ExtractionResult, DEFAULT_METHOD, }; use anyhow::{bail, Context, Result}; -use reqwest::{Client, StatusCode, Url}; +use futures::StreamExt; +use reqwest::{Client, Response, StatusCode, Url}; use scraper::{Html, Selector}; use std::{borrow::Cow, collections::HashSet}; +/// Wrapper around link extraction logic +/// - create a new Url object based on cli options/args +/// - check if the new Url has already been seen/scanned -> None +/// - make a request to the new Url ? -> Some(response) : None +pub(super) async fn request_link(url: &str, handles: Arc) -> Result { + log::trace!("enter: request_link({})", url); + + let ferox_url = FeroxUrl::from_string(url, handles.clone()); + + // create a url based on the given command line options + let new_url = ferox_url.format("", None)?; + + let scanned_urls = handles.ferox_scans()?; + + if scanned_urls.get_scan_by_url(new_url.as_ref()).is_some() { + //we've seen the url before and don't need to scan again + log::trace!("exit: request_link -> None"); + bail!("previously seen url"); + } + + if (!handles.config.url_denylist.is_empty() || !handles.config.regex_denylist.is_empty()) + && should_deny_url(&new_url, handles.clone())? + { + // can't allow a denied url to be requested + bail!( + "prevented request to {} due to {:?} || {:?}", + url, + handles.config.url_denylist, + handles.config.regex_denylist, + ); + } + + // make the request and store the response + let new_response = logged_request(&new_url, DEFAULT_METHOD, None, handles.clone()).await?; + + log::trace!("exit: request_link -> {:?}", new_response); + + Ok(new_response) +} + /// Whether an active scan is recursive or not -#[derive(Debug)] +#[derive(Debug, Copy, Clone)] enum RecursionStatus { /// Scan is recursive Recursive, @@ -121,91 +162,140 @@ impl<'a> Extractor<'a> { /// given a set of links from a normal http body response, task the request handler to make /// the requests - pub async fn request_links(&mut self, links: HashSet) -> Result<()> { + pub async fn request_links( + &mut self, + links: HashSet, + ) -> Result>> { log::trace!("enter: request_links({:?})", links); if links.is_empty() { - return Ok(()); + return Ok(None); } + self.update_stats(links.len())?; + + // create clones/remove use of self of/from everything the async move block will need to function + let cloned_scanned_urls = self.handles.ferox_scans()?; + let cloned_handles = self.handles.clone(); + let cloned_url = self.url.clone(); + let threads = self.handles.config.threads; let recursive = if self.handles.config.no_recursion { RecursionStatus::NotRecursive } else { RecursionStatus::Recursive }; - let scanned_urls = self.handles.ferox_scans()?; - self.update_stats(links.len())?; + let link_request_task = tokio::spawn(async move { + let producers = futures::stream::iter(links.into_iter()) + .map(|link| { + // another clone to satisfy the async move block + let inner_clone = cloned_handles.clone(); - for link in links { - let mut resp = match self.request_link(&link).await { - Ok(resp) => resp, - Err(_) => continue, - }; + ( + tokio::spawn(async move { request_link(&link, inner_clone).await }), + cloned_handles.clone(), + cloned_scanned_urls.clone(), + recursive, + cloned_url.clone(), + ) + }) + .for_each_concurrent( + threads, + |(join_handle, c_handles, c_scanned_urls, c_recursive, og_url)| async move { + match join_handle.await { + Ok(Ok(reqwest_response)) => { + let mut resp = FeroxResponse::from( + reqwest_response, + &og_url, + DEFAULT_METHOD, + c_handles.config.output_level, + ) + .await; - // filter if necessary - if self - .handles - .filters - .data - .should_filter_response(&resp, self.handles.stats.tx.clone()) - { - continue; - } + // filter if necessary + if c_handles + .filters + .data + .should_filter_response(&resp, c_handles.stats.tx.clone()) + { + return; + } - // request and report assumed file - if resp.is_file() || !resp.is_directory() { - log::debug!("Extracted File: {}", resp); + // request and report assumed file + if resp.is_file() || !resp.is_directory() { + log::debug!("Extracted File: {}", resp); - scanned_urls.add_file_scan(resp.url().as_str(), ScanOrder::Latest); + c_scanned_urls + .add_file_scan(resp.url().as_str(), ScanOrder::Latest); - if self.handles.config.collect_extensions { - resp.parse_extension(self.handles.clone())?; - } + if c_handles.config.collect_extensions { + // no real reason this should fail + resp.parse_extension(c_handles.clone()).unwrap(); + } - if let Err(e) = resp.send_report(self.handles.output.tx.clone()) { - log::warn!("Could not send FeroxResponse to output handler: {}", e); - } + if let Err(e) = resp.send_report(c_handles.output.tx.clone()) { + log::warn!( + "Could not send FeroxResponse to output handler: {}", + e + ); + } - continue; - } + return; + } - if matches!(recursive, RecursionStatus::Recursive) { - log::debug!("Extracted Directory: {}", resp); + if matches!(c_recursive, RecursionStatus::Recursive) { + log::debug!("Extracted Directory: {}", resp); - if !resp.url().as_str().ends_with('/') - && (resp.status().is_success() - || matches!(resp.status(), &StatusCode::FORBIDDEN)) - { - // if the url doesn't end with a / - // and the response code is either a 2xx or 403 + if !resp.url().as_str().ends_with('/') + && (resp.status().is_success() + || matches!(resp.status(), &StatusCode::FORBIDDEN)) + { + // if the url doesn't end with a / + // and the response code is either a 2xx or 403 - // since all of these are 2xx or 403, recursion is only attempted if the - // url ends in a /. I am actually ok with adding the slash and not - // adding it, as both have merit. Leaving it in for now to see how - // things turn out (current as of: v1.1.0) - resp.set_url(&format!("{}/", resp.url())); - } + // since all of these are 2xx or 403, recursion is only attempted if the + // url ends in a /. I am actually ok with adding the slash and not + // adding it, as both have merit. Leaving it in for now to see how + // things turn out (current as of: v1.1.0) + resp.set_url(&format!("{}/", resp.url())); + } + + if c_handles.config.filter_status.is_empty() { + // -C wasn't used, so -s is the only 'filter' left to account for + if c_handles + .config + .status_codes + .contains(&resp.status().as_u16()) + { + send_try_recursion_command(c_handles.clone(), resp) + .await + .unwrap_or_default(); + } + } else { + // -C was used, that means the filters above would have removed + // those responses, and anything else should be let through + send_try_recursion_command(c_handles.clone(), resp) + .await + .unwrap_or_default(); + } + } + } + Ok(Err(err)) => { + log::warn!("Error during link extraction: {}", err); + } + Err(err) => { + log::warn!("JoinError during link extraction: {}", err); + } + } + }, + ); + + // wait for the requests to finish + producers.await; + }); - if self.handles.config.filter_status.is_empty() { - // -C wasn't used, so -s is the only 'filter' left to account for - if self - .handles - .config - .status_codes - .contains(&resp.status().as_u16()) - { - send_try_recursion_command(self.handles.clone(), resp).await?; - } - } else { - // -C was used, that means the filters above would have removed - // those responses, and anything else should be let through - send_try_recursion_command(self.handles.clone(), resp).await?; - } - } - } log::trace!("exit: request_links"); - Ok(()) + Ok(Some(link_request_task)) } /// wrapper around link extraction via html attributes @@ -415,56 +505,6 @@ impl<'a> Extractor<'a> { Ok(()) } - /// Wrapper around link extraction logic - /// - create a new Url object based on cli options/args - /// - check if the new Url has already been seen/scanned -> None - /// - make a request to the new Url ? -> Some(response) : None - pub(super) async fn request_link(&self, url: &str) -> Result { - log::trace!("enter: request_link({})", url); - - let ferox_url = FeroxUrl::from_string(url, self.handles.clone()); - - // create a url based on the given command line options - let new_url = ferox_url.format("", None)?; - - let scanned_urls = self.handles.ferox_scans()?; - - if scanned_urls.get_scan_by_url(new_url.as_ref()).is_some() { - //we've seen the url before and don't need to scan again - log::trace!("exit: request_link -> None"); - bail!("previously seen url"); - } - - if (!self.handles.config.url_denylist.is_empty() - || !self.handles.config.regex_denylist.is_empty()) - && should_deny_url(&new_url, self.handles.clone())? - { - // can't allow a denied url to be requested - bail!( - "prevented request to {} due to {:?} || {:?}", - url, - self.handles.config.url_denylist, - self.handles.config.regex_denylist, - ); - } - - // make the request and store the response - let new_response = - logged_request(&new_url, DEFAULT_METHOD, None, self.handles.clone()).await?; - - let new_ferox_response = FeroxResponse::from( - new_response, - url, - DEFAULT_METHOD, - self.handles.config.output_level, - ) - .await; - - log::trace!("exit: request_link -> {:?}", new_ferox_response); - - Ok(new_ferox_response) - } - /// Entry point to perform link extraction from robots.txt /// /// `base_url` can have paths and subpaths, however robots.txt will be requested from the