made link extraction req/resp async

This commit is contained in:
epi
2023-04-19 06:56:52 -05:00
parent 8cd9918b76
commit 233cf99907

View File

@@ -15,12 +15,53 @@ use crate::{
ExtractionResult, DEFAULT_METHOD,
};
use anyhow::{bail, Context, Result};
use reqwest::{Client, StatusCode, Url};
use futures::StreamExt;
use reqwest::{Client, Response, StatusCode, Url};
use scraper::{Html, Selector};
use std::{borrow::Cow, collections::HashSet};
/// Wrapper around link extraction logic
/// - create a new Url object based on cli options/args
/// - check if the new Url has already been seen/scanned -> None
/// - make a request to the new Url ? -> Some(response) : None
pub(super) async fn request_link(url: &str, handles: Arc<Handles>) -> Result<Response> {
log::trace!("enter: request_link({})", url);
let ferox_url = FeroxUrl::from_string(url, handles.clone());
// create a url based on the given command line options
let new_url = ferox_url.format("", None)?;
let scanned_urls = handles.ferox_scans()?;
if scanned_urls.get_scan_by_url(new_url.as_ref()).is_some() {
//we've seen the url before and don't need to scan again
log::trace!("exit: request_link -> None");
bail!("previously seen url");
}
if (!handles.config.url_denylist.is_empty() || !handles.config.regex_denylist.is_empty())
&& should_deny_url(&new_url, handles.clone())?
{
// can't allow a denied url to be requested
bail!(
"prevented request to {} due to {:?} || {:?}",
url,
handles.config.url_denylist,
handles.config.regex_denylist,
);
}
// make the request and store the response
let new_response = logged_request(&new_url, DEFAULT_METHOD, None, handles.clone()).await?;
log::trace!("exit: request_link -> {:?}", new_response);
Ok(new_response)
}
/// Whether an active scan is recursive or not
#[derive(Debug)]
#[derive(Debug, Copy, Clone)]
enum RecursionStatus {
/// Scan is recursive
Recursive,
@@ -121,91 +162,140 @@ impl<'a> Extractor<'a> {
/// given a set of links from a normal http body response, task the request handler to make
/// the requests
pub async fn request_links(&mut self, links: HashSet<String>) -> Result<()> {
pub async fn request_links(
&mut self,
links: HashSet<String>,
) -> Result<Option<tokio::task::JoinHandle<()>>> {
log::trace!("enter: request_links({:?})", links);
if links.is_empty() {
return Ok(());
return Ok(None);
}
self.update_stats(links.len())?;
// create clones/remove use of self of/from everything the async move block will need to function
let cloned_scanned_urls = self.handles.ferox_scans()?;
let cloned_handles = self.handles.clone();
let cloned_url = self.url.clone();
let threads = self.handles.config.threads;
let recursive = if self.handles.config.no_recursion {
RecursionStatus::NotRecursive
} else {
RecursionStatus::Recursive
};
let scanned_urls = self.handles.ferox_scans()?;
self.update_stats(links.len())?;
let link_request_task = tokio::spawn(async move {
let producers = futures::stream::iter(links.into_iter())
.map(|link| {
// another clone to satisfy the async move block
let inner_clone = cloned_handles.clone();
for link in links {
let mut resp = match self.request_link(&link).await {
Ok(resp) => resp,
Err(_) => continue,
};
(
tokio::spawn(async move { request_link(&link, inner_clone).await }),
cloned_handles.clone(),
cloned_scanned_urls.clone(),
recursive,
cloned_url.clone(),
)
})
.for_each_concurrent(
threads,
|(join_handle, c_handles, c_scanned_urls, c_recursive, og_url)| async move {
match join_handle.await {
Ok(Ok(reqwest_response)) => {
let mut resp = FeroxResponse::from(
reqwest_response,
&og_url,
DEFAULT_METHOD,
c_handles.config.output_level,
)
.await;
// filter if necessary
if self
.handles
.filters
.data
.should_filter_response(&resp, self.handles.stats.tx.clone())
{
continue;
}
// filter if necessary
if c_handles
.filters
.data
.should_filter_response(&resp, c_handles.stats.tx.clone())
{
return;
}
// request and report assumed file
if resp.is_file() || !resp.is_directory() {
log::debug!("Extracted File: {}", resp);
// request and report assumed file
if resp.is_file() || !resp.is_directory() {
log::debug!("Extracted File: {}", resp);
scanned_urls.add_file_scan(resp.url().as_str(), ScanOrder::Latest);
c_scanned_urls
.add_file_scan(resp.url().as_str(), ScanOrder::Latest);
if self.handles.config.collect_extensions {
resp.parse_extension(self.handles.clone())?;
}
if c_handles.config.collect_extensions {
// no real reason this should fail
resp.parse_extension(c_handles.clone()).unwrap();
}
if let Err(e) = resp.send_report(self.handles.output.tx.clone()) {
log::warn!("Could not send FeroxResponse to output handler: {}", e);
}
if let Err(e) = resp.send_report(c_handles.output.tx.clone()) {
log::warn!(
"Could not send FeroxResponse to output handler: {}",
e
);
}
continue;
}
return;
}
if matches!(recursive, RecursionStatus::Recursive) {
log::debug!("Extracted Directory: {}", resp);
if matches!(c_recursive, RecursionStatus::Recursive) {
log::debug!("Extracted Directory: {}", resp);
if !resp.url().as_str().ends_with('/')
&& (resp.status().is_success()
|| matches!(resp.status(), &StatusCode::FORBIDDEN))
{
// if the url doesn't end with a /
// and the response code is either a 2xx or 403
if !resp.url().as_str().ends_with('/')
&& (resp.status().is_success()
|| matches!(resp.status(), &StatusCode::FORBIDDEN))
{
// if the url doesn't end with a /
// and the response code is either a 2xx or 403
// since all of these are 2xx or 403, recursion is only attempted if the
// url ends in a /. I am actually ok with adding the slash and not
// adding it, as both have merit. Leaving it in for now to see how
// things turn out (current as of: v1.1.0)
resp.set_url(&format!("{}/", resp.url()));
}
// since all of these are 2xx or 403, recursion is only attempted if the
// url ends in a /. I am actually ok with adding the slash and not
// adding it, as both have merit. Leaving it in for now to see how
// things turn out (current as of: v1.1.0)
resp.set_url(&format!("{}/", resp.url()));
}
if c_handles.config.filter_status.is_empty() {
// -C wasn't used, so -s is the only 'filter' left to account for
if c_handles
.config
.status_codes
.contains(&resp.status().as_u16())
{
send_try_recursion_command(c_handles.clone(), resp)
.await
.unwrap_or_default();
}
} else {
// -C was used, that means the filters above would have removed
// those responses, and anything else should be let through
send_try_recursion_command(c_handles.clone(), resp)
.await
.unwrap_or_default();
}
}
}
Ok(Err(err)) => {
log::warn!("Error during link extraction: {}", err);
}
Err(err) => {
log::warn!("JoinError during link extraction: {}", err);
}
}
},
);
// wait for the requests to finish
producers.await;
});
if self.handles.config.filter_status.is_empty() {
// -C wasn't used, so -s is the only 'filter' left to account for
if self
.handles
.config
.status_codes
.contains(&resp.status().as_u16())
{
send_try_recursion_command(self.handles.clone(), resp).await?;
}
} else {
// -C was used, that means the filters above would have removed
// those responses, and anything else should be let through
send_try_recursion_command(self.handles.clone(), resp).await?;
}
}
}
log::trace!("exit: request_links");
Ok(())
Ok(Some(link_request_task))
}
/// wrapper around link extraction via html attributes
@@ -415,56 +505,6 @@ impl<'a> Extractor<'a> {
Ok(())
}
/// Wrapper around link extraction logic
/// - create a new Url object based on cli options/args
/// - check if the new Url has already been seen/scanned -> None
/// - make a request to the new Url ? -> Some(response) : None
pub(super) async fn request_link(&self, url: &str) -> Result<FeroxResponse> {
log::trace!("enter: request_link({})", url);
let ferox_url = FeroxUrl::from_string(url, self.handles.clone());
// create a url based on the given command line options
let new_url = ferox_url.format("", None)?;
let scanned_urls = self.handles.ferox_scans()?;
if scanned_urls.get_scan_by_url(new_url.as_ref()).is_some() {
//we've seen the url before and don't need to scan again
log::trace!("exit: request_link -> None");
bail!("previously seen url");
}
if (!self.handles.config.url_denylist.is_empty()
|| !self.handles.config.regex_denylist.is_empty())
&& should_deny_url(&new_url, self.handles.clone())?
{
// can't allow a denied url to be requested
bail!(
"prevented request to {} due to {:?} || {:?}",
url,
self.handles.config.url_denylist,
self.handles.config.regex_denylist,
);
}
// make the request and store the response
let new_response =
logged_request(&new_url, DEFAULT_METHOD, None, self.handles.clone()).await?;
let new_ferox_response = FeroxResponse::from(
new_response,
url,
DEFAULT_METHOD,
self.handles.config.output_level,
)
.await;
log::trace!("exit: request_link -> {:?}", new_ferox_response);
Ok(new_ferox_response)
}
/// Entry point to perform link extraction from robots.txt
///
/// `base_url` can have paths and subpaths, however robots.txt will be requested from the