mirror of
https://github.com/epi052/feroxbuster.git
synced 2026-05-27 16:51:13 -03:00
extractor restructure mostly done
This commit is contained in:
504
src/extractor.rs
504
src/extractor.rs
@@ -1,504 +0,0 @@
|
||||
use crate::{
|
||||
client,
|
||||
config::{Configuration, CONFIGURATION},
|
||||
scanner::SCANNED_URLS,
|
||||
statistics::{
|
||||
StatCommand::{self, UpdateUsizeField},
|
||||
StatField::{LinksExtracted, TotalExpected},
|
||||
},
|
||||
utils::{format_url, make_request},
|
||||
FeroxResponse,
|
||||
};
|
||||
use lazy_static::lazy_static;
|
||||
use regex::Regex;
|
||||
use reqwest::Url;
|
||||
use std::collections::HashSet;
|
||||
use tokio::sync::mpsc::UnboundedSender;
|
||||
|
||||
/// Regular expression used in [LinkFinder](https://github.com/GerbenJavado/LinkFinder)
|
||||
///
|
||||
/// Incorporates change from this [Pull Request](https://github.com/GerbenJavado/LinkFinder/pull/66/files)
|
||||
const LINKFINDER_REGEX: &str = r#"(?:"|')(((?:[a-zA-Z]{1,10}://|//)[^"'/]{1,}\.[a-zA-Z]{2,}[^"']{0,})|((?:/|\.\./|\./)[^"'><,;| *()(%%$^/\\\[\]][^"'><,;|()]{1,})|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{1,}\.(?:[a-zA-Z]{1,4}|action)(?:[\?|#][^"|']{0,}|))|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{3,}(?:[\?|#][^"|']{0,}|))|([a-zA-Z0-9_\-.]{1,}\.(?:php|asp|aspx|jsp|json|action|html|js|txt|xml)(?:[\?|#][^"|']{0,}|)))(?:"|')"#;
|
||||
|
||||
/// Regular expression to pull url paths from robots.txt
|
||||
///
|
||||
/// ref: https://developers.google.com/search/reference/robots_txt
|
||||
const ROBOTS_TXT_REGEX: &str =
|
||||
r#"(?m)^ *(Allow|Disallow): *(?P<url_path>[a-zA-Z0-9._/?#@!&'()+,;%=-]+?)$"#; // multi-line (?m)
|
||||
|
||||
lazy_static! {
|
||||
/// `LINKFINDER_REGEX` as a regex::Regex type
|
||||
static ref LINKS_REGEX: Regex = Regex::new(LINKFINDER_REGEX).unwrap();
|
||||
|
||||
/// `ROBOTS_TXT_REGEX` as a regex::Regex type
|
||||
static ref ROBOTS_REGEX: Regex = Regex::new(ROBOTS_TXT_REGEX).unwrap();
|
||||
}
|
||||
|
||||
/// Iterate over a given path, return a list of every sub-path found
|
||||
///
|
||||
/// example: `path` contains a link fragment `homepage/assets/img/icons/handshake.svg`
|
||||
/// the following fragments would be returned:
|
||||
/// - homepage/assets/img/icons/handshake.svg
|
||||
/// - homepage/assets/img/icons/
|
||||
/// - homepage/assets/img/
|
||||
/// - homepage/assets/
|
||||
/// - homepage/
|
||||
fn get_sub_paths_from_path(path: &str) -> Vec<String> {
|
||||
log::trace!("enter: get_sub_paths_from_path({})", path);
|
||||
let mut paths = vec![];
|
||||
|
||||
// filter out any empty strings caused by .split
|
||||
let mut parts: Vec<&str> = path.split('/').filter(|s| !s.is_empty()).collect();
|
||||
|
||||
let length = parts.len();
|
||||
|
||||
for i in 0..length {
|
||||
// iterate over all parts of the path
|
||||
if parts.is_empty() {
|
||||
// pop left us with an empty vector, we're done
|
||||
break;
|
||||
}
|
||||
|
||||
let mut possible_path = parts.join("/");
|
||||
|
||||
if possible_path.is_empty() {
|
||||
// .join can result in an empty string, which we don't need, ignore
|
||||
continue;
|
||||
}
|
||||
|
||||
if i > 0 {
|
||||
// this isn't the last index of the parts array
|
||||
// ex: /buried/misc/stupidfile.php
|
||||
// this block skips the file but sees all parent folders
|
||||
possible_path = format!("{}/", possible_path);
|
||||
}
|
||||
|
||||
paths.push(possible_path); // good sub-path found
|
||||
parts.pop(); // use .pop() to remove the last part of the path and continue iteration
|
||||
}
|
||||
|
||||
log::trace!("exit: get_sub_paths_from_path -> {:?}", paths);
|
||||
paths
|
||||
}
|
||||
|
||||
/// simple helper to stay DRY, trys to join a url + fragment and add it to the `links` HashSet
|
||||
fn add_link_to_set_of_links(link: &str, url: &Url, links: &mut HashSet<String>) {
|
||||
log::trace!(
|
||||
"enter: add_link_to_set_of_links({}, {}, {:?})",
|
||||
link,
|
||||
url.to_string(),
|
||||
links
|
||||
);
|
||||
match url.join(&link) {
|
||||
Ok(new_url) => {
|
||||
links.insert(new_url.to_string());
|
||||
}
|
||||
Err(e) => {
|
||||
log::error!("Could not join given url to the base url: {}", e);
|
||||
}
|
||||
}
|
||||
log::trace!("exit: add_link_to_set_of_links");
|
||||
}
|
||||
|
||||
/// Given a `reqwest::Response`, perform the following actions
|
||||
/// - parse the response's text for links using the linkfinder regex
|
||||
/// - for every link found take its url path and parse each sub-path
|
||||
/// - example: Response contains a link fragment `homepage/assets/img/icons/handshake.svg`
|
||||
/// with a base url of http://localhost, the following urls would be returned:
|
||||
/// - homepage/assets/img/icons/handshake.svg
|
||||
/// - homepage/assets/img/icons/
|
||||
/// - homepage/assets/img/
|
||||
/// - homepage/assets/
|
||||
/// - homepage/
|
||||
pub async fn get_links(
|
||||
response: &FeroxResponse,
|
||||
tx_stats: UnboundedSender<StatCommand>,
|
||||
) -> HashSet<String> {
|
||||
log::trace!(
|
||||
"enter: get_links({}, {:?})",
|
||||
response.url().as_str(),
|
||||
tx_stats
|
||||
);
|
||||
|
||||
let mut links = HashSet::<String>::new();
|
||||
|
||||
let body = response.text();
|
||||
|
||||
for capture in LINKS_REGEX.captures_iter(&body) {
|
||||
// remove single & double quotes from both ends of the capture
|
||||
// capture[0] is the entire match, additional capture groups start at [1]
|
||||
let link = capture[0].trim_matches(|c| c == '\'' || c == '"');
|
||||
|
||||
match Url::parse(link) {
|
||||
Ok(absolute) => {
|
||||
if absolute.domain() != response.url().domain()
|
||||
|| absolute.host() != response.url().host()
|
||||
{
|
||||
// domains/ips are not the same, don't scan things that aren't part of the original
|
||||
// target url
|
||||
continue;
|
||||
}
|
||||
|
||||
add_all_sub_paths(absolute.path(), &response, &mut links);
|
||||
}
|
||||
Err(e) => {
|
||||
// this is the expected error that happens when we try to parse a url fragment
|
||||
// ex: Url::parse("/login") -> Err("relative URL without a base")
|
||||
// while this is technically an error, these are good results for us
|
||||
if e.to_string().contains("relative URL without a base") {
|
||||
add_all_sub_paths(link, &response, &mut links);
|
||||
} else {
|
||||
// unexpected error has occurred
|
||||
log::error!("Could not parse given url: {}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let multiplier = CONFIGURATION.extensions.len().max(1);
|
||||
|
||||
update_stat!(tx_stats, UpdateUsizeField(LinksExtracted, links.len()));
|
||||
update_stat!(
|
||||
tx_stats,
|
||||
UpdateUsizeField(TotalExpected, links.len() * multiplier)
|
||||
);
|
||||
|
||||
log::trace!("exit: get_links -> {:?}", links);
|
||||
|
||||
links
|
||||
}
|
||||
|
||||
/// take a url fragment like homepage/assets/img/icons/handshake.svg and
|
||||
/// incrementally add
|
||||
/// - homepage/assets/img/icons/
|
||||
/// - homepage/assets/img/
|
||||
/// - homepage/assets/
|
||||
/// - homepage/
|
||||
fn add_all_sub_paths(url_path: &str, response: &FeroxResponse, mut links: &mut HashSet<String>) {
|
||||
log::trace!(
|
||||
"enter: add_all_sub_paths({}, {}, {:?})",
|
||||
url_path,
|
||||
response,
|
||||
links
|
||||
);
|
||||
|
||||
for sub_path in get_sub_paths_from_path(url_path) {
|
||||
log::debug!("Adding {} to {:?}", sub_path, links);
|
||||
add_link_to_set_of_links(&sub_path, &response.url(), &mut links);
|
||||
}
|
||||
|
||||
log::trace!("exit: add_all_sub_paths");
|
||||
}
|
||||
|
||||
/// Wrapper around link extraction logic
|
||||
/// currently used in two places:
|
||||
/// - links from response bodys
|
||||
/// - links from robots.txt responses
|
||||
///
|
||||
/// general steps taken:
|
||||
/// - create a new Url object based on cli options/args
|
||||
/// - check if the new Url has already been seen/scanned -> None
|
||||
/// - make a request to the new Url ? -> Some(response) : None
|
||||
pub async fn request_feroxresponse_from_new_link(
|
||||
url: &str,
|
||||
tx_stats: UnboundedSender<StatCommand>,
|
||||
) -> Option<FeroxResponse> {
|
||||
log::trace!(
|
||||
"enter: request_feroxresponse_from_new_link({}, {:?})",
|
||||
url,
|
||||
tx_stats
|
||||
);
|
||||
|
||||
// create a url based on the given command line options, return None on error
|
||||
let new_url = match format_url(
|
||||
&url,
|
||||
&"",
|
||||
CONFIGURATION.add_slash,
|
||||
&CONFIGURATION.queries,
|
||||
None,
|
||||
tx_stats.clone(),
|
||||
) {
|
||||
Ok(url) => url,
|
||||
Err(_) => {
|
||||
log::trace!("exit: request_feroxresponse_from_new_link -> None");
|
||||
return None;
|
||||
}
|
||||
};
|
||||
|
||||
if SCANNED_URLS.get_scan_by_url(&new_url.to_string()).is_some() {
|
||||
//we've seen the url before and don't need to scan again
|
||||
log::trace!("exit: request_feroxresponse_from_new_link -> None");
|
||||
return None;
|
||||
}
|
||||
|
||||
// make the request and store the response
|
||||
let new_response = match make_request(&CONFIGURATION.client, &new_url, tx_stats).await {
|
||||
Ok(resp) => resp,
|
||||
Err(_) => {
|
||||
log::trace!("exit: request_feroxresponse_from_new_link -> None");
|
||||
return None;
|
||||
}
|
||||
};
|
||||
|
||||
let new_ferox_response = FeroxResponse::from(new_response, true).await;
|
||||
|
||||
log::trace!(
|
||||
"exit: request_feroxresponse_from_new_link -> {:?}",
|
||||
new_ferox_response
|
||||
);
|
||||
Some(new_ferox_response)
|
||||
}
|
||||
|
||||
/// helper function that simply requests /robots.txt on the given url's base url
|
||||
///
|
||||
/// example:
|
||||
/// http://localhost/api/users -> http://localhost/robots.txt
|
||||
///
|
||||
/// The length of the given path has no effect on what's requested; it's always
|
||||
/// base url + /robots.txt
|
||||
pub async fn request_robots_txt(
|
||||
base_url: &str,
|
||||
config: &Configuration,
|
||||
tx_stats: UnboundedSender<StatCommand>,
|
||||
) -> Option<FeroxResponse> {
|
||||
log::trace!(
|
||||
"enter: get_robots_file({}, CONFIGURATION, {:?})",
|
||||
base_url,
|
||||
tx_stats
|
||||
);
|
||||
|
||||
// more often than not, domain/robots.txt will redirect to www.domain/robots.txt or something
|
||||
// similar; to account for that, create a client that will follow redirects, regardless of
|
||||
// what the user specified for the scanning client. Other than redirects, it will respect
|
||||
// all other user specified settings
|
||||
let follow_redirects = true;
|
||||
|
||||
let proxy = if config.proxy.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(config.proxy.as_str())
|
||||
};
|
||||
|
||||
let client = client::initialize(
|
||||
config.timeout,
|
||||
&config.user_agent,
|
||||
follow_redirects,
|
||||
config.insecure,
|
||||
&config.headers,
|
||||
proxy,
|
||||
);
|
||||
|
||||
if let Ok(mut url) = Url::parse(base_url) {
|
||||
url.set_path("/robots.txt"); // overwrite existing path with /robots.txt
|
||||
|
||||
if let Ok(response) = make_request(&client, &url, tx_stats).await {
|
||||
let ferox_response = FeroxResponse::from(response, true).await;
|
||||
|
||||
log::trace!("exit: get_robots_file -> {}", ferox_response);
|
||||
return Some(ferox_response);
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
/// Entry point to perform link extraction from robots.txt
|
||||
///
|
||||
/// `base_url` can have paths and subpaths, however robots.txt will be requested from the
|
||||
/// root of the url
|
||||
/// given the url:
|
||||
/// http://localhost/stuff/things
|
||||
/// this function requests:
|
||||
/// http://localhost/robots.txt
|
||||
pub async fn extract_robots_txt(
|
||||
base_url: &str,
|
||||
config: &Configuration,
|
||||
tx_stats: UnboundedSender<StatCommand>,
|
||||
) -> HashSet<String> {
|
||||
log::trace!(
|
||||
"enter: extract_robots_txt({}, CONFIGURATION, {:?})",
|
||||
base_url,
|
||||
tx_stats
|
||||
);
|
||||
let mut links = HashSet::new();
|
||||
|
||||
if let Some(response) = request_robots_txt(&base_url, &config, tx_stats.clone()).await {
|
||||
for capture in ROBOTS_REGEX.captures_iter(response.text.as_str()) {
|
||||
if let Some(new_path) = capture.name("url_path") {
|
||||
if let Ok(mut new_url) = Url::parse(base_url) {
|
||||
new_url.set_path(new_path.as_str());
|
||||
add_all_sub_paths(new_url.path(), &response, &mut links);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let multiplier = CONFIGURATION.extensions.len().max(1);
|
||||
|
||||
update_stat!(tx_stats, UpdateUsizeField(LinksExtracted, links.len()));
|
||||
update_stat!(
|
||||
tx_stats,
|
||||
UpdateUsizeField(TotalExpected, links.len() * multiplier)
|
||||
);
|
||||
|
||||
log::trace!("exit: extract_robots_txt -> {:?}", links);
|
||||
links
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::utils::make_request;
|
||||
use crate::FeroxChannel;
|
||||
use httpmock::Method::GET;
|
||||
use httpmock::MockServer;
|
||||
use reqwest::Client;
|
||||
use tokio::sync::mpsc;
|
||||
|
||||
#[test]
|
||||
/// extract sub paths from the given url fragment; expect 4 sub paths and that all are
|
||||
/// in the expected array
|
||||
fn extractor_get_sub_paths_from_path_with_multiple_paths() {
|
||||
let path = "homepage/assets/img/icons/handshake.svg";
|
||||
let paths = get_sub_paths_from_path(&path);
|
||||
let expected = vec![
|
||||
"homepage/",
|
||||
"homepage/assets/",
|
||||
"homepage/assets/img/",
|
||||
"homepage/assets/img/icons/",
|
||||
"homepage/assets/img/icons/handshake.svg",
|
||||
];
|
||||
|
||||
assert_eq!(paths.len(), expected.len());
|
||||
for expected_path in expected {
|
||||
assert_eq!(paths.contains(&expected_path.to_string()), true);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
/// extract sub paths from the given url fragment; expect 2 sub paths and that all are
|
||||
/// in the expected array. the fragment is wrapped in slashes to ensure no empty strings are
|
||||
/// returned
|
||||
fn extractor_get_sub_paths_from_path_with_enclosing_slashes() {
|
||||
let path = "/homepage/assets/";
|
||||
let paths = get_sub_paths_from_path(&path);
|
||||
let expected = vec!["homepage/", "homepage/assets"];
|
||||
|
||||
assert_eq!(paths.len(), expected.len());
|
||||
for expected_path in expected {
|
||||
assert_eq!(paths.contains(&expected_path.to_string()), true);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
/// extract sub paths from the given url fragment; expect 1 sub path, no forward slashes are
|
||||
/// included
|
||||
fn extractor_get_sub_paths_from_path_with_only_a_word() {
|
||||
let path = "homepage";
|
||||
let paths = get_sub_paths_from_path(&path);
|
||||
let expected = vec!["homepage"];
|
||||
|
||||
assert_eq!(paths.len(), expected.len());
|
||||
for expected_path in expected {
|
||||
assert_eq!(paths.contains(&expected_path.to_string()), true);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
/// extract sub paths from the given url fragment; expect 1 sub path, forward slash removed
|
||||
fn extractor_get_sub_paths_from_path_with_an_absolute_word() {
|
||||
let path = "/homepage";
|
||||
let paths = get_sub_paths_from_path(&path);
|
||||
let expected = vec!["homepage"];
|
||||
|
||||
assert_eq!(paths.len(), expected.len());
|
||||
for expected_path in expected {
|
||||
assert_eq!(paths.contains(&expected_path.to_string()), true);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
/// test that a full url and fragment are joined correctly, then added to the given list
|
||||
/// i.e. the happy path
|
||||
fn extractor_add_link_to_set_of_links_happy_path() {
|
||||
let url = Url::parse("https://localhost").unwrap();
|
||||
let mut links = HashSet::<String>::new();
|
||||
let link = "admin";
|
||||
|
||||
assert_eq!(links.len(), 0);
|
||||
add_link_to_set_of_links(link, &url, &mut links);
|
||||
|
||||
assert_eq!(links.len(), 1);
|
||||
assert!(links.contains("https://localhost/admin"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
/// test that an invalid path fragment doesn't add anything to the set of links
|
||||
fn extractor_add_link_to_set_of_links_with_non_base_url() {
|
||||
let url = Url::parse("https://localhost").unwrap();
|
||||
let mut links = HashSet::<String>::new();
|
||||
let link = "\\\\\\\\";
|
||||
|
||||
assert_eq!(links.len(), 0);
|
||||
add_link_to_set_of_links(link, &url, &mut links);
|
||||
|
||||
assert_eq!(links.len(), 0);
|
||||
assert!(links.is_empty());
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 1)]
|
||||
/// use make_request to generate a Response, and use the Response to test get_links;
|
||||
/// the response will contain an absolute path to a domain that is not part of the scanned
|
||||
/// domain; expect an empty set returned
|
||||
async fn extractor_get_links_with_absolute_url_that_differs_from_target_domain(
|
||||
) -> Result<(), Box<dyn std::error::Error>> {
|
||||
let srv = MockServer::start();
|
||||
|
||||
let mock = srv.mock(|when, then|{
|
||||
when.method(GET)
|
||||
.path("/some-path");
|
||||
then.status(200)
|
||||
.body("\"http://defintely.not.a.thing.probably.com/homepage/assets/img/icons/handshake.svg\"");
|
||||
});
|
||||
|
||||
let client = Client::new();
|
||||
let url = Url::parse(&srv.url("/some-path")).unwrap();
|
||||
let (tx, _): FeroxChannel<StatCommand> = mpsc::unbounded_channel();
|
||||
|
||||
let response = make_request(&client, &url, tx.clone()).await.unwrap();
|
||||
|
||||
let ferox_response = FeroxResponse::from(response, true).await;
|
||||
|
||||
let links = get_links(&ferox_response, tx).await;
|
||||
|
||||
assert!(links.is_empty());
|
||||
|
||||
assert_eq!(mock.hits(), 1);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 1)]
|
||||
/// test that /robots.txt is correctly requested given a base url (happy path)
|
||||
async fn request_robots_txt_with_and_without_proxy() {
|
||||
let srv = MockServer::start();
|
||||
|
||||
let mock = srv.mock(|when, then| {
|
||||
when.method(GET).path("/robots.txt");
|
||||
then.status(200).body("this is a test");
|
||||
});
|
||||
|
||||
let mut config = Configuration::default();
|
||||
|
||||
let (tx, _): FeroxChannel<StatCommand> = mpsc::unbounded_channel();
|
||||
|
||||
request_robots_txt(&srv.url("/api/users/stuff/things"), &config, tx.clone()).await;
|
||||
|
||||
// note: the proxy doesn't actually do anything other than hit a different code branch
|
||||
// in this unit test; it would however have an effect on an integration test
|
||||
config.proxy = srv.url("/ima-proxy");
|
||||
|
||||
request_robots_txt(&srv.url("/api/different/path"), &config, tx).await;
|
||||
|
||||
assert_eq!(mock.hits(), 2);
|
||||
}
|
||||
}
|
||||
171
src/extractor/builder.rs
Normal file
171
src/extractor/builder.rs
Normal file
@@ -0,0 +1,171 @@
|
||||
use super::*;
|
||||
use anyhow::{bail, Result};
|
||||
|
||||
/// Regular expression used in [LinkFinder](https://github.com/GerbenJavado/LinkFinder)
|
||||
///
|
||||
/// Incorporates change from this [Pull Request](https://github.com/GerbenJavado/LinkFinder/pull/66/files)
|
||||
const LINKFINDER_REGEX: &str = r#"(?:"|')(((?:[a-zA-Z]{1,10}://|//)[^"'/]{1,}\.[a-zA-Z]{2,}[^"']{0,})|((?:/|\.\./|\./)[^"'><,;| *()(%%$^/\\\[\]][^"'><,;|()]{1,})|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{1,}\.(?:[a-zA-Z]{1,4}|action)(?:[\?|#][^"|']{0,}|))|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{3,}(?:[\?|#][^"|']{0,}|))|([a-zA-Z0-9_\-.]{1,}\.(?:php|asp|aspx|jsp|json|action|html|js|txt|xml)(?:[\?|#][^"|']{0,}|)))(?:"|')"#;
|
||||
|
||||
/// Regular expression to pull url paths from robots.txt
|
||||
///
|
||||
/// ref: https://developers.google.com/search/reference/robots_txt
|
||||
const ROBOTS_TXT_REGEX: &str =
|
||||
r#"(?m)^ *(Allow|Disallow): *(?P<url_path>[a-zA-Z0-9._/?#@!&'()+,;%=-]+?)$"#; // multi-line (?m)
|
||||
|
||||
/// Which type of extraction should be performed
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
pub enum ExtractionTarget {
|
||||
/// Examine a response body and extract links
|
||||
ResponseBody,
|
||||
|
||||
/// Examine robots.txt (specifically) and extract links
|
||||
RobotsTxt,
|
||||
}
|
||||
|
||||
/// responsible for building an `Extractor`
|
||||
pub struct ExtractorBuilder<'a> {
|
||||
/// Response from which to extract links
|
||||
response: Option<&'a FeroxResponse>,
|
||||
|
||||
/// Response from which to extract links
|
||||
url: String,
|
||||
|
||||
/// Whether or not to try recursion
|
||||
config: Option<&'a Configuration>,
|
||||
|
||||
/// transmitter to the mpsc that handles statistics gathering
|
||||
tx_stats: Option<UnboundedSender<StatCommand>>,
|
||||
|
||||
/// transmitter to the mpsc that handles recursive scan calls
|
||||
tx_recursion: Option<UnboundedSender<String>>,
|
||||
|
||||
/// transmitter to the mpsc that handles reporting information to the user
|
||||
tx_reporter: Option<UnboundedSender<FeroxResponse>>,
|
||||
|
||||
/// list of urls that will be added to when new urls are extracted
|
||||
scanned_urls: Option<&'a FeroxScans>,
|
||||
|
||||
/// depth at which the scan was started
|
||||
depth: Option<usize>,
|
||||
|
||||
/// copy of Stats object
|
||||
stats: Option<Arc<Stats>>,
|
||||
|
||||
/// type of extraction to be performed
|
||||
target: Option<ExtractionTarget>,
|
||||
}
|
||||
|
||||
/// ExtractorBuilder implementation
|
||||
impl<'a> ExtractorBuilder<'a> {
|
||||
/// Given a FeroxResponse, create new ExtractorBuilder
|
||||
///
|
||||
/// Once built, Extractor::target is ExtractionTarget::ResponseBody
|
||||
pub fn with_response(response: &'a FeroxResponse) -> Self {
|
||||
Self {
|
||||
response: Some(response),
|
||||
url: "".to_string(),
|
||||
config: None,
|
||||
tx_stats: None,
|
||||
tx_recursion: None,
|
||||
tx_reporter: None,
|
||||
scanned_urls: None,
|
||||
depth: None,
|
||||
stats: None,
|
||||
target: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Given a url and Stats transmitter, create new ExtractorBuilder
|
||||
///
|
||||
/// Once built, Extractor::target is ExtractionTarget::ResponseBody
|
||||
pub fn with_url(url: &str) -> Self {
|
||||
Self {
|
||||
response: None,
|
||||
url: url.to_string(),
|
||||
config: None,
|
||||
tx_stats: None,
|
||||
tx_recursion: None,
|
||||
tx_reporter: None,
|
||||
scanned_urls: None,
|
||||
depth: None,
|
||||
stats: None,
|
||||
target: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// builder call to set `config`
|
||||
pub fn config(&mut self, config: &'a Configuration) -> &mut Self {
|
||||
self.config = Some(config);
|
||||
self
|
||||
}
|
||||
|
||||
/// builder call to set `tx_recursion`
|
||||
pub fn recursion_transmitter(&mut self, tx_recursion: UnboundedSender<String>) -> &mut Self {
|
||||
self.tx_recursion = Some(tx_recursion);
|
||||
self
|
||||
}
|
||||
|
||||
/// builder call to set `tx_stats`
|
||||
pub fn stats_transmitter(&mut self, tx_stats: UnboundedSender<StatCommand>) -> &mut Self {
|
||||
self.tx_stats = Some(tx_stats);
|
||||
self
|
||||
}
|
||||
|
||||
/// builder call to set `tx_reporter`
|
||||
pub fn reporter_transmitter(
|
||||
&mut self,
|
||||
tx_reporter: UnboundedSender<FeroxResponse>,
|
||||
) -> &mut Self {
|
||||
self.tx_reporter = Some(tx_reporter);
|
||||
self
|
||||
}
|
||||
|
||||
/// builder call to set `scanned_urls`
|
||||
pub fn scanned_urls(&mut self, scanned_urls: &'a FeroxScans) -> &mut Self {
|
||||
self.scanned_urls = Some(scanned_urls);
|
||||
self
|
||||
}
|
||||
|
||||
/// builder call to set `stats`
|
||||
pub fn stats(&mut self, stats: Arc<Stats>) -> &mut Self {
|
||||
self.stats = Some(stats);
|
||||
self
|
||||
}
|
||||
|
||||
/// builder call to set `depth`
|
||||
pub fn depth(&mut self, depth: usize) -> &mut Self {
|
||||
self.depth = Some(depth);
|
||||
self
|
||||
}
|
||||
|
||||
/// builder call to set `target`
|
||||
pub fn target(&mut self, target: ExtractionTarget) -> &mut Self {
|
||||
self.target = Some(target);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn build(&self) -> Result<Extractor<'a>> {
|
||||
if self.url.is_empty() && self.response.is_none() {
|
||||
bail!("Extractor requires either a URL or a FeroxResponse be specified")
|
||||
}
|
||||
|
||||
Ok(Extractor {
|
||||
links_regex: Regex::new(LINKFINDER_REGEX).unwrap(),
|
||||
robots_regex: Regex::new(ROBOTS_TXT_REGEX).unwrap(),
|
||||
response: if self.response.is_some() {
|
||||
Some(self.response.unwrap())
|
||||
} else {
|
||||
None
|
||||
},
|
||||
url: self.url.to_owned(),
|
||||
config: self.config.unwrap(),
|
||||
tx_stats: self.tx_stats.as_ref().unwrap().clone(),
|
||||
tx_recursion: self.tx_recursion.as_ref().unwrap().clone(),
|
||||
tx_reporter: self.tx_reporter.as_ref().unwrap().clone(),
|
||||
scanned_urls: self.scanned_urls.unwrap(),
|
||||
depth: self.depth.unwrap(),
|
||||
stats: self.stats.as_ref().unwrap().clone(),
|
||||
target: self.target.unwrap(),
|
||||
})
|
||||
}
|
||||
}
|
||||
408
src/extractor/container.rs
Normal file
408
src/extractor/container.rs
Normal file
@@ -0,0 +1,408 @@
|
||||
use super::*;
|
||||
use crate::{
|
||||
client,
|
||||
scanner::{send_report, should_filter_response, try_recursion},
|
||||
statistics::{
|
||||
StatCommand::UpdateUsizeField,
|
||||
StatField::{LinksExtracted, TotalExpected},
|
||||
},
|
||||
update_stat,
|
||||
utils::{format_url, make_request},
|
||||
};
|
||||
use anyhow::{bail, Context, Result};
|
||||
use reqwest::{StatusCode, Url};
|
||||
use std::collections::HashSet;
|
||||
|
||||
/// Whether an active scan is recursive or not
|
||||
#[derive(Debug)]
|
||||
enum RecursionStatus {
|
||||
/// Scan is recursive
|
||||
Recursive,
|
||||
|
||||
/// Scan is not recursive
|
||||
NotRecursive,
|
||||
}
|
||||
|
||||
/// Handles all logic related to extracting links from requested source code
|
||||
#[derive(Debug)]
|
||||
pub struct Extractor<'a> {
|
||||
/// `LINKFINDER_REGEX` as a regex::Regex type
|
||||
pub(super) links_regex: Regex,
|
||||
|
||||
/// `ROBOTS_TXT_REGEX` as a regex::Regex type
|
||||
pub(super) robots_regex: Regex,
|
||||
|
||||
/// Response from which to extract links
|
||||
pub(super) response: Option<&'a FeroxResponse>,
|
||||
|
||||
/// Response from which to extract links
|
||||
pub(super) url: String,
|
||||
|
||||
/// Whether or not to try recursion
|
||||
pub(super) config: &'a Configuration,
|
||||
|
||||
/// transmitter to the mpsc that handles statistics gathering
|
||||
pub(super) tx_stats: UnboundedSender<StatCommand>,
|
||||
|
||||
/// transmitter to the mpsc that handles recursive scan calls
|
||||
pub(super) tx_recursion: UnboundedSender<String>,
|
||||
|
||||
/// transmitter to the mpsc that handles reporting information to the user
|
||||
pub(super) tx_reporter: UnboundedSender<FeroxResponse>,
|
||||
|
||||
/// list of urls that will be added to when new urls are extracted
|
||||
pub(super) scanned_urls: &'a FeroxScans,
|
||||
|
||||
/// depth at which the scan was started
|
||||
pub(super) depth: usize,
|
||||
|
||||
/// copy of Stats object
|
||||
pub(super) stats: Arc<Stats>,
|
||||
|
||||
/// type of extraction to be performed
|
||||
pub(super) target: ExtractionTarget,
|
||||
}
|
||||
|
||||
/// Extractor implementation
|
||||
impl<'a> Extractor<'a> {
|
||||
/// business logic that handles getting links from a normal http body response
|
||||
pub async fn extract(&self) -> Result<()> {
|
||||
let links = match self.target {
|
||||
ExtractionTarget::ResponseBody => self.extract_from_body().await?,
|
||||
ExtractionTarget::RobotsTxt => self.extract_from_robots().await?,
|
||||
};
|
||||
|
||||
let recursive = if self.config.no_recursion {
|
||||
RecursionStatus::NotRecursive
|
||||
} else {
|
||||
RecursionStatus::Recursive
|
||||
};
|
||||
|
||||
for link in links {
|
||||
// todo rename get_feroxresponse_from_link
|
||||
let mut resp = match self.get_feroxresponse_from_link(&link).await {
|
||||
Ok(resp) => resp,
|
||||
Err(_) => continue,
|
||||
};
|
||||
|
||||
// filter if necessary
|
||||
if should_filter_response(&resp, self.tx_stats.clone()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if resp.is_file() {
|
||||
// very likely a file, simply request and report
|
||||
log::debug!("Extracted file: {}", resp);
|
||||
|
||||
self.scanned_urls
|
||||
.add_file_scan(&resp.url().to_string(), self.stats.clone());
|
||||
|
||||
send_report(self.tx_reporter.clone(), resp);
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
if matches!(recursive, RecursionStatus::Recursive) {
|
||||
log::debug!("Extracted Directory: {}", resp);
|
||||
|
||||
if !resp.url().as_str().ends_with('/')
|
||||
&& (resp.status().is_success()
|
||||
|| matches!(resp.status(), &StatusCode::FORBIDDEN))
|
||||
{
|
||||
// if the url doesn't end with a /
|
||||
// and the response code is either a 2xx or 403
|
||||
|
||||
// since all of these are 2xx or 403, recursion is only attempted if the
|
||||
// url ends in a /. I am actually ok with adding the slash and not
|
||||
// adding it, as both have merit. Leaving it in for now to see how
|
||||
// things turn out (current as of: v1.1.0)
|
||||
resp.set_url(&format!("{}/", resp.url()));
|
||||
}
|
||||
|
||||
try_recursion(&resp, self.depth, self.tx_recursion.clone()).await;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Given a `reqwest::Response`, perform the following actions
|
||||
/// - parse the response's text for links using the linkfinder regex
|
||||
/// - for every link found take its url path and parse each sub-path
|
||||
/// - example: Response contains a link fragment `homepage/assets/img/icons/handshake.svg`
|
||||
/// with a base url of http://localhost, the following urls would be returned:
|
||||
/// - homepage/assets/img/icons/handshake.svg
|
||||
/// - homepage/assets/img/icons/
|
||||
/// - homepage/assets/img/
|
||||
/// - homepage/assets/
|
||||
/// - homepage/
|
||||
pub(super) async fn extract_from_body(&self) -> Result<HashSet<String>> {
|
||||
log::trace!("enter: get_links");
|
||||
|
||||
let mut links = HashSet::<String>::new();
|
||||
|
||||
let body = self.response.unwrap().text();
|
||||
|
||||
for capture in self.links_regex.captures_iter(&body) {
|
||||
// remove single & double quotes from both ends of the capture
|
||||
// capture[0] is the entire match, additional capture groups start at [1]
|
||||
let link = capture[0].trim_matches(|c| c == '\'' || c == '"');
|
||||
|
||||
match Url::parse(link) {
|
||||
Ok(absolute) => {
|
||||
if absolute.domain() != self.response.unwrap().url().domain()
|
||||
|| absolute.host() != self.response.unwrap().url().host()
|
||||
{
|
||||
// domains/ips are not the same, don't scan things that aren't part of the original
|
||||
// target url
|
||||
continue;
|
||||
}
|
||||
|
||||
if self.add_all_sub_paths(absolute.path(), &mut links).is_err() {
|
||||
log::warn!("could not add sub-paths from {} to {:?}", absolute, links);
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
// this is the expected error that happens when we try to parse a url fragment
|
||||
// ex: Url::parse("/login") -> Err("relative URL without a base")
|
||||
// while this is technically an error, these are good results for us
|
||||
if e.to_string().contains("relative URL without a base") {
|
||||
if self.add_all_sub_paths(link, &mut links).is_err() {
|
||||
log::warn!("could not add sub-paths from {} to {:?}", link, links);
|
||||
}
|
||||
} else {
|
||||
// unexpected error has occurred
|
||||
log::error!("Could not parse given url: {}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
self.update_stats(links.len());
|
||||
|
||||
log::trace!("exit: get_links -> {:?}", links);
|
||||
|
||||
Ok(links)
|
||||
}
|
||||
|
||||
/// take a url fragment like homepage/assets/img/icons/handshake.svg and
|
||||
/// incrementally add
|
||||
/// - homepage/assets/img/icons/
|
||||
/// - homepage/assets/img/
|
||||
/// - homepage/assets/
|
||||
/// - homepage/
|
||||
fn add_all_sub_paths(&self, url_path: &str, mut links: &mut HashSet<String>) -> Result<()> {
|
||||
log::trace!("enter: add_all_sub_paths({}, {:?})", url_path, links);
|
||||
|
||||
for sub_path in self.get_sub_paths_from_path(url_path) {
|
||||
self.add_link_to_set_of_links(&sub_path, &mut links)?;
|
||||
}
|
||||
|
||||
log::trace!("exit: add_all_sub_paths");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Iterate over a given path, return a list of every sub-path found
|
||||
///
|
||||
/// example: `path` contains a link fragment `homepage/assets/img/icons/handshake.svg`
|
||||
/// the following fragments would be returned:
|
||||
/// - homepage/assets/img/icons/handshake.svg
|
||||
/// - homepage/assets/img/icons/
|
||||
/// - homepage/assets/img/
|
||||
/// - homepage/assets/
|
||||
/// - homepage/
|
||||
pub(super) fn get_sub_paths_from_path(&self, path: &str) -> Vec<String> {
|
||||
log::trace!("enter: get_sub_paths_from_path({})", path);
|
||||
let mut paths = vec![];
|
||||
|
||||
// filter out any empty strings caused by .split
|
||||
let mut parts: Vec<&str> = path.split('/').filter(|s| !s.is_empty()).collect();
|
||||
|
||||
let length = parts.len();
|
||||
|
||||
for i in 0..length {
|
||||
// iterate over all parts of the path
|
||||
if parts.is_empty() {
|
||||
// pop left us with an empty vector, we're done
|
||||
break;
|
||||
}
|
||||
|
||||
let mut possible_path = parts.join("/");
|
||||
|
||||
if possible_path.is_empty() {
|
||||
// .join can result in an empty string, which we don't need, ignore
|
||||
continue;
|
||||
}
|
||||
|
||||
if i > 0 {
|
||||
// this isn't the last index of the parts array
|
||||
// ex: /buried/misc/stupidfile.php
|
||||
// this block skips the file but sees all parent folders
|
||||
possible_path = format!("{}/", possible_path);
|
||||
}
|
||||
|
||||
paths.push(possible_path); // good sub-path found
|
||||
parts.pop(); // use .pop() to remove the last part of the path and continue iteration
|
||||
}
|
||||
|
||||
log::trace!("exit: get_sub_paths_from_path -> {:?}", paths);
|
||||
paths
|
||||
}
|
||||
|
||||
/// simple helper to stay DRY, trys to join a url + fragment and add it to the `links` HashSet
|
||||
pub(super) fn add_link_to_set_of_links(
|
||||
&self,
|
||||
link: &str,
|
||||
links: &mut HashSet<String>,
|
||||
) -> Result<()> {
|
||||
log::trace!("enter: add_link_to_set_of_links({}, {:?})", link, links);
|
||||
|
||||
let old_url = match self.target {
|
||||
ExtractionTarget::ResponseBody => self.response.unwrap().url.clone(),
|
||||
ExtractionTarget::RobotsTxt => match Url::parse(&self.url) {
|
||||
Ok(u) => u,
|
||||
Err(e) => {
|
||||
bail!("Could not parse {}: {}", self.url, e);
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
let new_url = old_url
|
||||
.join(&link)
|
||||
.with_context(|| format!("Could not join {} with {}", old_url, link))?;
|
||||
|
||||
links.insert(new_url.to_string());
|
||||
|
||||
log::trace!("exit: add_link_to_set_of_links");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Wrapper around link extraction logic
|
||||
/// currently used in two places:
|
||||
/// - links from response bodies
|
||||
/// - links from robots.txt responses
|
||||
///
|
||||
/// general steps taken:
|
||||
/// - create a new Url object based on cli options/args
|
||||
/// - check if the new Url has already been seen/scanned -> None
|
||||
/// - make a request to the new Url ? -> Some(response) : None
|
||||
pub(super) async fn get_feroxresponse_from_link(&self, url: &str) -> Result<FeroxResponse> {
|
||||
log::trace!("enter: get_feroxresponse_from_link({})", url);
|
||||
|
||||
// create a url based on the given command line options, return None on error
|
||||
let new_url = format_url(
|
||||
&url,
|
||||
&"",
|
||||
self.config.add_slash,
|
||||
&self.config.queries,
|
||||
None,
|
||||
self.tx_stats.clone(),
|
||||
)?;
|
||||
|
||||
if self
|
||||
.scanned_urls
|
||||
.get_scan_by_url(&new_url.to_string())
|
||||
.is_some()
|
||||
{
|
||||
//we've seen the url before and don't need to scan again
|
||||
log::trace!("exit: get_feroxresponse_from_link -> None");
|
||||
bail!("previously seen url");
|
||||
}
|
||||
|
||||
// make the request and store the response
|
||||
let new_response =
|
||||
make_request(&self.config.client, &new_url, self.tx_stats.clone()).await?;
|
||||
|
||||
let new_ferox_response = FeroxResponse::from(new_response, true).await;
|
||||
|
||||
log::trace!(
|
||||
"exit: get_feroxresponse_from_link -> {:?}",
|
||||
new_ferox_response
|
||||
);
|
||||
|
||||
Ok(new_ferox_response)
|
||||
}
|
||||
|
||||
/// Entry point to perform link extraction from robots.txt
|
||||
///
|
||||
/// `base_url` can have paths and subpaths, however robots.txt will be requested from the
|
||||
/// root of the url
|
||||
/// given the url:
|
||||
/// http://localhost/stuff/things
|
||||
/// this function requests:
|
||||
/// http://localhost/robots.txt
|
||||
pub(super) async fn extract_from_robots(&self) -> Result<HashSet<String>> {
|
||||
log::trace!("enter: extract_robots_txt");
|
||||
|
||||
let mut links: HashSet<String> = HashSet::new();
|
||||
|
||||
let response = self.request_robots_txt().await?;
|
||||
|
||||
for capture in self.robots_regex.captures_iter(response.text.as_str()) {
|
||||
if let Some(new_path) = capture.name("url_path") {
|
||||
let mut new_url = Url::parse(&self.url)?;
|
||||
new_url.set_path(new_path.as_str());
|
||||
if self.add_all_sub_paths(&new_url.path(), &mut links).is_err() {
|
||||
log::warn!("could not add sub-paths from {} to {:?}", new_url, links);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
self.update_stats(links.len());
|
||||
|
||||
log::trace!("exit: extract_robots_txt -> {:?}", links);
|
||||
Ok(links)
|
||||
}
|
||||
|
||||
/// helper function that simply requests /robots.txt on the given url's base url
|
||||
///
|
||||
/// example:
|
||||
/// http://localhost/api/users -> http://localhost/robots.txt
|
||||
///
|
||||
/// The length of the given path has no effect on what's requested; it's always
|
||||
/// base url + /robots.txt
|
||||
pub(super) async fn request_robots_txt(&self) -> Result<FeroxResponse> {
|
||||
log::trace!("enter: get_robots_file");
|
||||
|
||||
// more often than not, domain/robots.txt will redirect to www.domain/robots.txt or something
|
||||
// similar; to account for that, create a client that will follow redirects, regardless of
|
||||
// what the user specified for the scanning client. Other than redirects, it will respect
|
||||
// all other user specified settings
|
||||
let follow_redirects = true;
|
||||
|
||||
let proxy = if self.config.proxy.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(self.config.proxy.as_str())
|
||||
};
|
||||
|
||||
let client = client::initialize(
|
||||
self.config.timeout,
|
||||
&self.config.user_agent,
|
||||
follow_redirects,
|
||||
self.config.insecure,
|
||||
&self.config.headers,
|
||||
proxy,
|
||||
);
|
||||
|
||||
let mut url = Url::parse(&self.url)?;
|
||||
url.set_path("/robots.txt"); // overwrite existing path with /robots.txt
|
||||
|
||||
let response = make_request(&client, &url, self.tx_stats.clone()).await?;
|
||||
let ferox_response = FeroxResponse::from(response, true).await;
|
||||
|
||||
log::trace!("exit: get_robots_file -> {}", ferox_response);
|
||||
return Ok(ferox_response);
|
||||
}
|
||||
|
||||
/// update total number of links extracted and expected responses
|
||||
fn update_stats(&self, num_links: usize) {
|
||||
let multiplier = self.config.extensions.len().max(1);
|
||||
|
||||
update_stat!(self.tx_stats, UpdateUsizeField(LinksExtracted, num_links));
|
||||
update_stat!(
|
||||
self.tx_stats,
|
||||
UpdateUsizeField(TotalExpected, num_links * multiplier)
|
||||
);
|
||||
}
|
||||
}
|
||||
19
src/extractor/mod.rs
Normal file
19
src/extractor/mod.rs
Normal file
@@ -0,0 +1,19 @@
|
||||
//! extract links from html source and robots.txt
|
||||
mod builder;
|
||||
mod container;
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
||||
pub use self::builder::ExtractionTarget;
|
||||
pub use self::builder::ExtractorBuilder;
|
||||
pub use self::container::Extractor;
|
||||
|
||||
use crate::{
|
||||
config::Configuration,
|
||||
scan_manager::FeroxScans,
|
||||
statistics::{StatCommand, Stats},
|
||||
FeroxResponse,
|
||||
};
|
||||
use regex::Regex;
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::mpsc::UnboundedSender;
|
||||
372
src/extractor/tests.rs
Normal file
372
src/extractor/tests.rs
Normal file
@@ -0,0 +1,372 @@
|
||||
use super::*;
|
||||
use crate::utils::make_request;
|
||||
use crate::FeroxChannel;
|
||||
use anyhow::Result;
|
||||
use httpmock::Method::GET;
|
||||
use httpmock::MockServer;
|
||||
use lazy_static::lazy_static;
|
||||
use reqwest::{header::HeaderMap, Client, StatusCode, Url};
|
||||
use std::collections::HashSet;
|
||||
use tokio::sync::mpsc;
|
||||
|
||||
lazy_static! {
|
||||
/// Extractor for testing robots.txt
|
||||
static ref ROBOTS_EXT: Extractor<'static> = setup_extractor(ExtractionTarget::RobotsTxt);
|
||||
|
||||
/// Extractor for testing response bodies
|
||||
static ref BODY_EXT: Extractor<'static> = setup_extractor(ExtractionTarget::ResponseBody);
|
||||
|
||||
/// Configuration for Extractor
|
||||
static ref CONFIG: Configuration = Configuration::new();
|
||||
|
||||
/// FeroxScans for Extractor
|
||||
static ref SCANS: FeroxScans = FeroxScans::default();
|
||||
|
||||
/// FeroxResponse for Extractor
|
||||
static ref RESPONSE: FeroxResponse = get_test_response();
|
||||
}
|
||||
|
||||
fn get_test_response() -> FeroxResponse {
|
||||
FeroxResponse {
|
||||
text: String::new(),
|
||||
wildcard: true,
|
||||
url: Url::parse("https://localhost").unwrap(),
|
||||
content_length: 125,
|
||||
word_count: 10,
|
||||
line_count: 14,
|
||||
headers: HeaderMap::new(),
|
||||
status: StatusCode::OK,
|
||||
}
|
||||
}
|
||||
|
||||
/// creates a single extractor that can be used to test standalone functions
|
||||
fn setup_extractor(target: ExtractionTarget) -> Extractor<'static> {
|
||||
let (tx_dir, _): FeroxChannel<String> = mpsc::unbounded_channel();
|
||||
let (tx_stats, _): FeroxChannel<StatCommand> = mpsc::unbounded_channel();
|
||||
let (tx_term, _): FeroxChannel<FeroxResponse> = mpsc::unbounded_channel();
|
||||
let stats = Arc::new(Stats::new());
|
||||
|
||||
let mut builder = match target {
|
||||
ExtractionTarget::ResponseBody => ExtractorBuilder::with_response(&RESPONSE),
|
||||
ExtractionTarget::RobotsTxt => ExtractorBuilder::with_url("https://localhost"),
|
||||
};
|
||||
|
||||
builder
|
||||
.target(target)
|
||||
.depth(4)
|
||||
.config(&CONFIG)
|
||||
.recursion_transmitter(tx_dir.clone())
|
||||
.stats_transmitter(tx_stats.clone())
|
||||
.reporter_transmitter(tx_term.clone())
|
||||
.scanned_urls(&SCANS)
|
||||
.stats(stats.clone())
|
||||
.build()
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
#[test]
|
||||
/// extract sub paths from the given url fragment; expect 4 sub paths and that all are
|
||||
/// in the expected array
|
||||
fn extractor_get_sub_paths_from_path_with_multiple_paths() {
|
||||
let path = "homepage/assets/img/icons/handshake.svg";
|
||||
let r_paths = ROBOTS_EXT.get_sub_paths_from_path(&path);
|
||||
let b_paths = BODY_EXT.get_sub_paths_from_path(&path);
|
||||
let expected = vec![
|
||||
"homepage/",
|
||||
"homepage/assets/",
|
||||
"homepage/assets/img/",
|
||||
"homepage/assets/img/icons/",
|
||||
"homepage/assets/img/icons/handshake.svg",
|
||||
];
|
||||
|
||||
assert_eq!(r_paths.len(), expected.len());
|
||||
assert_eq!(b_paths.len(), expected.len());
|
||||
for expected_path in expected {
|
||||
assert_eq!(r_paths.contains(&expected_path.to_string()), true);
|
||||
assert_eq!(b_paths.contains(&expected_path.to_string()), true);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
/// extract sub paths from the given url fragment; expect 2 sub paths and that all are
|
||||
/// in the expected array. the fragment is wrapped in slashes to ensure no empty strings are
|
||||
/// returned
|
||||
fn extractor_get_sub_paths_from_path_with_enclosing_slashes() {
|
||||
let path = "/homepage/assets/";
|
||||
let r_paths = ROBOTS_EXT.get_sub_paths_from_path(&path);
|
||||
let b_paths = BODY_EXT.get_sub_paths_from_path(&path);
|
||||
let expected = vec!["homepage/", "homepage/assets"];
|
||||
|
||||
assert_eq!(r_paths.len(), expected.len());
|
||||
assert_eq!(b_paths.len(), expected.len());
|
||||
for expected_path in expected {
|
||||
assert_eq!(r_paths.contains(&expected_path.to_string()), true);
|
||||
assert_eq!(b_paths.contains(&expected_path.to_string()), true);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
/// extract sub paths from the given url fragment; expect 1 sub path, no forward slashes are
|
||||
/// included
|
||||
fn extractor_get_sub_paths_from_path_with_only_a_word() {
|
||||
let path = "homepage";
|
||||
let r_paths = ROBOTS_EXT.get_sub_paths_from_path(&path);
|
||||
let b_paths = BODY_EXT.get_sub_paths_from_path(&path);
|
||||
let expected = vec!["homepage"];
|
||||
|
||||
assert_eq!(r_paths.len(), expected.len());
|
||||
assert_eq!(b_paths.len(), expected.len());
|
||||
for expected_path in expected {
|
||||
assert_eq!(r_paths.contains(&expected_path.to_string()), true);
|
||||
assert_eq!(b_paths.contains(&expected_path.to_string()), true);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
/// extract sub paths from the given url fragment; expect 1 sub path, forward slash removed
|
||||
fn extractor_get_sub_paths_from_path_with_an_absolute_word() {
|
||||
let path = "/homepage";
|
||||
let r_paths = ROBOTS_EXT.get_sub_paths_from_path(&path);
|
||||
let b_paths = BODY_EXT.get_sub_paths_from_path(&path);
|
||||
let expected = vec!["homepage"];
|
||||
|
||||
assert_eq!(r_paths.len(), expected.len());
|
||||
assert_eq!(b_paths.len(), expected.len());
|
||||
for expected_path in expected {
|
||||
assert_eq!(r_paths.contains(&expected_path.to_string()), true);
|
||||
assert_eq!(b_paths.contains(&expected_path.to_string()), true);
|
||||
}
|
||||
}
|
||||
#[test]
|
||||
/// test that an ExtractorBuilder without a FeroxResponse and without a URL bails
|
||||
fn extractor_builder_bails_when_neither_required_field_is_set() {
|
||||
let (tx_dir, _): FeroxChannel<String> = mpsc::unbounded_channel();
|
||||
let (tx_stats, _): FeroxChannel<StatCommand> = mpsc::unbounded_channel();
|
||||
let (tx_term, _): FeroxChannel<FeroxResponse> = mpsc::unbounded_channel();
|
||||
let stats = Arc::new(Stats::new());
|
||||
|
||||
let extractor = ExtractorBuilder::with_url("")
|
||||
.target(ExtractionTarget::ResponseBody)
|
||||
.depth(4)
|
||||
.config(&CONFIG)
|
||||
.recursion_transmitter(tx_dir.clone())
|
||||
.stats_transmitter(tx_stats.clone())
|
||||
.reporter_transmitter(tx_term.clone())
|
||||
.scanned_urls(&SCANS)
|
||||
.stats(stats.clone())
|
||||
.build();
|
||||
|
||||
assert!(extractor.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
/// test that a full url and fragment are joined correctly, then added to the given list
|
||||
/// i.e. the happy path
|
||||
fn extractor_add_link_to_set_of_links_happy_path() {
|
||||
let mut r_links = HashSet::<String>::new();
|
||||
let r_link = "admin";
|
||||
let mut b_links = HashSet::<String>::new();
|
||||
let b_link = "shmadmin";
|
||||
|
||||
assert_eq!(r_links.len(), 0);
|
||||
ROBOTS_EXT
|
||||
.add_link_to_set_of_links(r_link, &mut r_links)
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(r_links.len(), 1);
|
||||
assert!(r_links.contains("https://localhost/admin"));
|
||||
|
||||
assert_eq!(b_links.len(), 0);
|
||||
|
||||
BODY_EXT
|
||||
.add_link_to_set_of_links(b_link, &mut b_links)
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(b_links.len(), 1);
|
||||
assert!(b_links.contains("https://localhost/shmadmin"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
/// test that an invalid path fragment doesn't add anything to the set of links
|
||||
fn extractor_add_link_to_set_of_links_with_non_base_url() {
|
||||
let mut links = HashSet::<String>::new();
|
||||
let link = "\\\\\\\\";
|
||||
|
||||
assert_eq!(links.len(), 0);
|
||||
assert!(ROBOTS_EXT
|
||||
.add_link_to_set_of_links(link, &mut links)
|
||||
.is_err());
|
||||
assert!(BODY_EXT.add_link_to_set_of_links(link, &mut links).is_err());
|
||||
|
||||
assert_eq!(links.len(), 0);
|
||||
assert!(links.is_empty());
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 1)]
|
||||
/// use make_request to generate a Response, and use the Response to test get_links;
|
||||
/// the response will contain an absolute path to a domain that is not part of the scanned
|
||||
/// domain; expect an empty set returned
|
||||
async fn extractor_get_links_with_absolute_url_that_differs_from_target_domain() -> Result<()> {
|
||||
let (tx_dir, _): FeroxChannel<String> = mpsc::unbounded_channel();
|
||||
let (tx_stats, _): FeroxChannel<StatCommand> = mpsc::unbounded_channel();
|
||||
let (tx_term, _): FeroxChannel<FeroxResponse> = mpsc::unbounded_channel();
|
||||
let stats = Arc::new(Stats::new());
|
||||
|
||||
let srv = MockServer::start();
|
||||
|
||||
let mock = srv.mock(|when, then| {
|
||||
when.method(GET).path("/some-path");
|
||||
then.status(200).body(
|
||||
"\"http://defintely.not.a.thing.probably.com/homepage/assets/img/icons/handshake.svg\"",
|
||||
);
|
||||
});
|
||||
|
||||
let client = Client::new();
|
||||
let url = Url::parse(&srv.url("/some-path")).unwrap();
|
||||
|
||||
let response = make_request(&client, &url, tx_stats.clone()).await.unwrap();
|
||||
|
||||
let ferox_response = FeroxResponse::from(response, true).await;
|
||||
|
||||
let extractor = ExtractorBuilder::with_response(&ferox_response)
|
||||
.target(ExtractionTarget::ResponseBody)
|
||||
.depth(4)
|
||||
.config(&CONFIG)
|
||||
.recursion_transmitter(tx_dir.clone())
|
||||
.stats_transmitter(tx_stats.clone())
|
||||
.reporter_transmitter(tx_term.clone())
|
||||
.scanned_urls(&SCANS)
|
||||
.stats(stats.clone())
|
||||
.build()?;
|
||||
|
||||
let links = extractor.extract_from_body().await?;
|
||||
|
||||
assert!(links.is_empty());
|
||||
|
||||
assert_eq!(mock.hits(), 1);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 1)]
|
||||
/// test that /robots.txt is correctly requested given a base url (happy path)
|
||||
async fn request_robots_txt_without_proxy() -> Result<()> {
|
||||
let (tx_dir, _): FeroxChannel<String> = mpsc::unbounded_channel();
|
||||
let (tx_stats, _): FeroxChannel<StatCommand> = mpsc::unbounded_channel();
|
||||
let (tx_term, _): FeroxChannel<FeroxResponse> = mpsc::unbounded_channel();
|
||||
let stats = Arc::new(Stats::new());
|
||||
let config = Configuration::new();
|
||||
|
||||
let srv = MockServer::start();
|
||||
|
||||
let mock = srv.mock(|when, then| {
|
||||
when.method(GET).path("/robots.txt");
|
||||
then.status(200).body("this is a test");
|
||||
});
|
||||
|
||||
let extractor = ExtractorBuilder::with_url(&srv.url("/api/users/stuff/things"))
|
||||
.target(ExtractionTarget::RobotsTxt)
|
||||
.depth(4)
|
||||
.config(&config)
|
||||
.recursion_transmitter(tx_dir.clone())
|
||||
.stats_transmitter(tx_stats.clone())
|
||||
.reporter_transmitter(tx_term.clone())
|
||||
.scanned_urls(&SCANS)
|
||||
.stats(stats.clone())
|
||||
.build()?;
|
||||
|
||||
let resp = extractor.request_robots_txt().await?;
|
||||
|
||||
assert!(matches!(resp.status(), &StatusCode::OK));
|
||||
println!("{}", resp);
|
||||
assert_eq!(resp.content_length(), 14);
|
||||
assert_eq!(mock.hits(), 1);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 1)]
|
||||
/// test that /robots.txt is correctly requested given a base url (happy path) when a proxy is used
|
||||
async fn request_robots_txt_with_proxy() -> Result<()> {
|
||||
let (tx_dir, _): FeroxChannel<String> = mpsc::unbounded_channel();
|
||||
let (tx_stats, _): FeroxChannel<StatCommand> = mpsc::unbounded_channel();
|
||||
let (tx_term, _): FeroxChannel<FeroxResponse> = mpsc::unbounded_channel();
|
||||
let stats = Arc::new(Stats::new());
|
||||
let mut config = Configuration::new();
|
||||
|
||||
let srv = MockServer::start();
|
||||
|
||||
let mock = srv.mock(|when, then| {
|
||||
when.method(GET).path("/robots.txt");
|
||||
then.status(200).body("this is also a test");
|
||||
});
|
||||
|
||||
// note: the proxy doesn't actually do anything other than hit a different code branch
|
||||
// in this unit test; it would however have an effect on an integration test
|
||||
config.proxy = srv.url("/ima-proxy");
|
||||
|
||||
let extractor = ExtractorBuilder::with_url(&srv.url("/api/different/path"))
|
||||
.target(ExtractionTarget::RobotsTxt)
|
||||
.depth(4)
|
||||
.config(&config)
|
||||
.recursion_transmitter(tx_dir.clone())
|
||||
.stats_transmitter(tx_stats.clone())
|
||||
.reporter_transmitter(tx_term.clone())
|
||||
.scanned_urls(&SCANS)
|
||||
.stats(stats.clone())
|
||||
.build()?;
|
||||
|
||||
let resp = extractor.request_robots_txt().await?;
|
||||
|
||||
assert!(matches!(resp.status(), &StatusCode::OK));
|
||||
assert_eq!(resp.content_length(), 19);
|
||||
assert_eq!(mock.hits(), 1);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 1)]
|
||||
/// get_feroxresponse_from_link's happy path, expect back a FeroxResponse
|
||||
async fn get_feroxresponse_from_link_happy_path() -> Result<()> {
|
||||
let srv = MockServer::start();
|
||||
|
||||
let mock = srv.mock(|when, then| {
|
||||
when.method(GET).path("/login.php");
|
||||
then.status(200).body("this is a test");
|
||||
});
|
||||
|
||||
let r_resp = ROBOTS_EXT
|
||||
.get_feroxresponse_from_link(&srv.url("/login.php"))
|
||||
.await?;
|
||||
let b_resp = BODY_EXT
|
||||
.get_feroxresponse_from_link(&srv.url("/login.php"))
|
||||
.await?;
|
||||
|
||||
assert!(matches!(r_resp.status(), &StatusCode::OK));
|
||||
assert!(matches!(b_resp.status(), &StatusCode::OK));
|
||||
assert_eq!(r_resp.content_length(), 14);
|
||||
assert_eq!(b_resp.content_length(), 14);
|
||||
assert_eq!(mock.hits(), 2);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 1)]
|
||||
/// get_feroxresponse_from_link should bail in the event that the url is already in scanned_urls
|
||||
async fn get_feroxresponse_from_link_bails_on_seen_url() -> Result<()> {
|
||||
let url = "/unique-for-this-test.php";
|
||||
let srv = MockServer::start();
|
||||
let served = srv.url(url);
|
||||
|
||||
let mock = srv.mock(|when, then| {
|
||||
when.method(GET).path(url);
|
||||
then.status(200)
|
||||
.body("this is a unique test, don't reuse the endpoint");
|
||||
});
|
||||
|
||||
SCANS.add_file_scan(&served, ROBOTS_EXT.stats.clone());
|
||||
|
||||
let r_resp = ROBOTS_EXT.get_feroxresponse_from_link(&served).await;
|
||||
let b_resp = BODY_EXT.get_feroxresponse_from_link(&served).await;
|
||||
|
||||
assert!(r_resp.is_err());
|
||||
assert!(b_resp.is_err());
|
||||
assert_eq!(mock.hits(), 0); // function exits before requests can happen
|
||||
Ok(())
|
||||
}
|
||||
36
src/filters/helpers.rs
Normal file
36
src/filters/helpers.rs
Normal file
@@ -0,0 +1,36 @@
|
||||
// use super::WildcardFilter;
|
||||
// use crate::{
|
||||
// statistics::{
|
||||
// StatCommand::{self, UpdateUsizeField},
|
||||
// StatField::WildcardsFiltered,
|
||||
// },
|
||||
// FeroxResponse,
|
||||
// };
|
||||
// use anyhow::Result;
|
||||
// use tokio::sync::mpsc::UnboundedSender;
|
||||
//
|
||||
// /// Simple helper to stay DRY; determines whether or not a given `FeroxResponse` should be reported
|
||||
// /// to the user or not.
|
||||
// pub fn should_filter_response(
|
||||
// response: &FeroxResponse,
|
||||
// tx_stats: UnboundedSender<StatCommand>,
|
||||
// ) -> Result<bool> {
|
||||
// let filters = FILTERS
|
||||
// match FILTERS.read() {
|
||||
// Ok(filters) => {
|
||||
// for filter in filters.iter() {
|
||||
// // wildcard.should_filter goes here
|
||||
// if filter.should_filter_response(&response) {
|
||||
// if filter.as_any().downcast_ref::<WildcardFilter>().is_some() {
|
||||
// update_stat!(tx_stats, UpdateUsizeField(WildcardsFiltered, 1))
|
||||
// }
|
||||
// return true;
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// Err(e) => {
|
||||
// log::error!("{}", e);
|
||||
// }
|
||||
// }
|
||||
// false
|
||||
// }
|
||||
@@ -8,6 +8,7 @@ mod regex;
|
||||
mod similarity;
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
mod helpers;
|
||||
|
||||
pub use self::lines::LinesFilter;
|
||||
pub use self::regex::RegexFilter;
|
||||
|
||||
14
src/lib.rs
14
src/lib.rs
@@ -1,19 +1,19 @@
|
||||
pub mod utils;
|
||||
pub mod client;
|
||||
pub mod banner;
|
||||
pub mod config;
|
||||
pub mod extractor;
|
||||
pub mod filters;
|
||||
mod client;
|
||||
mod event_handlers;
|
||||
mod filters;
|
||||
pub mod heuristics;
|
||||
pub mod logger;
|
||||
pub mod parser;
|
||||
mod parser;
|
||||
pub mod progress;
|
||||
pub mod reporter;
|
||||
pub mod scan_manager;
|
||||
pub mod scanner;
|
||||
pub mod statistics;
|
||||
mod event_handlers;
|
||||
pub mod banner;
|
||||
mod traits;
|
||||
pub mod utils;
|
||||
mod extractor;
|
||||
|
||||
use crate::{
|
||||
traits::FeroxSerialize,
|
||||
|
||||
@@ -5,9 +5,11 @@ use crate::{
|
||||
StatCommand::{self, UpdateUsizeField},
|
||||
StatField::ResourcesDiscovered,
|
||||
},
|
||||
update_stat,
|
||||
utils::{ferox_print, make_request, open_file},
|
||||
FeroxChannel, FeroxResponse, FeroxSerialize,
|
||||
};
|
||||
|
||||
use console::strip_ansi_codes;
|
||||
use std::{
|
||||
fs, io,
|
||||
|
||||
145
src/scanner.rs
145
src/scanner.rs
@@ -1,6 +1,6 @@
|
||||
use crate::{
|
||||
config::{Configuration, CONFIGURATION},
|
||||
extractor::{extract_robots_txt, get_links, request_feroxresponse_from_new_link},
|
||||
extractor::{ExtractionTarget, ExtractorBuilder},
|
||||
filters::{
|
||||
LinesFilter, RegexFilter, SimilarityFilter, SizeFilter, StatusCodeFilter, WildcardFilter,
|
||||
WordsFilter,
|
||||
@@ -13,6 +13,7 @@ use crate::{
|
||||
Stats,
|
||||
},
|
||||
traits::FeroxFilter,
|
||||
update_stat,
|
||||
utils::{format_url, get_current_depth, make_request},
|
||||
FeroxChannel, FeroxResponse, SIMILARITY_THRESHOLD,
|
||||
};
|
||||
@@ -307,11 +308,12 @@ fn reached_max_depth(url: &Url, base_depth: usize, max_depth: usize) -> bool {
|
||||
/// Helper function that wraps logic to check for recursion opportunities
|
||||
///
|
||||
/// When a recursion opportunity is found, the new url is sent across the recursion channel
|
||||
async fn try_recursion(
|
||||
pub async fn try_recursion(
|
||||
response: &FeroxResponse,
|
||||
base_depth: usize,
|
||||
transmitter: UnboundedSender<String>,
|
||||
) {
|
||||
// todo this should be part of the recursion handler
|
||||
log::trace!(
|
||||
"enter: try_recursion({}, {}, {:?})",
|
||||
response,
|
||||
@@ -433,56 +435,19 @@ async fn make_requests(
|
||||
}
|
||||
|
||||
if CONFIGURATION.extract_links && !ferox_response.status().is_redirection() {
|
||||
let new_links = get_links(&ferox_response, tx_stats.clone()).await;
|
||||
let extractor = ExtractorBuilder::with_response(&ferox_response)
|
||||
.target(ExtractionTarget::ResponseBody)
|
||||
.depth(base_depth)
|
||||
.config(&CONFIGURATION)
|
||||
.recursion_transmitter(dir_chan.clone())
|
||||
.stats_transmitter(tx_stats.clone())
|
||||
.reporter_transmitter(report_chan.clone())
|
||||
.scanned_urls(&SCANNED_URLS)
|
||||
.stats(stats.clone())
|
||||
.build()
|
||||
.unwrap(); // todo change once this function returns Result
|
||||
|
||||
for new_link in new_links {
|
||||
let mut new_ferox_response = match request_feroxresponse_from_new_link(
|
||||
&new_link,
|
||||
tx_stats.clone(),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Some(resp) => resp,
|
||||
None => continue,
|
||||
};
|
||||
|
||||
// filter if necessary
|
||||
if should_filter_response(&new_ferox_response, tx_stats.clone()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if new_ferox_response.is_file() {
|
||||
// very likely a file, simply request and report
|
||||
log::debug!("Singular extraction: {}", new_ferox_response);
|
||||
|
||||
SCANNED_URLS
|
||||
.add_file_scan(&new_ferox_response.url().to_string(), stats.clone());
|
||||
|
||||
send_report(report_chan.clone(), new_ferox_response);
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
if !CONFIGURATION.no_recursion {
|
||||
log::debug!("Recursive extraction: {}", new_ferox_response);
|
||||
|
||||
if !new_ferox_response.url().as_str().ends_with('/')
|
||||
&& (new_ferox_response.status().is_success()
|
||||
|| matches!(new_ferox_response.status(), &StatusCode::FORBIDDEN))
|
||||
{
|
||||
// if the url doesn't end with a /
|
||||
// and the response code is either a 2xx or 403
|
||||
|
||||
// since all of these are 2xx or 403, recursion is only attempted if the
|
||||
// url ends in a /. I am actually ok with adding the slash and not
|
||||
// adding it, as both have merit. Leaving it in for now to see how
|
||||
// things turn out (current as of: v1.1.0)
|
||||
new_ferox_response.set_url(&format!("{}/", new_ferox_response.url()));
|
||||
}
|
||||
|
||||
try_recursion(&new_ferox_response, base_depth, dir_chan.clone()).await;
|
||||
}
|
||||
}
|
||||
let _ = extractor.extract().await;
|
||||
}
|
||||
|
||||
// everything else should be reported
|
||||
@@ -506,61 +471,6 @@ pub fn send_report(report_sender: UnboundedSender<FeroxResponse>, response: Fero
|
||||
log::trace!("exit: send_report");
|
||||
}
|
||||
|
||||
/// Request /robots.txt from given url
|
||||
async fn scan_robots_txt(
|
||||
target_url: &str,
|
||||
base_depth: usize,
|
||||
stats: Arc<Stats>,
|
||||
tx_term: UnboundedSender<FeroxResponse>,
|
||||
tx_dir: UnboundedSender<String>,
|
||||
tx_stats: UnboundedSender<StatCommand>,
|
||||
) {
|
||||
log::trace!(
|
||||
"enter: scan_robots_txt({}, {}, {:?}, {:?}, {:?}, {:?})",
|
||||
target_url,
|
||||
base_depth,
|
||||
stats,
|
||||
tx_term,
|
||||
tx_dir,
|
||||
tx_stats
|
||||
);
|
||||
|
||||
let robots_links = extract_robots_txt(&target_url, &CONFIGURATION, tx_stats.clone()).await;
|
||||
|
||||
for robot_link in robots_links {
|
||||
// create a url based on the given command line options, continue on error
|
||||
let mut ferox_response =
|
||||
match request_feroxresponse_from_new_link(&robot_link, tx_stats.clone()).await {
|
||||
Some(resp) => resp,
|
||||
None => continue,
|
||||
};
|
||||
|
||||
if should_filter_response(&ferox_response, tx_stats.clone()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if ferox_response.is_file() {
|
||||
log::debug!("File extracted from robots.txt: {}", ferox_response);
|
||||
SCANNED_URLS.add_file_scan(&robot_link, stats.clone());
|
||||
send_report(tx_term.clone(), ferox_response);
|
||||
} else if !CONFIGURATION.no_recursion {
|
||||
log::debug!("Directory extracted from robots.txt: {}", ferox_response);
|
||||
// todo this code is essentially the same as another piece around ~467 of this file
|
||||
if !ferox_response.url().as_str().ends_with('/')
|
||||
&& (ferox_response.status().is_success()
|
||||
|| matches!(ferox_response.status(), &StatusCode::FORBIDDEN))
|
||||
{
|
||||
// if the url doesn't end with a /
|
||||
// and the response code is either a 2xx or 403
|
||||
ferox_response.set_url(&format!("{}/", ferox_response.url()));
|
||||
}
|
||||
|
||||
try_recursion(&ferox_response, base_depth, tx_dir.clone()).await;
|
||||
}
|
||||
}
|
||||
log::trace!("exit: scan_robots_txt");
|
||||
}
|
||||
|
||||
/// Scan a given url using a given wordlist
|
||||
///
|
||||
/// This is the primary entrypoint for the scanner
|
||||
@@ -596,15 +506,20 @@ pub async fn scan_url(
|
||||
if CONFIGURATION.extract_links {
|
||||
// only grab robots.txt on the initial scan_url calls. all fresh dirs will be passed
|
||||
// to try_recursion
|
||||
scan_robots_txt(
|
||||
target_url,
|
||||
base_depth,
|
||||
stats.clone(),
|
||||
tx_term.clone(),
|
||||
tx_dir.clone(),
|
||||
tx_stats.clone(),
|
||||
)
|
||||
.await;
|
||||
|
||||
let extractor = ExtractorBuilder::with_url(target_url)
|
||||
.target(ExtractionTarget::RobotsTxt)
|
||||
.depth(base_depth)
|
||||
.config(&CONFIGURATION)
|
||||
.recursion_transmitter(tx_dir.clone())
|
||||
.stats_transmitter(tx_stats.clone())
|
||||
.reporter_transmitter(tx_term.clone())
|
||||
.scanned_urls(&SCANNED_URLS)
|
||||
.stats(stats.clone())
|
||||
.build()
|
||||
.unwrap(); // todo change once this function returns Result
|
||||
|
||||
let _ = extractor.extract().await;
|
||||
}
|
||||
|
||||
update_stat!(tx_stats, UpdateUsizeField(TotalScans, 1));
|
||||
|
||||
@@ -5,7 +5,7 @@ use crate::{
|
||||
StatCommand::{self, AddError, AddStatus},
|
||||
StatError::{Connection, Other, Redirection, Request, Timeout, UrlFormat},
|
||||
},
|
||||
FeroxError, FeroxResult,
|
||||
FeroxError,
|
||||
};
|
||||
use anyhow::{bail, Context, Result};
|
||||
use console::{strip_ansi_codes, style, user_attended};
|
||||
@@ -184,7 +184,7 @@ pub fn format_url(
|
||||
queries: &[(String, String)],
|
||||
extension: Option<&str>,
|
||||
tx_stats: UnboundedSender<StatCommand>,
|
||||
) -> FeroxResult<Url> {
|
||||
) -> Result<Url> {
|
||||
log::trace!(
|
||||
"enter: format_url({}, {}, {}, {:?} {:?}, {:?})",
|
||||
url,
|
||||
@@ -214,7 +214,7 @@ pub fn format_url(
|
||||
update_stat!(tx_stats, AddError(UrlFormat));
|
||||
|
||||
log::trace!("exit: format_url -> {}", err);
|
||||
return Err(Box::new(err));
|
||||
bail!("{}", err);
|
||||
}
|
||||
|
||||
// from reqwest::Url::join
|
||||
@@ -284,7 +284,7 @@ pub fn format_url(
|
||||
update_stat!(tx_stats, AddError(UrlFormat));
|
||||
log::trace!("exit: format_url -> {}", e);
|
||||
log::error!("Could not join {} with {}", word, base_url);
|
||||
Err(Box::new(e))
|
||||
bail!("{}", e)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user