extractor restructure mostly done

This commit is contained in:
epi
2021-01-16 08:07:38 -06:00
parent 4b2af18ae2
commit 269ae86201
11 changed files with 1050 additions and 630 deletions

View File

@@ -1,504 +0,0 @@
use crate::{
client,
config::{Configuration, CONFIGURATION},
scanner::SCANNED_URLS,
statistics::{
StatCommand::{self, UpdateUsizeField},
StatField::{LinksExtracted, TotalExpected},
},
utils::{format_url, make_request},
FeroxResponse,
};
use lazy_static::lazy_static;
use regex::Regex;
use reqwest::Url;
use std::collections::HashSet;
use tokio::sync::mpsc::UnboundedSender;
/// Regular expression used in [LinkFinder](https://github.com/GerbenJavado/LinkFinder)
///
/// Incorporates change from this [Pull Request](https://github.com/GerbenJavado/LinkFinder/pull/66/files)
const LINKFINDER_REGEX: &str = r#"(?:"|')(((?:[a-zA-Z]{1,10}://|//)[^"'/]{1,}\.[a-zA-Z]{2,}[^"']{0,})|((?:/|\.\./|\./)[^"'><,;| *()(%%$^/\\\[\]][^"'><,;|()]{1,})|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{1,}\.(?:[a-zA-Z]{1,4}|action)(?:[\?|#][^"|']{0,}|))|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{3,}(?:[\?|#][^"|']{0,}|))|([a-zA-Z0-9_\-.]{1,}\.(?:php|asp|aspx|jsp|json|action|html|js|txt|xml)(?:[\?|#][^"|']{0,}|)))(?:"|')"#;
/// Regular expression to pull url paths from robots.txt
///
/// ref: https://developers.google.com/search/reference/robots_txt
const ROBOTS_TXT_REGEX: &str =
r#"(?m)^ *(Allow|Disallow): *(?P<url_path>[a-zA-Z0-9._/?#@!&'()+,;%=-]+?)$"#; // multi-line (?m)
lazy_static! {
/// `LINKFINDER_REGEX` as a regex::Regex type
static ref LINKS_REGEX: Regex = Regex::new(LINKFINDER_REGEX).unwrap();
/// `ROBOTS_TXT_REGEX` as a regex::Regex type
static ref ROBOTS_REGEX: Regex = Regex::new(ROBOTS_TXT_REGEX).unwrap();
}
/// Iterate over a given path, return a list of every sub-path found
///
/// example: `path` contains a link fragment `homepage/assets/img/icons/handshake.svg`
/// the following fragments would be returned:
/// - homepage/assets/img/icons/handshake.svg
/// - homepage/assets/img/icons/
/// - homepage/assets/img/
/// - homepage/assets/
/// - homepage/
fn get_sub_paths_from_path(path: &str) -> Vec<String> {
log::trace!("enter: get_sub_paths_from_path({})", path);
let mut paths = vec![];
// filter out any empty strings caused by .split
let mut parts: Vec<&str> = path.split('/').filter(|s| !s.is_empty()).collect();
let length = parts.len();
for i in 0..length {
// iterate over all parts of the path
if parts.is_empty() {
// pop left us with an empty vector, we're done
break;
}
let mut possible_path = parts.join("/");
if possible_path.is_empty() {
// .join can result in an empty string, which we don't need, ignore
continue;
}
if i > 0 {
// this isn't the last index of the parts array
// ex: /buried/misc/stupidfile.php
// this block skips the file but sees all parent folders
possible_path = format!("{}/", possible_path);
}
paths.push(possible_path); // good sub-path found
parts.pop(); // use .pop() to remove the last part of the path and continue iteration
}
log::trace!("exit: get_sub_paths_from_path -> {:?}", paths);
paths
}
/// simple helper to stay DRY, trys to join a url + fragment and add it to the `links` HashSet
fn add_link_to_set_of_links(link: &str, url: &Url, links: &mut HashSet<String>) {
log::trace!(
"enter: add_link_to_set_of_links({}, {}, {:?})",
link,
url.to_string(),
links
);
match url.join(&link) {
Ok(new_url) => {
links.insert(new_url.to_string());
}
Err(e) => {
log::error!("Could not join given url to the base url: {}", e);
}
}
log::trace!("exit: add_link_to_set_of_links");
}
/// Given a `reqwest::Response`, perform the following actions
/// - parse the response's text for links using the linkfinder regex
/// - for every link found take its url path and parse each sub-path
/// - example: Response contains a link fragment `homepage/assets/img/icons/handshake.svg`
/// with a base url of http://localhost, the following urls would be returned:
/// - homepage/assets/img/icons/handshake.svg
/// - homepage/assets/img/icons/
/// - homepage/assets/img/
/// - homepage/assets/
/// - homepage/
pub async fn get_links(
response: &FeroxResponse,
tx_stats: UnboundedSender<StatCommand>,
) -> HashSet<String> {
log::trace!(
"enter: get_links({}, {:?})",
response.url().as_str(),
tx_stats
);
let mut links = HashSet::<String>::new();
let body = response.text();
for capture in LINKS_REGEX.captures_iter(&body) {
// remove single & double quotes from both ends of the capture
// capture[0] is the entire match, additional capture groups start at [1]
let link = capture[0].trim_matches(|c| c == '\'' || c == '"');
match Url::parse(link) {
Ok(absolute) => {
if absolute.domain() != response.url().domain()
|| absolute.host() != response.url().host()
{
// domains/ips are not the same, don't scan things that aren't part of the original
// target url
continue;
}
add_all_sub_paths(absolute.path(), &response, &mut links);
}
Err(e) => {
// this is the expected error that happens when we try to parse a url fragment
// ex: Url::parse("/login") -> Err("relative URL without a base")
// while this is technically an error, these are good results for us
if e.to_string().contains("relative URL without a base") {
add_all_sub_paths(link, &response, &mut links);
} else {
// unexpected error has occurred
log::error!("Could not parse given url: {}", e);
}
}
}
}
let multiplier = CONFIGURATION.extensions.len().max(1);
update_stat!(tx_stats, UpdateUsizeField(LinksExtracted, links.len()));
update_stat!(
tx_stats,
UpdateUsizeField(TotalExpected, links.len() * multiplier)
);
log::trace!("exit: get_links -> {:?}", links);
links
}
/// take a url fragment like homepage/assets/img/icons/handshake.svg and
/// incrementally add
/// - homepage/assets/img/icons/
/// - homepage/assets/img/
/// - homepage/assets/
/// - homepage/
fn add_all_sub_paths(url_path: &str, response: &FeroxResponse, mut links: &mut HashSet<String>) {
log::trace!(
"enter: add_all_sub_paths({}, {}, {:?})",
url_path,
response,
links
);
for sub_path in get_sub_paths_from_path(url_path) {
log::debug!("Adding {} to {:?}", sub_path, links);
add_link_to_set_of_links(&sub_path, &response.url(), &mut links);
}
log::trace!("exit: add_all_sub_paths");
}
/// Wrapper around link extraction logic
/// currently used in two places:
/// - links from response bodys
/// - links from robots.txt responses
///
/// general steps taken:
/// - create a new Url object based on cli options/args
/// - check if the new Url has already been seen/scanned -> None
/// - make a request to the new Url ? -> Some(response) : None
pub async fn request_feroxresponse_from_new_link(
url: &str,
tx_stats: UnboundedSender<StatCommand>,
) -> Option<FeroxResponse> {
log::trace!(
"enter: request_feroxresponse_from_new_link({}, {:?})",
url,
tx_stats
);
// create a url based on the given command line options, return None on error
let new_url = match format_url(
&url,
&"",
CONFIGURATION.add_slash,
&CONFIGURATION.queries,
None,
tx_stats.clone(),
) {
Ok(url) => url,
Err(_) => {
log::trace!("exit: request_feroxresponse_from_new_link -> None");
return None;
}
};
if SCANNED_URLS.get_scan_by_url(&new_url.to_string()).is_some() {
//we've seen the url before and don't need to scan again
log::trace!("exit: request_feroxresponse_from_new_link -> None");
return None;
}
// make the request and store the response
let new_response = match make_request(&CONFIGURATION.client, &new_url, tx_stats).await {
Ok(resp) => resp,
Err(_) => {
log::trace!("exit: request_feroxresponse_from_new_link -> None");
return None;
}
};
let new_ferox_response = FeroxResponse::from(new_response, true).await;
log::trace!(
"exit: request_feroxresponse_from_new_link -> {:?}",
new_ferox_response
);
Some(new_ferox_response)
}
/// helper function that simply requests /robots.txt on the given url's base url
///
/// example:
/// http://localhost/api/users -> http://localhost/robots.txt
///
/// The length of the given path has no effect on what's requested; it's always
/// base url + /robots.txt
pub async fn request_robots_txt(
base_url: &str,
config: &Configuration,
tx_stats: UnboundedSender<StatCommand>,
) -> Option<FeroxResponse> {
log::trace!(
"enter: get_robots_file({}, CONFIGURATION, {:?})",
base_url,
tx_stats
);
// more often than not, domain/robots.txt will redirect to www.domain/robots.txt or something
// similar; to account for that, create a client that will follow redirects, regardless of
// what the user specified for the scanning client. Other than redirects, it will respect
// all other user specified settings
let follow_redirects = true;
let proxy = if config.proxy.is_empty() {
None
} else {
Some(config.proxy.as_str())
};
let client = client::initialize(
config.timeout,
&config.user_agent,
follow_redirects,
config.insecure,
&config.headers,
proxy,
);
if let Ok(mut url) = Url::parse(base_url) {
url.set_path("/robots.txt"); // overwrite existing path with /robots.txt
if let Ok(response) = make_request(&client, &url, tx_stats).await {
let ferox_response = FeroxResponse::from(response, true).await;
log::trace!("exit: get_robots_file -> {}", ferox_response);
return Some(ferox_response);
}
}
None
}
/// Entry point to perform link extraction from robots.txt
///
/// `base_url` can have paths and subpaths, however robots.txt will be requested from the
/// root of the url
/// given the url:
/// http://localhost/stuff/things
/// this function requests:
/// http://localhost/robots.txt
pub async fn extract_robots_txt(
base_url: &str,
config: &Configuration,
tx_stats: UnboundedSender<StatCommand>,
) -> HashSet<String> {
log::trace!(
"enter: extract_robots_txt({}, CONFIGURATION, {:?})",
base_url,
tx_stats
);
let mut links = HashSet::new();
if let Some(response) = request_robots_txt(&base_url, &config, tx_stats.clone()).await {
for capture in ROBOTS_REGEX.captures_iter(response.text.as_str()) {
if let Some(new_path) = capture.name("url_path") {
if let Ok(mut new_url) = Url::parse(base_url) {
new_url.set_path(new_path.as_str());
add_all_sub_paths(new_url.path(), &response, &mut links);
}
}
}
}
let multiplier = CONFIGURATION.extensions.len().max(1);
update_stat!(tx_stats, UpdateUsizeField(LinksExtracted, links.len()));
update_stat!(
tx_stats,
UpdateUsizeField(TotalExpected, links.len() * multiplier)
);
log::trace!("exit: extract_robots_txt -> {:?}", links);
links
}
#[cfg(test)]
mod tests {
use super::*;
use crate::utils::make_request;
use crate::FeroxChannel;
use httpmock::Method::GET;
use httpmock::MockServer;
use reqwest::Client;
use tokio::sync::mpsc;
#[test]
/// extract sub paths from the given url fragment; expect 4 sub paths and that all are
/// in the expected array
fn extractor_get_sub_paths_from_path_with_multiple_paths() {
let path = "homepage/assets/img/icons/handshake.svg";
let paths = get_sub_paths_from_path(&path);
let expected = vec![
"homepage/",
"homepage/assets/",
"homepage/assets/img/",
"homepage/assets/img/icons/",
"homepage/assets/img/icons/handshake.svg",
];
assert_eq!(paths.len(), expected.len());
for expected_path in expected {
assert_eq!(paths.contains(&expected_path.to_string()), true);
}
}
#[test]
/// extract sub paths from the given url fragment; expect 2 sub paths and that all are
/// in the expected array. the fragment is wrapped in slashes to ensure no empty strings are
/// returned
fn extractor_get_sub_paths_from_path_with_enclosing_slashes() {
let path = "/homepage/assets/";
let paths = get_sub_paths_from_path(&path);
let expected = vec!["homepage/", "homepage/assets"];
assert_eq!(paths.len(), expected.len());
for expected_path in expected {
assert_eq!(paths.contains(&expected_path.to_string()), true);
}
}
#[test]
/// extract sub paths from the given url fragment; expect 1 sub path, no forward slashes are
/// included
fn extractor_get_sub_paths_from_path_with_only_a_word() {
let path = "homepage";
let paths = get_sub_paths_from_path(&path);
let expected = vec!["homepage"];
assert_eq!(paths.len(), expected.len());
for expected_path in expected {
assert_eq!(paths.contains(&expected_path.to_string()), true);
}
}
#[test]
/// extract sub paths from the given url fragment; expect 1 sub path, forward slash removed
fn extractor_get_sub_paths_from_path_with_an_absolute_word() {
let path = "/homepage";
let paths = get_sub_paths_from_path(&path);
let expected = vec!["homepage"];
assert_eq!(paths.len(), expected.len());
for expected_path in expected {
assert_eq!(paths.contains(&expected_path.to_string()), true);
}
}
#[test]
/// test that a full url and fragment are joined correctly, then added to the given list
/// i.e. the happy path
fn extractor_add_link_to_set_of_links_happy_path() {
let url = Url::parse("https://localhost").unwrap();
let mut links = HashSet::<String>::new();
let link = "admin";
assert_eq!(links.len(), 0);
add_link_to_set_of_links(link, &url, &mut links);
assert_eq!(links.len(), 1);
assert!(links.contains("https://localhost/admin"));
}
#[test]
/// test that an invalid path fragment doesn't add anything to the set of links
fn extractor_add_link_to_set_of_links_with_non_base_url() {
let url = Url::parse("https://localhost").unwrap();
let mut links = HashSet::<String>::new();
let link = "\\\\\\\\";
assert_eq!(links.len(), 0);
add_link_to_set_of_links(link, &url, &mut links);
assert_eq!(links.len(), 0);
assert!(links.is_empty());
}
#[tokio::test(flavor = "multi_thread", worker_threads = 1)]
/// use make_request to generate a Response, and use the Response to test get_links;
/// the response will contain an absolute path to a domain that is not part of the scanned
/// domain; expect an empty set returned
async fn extractor_get_links_with_absolute_url_that_differs_from_target_domain(
) -> Result<(), Box<dyn std::error::Error>> {
let srv = MockServer::start();
let mock = srv.mock(|when, then|{
when.method(GET)
.path("/some-path");
then.status(200)
.body("\"http://defintely.not.a.thing.probably.com/homepage/assets/img/icons/handshake.svg\"");
});
let client = Client::new();
let url = Url::parse(&srv.url("/some-path")).unwrap();
let (tx, _): FeroxChannel<StatCommand> = mpsc::unbounded_channel();
let response = make_request(&client, &url, tx.clone()).await.unwrap();
let ferox_response = FeroxResponse::from(response, true).await;
let links = get_links(&ferox_response, tx).await;
assert!(links.is_empty());
assert_eq!(mock.hits(), 1);
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 1)]
/// test that /robots.txt is correctly requested given a base url (happy path)
async fn request_robots_txt_with_and_without_proxy() {
let srv = MockServer::start();
let mock = srv.mock(|when, then| {
when.method(GET).path("/robots.txt");
then.status(200).body("this is a test");
});
let mut config = Configuration::default();
let (tx, _): FeroxChannel<StatCommand> = mpsc::unbounded_channel();
request_robots_txt(&srv.url("/api/users/stuff/things"), &config, tx.clone()).await;
// note: the proxy doesn't actually do anything other than hit a different code branch
// in this unit test; it would however have an effect on an integration test
config.proxy = srv.url("/ima-proxy");
request_robots_txt(&srv.url("/api/different/path"), &config, tx).await;
assert_eq!(mock.hits(), 2);
}
}

171
src/extractor/builder.rs Normal file
View File

@@ -0,0 +1,171 @@
use super::*;
use anyhow::{bail, Result};
/// Regular expression used in [LinkFinder](https://github.com/GerbenJavado/LinkFinder)
///
/// Incorporates change from this [Pull Request](https://github.com/GerbenJavado/LinkFinder/pull/66/files)
const LINKFINDER_REGEX: &str = r#"(?:"|')(((?:[a-zA-Z]{1,10}://|//)[^"'/]{1,}\.[a-zA-Z]{2,}[^"']{0,})|((?:/|\.\./|\./)[^"'><,;| *()(%%$^/\\\[\]][^"'><,;|()]{1,})|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{1,}\.(?:[a-zA-Z]{1,4}|action)(?:[\?|#][^"|']{0,}|))|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{3,}(?:[\?|#][^"|']{0,}|))|([a-zA-Z0-9_\-.]{1,}\.(?:php|asp|aspx|jsp|json|action|html|js|txt|xml)(?:[\?|#][^"|']{0,}|)))(?:"|')"#;
/// Regular expression to pull url paths from robots.txt
///
/// ref: https://developers.google.com/search/reference/robots_txt
const ROBOTS_TXT_REGEX: &str =
r#"(?m)^ *(Allow|Disallow): *(?P<url_path>[a-zA-Z0-9._/?#@!&'()+,;%=-]+?)$"#; // multi-line (?m)
/// Which type of extraction should be performed
#[derive(Debug, Copy, Clone)]
pub enum ExtractionTarget {
/// Examine a response body and extract links
ResponseBody,
/// Examine robots.txt (specifically) and extract links
RobotsTxt,
}
/// responsible for building an `Extractor`
pub struct ExtractorBuilder<'a> {
/// Response from which to extract links
response: Option<&'a FeroxResponse>,
/// Response from which to extract links
url: String,
/// Whether or not to try recursion
config: Option<&'a Configuration>,
/// transmitter to the mpsc that handles statistics gathering
tx_stats: Option<UnboundedSender<StatCommand>>,
/// transmitter to the mpsc that handles recursive scan calls
tx_recursion: Option<UnboundedSender<String>>,
/// transmitter to the mpsc that handles reporting information to the user
tx_reporter: Option<UnboundedSender<FeroxResponse>>,
/// list of urls that will be added to when new urls are extracted
scanned_urls: Option<&'a FeroxScans>,
/// depth at which the scan was started
depth: Option<usize>,
/// copy of Stats object
stats: Option<Arc<Stats>>,
/// type of extraction to be performed
target: Option<ExtractionTarget>,
}
/// ExtractorBuilder implementation
impl<'a> ExtractorBuilder<'a> {
/// Given a FeroxResponse, create new ExtractorBuilder
///
/// Once built, Extractor::target is ExtractionTarget::ResponseBody
pub fn with_response(response: &'a FeroxResponse) -> Self {
Self {
response: Some(response),
url: "".to_string(),
config: None,
tx_stats: None,
tx_recursion: None,
tx_reporter: None,
scanned_urls: None,
depth: None,
stats: None,
target: None,
}
}
/// Given a url and Stats transmitter, create new ExtractorBuilder
///
/// Once built, Extractor::target is ExtractionTarget::ResponseBody
pub fn with_url(url: &str) -> Self {
Self {
response: None,
url: url.to_string(),
config: None,
tx_stats: None,
tx_recursion: None,
tx_reporter: None,
scanned_urls: None,
depth: None,
stats: None,
target: None,
}
}
/// builder call to set `config`
pub fn config(&mut self, config: &'a Configuration) -> &mut Self {
self.config = Some(config);
self
}
/// builder call to set `tx_recursion`
pub fn recursion_transmitter(&mut self, tx_recursion: UnboundedSender<String>) -> &mut Self {
self.tx_recursion = Some(tx_recursion);
self
}
/// builder call to set `tx_stats`
pub fn stats_transmitter(&mut self, tx_stats: UnboundedSender<StatCommand>) -> &mut Self {
self.tx_stats = Some(tx_stats);
self
}
/// builder call to set `tx_reporter`
pub fn reporter_transmitter(
&mut self,
tx_reporter: UnboundedSender<FeroxResponse>,
) -> &mut Self {
self.tx_reporter = Some(tx_reporter);
self
}
/// builder call to set `scanned_urls`
pub fn scanned_urls(&mut self, scanned_urls: &'a FeroxScans) -> &mut Self {
self.scanned_urls = Some(scanned_urls);
self
}
/// builder call to set `stats`
pub fn stats(&mut self, stats: Arc<Stats>) -> &mut Self {
self.stats = Some(stats);
self
}
/// builder call to set `depth`
pub fn depth(&mut self, depth: usize) -> &mut Self {
self.depth = Some(depth);
self
}
/// builder call to set `target`
pub fn target(&mut self, target: ExtractionTarget) -> &mut Self {
self.target = Some(target);
self
}
pub fn build(&self) -> Result<Extractor<'a>> {
if self.url.is_empty() && self.response.is_none() {
bail!("Extractor requires either a URL or a FeroxResponse be specified")
}
Ok(Extractor {
links_regex: Regex::new(LINKFINDER_REGEX).unwrap(),
robots_regex: Regex::new(ROBOTS_TXT_REGEX).unwrap(),
response: if self.response.is_some() {
Some(self.response.unwrap())
} else {
None
},
url: self.url.to_owned(),
config: self.config.unwrap(),
tx_stats: self.tx_stats.as_ref().unwrap().clone(),
tx_recursion: self.tx_recursion.as_ref().unwrap().clone(),
tx_reporter: self.tx_reporter.as_ref().unwrap().clone(),
scanned_urls: self.scanned_urls.unwrap(),
depth: self.depth.unwrap(),
stats: self.stats.as_ref().unwrap().clone(),
target: self.target.unwrap(),
})
}
}

408
src/extractor/container.rs Normal file
View File

@@ -0,0 +1,408 @@
use super::*;
use crate::{
client,
scanner::{send_report, should_filter_response, try_recursion},
statistics::{
StatCommand::UpdateUsizeField,
StatField::{LinksExtracted, TotalExpected},
},
update_stat,
utils::{format_url, make_request},
};
use anyhow::{bail, Context, Result};
use reqwest::{StatusCode, Url};
use std::collections::HashSet;
/// Whether an active scan is recursive or not
#[derive(Debug)]
enum RecursionStatus {
/// Scan is recursive
Recursive,
/// Scan is not recursive
NotRecursive,
}
/// Handles all logic related to extracting links from requested source code
#[derive(Debug)]
pub struct Extractor<'a> {
/// `LINKFINDER_REGEX` as a regex::Regex type
pub(super) links_regex: Regex,
/// `ROBOTS_TXT_REGEX` as a regex::Regex type
pub(super) robots_regex: Regex,
/// Response from which to extract links
pub(super) response: Option<&'a FeroxResponse>,
/// Response from which to extract links
pub(super) url: String,
/// Whether or not to try recursion
pub(super) config: &'a Configuration,
/// transmitter to the mpsc that handles statistics gathering
pub(super) tx_stats: UnboundedSender<StatCommand>,
/// transmitter to the mpsc that handles recursive scan calls
pub(super) tx_recursion: UnboundedSender<String>,
/// transmitter to the mpsc that handles reporting information to the user
pub(super) tx_reporter: UnboundedSender<FeroxResponse>,
/// list of urls that will be added to when new urls are extracted
pub(super) scanned_urls: &'a FeroxScans,
/// depth at which the scan was started
pub(super) depth: usize,
/// copy of Stats object
pub(super) stats: Arc<Stats>,
/// type of extraction to be performed
pub(super) target: ExtractionTarget,
}
/// Extractor implementation
impl<'a> Extractor<'a> {
/// business logic that handles getting links from a normal http body response
pub async fn extract(&self) -> Result<()> {
let links = match self.target {
ExtractionTarget::ResponseBody => self.extract_from_body().await?,
ExtractionTarget::RobotsTxt => self.extract_from_robots().await?,
};
let recursive = if self.config.no_recursion {
RecursionStatus::NotRecursive
} else {
RecursionStatus::Recursive
};
for link in links {
// todo rename get_feroxresponse_from_link
let mut resp = match self.get_feroxresponse_from_link(&link).await {
Ok(resp) => resp,
Err(_) => continue,
};
// filter if necessary
if should_filter_response(&resp, self.tx_stats.clone()) {
continue;
}
if resp.is_file() {
// very likely a file, simply request and report
log::debug!("Extracted file: {}", resp);
self.scanned_urls
.add_file_scan(&resp.url().to_string(), self.stats.clone());
send_report(self.tx_reporter.clone(), resp);
continue;
}
if matches!(recursive, RecursionStatus::Recursive) {
log::debug!("Extracted Directory: {}", resp);
if !resp.url().as_str().ends_with('/')
&& (resp.status().is_success()
|| matches!(resp.status(), &StatusCode::FORBIDDEN))
{
// if the url doesn't end with a /
// and the response code is either a 2xx or 403
// since all of these are 2xx or 403, recursion is only attempted if the
// url ends in a /. I am actually ok with adding the slash and not
// adding it, as both have merit. Leaving it in for now to see how
// things turn out (current as of: v1.1.0)
resp.set_url(&format!("{}/", resp.url()));
}
try_recursion(&resp, self.depth, self.tx_recursion.clone()).await;
}
}
Ok(())
}
/// Given a `reqwest::Response`, perform the following actions
/// - parse the response's text for links using the linkfinder regex
/// - for every link found take its url path and parse each sub-path
/// - example: Response contains a link fragment `homepage/assets/img/icons/handshake.svg`
/// with a base url of http://localhost, the following urls would be returned:
/// - homepage/assets/img/icons/handshake.svg
/// - homepage/assets/img/icons/
/// - homepage/assets/img/
/// - homepage/assets/
/// - homepage/
pub(super) async fn extract_from_body(&self) -> Result<HashSet<String>> {
log::trace!("enter: get_links");
let mut links = HashSet::<String>::new();
let body = self.response.unwrap().text();
for capture in self.links_regex.captures_iter(&body) {
// remove single & double quotes from both ends of the capture
// capture[0] is the entire match, additional capture groups start at [1]
let link = capture[0].trim_matches(|c| c == '\'' || c == '"');
match Url::parse(link) {
Ok(absolute) => {
if absolute.domain() != self.response.unwrap().url().domain()
|| absolute.host() != self.response.unwrap().url().host()
{
// domains/ips are not the same, don't scan things that aren't part of the original
// target url
continue;
}
if self.add_all_sub_paths(absolute.path(), &mut links).is_err() {
log::warn!("could not add sub-paths from {} to {:?}", absolute, links);
}
}
Err(e) => {
// this is the expected error that happens when we try to parse a url fragment
// ex: Url::parse("/login") -> Err("relative URL without a base")
// while this is technically an error, these are good results for us
if e.to_string().contains("relative URL without a base") {
if self.add_all_sub_paths(link, &mut links).is_err() {
log::warn!("could not add sub-paths from {} to {:?}", link, links);
}
} else {
// unexpected error has occurred
log::error!("Could not parse given url: {}", e);
}
}
}
}
self.update_stats(links.len());
log::trace!("exit: get_links -> {:?}", links);
Ok(links)
}
/// take a url fragment like homepage/assets/img/icons/handshake.svg and
/// incrementally add
/// - homepage/assets/img/icons/
/// - homepage/assets/img/
/// - homepage/assets/
/// - homepage/
fn add_all_sub_paths(&self, url_path: &str, mut links: &mut HashSet<String>) -> Result<()> {
log::trace!("enter: add_all_sub_paths({}, {:?})", url_path, links);
for sub_path in self.get_sub_paths_from_path(url_path) {
self.add_link_to_set_of_links(&sub_path, &mut links)?;
}
log::trace!("exit: add_all_sub_paths");
Ok(())
}
/// Iterate over a given path, return a list of every sub-path found
///
/// example: `path` contains a link fragment `homepage/assets/img/icons/handshake.svg`
/// the following fragments would be returned:
/// - homepage/assets/img/icons/handshake.svg
/// - homepage/assets/img/icons/
/// - homepage/assets/img/
/// - homepage/assets/
/// - homepage/
pub(super) fn get_sub_paths_from_path(&self, path: &str) -> Vec<String> {
log::trace!("enter: get_sub_paths_from_path({})", path);
let mut paths = vec![];
// filter out any empty strings caused by .split
let mut parts: Vec<&str> = path.split('/').filter(|s| !s.is_empty()).collect();
let length = parts.len();
for i in 0..length {
// iterate over all parts of the path
if parts.is_empty() {
// pop left us with an empty vector, we're done
break;
}
let mut possible_path = parts.join("/");
if possible_path.is_empty() {
// .join can result in an empty string, which we don't need, ignore
continue;
}
if i > 0 {
// this isn't the last index of the parts array
// ex: /buried/misc/stupidfile.php
// this block skips the file but sees all parent folders
possible_path = format!("{}/", possible_path);
}
paths.push(possible_path); // good sub-path found
parts.pop(); // use .pop() to remove the last part of the path and continue iteration
}
log::trace!("exit: get_sub_paths_from_path -> {:?}", paths);
paths
}
/// simple helper to stay DRY, trys to join a url + fragment and add it to the `links` HashSet
pub(super) fn add_link_to_set_of_links(
&self,
link: &str,
links: &mut HashSet<String>,
) -> Result<()> {
log::trace!("enter: add_link_to_set_of_links({}, {:?})", link, links);
let old_url = match self.target {
ExtractionTarget::ResponseBody => self.response.unwrap().url.clone(),
ExtractionTarget::RobotsTxt => match Url::parse(&self.url) {
Ok(u) => u,
Err(e) => {
bail!("Could not parse {}: {}", self.url, e);
}
},
};
let new_url = old_url
.join(&link)
.with_context(|| format!("Could not join {} with {}", old_url, link))?;
links.insert(new_url.to_string());
log::trace!("exit: add_link_to_set_of_links");
Ok(())
}
/// Wrapper around link extraction logic
/// currently used in two places:
/// - links from response bodies
/// - links from robots.txt responses
///
/// general steps taken:
/// - create a new Url object based on cli options/args
/// - check if the new Url has already been seen/scanned -> None
/// - make a request to the new Url ? -> Some(response) : None
pub(super) async fn get_feroxresponse_from_link(&self, url: &str) -> Result<FeroxResponse> {
log::trace!("enter: get_feroxresponse_from_link({})", url);
// create a url based on the given command line options, return None on error
let new_url = format_url(
&url,
&"",
self.config.add_slash,
&self.config.queries,
None,
self.tx_stats.clone(),
)?;
if self
.scanned_urls
.get_scan_by_url(&new_url.to_string())
.is_some()
{
//we've seen the url before and don't need to scan again
log::trace!("exit: get_feroxresponse_from_link -> None");
bail!("previously seen url");
}
// make the request and store the response
let new_response =
make_request(&self.config.client, &new_url, self.tx_stats.clone()).await?;
let new_ferox_response = FeroxResponse::from(new_response, true).await;
log::trace!(
"exit: get_feroxresponse_from_link -> {:?}",
new_ferox_response
);
Ok(new_ferox_response)
}
/// Entry point to perform link extraction from robots.txt
///
/// `base_url` can have paths and subpaths, however robots.txt will be requested from the
/// root of the url
/// given the url:
/// http://localhost/stuff/things
/// this function requests:
/// http://localhost/robots.txt
pub(super) async fn extract_from_robots(&self) -> Result<HashSet<String>> {
log::trace!("enter: extract_robots_txt");
let mut links: HashSet<String> = HashSet::new();
let response = self.request_robots_txt().await?;
for capture in self.robots_regex.captures_iter(response.text.as_str()) {
if let Some(new_path) = capture.name("url_path") {
let mut new_url = Url::parse(&self.url)?;
new_url.set_path(new_path.as_str());
if self.add_all_sub_paths(&new_url.path(), &mut links).is_err() {
log::warn!("could not add sub-paths from {} to {:?}", new_url, links);
}
}
}
self.update_stats(links.len());
log::trace!("exit: extract_robots_txt -> {:?}", links);
Ok(links)
}
/// helper function that simply requests /robots.txt on the given url's base url
///
/// example:
/// http://localhost/api/users -> http://localhost/robots.txt
///
/// The length of the given path has no effect on what's requested; it's always
/// base url + /robots.txt
pub(super) async fn request_robots_txt(&self) -> Result<FeroxResponse> {
log::trace!("enter: get_robots_file");
// more often than not, domain/robots.txt will redirect to www.domain/robots.txt or something
// similar; to account for that, create a client that will follow redirects, regardless of
// what the user specified for the scanning client. Other than redirects, it will respect
// all other user specified settings
let follow_redirects = true;
let proxy = if self.config.proxy.is_empty() {
None
} else {
Some(self.config.proxy.as_str())
};
let client = client::initialize(
self.config.timeout,
&self.config.user_agent,
follow_redirects,
self.config.insecure,
&self.config.headers,
proxy,
);
let mut url = Url::parse(&self.url)?;
url.set_path("/robots.txt"); // overwrite existing path with /robots.txt
let response = make_request(&client, &url, self.tx_stats.clone()).await?;
let ferox_response = FeroxResponse::from(response, true).await;
log::trace!("exit: get_robots_file -> {}", ferox_response);
return Ok(ferox_response);
}
/// update total number of links extracted and expected responses
fn update_stats(&self, num_links: usize) {
let multiplier = self.config.extensions.len().max(1);
update_stat!(self.tx_stats, UpdateUsizeField(LinksExtracted, num_links));
update_stat!(
self.tx_stats,
UpdateUsizeField(TotalExpected, num_links * multiplier)
);
}
}

19
src/extractor/mod.rs Normal file
View File

@@ -0,0 +1,19 @@
//! extract links from html source and robots.txt
mod builder;
mod container;
#[cfg(test)]
mod tests;
pub use self::builder::ExtractionTarget;
pub use self::builder::ExtractorBuilder;
pub use self::container::Extractor;
use crate::{
config::Configuration,
scan_manager::FeroxScans,
statistics::{StatCommand, Stats},
FeroxResponse,
};
use regex::Regex;
use std::sync::Arc;
use tokio::sync::mpsc::UnboundedSender;

372
src/extractor/tests.rs Normal file
View File

@@ -0,0 +1,372 @@
use super::*;
use crate::utils::make_request;
use crate::FeroxChannel;
use anyhow::Result;
use httpmock::Method::GET;
use httpmock::MockServer;
use lazy_static::lazy_static;
use reqwest::{header::HeaderMap, Client, StatusCode, Url};
use std::collections::HashSet;
use tokio::sync::mpsc;
lazy_static! {
/// Extractor for testing robots.txt
static ref ROBOTS_EXT: Extractor<'static> = setup_extractor(ExtractionTarget::RobotsTxt);
/// Extractor for testing response bodies
static ref BODY_EXT: Extractor<'static> = setup_extractor(ExtractionTarget::ResponseBody);
/// Configuration for Extractor
static ref CONFIG: Configuration = Configuration::new();
/// FeroxScans for Extractor
static ref SCANS: FeroxScans = FeroxScans::default();
/// FeroxResponse for Extractor
static ref RESPONSE: FeroxResponse = get_test_response();
}
fn get_test_response() -> FeroxResponse {
FeroxResponse {
text: String::new(),
wildcard: true,
url: Url::parse("https://localhost").unwrap(),
content_length: 125,
word_count: 10,
line_count: 14,
headers: HeaderMap::new(),
status: StatusCode::OK,
}
}
/// creates a single extractor that can be used to test standalone functions
fn setup_extractor(target: ExtractionTarget) -> Extractor<'static> {
let (tx_dir, _): FeroxChannel<String> = mpsc::unbounded_channel();
let (tx_stats, _): FeroxChannel<StatCommand> = mpsc::unbounded_channel();
let (tx_term, _): FeroxChannel<FeroxResponse> = mpsc::unbounded_channel();
let stats = Arc::new(Stats::new());
let mut builder = match target {
ExtractionTarget::ResponseBody => ExtractorBuilder::with_response(&RESPONSE),
ExtractionTarget::RobotsTxt => ExtractorBuilder::with_url("https://localhost"),
};
builder
.target(target)
.depth(4)
.config(&CONFIG)
.recursion_transmitter(tx_dir.clone())
.stats_transmitter(tx_stats.clone())
.reporter_transmitter(tx_term.clone())
.scanned_urls(&SCANS)
.stats(stats.clone())
.build()
.unwrap()
}
#[test]
/// extract sub paths from the given url fragment; expect 4 sub paths and that all are
/// in the expected array
fn extractor_get_sub_paths_from_path_with_multiple_paths() {
let path = "homepage/assets/img/icons/handshake.svg";
let r_paths = ROBOTS_EXT.get_sub_paths_from_path(&path);
let b_paths = BODY_EXT.get_sub_paths_from_path(&path);
let expected = vec![
"homepage/",
"homepage/assets/",
"homepage/assets/img/",
"homepage/assets/img/icons/",
"homepage/assets/img/icons/handshake.svg",
];
assert_eq!(r_paths.len(), expected.len());
assert_eq!(b_paths.len(), expected.len());
for expected_path in expected {
assert_eq!(r_paths.contains(&expected_path.to_string()), true);
assert_eq!(b_paths.contains(&expected_path.to_string()), true);
}
}
#[test]
/// extract sub paths from the given url fragment; expect 2 sub paths and that all are
/// in the expected array. the fragment is wrapped in slashes to ensure no empty strings are
/// returned
fn extractor_get_sub_paths_from_path_with_enclosing_slashes() {
let path = "/homepage/assets/";
let r_paths = ROBOTS_EXT.get_sub_paths_from_path(&path);
let b_paths = BODY_EXT.get_sub_paths_from_path(&path);
let expected = vec!["homepage/", "homepage/assets"];
assert_eq!(r_paths.len(), expected.len());
assert_eq!(b_paths.len(), expected.len());
for expected_path in expected {
assert_eq!(r_paths.contains(&expected_path.to_string()), true);
assert_eq!(b_paths.contains(&expected_path.to_string()), true);
}
}
#[test]
/// extract sub paths from the given url fragment; expect 1 sub path, no forward slashes are
/// included
fn extractor_get_sub_paths_from_path_with_only_a_word() {
let path = "homepage";
let r_paths = ROBOTS_EXT.get_sub_paths_from_path(&path);
let b_paths = BODY_EXT.get_sub_paths_from_path(&path);
let expected = vec!["homepage"];
assert_eq!(r_paths.len(), expected.len());
assert_eq!(b_paths.len(), expected.len());
for expected_path in expected {
assert_eq!(r_paths.contains(&expected_path.to_string()), true);
assert_eq!(b_paths.contains(&expected_path.to_string()), true);
}
}
#[test]
/// extract sub paths from the given url fragment; expect 1 sub path, forward slash removed
fn extractor_get_sub_paths_from_path_with_an_absolute_word() {
let path = "/homepage";
let r_paths = ROBOTS_EXT.get_sub_paths_from_path(&path);
let b_paths = BODY_EXT.get_sub_paths_from_path(&path);
let expected = vec!["homepage"];
assert_eq!(r_paths.len(), expected.len());
assert_eq!(b_paths.len(), expected.len());
for expected_path in expected {
assert_eq!(r_paths.contains(&expected_path.to_string()), true);
assert_eq!(b_paths.contains(&expected_path.to_string()), true);
}
}
#[test]
/// test that an ExtractorBuilder without a FeroxResponse and without a URL bails
fn extractor_builder_bails_when_neither_required_field_is_set() {
let (tx_dir, _): FeroxChannel<String> = mpsc::unbounded_channel();
let (tx_stats, _): FeroxChannel<StatCommand> = mpsc::unbounded_channel();
let (tx_term, _): FeroxChannel<FeroxResponse> = mpsc::unbounded_channel();
let stats = Arc::new(Stats::new());
let extractor = ExtractorBuilder::with_url("")
.target(ExtractionTarget::ResponseBody)
.depth(4)
.config(&CONFIG)
.recursion_transmitter(tx_dir.clone())
.stats_transmitter(tx_stats.clone())
.reporter_transmitter(tx_term.clone())
.scanned_urls(&SCANS)
.stats(stats.clone())
.build();
assert!(extractor.is_err());
}
#[test]
/// test that a full url and fragment are joined correctly, then added to the given list
/// i.e. the happy path
fn extractor_add_link_to_set_of_links_happy_path() {
let mut r_links = HashSet::<String>::new();
let r_link = "admin";
let mut b_links = HashSet::<String>::new();
let b_link = "shmadmin";
assert_eq!(r_links.len(), 0);
ROBOTS_EXT
.add_link_to_set_of_links(r_link, &mut r_links)
.unwrap();
assert_eq!(r_links.len(), 1);
assert!(r_links.contains("https://localhost/admin"));
assert_eq!(b_links.len(), 0);
BODY_EXT
.add_link_to_set_of_links(b_link, &mut b_links)
.unwrap();
assert_eq!(b_links.len(), 1);
assert!(b_links.contains("https://localhost/shmadmin"));
}
#[test]
/// test that an invalid path fragment doesn't add anything to the set of links
fn extractor_add_link_to_set_of_links_with_non_base_url() {
let mut links = HashSet::<String>::new();
let link = "\\\\\\\\";
assert_eq!(links.len(), 0);
assert!(ROBOTS_EXT
.add_link_to_set_of_links(link, &mut links)
.is_err());
assert!(BODY_EXT.add_link_to_set_of_links(link, &mut links).is_err());
assert_eq!(links.len(), 0);
assert!(links.is_empty());
}
#[tokio::test(flavor = "multi_thread", worker_threads = 1)]
/// use make_request to generate a Response, and use the Response to test get_links;
/// the response will contain an absolute path to a domain that is not part of the scanned
/// domain; expect an empty set returned
async fn extractor_get_links_with_absolute_url_that_differs_from_target_domain() -> Result<()> {
let (tx_dir, _): FeroxChannel<String> = mpsc::unbounded_channel();
let (tx_stats, _): FeroxChannel<StatCommand> = mpsc::unbounded_channel();
let (tx_term, _): FeroxChannel<FeroxResponse> = mpsc::unbounded_channel();
let stats = Arc::new(Stats::new());
let srv = MockServer::start();
let mock = srv.mock(|when, then| {
when.method(GET).path("/some-path");
then.status(200).body(
"\"http://defintely.not.a.thing.probably.com/homepage/assets/img/icons/handshake.svg\"",
);
});
let client = Client::new();
let url = Url::parse(&srv.url("/some-path")).unwrap();
let response = make_request(&client, &url, tx_stats.clone()).await.unwrap();
let ferox_response = FeroxResponse::from(response, true).await;
let extractor = ExtractorBuilder::with_response(&ferox_response)
.target(ExtractionTarget::ResponseBody)
.depth(4)
.config(&CONFIG)
.recursion_transmitter(tx_dir.clone())
.stats_transmitter(tx_stats.clone())
.reporter_transmitter(tx_term.clone())
.scanned_urls(&SCANS)
.stats(stats.clone())
.build()?;
let links = extractor.extract_from_body().await?;
assert!(links.is_empty());
assert_eq!(mock.hits(), 1);
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 1)]
/// test that /robots.txt is correctly requested given a base url (happy path)
async fn request_robots_txt_without_proxy() -> Result<()> {
let (tx_dir, _): FeroxChannel<String> = mpsc::unbounded_channel();
let (tx_stats, _): FeroxChannel<StatCommand> = mpsc::unbounded_channel();
let (tx_term, _): FeroxChannel<FeroxResponse> = mpsc::unbounded_channel();
let stats = Arc::new(Stats::new());
let config = Configuration::new();
let srv = MockServer::start();
let mock = srv.mock(|when, then| {
when.method(GET).path("/robots.txt");
then.status(200).body("this is a test");
});
let extractor = ExtractorBuilder::with_url(&srv.url("/api/users/stuff/things"))
.target(ExtractionTarget::RobotsTxt)
.depth(4)
.config(&config)
.recursion_transmitter(tx_dir.clone())
.stats_transmitter(tx_stats.clone())
.reporter_transmitter(tx_term.clone())
.scanned_urls(&SCANS)
.stats(stats.clone())
.build()?;
let resp = extractor.request_robots_txt().await?;
assert!(matches!(resp.status(), &StatusCode::OK));
println!("{}", resp);
assert_eq!(resp.content_length(), 14);
assert_eq!(mock.hits(), 1);
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 1)]
/// test that /robots.txt is correctly requested given a base url (happy path) when a proxy is used
async fn request_robots_txt_with_proxy() -> Result<()> {
let (tx_dir, _): FeroxChannel<String> = mpsc::unbounded_channel();
let (tx_stats, _): FeroxChannel<StatCommand> = mpsc::unbounded_channel();
let (tx_term, _): FeroxChannel<FeroxResponse> = mpsc::unbounded_channel();
let stats = Arc::new(Stats::new());
let mut config = Configuration::new();
let srv = MockServer::start();
let mock = srv.mock(|when, then| {
when.method(GET).path("/robots.txt");
then.status(200).body("this is also a test");
});
// note: the proxy doesn't actually do anything other than hit a different code branch
// in this unit test; it would however have an effect on an integration test
config.proxy = srv.url("/ima-proxy");
let extractor = ExtractorBuilder::with_url(&srv.url("/api/different/path"))
.target(ExtractionTarget::RobotsTxt)
.depth(4)
.config(&config)
.recursion_transmitter(tx_dir.clone())
.stats_transmitter(tx_stats.clone())
.reporter_transmitter(tx_term.clone())
.scanned_urls(&SCANS)
.stats(stats.clone())
.build()?;
let resp = extractor.request_robots_txt().await?;
assert!(matches!(resp.status(), &StatusCode::OK));
assert_eq!(resp.content_length(), 19);
assert_eq!(mock.hits(), 1);
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 1)]
/// get_feroxresponse_from_link's happy path, expect back a FeroxResponse
async fn get_feroxresponse_from_link_happy_path() -> Result<()> {
let srv = MockServer::start();
let mock = srv.mock(|when, then| {
when.method(GET).path("/login.php");
then.status(200).body("this is a test");
});
let r_resp = ROBOTS_EXT
.get_feroxresponse_from_link(&srv.url("/login.php"))
.await?;
let b_resp = BODY_EXT
.get_feroxresponse_from_link(&srv.url("/login.php"))
.await?;
assert!(matches!(r_resp.status(), &StatusCode::OK));
assert!(matches!(b_resp.status(), &StatusCode::OK));
assert_eq!(r_resp.content_length(), 14);
assert_eq!(b_resp.content_length(), 14);
assert_eq!(mock.hits(), 2);
Ok(())
}
#[tokio::test(flavor = "multi_thread", worker_threads = 1)]
/// get_feroxresponse_from_link should bail in the event that the url is already in scanned_urls
async fn get_feroxresponse_from_link_bails_on_seen_url() -> Result<()> {
let url = "/unique-for-this-test.php";
let srv = MockServer::start();
let served = srv.url(url);
let mock = srv.mock(|when, then| {
when.method(GET).path(url);
then.status(200)
.body("this is a unique test, don't reuse the endpoint");
});
SCANS.add_file_scan(&served, ROBOTS_EXT.stats.clone());
let r_resp = ROBOTS_EXT.get_feroxresponse_from_link(&served).await;
let b_resp = BODY_EXT.get_feroxresponse_from_link(&served).await;
assert!(r_resp.is_err());
assert!(b_resp.is_err());
assert_eq!(mock.hits(), 0); // function exits before requests can happen
Ok(())
}

36
src/filters/helpers.rs Normal file
View File

@@ -0,0 +1,36 @@
// use super::WildcardFilter;
// use crate::{
// statistics::{
// StatCommand::{self, UpdateUsizeField},
// StatField::WildcardsFiltered,
// },
// FeroxResponse,
// };
// use anyhow::Result;
// use tokio::sync::mpsc::UnboundedSender;
//
// /// Simple helper to stay DRY; determines whether or not a given `FeroxResponse` should be reported
// /// to the user or not.
// pub fn should_filter_response(
// response: &FeroxResponse,
// tx_stats: UnboundedSender<StatCommand>,
// ) -> Result<bool> {
// let filters = FILTERS
// match FILTERS.read() {
// Ok(filters) => {
// for filter in filters.iter() {
// // wildcard.should_filter goes here
// if filter.should_filter_response(&response) {
// if filter.as_any().downcast_ref::<WildcardFilter>().is_some() {
// update_stat!(tx_stats, UpdateUsizeField(WildcardsFiltered, 1))
// }
// return true;
// }
// }
// }
// Err(e) => {
// log::error!("{}", e);
// }
// }
// false
// }

View File

@@ -8,6 +8,7 @@ mod regex;
mod similarity;
#[cfg(test)]
mod tests;
mod helpers;
pub use self::lines::LinesFilter;
pub use self::regex::RegexFilter;

View File

@@ -1,19 +1,19 @@
pub mod utils;
pub mod client;
pub mod banner;
pub mod config;
pub mod extractor;
pub mod filters;
mod client;
mod event_handlers;
mod filters;
pub mod heuristics;
pub mod logger;
pub mod parser;
mod parser;
pub mod progress;
pub mod reporter;
pub mod scan_manager;
pub mod scanner;
pub mod statistics;
mod event_handlers;
pub mod banner;
mod traits;
pub mod utils;
mod extractor;
use crate::{
traits::FeroxSerialize,

View File

@@ -5,9 +5,11 @@ use crate::{
StatCommand::{self, UpdateUsizeField},
StatField::ResourcesDiscovered,
},
update_stat,
utils::{ferox_print, make_request, open_file},
FeroxChannel, FeroxResponse, FeroxSerialize,
};
use console::strip_ansi_codes;
use std::{
fs, io,

View File

@@ -1,6 +1,6 @@
use crate::{
config::{Configuration, CONFIGURATION},
extractor::{extract_robots_txt, get_links, request_feroxresponse_from_new_link},
extractor::{ExtractionTarget, ExtractorBuilder},
filters::{
LinesFilter, RegexFilter, SimilarityFilter, SizeFilter, StatusCodeFilter, WildcardFilter,
WordsFilter,
@@ -13,6 +13,7 @@ use crate::{
Stats,
},
traits::FeroxFilter,
update_stat,
utils::{format_url, get_current_depth, make_request},
FeroxChannel, FeroxResponse, SIMILARITY_THRESHOLD,
};
@@ -307,11 +308,12 @@ fn reached_max_depth(url: &Url, base_depth: usize, max_depth: usize) -> bool {
/// Helper function that wraps logic to check for recursion opportunities
///
/// When a recursion opportunity is found, the new url is sent across the recursion channel
async fn try_recursion(
pub async fn try_recursion(
response: &FeroxResponse,
base_depth: usize,
transmitter: UnboundedSender<String>,
) {
// todo this should be part of the recursion handler
log::trace!(
"enter: try_recursion({}, {}, {:?})",
response,
@@ -433,56 +435,19 @@ async fn make_requests(
}
if CONFIGURATION.extract_links && !ferox_response.status().is_redirection() {
let new_links = get_links(&ferox_response, tx_stats.clone()).await;
let extractor = ExtractorBuilder::with_response(&ferox_response)
.target(ExtractionTarget::ResponseBody)
.depth(base_depth)
.config(&CONFIGURATION)
.recursion_transmitter(dir_chan.clone())
.stats_transmitter(tx_stats.clone())
.reporter_transmitter(report_chan.clone())
.scanned_urls(&SCANNED_URLS)
.stats(stats.clone())
.build()
.unwrap(); // todo change once this function returns Result
for new_link in new_links {
let mut new_ferox_response = match request_feroxresponse_from_new_link(
&new_link,
tx_stats.clone(),
)
.await
{
Some(resp) => resp,
None => continue,
};
// filter if necessary
if should_filter_response(&new_ferox_response, tx_stats.clone()) {
continue;
}
if new_ferox_response.is_file() {
// very likely a file, simply request and report
log::debug!("Singular extraction: {}", new_ferox_response);
SCANNED_URLS
.add_file_scan(&new_ferox_response.url().to_string(), stats.clone());
send_report(report_chan.clone(), new_ferox_response);
continue;
}
if !CONFIGURATION.no_recursion {
log::debug!("Recursive extraction: {}", new_ferox_response);
if !new_ferox_response.url().as_str().ends_with('/')
&& (new_ferox_response.status().is_success()
|| matches!(new_ferox_response.status(), &StatusCode::FORBIDDEN))
{
// if the url doesn't end with a /
// and the response code is either a 2xx or 403
// since all of these are 2xx or 403, recursion is only attempted if the
// url ends in a /. I am actually ok with adding the slash and not
// adding it, as both have merit. Leaving it in for now to see how
// things turn out (current as of: v1.1.0)
new_ferox_response.set_url(&format!("{}/", new_ferox_response.url()));
}
try_recursion(&new_ferox_response, base_depth, dir_chan.clone()).await;
}
}
let _ = extractor.extract().await;
}
// everything else should be reported
@@ -506,61 +471,6 @@ pub fn send_report(report_sender: UnboundedSender<FeroxResponse>, response: Fero
log::trace!("exit: send_report");
}
/// Request /robots.txt from given url
async fn scan_robots_txt(
target_url: &str,
base_depth: usize,
stats: Arc<Stats>,
tx_term: UnboundedSender<FeroxResponse>,
tx_dir: UnboundedSender<String>,
tx_stats: UnboundedSender<StatCommand>,
) {
log::trace!(
"enter: scan_robots_txt({}, {}, {:?}, {:?}, {:?}, {:?})",
target_url,
base_depth,
stats,
tx_term,
tx_dir,
tx_stats
);
let robots_links = extract_robots_txt(&target_url, &CONFIGURATION, tx_stats.clone()).await;
for robot_link in robots_links {
// create a url based on the given command line options, continue on error
let mut ferox_response =
match request_feroxresponse_from_new_link(&robot_link, tx_stats.clone()).await {
Some(resp) => resp,
None => continue,
};
if should_filter_response(&ferox_response, tx_stats.clone()) {
continue;
}
if ferox_response.is_file() {
log::debug!("File extracted from robots.txt: {}", ferox_response);
SCANNED_URLS.add_file_scan(&robot_link, stats.clone());
send_report(tx_term.clone(), ferox_response);
} else if !CONFIGURATION.no_recursion {
log::debug!("Directory extracted from robots.txt: {}", ferox_response);
// todo this code is essentially the same as another piece around ~467 of this file
if !ferox_response.url().as_str().ends_with('/')
&& (ferox_response.status().is_success()
|| matches!(ferox_response.status(), &StatusCode::FORBIDDEN))
{
// if the url doesn't end with a /
// and the response code is either a 2xx or 403
ferox_response.set_url(&format!("{}/", ferox_response.url()));
}
try_recursion(&ferox_response, base_depth, tx_dir.clone()).await;
}
}
log::trace!("exit: scan_robots_txt");
}
/// Scan a given url using a given wordlist
///
/// This is the primary entrypoint for the scanner
@@ -596,15 +506,20 @@ pub async fn scan_url(
if CONFIGURATION.extract_links {
// only grab robots.txt on the initial scan_url calls. all fresh dirs will be passed
// to try_recursion
scan_robots_txt(
target_url,
base_depth,
stats.clone(),
tx_term.clone(),
tx_dir.clone(),
tx_stats.clone(),
)
.await;
let extractor = ExtractorBuilder::with_url(target_url)
.target(ExtractionTarget::RobotsTxt)
.depth(base_depth)
.config(&CONFIGURATION)
.recursion_transmitter(tx_dir.clone())
.stats_transmitter(tx_stats.clone())
.reporter_transmitter(tx_term.clone())
.scanned_urls(&SCANNED_URLS)
.stats(stats.clone())
.build()
.unwrap(); // todo change once this function returns Result
let _ = extractor.extract().await;
}
update_stat!(tx_stats, UpdateUsizeField(TotalScans, 1));

View File

@@ -5,7 +5,7 @@ use crate::{
StatCommand::{self, AddError, AddStatus},
StatError::{Connection, Other, Redirection, Request, Timeout, UrlFormat},
},
FeroxError, FeroxResult,
FeroxError,
};
use anyhow::{bail, Context, Result};
use console::{strip_ansi_codes, style, user_attended};
@@ -184,7 +184,7 @@ pub fn format_url(
queries: &[(String, String)],
extension: Option<&str>,
tx_stats: UnboundedSender<StatCommand>,
) -> FeroxResult<Url> {
) -> Result<Url> {
log::trace!(
"enter: format_url({}, {}, {}, {:?} {:?}, {:?})",
url,
@@ -214,7 +214,7 @@ pub fn format_url(
update_stat!(tx_stats, AddError(UrlFormat));
log::trace!("exit: format_url -> {}", err);
return Err(Box::new(err));
bail!("{}", err);
}
// from reqwest::Url::join
@@ -284,7 +284,7 @@ pub fn format_url(
update_stat!(tx_stats, AddError(UrlFormat));
log::trace!("exit: format_url -> {}", e);
log::error!("Could not join {} with {}", word, base_url);
Err(Box::new(e))
bail!("{}", e)
}
}
}