From 54144dba898ab1ad0bd6aac8e013a6be5abd6bd6 Mon Sep 17 00:00:00 2001 From: godylockz <81207744+godylockz@users.noreply.github.com> Date: Fri, 14 Jan 2022 23:46:40 -0500 Subject: [PATCH] Second Cut - All Directory Listing Items Obtained --- src/extractor/builder.rs | 2 +- src/extractor/container.rs | 38 ++++++++++++++++++++++++++++++------ src/extractor/tests.rs | 4 ++-- src/scanner/ferox_scanner.rs | 20 +++++++++---------- src/scanner/requester.rs | 4 ++-- 5 files changed, 47 insertions(+), 21 deletions(-) diff --git a/src/extractor/builder.rs b/src/extractor/builder.rs index d72d416..0a5bcaa 100644 --- a/src/extractor/builder.rs +++ b/src/extractor/builder.rs @@ -23,7 +23,7 @@ pub enum ExtractionTarget { RobotsTxt, // Parse HTML and extract links - ParseHTML, + ParseHtml, } /// responsible for building an `Extractor` diff --git a/src/extractor/container.rs b/src/extractor/container.rs index 037e701..cea5982 100644 --- a/src/extractor/container.rs +++ b/src/extractor/container.rs @@ -63,7 +63,7 @@ impl<'a> Extractor<'a> { match self.target { ExtractionTarget::ResponseBody => Ok(self.extract_from_body().await?), ExtractionTarget::RobotsTxt => Ok(self.extract_from_robots().await?), - ExtractionTarget::ParseHTML => Ok(self.parse_html().await?), + ExtractionTarget::ParseHtml => Ok(self.parse_html().await?), } } @@ -153,6 +153,29 @@ impl<'a> Extractor<'a> { let body = self.response.unwrap().text(); + // Check for directory listing + if body.contains("Directory listing") { + log::debug!(" >> directory listing detected"); + } + // Parse links [Note: Update both functions] + let document = Document::from(body); + let html_links = (document.find(Name("a")).filter_map(|n| n.attr("href"))) + .chain(document.find(Name("img")).filter_map(|n| n.attr("src"))) + .chain(document.find(Name("form")).filter_map(|n| n.attr("action"))) + .chain(document.find(Name("script")).filter_map(|n| n.attr("src"))) + .chain(document.find(Name("iframe")).filter_map(|n| n.attr("src"))) + .chain(document.find(Name("div")).filter_map(|n| n.attr("src"))) + .chain(document.find(Name("frame")).filter_map(|n| n.attr("src"))) + .chain(document.find(Name("embed")).filter_map(|n| n.attr("src"))); + for link in html_links { + log::info!(" >> found link \"{}\"", link); + let mut new_url = Url::parse(&self.url)?; + new_url.set_path(link); + if self.add_all_sub_paths(new_url.path(), &mut links).is_err() { + log::warn!("could not add sub-paths from {} to {:?}", new_url, links); + } + } + for capture in self.links_regex.captures_iter(body) { // remove single & double quotes from both ends of the capture // capture[0] is the entire match, additional capture groups start at [1] @@ -276,7 +299,7 @@ impl<'a> Extractor<'a> { bail!("Could not parse {}: {}", self.url, e); } }, - ExtractionTarget::ParseHTML => match Url::parse(&self.url) { + ExtractionTarget::ParseHtml => match Url::parse(&self.url) { Ok(u) => u, Err(e) => { bail!("Could not parse {}: {}", self.url, e); @@ -365,9 +388,11 @@ impl<'a> Extractor<'a> { let mut links: HashSet = HashSet::new(); + // request let response = self.make_extract_request("/robots.txt").await?; + let body = response.text(); - for capture in self.robots_regex.captures_iter(response.text()) { + for capture in self.robots_regex.captures_iter(body) { if let Some(new_path) = capture.name("url_path") { let mut new_url = Url::parse(&self.url)?; new_url.set_path(new_path.as_str()); @@ -391,16 +416,17 @@ impl<'a> Extractor<'a> { let mut links: HashSet = HashSet::new(); - let response = self.make_extract_request("/").await?; + // Request + let url = Url::parse(&self.url)?; + let response = self.make_extract_request(url.path()).await?; let body = response.text(); // Check for directory listing if body.contains("Directory listing") { log::debug!(" >> directory listing detected"); } + // Parse links [Note: Update both functions] let document = Document::from(body); - - // Parse links let html_links = (document.find(Name("a")).filter_map(|n| n.attr("href"))) .chain(document.find(Name("img")).filter_map(|n| n.attr("src"))) .chain(document.find(Name("form")).filter_map(|n| n.attr("action"))) diff --git a/src/extractor/tests.rs b/src/extractor/tests.rs index 6baef11..c8d8d7e 100644 --- a/src/extractor/tests.rs +++ b/src/extractor/tests.rs @@ -280,7 +280,7 @@ async fn request_robots_txt_without_proxy() -> Result<()> { handles, }; - let resp = extractor.request_robots_txt().await?; + let resp = extractor.make_extract_request("/robots.txt").await?; assert!(matches!(resp.status(), &StatusCode::OK)); println!("{}", resp); @@ -313,7 +313,7 @@ async fn request_robots_txt_with_proxy() -> Result<()> { .handles(handles) .build()?; - let resp = extractor.request_robots_txt().await?; + let resp = extractor.make_extract_request("/robots.txt").await?; assert!(matches!(resp.status(), &StatusCode::OK)); assert_eq!(resp.content_length(), 19); diff --git a/src/scanner/ferox_scanner.rs b/src/scanner/ferox_scanner.rs index da1ef1f..68a4bbd 100644 --- a/src/scanner/ferox_scanner.rs +++ b/src/scanner/ferox_scanner.rs @@ -75,25 +75,25 @@ impl FeroxScanner { let scan_timer = Instant::now(); - if matches!(self.order, ScanOrder::Initial) { // all fresh dirs will be passed to try_recursion + if self.handles.config.extract_links { // parse html for links (i.e. web scraping) let extractor = ExtractorBuilder::default() + .target(ExtractionTarget::ParseHtml) .url(&self.target_url) .handles(self.handles.clone()) - .target(ExtractionTarget::ParseHTML) .build()?; let links = extractor.extract().await?; extractor.request_links(links).await?; - if self.handles.config.extract_links { - // test robots.txt + if matches!(self.order, ScanOrder::Initial) { + // check for robots.txt (cannot be in subdirs) let extractor = ExtractorBuilder::default() - .url(&self.target_url) - .handles(self.handles.clone()) - .target(ExtractionTarget::RobotsTxt) - .build()?; - let links = extractor.extract().await?; - extractor.request_links(links).await?; + .target(ExtractionTarget::RobotsTxt) + .url(&self.target_url) + .handles(self.handles.clone()) + .build()?; + let links = extractor.extract().await?; + extractor.request_links(links).await?; } } diff --git a/src/scanner/requester.rs b/src/scanner/requester.rs index 6ce7326..bf502ac 100644 --- a/src/scanner/requester.rs +++ b/src/scanner/requester.rs @@ -17,7 +17,7 @@ use crate::{ Command::{self, AddError, SubtractFromUsizeField}, Handles, }, - extractor::{ExtractionTarget::ResponseBody, ExtractorBuilder}, + extractor::{ExtractionTarget, ExtractorBuilder}, response::FeroxResponse, scan_manager::{FeroxScan, ScanStatus}, statistics::{StatError::Other, StatField::TotalExpected}, @@ -394,7 +394,7 @@ impl Requester { if self.handles.config.extract_links && !ferox_response.status().is_redirection() { let extractor = ExtractorBuilder::default() - .target(ResponseBody) + .target(ExtractionTarget::ResponseBody) .response(&ferox_response) .handles(self.handles.clone()) .build()?;