Second Cut - All Directory Listing Items Obtained

This commit is contained in:
godylockz
2022-01-14 23:46:40 -05:00
parent 18ad9ca733
commit 54144dba89
5 changed files with 47 additions and 21 deletions

View File

@@ -23,7 +23,7 @@ pub enum ExtractionTarget {
RobotsTxt,
// Parse HTML and extract links
ParseHTML,
ParseHtml,
}
/// responsible for building an `Extractor`

View File

@@ -63,7 +63,7 @@ impl<'a> Extractor<'a> {
match self.target {
ExtractionTarget::ResponseBody => Ok(self.extract_from_body().await?),
ExtractionTarget::RobotsTxt => Ok(self.extract_from_robots().await?),
ExtractionTarget::ParseHTML => Ok(self.parse_html().await?),
ExtractionTarget::ParseHtml => Ok(self.parse_html().await?),
}
}
@@ -153,6 +153,29 @@ impl<'a> Extractor<'a> {
let body = self.response.unwrap().text();
// Check for directory listing
if body.contains("Directory listing") {
log::debug!(" >> directory listing detected");
}
// Parse links [Note: Update both functions]
let document = Document::from(body);
let html_links = (document.find(Name("a")).filter_map(|n| n.attr("href")))
.chain(document.find(Name("img")).filter_map(|n| n.attr("src")))
.chain(document.find(Name("form")).filter_map(|n| n.attr("action")))
.chain(document.find(Name("script")).filter_map(|n| n.attr("src")))
.chain(document.find(Name("iframe")).filter_map(|n| n.attr("src")))
.chain(document.find(Name("div")).filter_map(|n| n.attr("src")))
.chain(document.find(Name("frame")).filter_map(|n| n.attr("src")))
.chain(document.find(Name("embed")).filter_map(|n| n.attr("src")));
for link in html_links {
log::info!(" >> found link \"{}\"", link);
let mut new_url = Url::parse(&self.url)?;
new_url.set_path(link);
if self.add_all_sub_paths(new_url.path(), &mut links).is_err() {
log::warn!("could not add sub-paths from {} to {:?}", new_url, links);
}
}
for capture in self.links_regex.captures_iter(body) {
// remove single & double quotes from both ends of the capture
// capture[0] is the entire match, additional capture groups start at [1]
@@ -276,7 +299,7 @@ impl<'a> Extractor<'a> {
bail!("Could not parse {}: {}", self.url, e);
}
},
ExtractionTarget::ParseHTML => match Url::parse(&self.url) {
ExtractionTarget::ParseHtml => match Url::parse(&self.url) {
Ok(u) => u,
Err(e) => {
bail!("Could not parse {}: {}", self.url, e);
@@ -365,9 +388,11 @@ impl<'a> Extractor<'a> {
let mut links: HashSet<String> = HashSet::new();
// request
let response = self.make_extract_request("/robots.txt").await?;
let body = response.text();
for capture in self.robots_regex.captures_iter(response.text()) {
for capture in self.robots_regex.captures_iter(body) {
if let Some(new_path) = capture.name("url_path") {
let mut new_url = Url::parse(&self.url)?;
new_url.set_path(new_path.as_str());
@@ -391,16 +416,17 @@ impl<'a> Extractor<'a> {
let mut links: HashSet<String> = HashSet::new();
let response = self.make_extract_request("/").await?;
// Request
let url = Url::parse(&self.url)?;
let response = self.make_extract_request(url.path()).await?;
let body = response.text();
// Check for directory listing
if body.contains("Directory listing") {
log::debug!(" >> directory listing detected");
}
// Parse links [Note: Update both functions]
let document = Document::from(body);
// Parse links
let html_links = (document.find(Name("a")).filter_map(|n| n.attr("href")))
.chain(document.find(Name("img")).filter_map(|n| n.attr("src")))
.chain(document.find(Name("form")).filter_map(|n| n.attr("action")))

View File

@@ -280,7 +280,7 @@ async fn request_robots_txt_without_proxy() -> Result<()> {
handles,
};
let resp = extractor.request_robots_txt().await?;
let resp = extractor.make_extract_request("/robots.txt").await?;
assert!(matches!(resp.status(), &StatusCode::OK));
println!("{}", resp);
@@ -313,7 +313,7 @@ async fn request_robots_txt_with_proxy() -> Result<()> {
.handles(handles)
.build()?;
let resp = extractor.request_robots_txt().await?;
let resp = extractor.make_extract_request("/robots.txt").await?;
assert!(matches!(resp.status(), &StatusCode::OK));
assert_eq!(resp.content_length(), 19);

View File

@@ -75,25 +75,25 @@ impl FeroxScanner {
let scan_timer = Instant::now();
if matches!(self.order, ScanOrder::Initial) { // all fresh dirs will be passed to try_recursion
if self.handles.config.extract_links {
// parse html for links (i.e. web scraping)
let extractor = ExtractorBuilder::default()
.target(ExtractionTarget::ParseHtml)
.url(&self.target_url)
.handles(self.handles.clone())
.target(ExtractionTarget::ParseHTML)
.build()?;
let links = extractor.extract().await?;
extractor.request_links(links).await?;
if self.handles.config.extract_links {
// test robots.txt
if matches!(self.order, ScanOrder::Initial) {
// check for robots.txt (cannot be in subdirs)
let extractor = ExtractorBuilder::default()
.url(&self.target_url)
.handles(self.handles.clone())
.target(ExtractionTarget::RobotsTxt)
.build()?;
let links = extractor.extract().await?;
extractor.request_links(links).await?;
.target(ExtractionTarget::RobotsTxt)
.url(&self.target_url)
.handles(self.handles.clone())
.build()?;
let links = extractor.extract().await?;
extractor.request_links(links).await?;
}
}

View File

@@ -17,7 +17,7 @@ use crate::{
Command::{self, AddError, SubtractFromUsizeField},
Handles,
},
extractor::{ExtractionTarget::ResponseBody, ExtractorBuilder},
extractor::{ExtractionTarget, ExtractorBuilder},
response::FeroxResponse,
scan_manager::{FeroxScan, ScanStatus},
statistics::{StatError::Other, StatField::TotalExpected},
@@ -394,7 +394,7 @@ impl Requester {
if self.handles.config.extract_links && !ferox_response.status().is_redirection() {
let extractor = ExtractorBuilder::default()
.target(ResponseBody)
.target(ExtractionTarget::ResponseBody)
.response(&ferox_response)
.handles(self.handles.clone())
.build()?;