From 54144dba898ab1ad0bd6aac8e013a6be5abd6bd6 Mon Sep 17 00:00:00 2001
From: godylockz <81207744+godylockz@users.noreply.github.com>
Date: Fri, 14 Jan 2022 23:46:40 -0500
Subject: [PATCH] Second Cut  - All Directory Listing Items Obtained

---
 src/extractor/builder.rs     |  2 +-
 src/extractor/container.rs   | 38 ++++++++++++++++++++++++++++++------
 src/extractor/tests.rs       |  4 ++--
 src/scanner/ferox_scanner.rs | 20 +++++++++----------
 src/scanner/requester.rs     |  4 ++--
 5 files changed, 47 insertions(+), 21 deletions(-)
diff --git a/src/extractor/builder.rs b/src/extractor/builder.rs
index d72d416..0a5bcaa 100644
--- a/src/extractor/builder.rs
+++ b/src/extractor/builder.rs
@@ -23,7 +23,7 @@ pub enum ExtractionTarget {
     RobotsTxt,
 
     // Parse HTML and extract links
-    ParseHTML,
+    ParseHtml,
 }
 
 /// responsible for building an `Extractor`
diff --git a/src/extractor/container.rs b/src/extractor/container.rs
index 037e701..cea5982 100644
--- a/src/extractor/container.rs
+++ b/src/extractor/container.rs
@@ -63,7 +63,7 @@ impl<'a> Extractor<'a> {
         match self.target {
             ExtractionTarget::ResponseBody => Ok(self.extract_from_body().await?),
             ExtractionTarget::RobotsTxt => Ok(self.extract_from_robots().await?),
-            ExtractionTarget::ParseHTML => Ok(self.parse_html().await?),
+            ExtractionTarget::ParseHtml => Ok(self.parse_html().await?),
         }
     }
 
@@ -153,6 +153,29 @@ impl<'a> Extractor<'a> {
 
         let body = self.response.unwrap().text();
 
+        // Check for directory listing
+        if body.contains("Directory listing") {
+            log::debug!(" >> directory listing detected");
+        }
+        // Parse links [Note: Update both functions]
+        let document = Document::from(body);
+        let html_links = (document.find(Name("a")).filter_map(|n| n.attr("href")))
+            .chain(document.find(Name("img")).filter_map(|n| n.attr("src")))
+            .chain(document.find(Name("form")).filter_map(|n| n.attr("action")))
+            .chain(document.find(Name("script")).filter_map(|n| n.attr("src")))
+            .chain(document.find(Name("iframe")).filter_map(|n| n.attr("src")))
+            .chain(document.find(Name("div")).filter_map(|n| n.attr("src")))
+            .chain(document.find(Name("frame")).filter_map(|n| n.attr("src")))
+            .chain(document.find(Name("embed")).filter_map(|n| n.attr("src")));
+        for link in html_links {
+            log::info!(" >> found link \"{}\"", link);
+            let mut new_url = Url::parse(&self.url)?;
+            new_url.set_path(link);
+            if self.add_all_sub_paths(new_url.path(), &mut links).is_err() {
+                log::warn!("could not add sub-paths from {} to {:?}", new_url, links);
+            }
+        }
+
         for capture in self.links_regex.captures_iter(body) {
             // remove single & double quotes from both ends of the capture
             // capture[0] is the entire match, additional capture groups start at [1]
@@ -276,7 +299,7 @@ impl<'a> Extractor<'a> {
                     bail!("Could not parse {}: {}", self.url, e);
                 }
             },
-            ExtractionTarget::ParseHTML => match Url::parse(&self.url) {
+            ExtractionTarget::ParseHtml => match Url::parse(&self.url) {
                 Ok(u) => u,
                 Err(e) => {
                     bail!("Could not parse {}: {}", self.url, e);
@@ -365,9 +388,11 @@ impl<'a> Extractor<'a> {
 
         let mut links: HashSet<String> = HashSet::new();
 
+        // request
         let response = self.make_extract_request("/robots.txt").await?;
+        let body = response.text();
 
-        for capture in self.robots_regex.captures_iter(response.text()) {
+        for capture in self.robots_regex.captures_iter(body) {
             if let Some(new_path) = capture.name("url_path") {
                 let mut new_url = Url::parse(&self.url)?;
                 new_url.set_path(new_path.as_str());
@@ -391,16 +416,17 @@ impl<'a> Extractor<'a> {
 
         let mut links: HashSet<String> = HashSet::new();
 
-        let response = self.make_extract_request("/").await?;
+        // Request
+        let url = Url::parse(&self.url)?;
+        let response = self.make_extract_request(url.path()).await?;
         let body = response.text();
 
         // Check for directory listing
         if body.contains("Directory listing") {
             log::debug!(" >> directory listing detected");
         }
+        // Parse links [Note: Update both functions]
         let document = Document::from(body);
-
-        // Parse links
         let html_links = (document.find(Name("a")).filter_map(|n| n.attr("href")))
             .chain(document.find(Name("img")).filter_map(|n| n.attr("src")))
             .chain(document.find(Name("form")).filter_map(|n| n.attr("action")))
diff --git a/src/extractor/tests.rs b/src/extractor/tests.rs
index 6baef11..c8d8d7e 100644
--- a/src/extractor/tests.rs
+++ b/src/extractor/tests.rs
@@ -280,7 +280,7 @@ async fn request_robots_txt_without_proxy() -> Result<()> {
         handles,
     };
 
-    let resp = extractor.request_robots_txt().await?;
+    let resp = extractor.make_extract_request("/robots.txt").await?;
 
     assert!(matches!(resp.status(), &StatusCode::OK));
     println!("{}", resp);
@@ -313,7 +313,7 @@ async fn request_robots_txt_with_proxy() -> Result<()> {
         .handles(handles)
         .build()?;
 
-    let resp = extractor.request_robots_txt().await?;
+    let resp = extractor.make_extract_request("/robots.txt").await?;
 
     assert!(matches!(resp.status(), &StatusCode::OK));
     assert_eq!(resp.content_length(), 19);
diff --git a/src/scanner/ferox_scanner.rs b/src/scanner/ferox_scanner.rs
index da1ef1f..68a4bbd 100644
--- a/src/scanner/ferox_scanner.rs
+++ b/src/scanner/ferox_scanner.rs
@@ -75,25 +75,25 @@ impl FeroxScanner {
 
         let scan_timer = Instant::now();
 
-        if matches!(self.order, ScanOrder::Initial) { // all fresh dirs will be passed to try_recursion
+        if self.handles.config.extract_links {
             // parse html for links (i.e. web scraping)
             let extractor = ExtractorBuilder::default()
+                .target(ExtractionTarget::ParseHtml)
                 .url(&self.target_url)
                 .handles(self.handles.clone())
-                .target(ExtractionTarget::ParseHTML)
                 .build()?;
             let links = extractor.extract().await?;
             extractor.request_links(links).await?;
 
-            if self.handles.config.extract_links {
-                // test robots.txt
+            if matches!(self.order, ScanOrder::Initial) {
+                // check for robots.txt (cannot be in subdirs)
                 let extractor = ExtractorBuilder::default()
-                    .url(&self.target_url)
-                    .handles(self.handles.clone())
-                    .target(ExtractionTarget::RobotsTxt)
-                    .build()?;
-                let links = extractor.extract().await?;
-                extractor.request_links(links).await?;
+                .target(ExtractionTarget::RobotsTxt)
+                .url(&self.target_url)
+                .handles(self.handles.clone())
+                .build()?;
+            let links = extractor.extract().await?;
+            extractor.request_links(links).await?;
             }
         }
 
diff --git a/src/scanner/requester.rs b/src/scanner/requester.rs
index 6ce7326..bf502ac 100644
--- a/src/scanner/requester.rs
+++ b/src/scanner/requester.rs
@@ -17,7 +17,7 @@ use crate::{
         Command::{self, AddError, SubtractFromUsizeField},
         Handles,
     },
-    extractor::{ExtractionTarget::ResponseBody, ExtractorBuilder},
+    extractor::{ExtractionTarget, ExtractorBuilder},
     response::FeroxResponse,
     scan_manager::{FeroxScan, ScanStatus},
     statistics::{StatError::Other, StatField::TotalExpected},
@@ -394,7 +394,7 @@ impl Requester {
 
                 if self.handles.config.extract_links && !ferox_response.status().is_redirection() {
                     let extractor = ExtractorBuilder::default()
-                        .target(ResponseBody)
+                        .target(ExtractionTarget::ResponseBody)
                         .response(&ferox_response)
                         .handles(self.handles.clone())
                         .build()?;