From bcfd8b6eefae0b9556de1c05c34d6275b7cede08 Mon Sep 17 00:00:00 2001
From: epi <epibar052@gmail.com>
Date: Tue, 11 Jul 2023 06:23:18 -0400
Subject: [PATCH] fixed unwrap in nlp::document

---
 src/nlp/document.rs      | 24 +++++++++++++-----------
 src/scanner/requester.rs | 13 +++++++------
 2 files changed, 20 insertions(+), 17 deletions(-)
diff --git a/src/nlp/document.rs b/src/nlp/document.rs
index 6a0244d..daeb785 100644
--- a/src/nlp/document.rs
+++ b/src/nlp/document.rs
@@ -40,15 +40,16 @@ impl Document {
     }
 
     /// create a new `Document` from the given HTML string
-    pub(crate) fn from_html(raw_html: &str) -> Self {
+    pub(crate) fn from_html(raw_html: &str) -> Option<Self> {
         let selector = Selector::parse("body").unwrap();
 
         let html = Html::parse_document(raw_html);
 
-        let text = html
-            .select(&selector)
-            .next()
-            .unwrap()
+        let Some(element) = html.select(&selector).next() else {
+            return None;
+        };
+
+        let text = element
             .descendants()
             .filter_map(|node| {
                 if !node.value().is_text() && !node.value().is_comment() {
@@ -95,7 +96,7 @@ impl Document {
 
         // call `new` to push the parsed html through the pre-processing pipeline and process all
         // the words
-        Self::new(&text)
+        Some(Self::new(&text))
     }
 
     /// Log normalized weighting scheme for term frequency
@@ -146,19 +147,20 @@ mod tests {
     #[test]
     /// `Document::new` should preprocess html and generate a hashmap of `Term, TermMetadata`
     fn nlp_document_creation_from_html() {
-        let empty = Document::from_html("<html></html>");
+        let empty = Document::from_html("<html></html>").unwrap();
         assert_eq!(empty.number_of_terms, 0);
 
-        let other_empty = Document::from_html("<html><body><p></p></body></html>");
+        let other_empty = Document::from_html("<html><body><p></p></body></html>").unwrap();
         assert_eq!(other_empty.number_of_terms, 0);
 
-        let third_empty = Document::from_html("<!DOCTYPE html><html><!DOCTYPE html><p></p></html>");
+        let third_empty =
+            Document::from_html("<!DOCTYPE html><html><!DOCTYPE html><p></p></html>").unwrap();
         assert_eq!(third_empty.number_of_terms, 0);
 
         // p tag for is_text check and comment for is_comment
         let doc = Document::from_html(
             "<html><body><p>The air quality in Singapore.</p><!--got worse on Wednesday--></body></html>",
-        );
+        ).unwrap();
 
         let expected_terms = ["air", "quality", "singapore", "worse", "wednesday"];
 
@@ -209,7 +211,7 @@ mod tests {
     /// ensure words in script/style tags aren't processed
     fn document_creation_skips_script_and_style_tags() {
         let html = "<body><script>The air quality</script><style>in Singapore</style><p>got worse on Wednesday.</p></body>";
-        let doc = Document::from_html(html);
+        let doc = Document::from_html(html).unwrap();
         let keys = doc.terms().keys().map(|key| key.raw()).collect::<Vec<_>>();
 
         let expected = ["worse", "wednesday"];
diff --git a/src/scanner/requester.rs b/src/scanner/requester.rs
index acdd03d..0517eca 100644
--- a/src/scanner/requester.rs
+++ b/src/scanner/requester.rs
@@ -475,12 +475,13 @@ impl Requester {
 
                 if self.handles.config.collect_words {
                     if let Ok(mut guard) = TF_IDF.write() {
-                        let doc = Document::from_html(ferox_response.text());
-                        guard.add_document(doc);
-                        if guard.num_documents() % 12 == 0
-                            || (guard.num_documents() < 5 && guard.num_documents() % 2 == 0)
-                        {
-                            guard.calculate_tf_idf_scores();
+                        if let Some(doc) = Document::from_html(ferox_response.text()) {
+                            guard.add_document(doc);
+                            if guard.num_documents() % 12 == 0
+                                || (guard.num_documents() < 5 && guard.num_documents() % 2 == 0)
+                            {
+                                guard.calculate_tf_idf_scores();
+                            }
                         }
                     }
                 }