Merge pull request #9 from Daethyra/main

Update branch for the sake of BuilderIO's PR #89
BuilderIO · Dec 8, 2023 · 558c24b · 558c24b
2 parents b63c9eb + b36324f
commit 558c24b
Show file tree

Hide file tree

Showing 3 changed files with 67 additions and 59 deletions.
diff --git a/config.ts b/config.ts
@@ -1,8 +1,8 @@
 import { Config } from "./src/config";
 
 export const defaultConfig: Config = {
-  url: "https://www.builder.io/c/docs/developers",
-  match: "https://www.builder.io/c/docs/**",
+  url: "https://docs.pinecone.io/docs/langchain",
+  match: "https://docs.pinecone.io/docs/langchain/**",
   maxPagesToCrawl: 50,
   outputFileName: "output.json",
 };
diff --git a/src/conv_html_to_markdown.py b/src/conv_html_to_markdown.py
@@ -5,6 +5,7 @@
 remove redundant content.
 """
 
+import glob
 import json
 import logging
 from concurrent.futures import ThreadPoolExecutor
@@ -34,8 +35,7 @@ def __init__(self, strip_tags=None, convert_links=True):
         self.strip_tags = strip_tags or ["script", "style", "meta"]
         self.convert_links = convert_links
         self.tokenizer = AutoTokenizer.from_pretrained(
-            "jinaai/jina-embeddings-v2-small-en",
-            trust_remote_code=True
+            "jinaai/jina-embeddings-v2-small-en", trust_remote_code=True
         )
         self.model = AutoModel.from_pretrained("jinaai/jina-embeddings-v2-small-en")
 
@@ -179,18 +179,21 @@ def format_dataset(self, data):
         )  # Ensure proper newline separation between entries
 
 
-def load_json(file_path):
+def load_json_files(pattern):
     """
-    Load data from a JSON file.
+    Load data from multiple JSON files matching a pattern.
 
     Args:
-        file_path (str): Path to the JSON file.
+        pattern (str): Glob pattern to match files.
 
     Returns:
-        dict: The data loaded from the JSON file.
+        list: Aggregated data from all matched files.
     """
-    with open(file_path, "r", encoding="utf-8") as file:
-        return json.load(file)
+    aggregated_data = []
+    for file_path in glob.glob(pattern):
+        with open(file_path, "r", encoding="utf-8") as file:
+            aggregated_data.extend(json.load(file))
+    return aggregated_data
 
 
 def save_output_in_chunks(file_path, contents, chunk_size=1024):
@@ -252,23 +255,21 @@ def main():
     """
     logging.basicConfig(level=logging.INFO)
     try:
-        original_data = load_json("output.json")
+        pattern = "output*.json"  # Pattern to match JSON files
+        original_data = load_json_files(pattern)
         chunk_size = 512  # Adjust chunk size as needed
         max_threads = 10  # Adjust the maximum number of threads as needed
 
         chunks = list(chunk_dataset(original_data, chunk_size))
-
         formatted_contents = []
         logging.info("Processing and saving dataset in chunks.")
         with ThreadPoolExecutor(max_workers=max_threads) as executor:
             results = executor.map(process_chunk, chunks)
             for result in results:
                 formatted_contents.append(result)
 
-        save_output_in_chunks(
-            "gpt-crawler-curated_markdown.md",
-            formatted_contents,
-        )
+        output_file_name = "gpt-crawler-curated_markdown.md"
+        save_output_in_chunks(output_file_name, formatted_contents)
         logging.info("Content formatted and saved in chunks successfully.")
         logging.info("\nConversion process successful. Exiting program.")
     except Exception as e:

diff --git a/tests/test_conv_html_to_markdown.py b/tests/test_conv_html_to_markdown.py
@@ -1,7 +1,7 @@
-import unittest
-import json
 import sys
 import os
+import unittest
+import json
 
 # Add the parent directory to the Python path
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
@@ -11,59 +11,66 @@
 class TestHTMLToMarkdownConverter(unittest.TestCase):
     def setUp(self):
         self.converter = HTMLToMarkdownConverter()
+        self.formatter = DatasetFormatter(self.converter)
+        self.html_content = "<h1>This is a test</h1><p>This is a paragraph.</p>"
+        self.markdown_content = "# This is a test\n\nThis is a paragraph."
 
     def test_convert(self):
-        html_content = "<p>This is a test.</p>"
-        expected_markdown = "This is a test."
-        markdown_content = self.converter.convert(html_content)
-        self.assertEqual(markdown_content, expected_markdown)
+        self.assertEqual(
+            self.converter.convert(self.html_content), self.markdown_content
+        )
 
     def test_curate_content(self):
-        html_content = "<p>This is a test.</p><script>alert('test');</script>"
-        expected_html = "<p>This is a test.</p>"
-        curated_html = self.converter.curate_content(html_content)
-        self.assertEqual(curated_html, expected_html)
-
-
-class TestDatasetFormatter(unittest.TestCase):
-    def setUp(self):
-        self.formatter = DatasetFormatter(HTMLToMarkdownConverter())
+        self.assertEqual(
+            self.converter.curate_content(self.html_content), self.html_content
+        )
 
     def test_format_entry(self):
-        entry = {
-            "title": "Test Entry",
-            "url": "https://example.com/test-entry",
-            "html": "<p>This is a test.</p>",
-        }
-        expected_markdown = "## Test Entry\n\n[Read More](https://example.com/test-entry)\n\nThis is a test."
-        markdown_content = self.formatter.format_entry(entry)
-        self.assertEqual(markdown_content, expected_markdown)
+        entry = {"title": "Test", "url": "www.test.com", "html": self.html_content}
+        self.assertEqual(
+            self.formatter.format_entry(entry),
+            f"## Test\n\n[Read More](www.test.com)\n\n{self.markdown_content}",
+        )
 
     def test_structure_markdown(self):
-        title = "Test Entry"
-        url = "https://example.com/test-entry"
-        content = "This is a test."
-        expected_markdown = "## Test Entry\n\n[Read More](https://example.com/test-entry)\n\nThis is a test."
-        structured_markdown = self.formatter.structure_markdown(title, url, content)
-        self.assertEqual(structured_markdown, expected_markdown)
+        self.assertEqual(
+            self.formatter.structure_markdown(
+                "Test", "www.test.com", self.markdown_content
+            ),
+            f"## Test\n\n[Read More](www.test.com)\n\n{self.markdown_content}",
+        )
 
     def test_format_dataset(self):
         data = [
-            {
-                "title": "Test Entry 1",
-                "url": "https://example.com/test-entry-1",
-                "html": "<p>This is a test.</p>",
-            },
-            {
-                "title": "Test Entry 2",
-                "url": "https://example.com/test-entry-2",
-                "html": "<p>This is another test.</p>",
-            },
+            {"title": "Test 1", "url": "www.test1.com", "html": self.html_content},
+            {"title": "Test 2", "url": "www.test2.com", "html": self.html_content},
+        ]
+        self.assertEqual(
+            self.formatter.format_dataset(data),
+            f"## Test 1\n\n[Read More](www.test1.com)\n\n{self.markdown_content}\n\n## Test 2\n\n[Read More](www.test2.com)\n\n{self.markdown_content}",
+        )
+
+    def test_load_json(self):
+        with open("output.json", "r", encoding="utf-8") as file:
+            expected_data = json.load(file)
+        self.assertEqual(load_json("output.json"), expected_data)
+
+    def test_chunk_dataset(self):
+        data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        chunk_size = 3
+        expected_chunks = [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10]]
+        self.assertEqual(list(chunk_dataset(data, chunk_size)), expected_chunks)
+
+    def test_process_chunk(self):
+        chunk = [
+            {"title": "Test 1", "url": "www.test1.com", "html": self.html_content},
+            {"title": "Test 2", "url": "www.test2.com", "html": self.html_content},
         ]
-        expected_markdown = "## Test Entry 1\n\n[Read More](https://example.com/test-entry-1)\n\nThis is a test.\n\n## Test Entry 2\n\n[Read More](https://example.com/test-entry-2)\n\nThis is another test."
-        markdown_content = self.formatter.format_dataset(data)
-        self.assertEqual(markdown_content, expected_markdown)
+        self.assertEqual(
+            process_chunk(chunk),
+            f"## Test 1\n\n[Read More](www.test1.com)\n\n{self.markdown_content}\n\n## Test 2\n\n[Read More](www.test2.com)\n\n{self.markdown_content}",
+        )
 
 
 if __name__ == "__main__":
-    unittest.main()
+    unittest.main