diff --git a/config.ts b/config.ts index bc2d22e0..82cd967d 100644 --- a/config.ts +++ b/config.ts @@ -1,8 +1,8 @@ import { Config } from "./src/config"; export const defaultConfig: Config = { - url: "https://www.builder.io/c/docs/developers", - match: "https://www.builder.io/c/docs/**", + url: "https://docs.pinecone.io/docs/langchain", + match: "https://docs.pinecone.io/docs/langchain/**", maxPagesToCrawl: 50, outputFileName: "output.json", }; diff --git a/src/conv_html_to_markdown.py b/src/conv_html_to_markdown.py index d570bb0f..ec0580b1 100644 --- a/src/conv_html_to_markdown.py +++ b/src/conv_html_to_markdown.py @@ -5,6 +5,7 @@ remove redundant content. """ +import glob import json import logging from concurrent.futures import ThreadPoolExecutor @@ -34,8 +35,7 @@ def __init__(self, strip_tags=None, convert_links=True): self.strip_tags = strip_tags or ["script", "style", "meta"] self.convert_links = convert_links self.tokenizer = AutoTokenizer.from_pretrained( - "jinaai/jina-embeddings-v2-small-en", - trust_remote_code=True + "jinaai/jina-embeddings-v2-small-en", trust_remote_code=True ) self.model = AutoModel.from_pretrained("jinaai/jina-embeddings-v2-small-en") @@ -179,18 +179,21 @@ def format_dataset(self, data): ) # Ensure proper newline separation between entries -def load_json(file_path): +def load_json_files(pattern): """ - Load data from a JSON file. + Load data from multiple JSON files matching a pattern. Args: - file_path (str): Path to the JSON file. + pattern (str): Glob pattern to match files. Returns: - dict: The data loaded from the JSON file. + list: Aggregated data from all matched files. """ - with open(file_path, "r", encoding="utf-8") as file: - return json.load(file) + aggregated_data = [] + for file_path in glob.glob(pattern): + with open(file_path, "r", encoding="utf-8") as file: + aggregated_data.extend(json.load(file)) + return aggregated_data def save_output_in_chunks(file_path, contents, chunk_size=1024): @@ -252,12 +255,12 @@ def main(): """ logging.basicConfig(level=logging.INFO) try: - original_data = load_json("output.json") + pattern = "output*.json" # Pattern to match JSON files + original_data = load_json_files(pattern) chunk_size = 512 # Adjust chunk size as needed max_threads = 10 # Adjust the maximum number of threads as needed chunks = list(chunk_dataset(original_data, chunk_size)) - formatted_contents = [] logging.info("Processing and saving dataset in chunks.") with ThreadPoolExecutor(max_workers=max_threads) as executor: @@ -265,10 +268,8 @@ def main(): for result in results: formatted_contents.append(result) - save_output_in_chunks( - "gpt-crawler-curated_markdown.md", - formatted_contents, - ) + output_file_name = "gpt-crawler-curated_markdown.md" + save_output_in_chunks(output_file_name, formatted_contents) logging.info("Content formatted and saved in chunks successfully.") logging.info("\nConversion process successful. Exiting program.") except Exception as e: diff --git a/tests/test_conv_html_to_markdown.py b/tests/test_conv_html_to_markdown.py index 0ca15c55..700dcce5 100644 --- a/tests/test_conv_html_to_markdown.py +++ b/tests/test_conv_html_to_markdown.py @@ -1,7 +1,7 @@ -import unittest -import json import sys import os +import unittest +import json # Add the parent directory to the Python path sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) @@ -11,59 +11,66 @@ class TestHTMLToMarkdownConverter(unittest.TestCase): def setUp(self): self.converter = HTMLToMarkdownConverter() + self.formatter = DatasetFormatter(self.converter) + self.html_content = "

This is a test

This is a paragraph.

" + self.markdown_content = "# This is a test\n\nThis is a paragraph." def test_convert(self): - html_content = "

This is a test.

" - expected_markdown = "This is a test." - markdown_content = self.converter.convert(html_content) - self.assertEqual(markdown_content, expected_markdown) + self.assertEqual( + self.converter.convert(self.html_content), self.markdown_content + ) def test_curate_content(self): - html_content = "

This is a test.

" - expected_html = "

This is a test.

" - curated_html = self.converter.curate_content(html_content) - self.assertEqual(curated_html, expected_html) - - -class TestDatasetFormatter(unittest.TestCase): - def setUp(self): - self.formatter = DatasetFormatter(HTMLToMarkdownConverter()) + self.assertEqual( + self.converter.curate_content(self.html_content), self.html_content + ) def test_format_entry(self): - entry = { - "title": "Test Entry", - "url": "https://example.com/test-entry", - "html": "

This is a test.

", - } - expected_markdown = "## Test Entry\n\n[Read More](https://example.com/test-entry)\n\nThis is a test." - markdown_content = self.formatter.format_entry(entry) - self.assertEqual(markdown_content, expected_markdown) + entry = {"title": "Test", "url": "www.test.com", "html": self.html_content} + self.assertEqual( + self.formatter.format_entry(entry), + f"## Test\n\n[Read More](www.test.com)\n\n{self.markdown_content}", + ) def test_structure_markdown(self): - title = "Test Entry" - url = "https://example.com/test-entry" - content = "This is a test." - expected_markdown = "## Test Entry\n\n[Read More](https://example.com/test-entry)\n\nThis is a test." - structured_markdown = self.formatter.structure_markdown(title, url, content) - self.assertEqual(structured_markdown, expected_markdown) + self.assertEqual( + self.formatter.structure_markdown( + "Test", "www.test.com", self.markdown_content + ), + f"## Test\n\n[Read More](www.test.com)\n\n{self.markdown_content}", + ) def test_format_dataset(self): data = [ - { - "title": "Test Entry 1", - "url": "https://example.com/test-entry-1", - "html": "

This is a test.

", - }, - { - "title": "Test Entry 2", - "url": "https://example.com/test-entry-2", - "html": "

This is another test.

", - }, + {"title": "Test 1", "url": "www.test1.com", "html": self.html_content}, + {"title": "Test 2", "url": "www.test2.com", "html": self.html_content}, + ] + self.assertEqual( + self.formatter.format_dataset(data), + f"## Test 1\n\n[Read More](www.test1.com)\n\n{self.markdown_content}\n\n## Test 2\n\n[Read More](www.test2.com)\n\n{self.markdown_content}", + ) + + def test_load_json(self): + with open("output.json", "r", encoding="utf-8") as file: + expected_data = json.load(file) + self.assertEqual(load_json("output.json"), expected_data) + + def test_chunk_dataset(self): + data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + chunk_size = 3 + expected_chunks = [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10]] + self.assertEqual(list(chunk_dataset(data, chunk_size)), expected_chunks) + + def test_process_chunk(self): + chunk = [ + {"title": "Test 1", "url": "www.test1.com", "html": self.html_content}, + {"title": "Test 2", "url": "www.test2.com", "html": self.html_content}, ] - expected_markdown = "## Test Entry 1\n\n[Read More](https://example.com/test-entry-1)\n\nThis is a test.\n\n## Test Entry 2\n\n[Read More](https://example.com/test-entry-2)\n\nThis is another test." - markdown_content = self.formatter.format_dataset(data) - self.assertEqual(markdown_content, expected_markdown) + self.assertEqual( + process_chunk(chunk), + f"## Test 1\n\n[Read More](www.test1.com)\n\n{self.markdown_content}\n\n## Test 2\n\n[Read More](www.test2.com)\n\n{self.markdown_content}", + ) if __name__ == "__main__": - unittest.main() + unittest.main \ No newline at end of file