From 5fd6126ae66ee97bc96edf7ff4340261af58e113 Mon Sep 17 00:00:00 2001 From: Daemon <109057945+Daethyra@users.noreply.github.com> Date: Wed, 29 Nov 2023 18:15:02 -0800 Subject: [PATCH 1/3] Updating test_conv_html_to_markdown --- tests/test_conv_html_to_markdown.py | 93 ++++++++++++++++------------- 1 file changed, 50 insertions(+), 43 deletions(-) diff --git a/tests/test_conv_html_to_markdown.py b/tests/test_conv_html_to_markdown.py index 0ca15c55..42a042b6 100644 --- a/tests/test_conv_html_to_markdown.py +++ b/tests/test_conv_html_to_markdown.py @@ -1,7 +1,7 @@ -import unittest -import json import sys import os +import unittest +import json # Add the parent directory to the Python path sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) @@ -11,59 +11,66 @@ class TestHTMLToMarkdownConverter(unittest.TestCase): def setUp(self): self.converter = HTMLToMarkdownConverter() + self.formatter = DatasetFormatter(self.converter) + self.html_content = "

This is a test

This is a paragraph.

" + self.markdown_content = "# This is a test\n\nThis is a paragraph." def test_convert(self): - html_content = "

This is a test.

" - expected_markdown = "This is a test." - markdown_content = self.converter.convert(html_content) - self.assertEqual(markdown_content, expected_markdown) + self.assertEqual( + self.converter.convert(self.html_content), self.markdown_content + ) def test_curate_content(self): - html_content = "

This is a test.

" - expected_html = "

This is a test.

" - curated_html = self.converter.curate_content(html_content) - self.assertEqual(curated_html, expected_html) - - -class TestDatasetFormatter(unittest.TestCase): - def setUp(self): - self.formatter = DatasetFormatter(HTMLToMarkdownConverter()) + self.assertEqual( + self.converter.curate_content(self.html_content), self.html_content + ) def test_format_entry(self): - entry = { - "title": "Test Entry", - "url": "https://example.com/test-entry", - "html": "

This is a test.

", - } - expected_markdown = "## Test Entry\n\n[Read More](https://example.com/test-entry)\n\nThis is a test." - markdown_content = self.formatter.format_entry(entry) - self.assertEqual(markdown_content, expected_markdown) + entry = {"title": "Test", "url": "www.test.com", "html": self.html_content} + self.assertEqual( + self.formatter.format_entry(entry), + f"## Test\n\n[Read More](www.test.com)\n\n{self.markdown_content}", + ) def test_structure_markdown(self): - title = "Test Entry" - url = "https://example.com/test-entry" - content = "This is a test." - expected_markdown = "## Test Entry\n\n[Read More](https://example.com/test-entry)\n\nThis is a test." - structured_markdown = self.formatter.structure_markdown(title, url, content) - self.assertEqual(structured_markdown, expected_markdown) + self.assertEqual( + self.formatter.structure_markdown( + "Test", "www.test.com", self.markdown_content + ), + f"## Test\n\n[Read More](www.test.com)\n\n{self.markdown_content}", + ) def test_format_dataset(self): data = [ - { - "title": "Test Entry 1", - "url": "https://example.com/test-entry-1", - "html": "

This is a test.

", - }, - { - "title": "Test Entry 2", - "url": "https://example.com/test-entry-2", - "html": "

This is another test.

", - }, + {"title": "Test 1", "url": "www.test1.com", "html": self.html_content}, + {"title": "Test 2", "url": "www.test2.com", "html": self.html_content}, + ] + self.assertEqual( + self.formatter.format_dataset(data), + f"## Test 1\n\n[Read More](www.test1.com)\n\n{self.markdown_content}\n\n## Test 2\n\n[Read More](www.test2.com)\n\n{self.markdown_content}", + ) + + def test_load_json(self): + with open("output.json", "r", encoding="utf-8") as file: + expected_data = json.load(file) + self.assertEqual(load_json("output.json"), expected_data) + + def test_chunk_dataset(self): + data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + chunk_size = 3 + expected_chunks = [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10]] + self.assertEqual(list(chunk_dataset(data, chunk_size)), expected_chunks) + + def test_process_chunk(self): + chunk = [ + {"title": "Test 1", "url": "www.test1.com", "html": self.html_content}, + {"title": "Test 2", "url": "www.test2.com", "html": self.html_content}, ] - expected_markdown = "## Test Entry 1\n\n[Read More](https://example.com/test-entry-1)\n\nThis is a test.\n\n## Test Entry 2\n\n[Read More](https://example.com/test-entry-2)\n\nThis is another test." - markdown_content = self.formatter.format_dataset(data) - self.assertEqual(markdown_content, expected_markdown) + self.assertEqual( + process_chunk(chunk), + f"## Test 1\n\n[Read More](www.test1.com)\n\n{self.markdown_content}\n\n## Test 2\n\n[Read More](www.test2.com)\n\n{self.markdown_content}", + ) if __name__ == "__main__": - unittest.main() + unittest.main From bf266eff3d8bd3995ab5ff46a969af2cab470796 Mon Sep 17 00:00:00 2001 From: Daemon <109057945+Daethyra@users.noreply.github.com> Date: Sat, 2 Dec 2023 13:19:03 -0800 Subject: [PATCH 2/3] refactor(data): improve JSON file loading and processing - Refactored load_json function to load_json_files, allowing it to handle multiple JSON files matching a pattern using glob. This change enables the aggregation of data from all matched files. Also, updated main function to reflect the new file loading process and added explanatory comments for clarity. --- src/conv_html_to_markdown.py | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/src/conv_html_to_markdown.py b/src/conv_html_to_markdown.py index d570bb0f..5f42fba2 100644 --- a/src/conv_html_to_markdown.py +++ b/src/conv_html_to_markdown.py @@ -5,6 +5,7 @@ remove redundant content. """ +import glob import json import logging from concurrent.futures import ThreadPoolExecutor @@ -34,8 +35,7 @@ def __init__(self, strip_tags=None, convert_links=True): self.strip_tags = strip_tags or ["script", "style", "meta"] self.convert_links = convert_links self.tokenizer = AutoTokenizer.from_pretrained( - "jinaai/jina-embeddings-v2-small-en", - trust_remote_code=True + "jinaai/jina-embeddings-v2-small-en", trust_remote_code=True ) self.model = AutoModel.from_pretrained("jinaai/jina-embeddings-v2-small-en") @@ -179,18 +179,21 @@ def format_dataset(self, data): ) # Ensure proper newline separation between entries -def load_json(file_path): +def load_json_files(pattern): """ - Load data from a JSON file. + Load data from multiple JSON files matching a pattern. Args: - file_path (str): Path to the JSON file. + pattern (str): Glob pattern to match files. Returns: - dict: The data loaded from the JSON file. + list: Aggregated data from all matched files. """ - with open(file_path, "r", encoding="utf-8") as file: - return json.load(file) + aggregated_data = [] + for file_path in glob.glob(pattern): + with open(file_path, "r", encoding="utf-8") as file: + aggregated_data.extend(json.load(file)) + return aggregated_data def save_output_in_chunks(file_path, contents, chunk_size=1024): @@ -252,24 +255,24 @@ def main(): """ logging.basicConfig(level=logging.INFO) try: - original_data = load_json("output.json") + pattern = "output*.json" # Pattern to match JSON files + original_data = load_json_files(pattern) chunk_size = 512 # Adjust chunk size as needed max_threads = 10 # Adjust the maximum number of threads as needed chunks = list(chunk_dataset(original_data, chunk_size)) - formatted_contents = [] + logging.info("Processing and saving dataset in chunks.") with ThreadPoolExecutor(max_workers=max_threads) as executor: results = executor.map(process_chunk, chunks) for result in results: formatted_contents.append(result) - save_output_in_chunks( - "gpt-crawler-curated_markdown.md", - formatted_contents, - ) + output_file_name = "gpt-crawler-curated_markdown.md" + save_output_in_chunks(output_file_name, formatted_contents) logging.info("Content formatted and saved in chunks successfully.") + logging.info("\nConversion process successful. Exiting program.") except Exception as e: logging.error("An error occurred in the main function: %s", e) From 42d3dafb9081b86f9b0f63ea6463630aba104544 Mon Sep 17 00:00:00 2001 From: Daemon <109057945+Daethyra@users.noreply.github.com> Date: Sat, 2 Dec 2023 14:09:17 -0800 Subject: [PATCH 3/3] Initialized daethyra/gpt-crawler as a submodule. modified: config.ts --- config.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config.ts b/config.ts index bc2d22e0..82cd967d 100644 --- a/config.ts +++ b/config.ts @@ -1,8 +1,8 @@ import { Config } from "./src/config"; export const defaultConfig: Config = { - url: "https://www.builder.io/c/docs/developers", - match: "https://www.builder.io/c/docs/**", + url: "https://docs.pinecone.io/docs/langchain", + match: "https://docs.pinecone.io/docs/langchain/**", maxPagesToCrawl: 50, outputFileName: "output.json", };