Skip to content

Commit

Permalink
Merge pull request #9 from Daethyra/main
Browse files Browse the repository at this point in the history
Update branch for the sake of BuilderIO's PR #89
  • Loading branch information
Daethyra authored Dec 8, 2023
2 parents b63c9eb + b36324f commit 558c24b
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 59 deletions.
4 changes: 2 additions & 2 deletions config.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import { Config } from "./src/config";

export const defaultConfig: Config = {
url: "https://www.builder.io/c/docs/developers",
match: "https://www.builder.io/c/docs/**",
url: "https://docs.pinecone.io/docs/langchain",
match: "https://docs.pinecone.io/docs/langchain/**",
maxPagesToCrawl: 50,
outputFileName: "output.json",
};
29 changes: 15 additions & 14 deletions src/conv_html_to_markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
remove redundant content.
"""

import glob
import json
import logging
from concurrent.futures import ThreadPoolExecutor
Expand Down Expand Up @@ -34,8 +35,7 @@ def __init__(self, strip_tags=None, convert_links=True):
self.strip_tags = strip_tags or ["script", "style", "meta"]
self.convert_links = convert_links
self.tokenizer = AutoTokenizer.from_pretrained(
"jinaai/jina-embeddings-v2-small-en",
trust_remote_code=True
"jinaai/jina-embeddings-v2-small-en", trust_remote_code=True
)
self.model = AutoModel.from_pretrained("jinaai/jina-embeddings-v2-small-en")

Expand Down Expand Up @@ -179,18 +179,21 @@ def format_dataset(self, data):
) # Ensure proper newline separation between entries


def load_json(file_path):
def load_json_files(pattern):
"""
Load data from a JSON file.
Load data from multiple JSON files matching a pattern.
Args:
file_path (str): Path to the JSON file.
pattern (str): Glob pattern to match files.
Returns:
dict: The data loaded from the JSON file.
list: Aggregated data from all matched files.
"""
with open(file_path, "r", encoding="utf-8") as file:
return json.load(file)
aggregated_data = []
for file_path in glob.glob(pattern):
with open(file_path, "r", encoding="utf-8") as file:
aggregated_data.extend(json.load(file))
return aggregated_data


def save_output_in_chunks(file_path, contents, chunk_size=1024):
Expand Down Expand Up @@ -252,23 +255,21 @@ def main():
"""
logging.basicConfig(level=logging.INFO)
try:
original_data = load_json("output.json")
pattern = "output*.json" # Pattern to match JSON files
original_data = load_json_files(pattern)
chunk_size = 512 # Adjust chunk size as needed
max_threads = 10 # Adjust the maximum number of threads as needed

chunks = list(chunk_dataset(original_data, chunk_size))

formatted_contents = []
logging.info("Processing and saving dataset in chunks.")
with ThreadPoolExecutor(max_workers=max_threads) as executor:
results = executor.map(process_chunk, chunks)
for result in results:
formatted_contents.append(result)

save_output_in_chunks(
"gpt-crawler-curated_markdown.md",
formatted_contents,
)
output_file_name = "gpt-crawler-curated_markdown.md"
save_output_in_chunks(output_file_name, formatted_contents)
logging.info("Content formatted and saved in chunks successfully.")
logging.info("\nConversion process successful. Exiting program.")
except Exception as e:
Expand Down
93 changes: 50 additions & 43 deletions tests/test_conv_html_to_markdown.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import unittest
import json
import sys
import os
import unittest
import json

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
Expand All @@ -11,59 +11,66 @@
class TestHTMLToMarkdownConverter(unittest.TestCase):
def setUp(self):
self.converter = HTMLToMarkdownConverter()
self.formatter = DatasetFormatter(self.converter)
self.html_content = "<h1>This is a test</h1><p>This is a paragraph.</p>"
self.markdown_content = "# This is a test\n\nThis is a paragraph."

def test_convert(self):
html_content = "<p>This is a test.</p>"
expected_markdown = "This is a test."
markdown_content = self.converter.convert(html_content)
self.assertEqual(markdown_content, expected_markdown)
self.assertEqual(
self.converter.convert(self.html_content), self.markdown_content
)

def test_curate_content(self):
html_content = "<p>This is a test.</p><script>alert('test');</script>"
expected_html = "<p>This is a test.</p>"
curated_html = self.converter.curate_content(html_content)
self.assertEqual(curated_html, expected_html)


class TestDatasetFormatter(unittest.TestCase):
def setUp(self):
self.formatter = DatasetFormatter(HTMLToMarkdownConverter())
self.assertEqual(
self.converter.curate_content(self.html_content), self.html_content
)

def test_format_entry(self):
entry = {
"title": "Test Entry",
"url": "https://example.com/test-entry",
"html": "<p>This is a test.</p>",
}
expected_markdown = "## Test Entry\n\n[Read More](https://example.com/test-entry)\n\nThis is a test."
markdown_content = self.formatter.format_entry(entry)
self.assertEqual(markdown_content, expected_markdown)
entry = {"title": "Test", "url": "www.test.com", "html": self.html_content}
self.assertEqual(
self.formatter.format_entry(entry),
f"## Test\n\n[Read More](www.test.com)\n\n{self.markdown_content}",
)

def test_structure_markdown(self):
title = "Test Entry"
url = "https://example.com/test-entry"
content = "This is a test."
expected_markdown = "## Test Entry\n\n[Read More](https://example.com/test-entry)\n\nThis is a test."
structured_markdown = self.formatter.structure_markdown(title, url, content)
self.assertEqual(structured_markdown, expected_markdown)
self.assertEqual(
self.formatter.structure_markdown(
"Test", "www.test.com", self.markdown_content
),
f"## Test\n\n[Read More](www.test.com)\n\n{self.markdown_content}",
)

def test_format_dataset(self):
data = [
{
"title": "Test Entry 1",
"url": "https://example.com/test-entry-1",
"html": "<p>This is a test.</p>",
},
{
"title": "Test Entry 2",
"url": "https://example.com/test-entry-2",
"html": "<p>This is another test.</p>",
},
{"title": "Test 1", "url": "www.test1.com", "html": self.html_content},
{"title": "Test 2", "url": "www.test2.com", "html": self.html_content},
]
self.assertEqual(
self.formatter.format_dataset(data),
f"## Test 1\n\n[Read More](www.test1.com)\n\n{self.markdown_content}\n\n## Test 2\n\n[Read More](www.test2.com)\n\n{self.markdown_content}",
)

def test_load_json(self):
with open("output.json", "r", encoding="utf-8") as file:
expected_data = json.load(file)
self.assertEqual(load_json("output.json"), expected_data)

def test_chunk_dataset(self):
data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
chunk_size = 3
expected_chunks = [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10]]
self.assertEqual(list(chunk_dataset(data, chunk_size)), expected_chunks)

def test_process_chunk(self):
chunk = [
{"title": "Test 1", "url": "www.test1.com", "html": self.html_content},
{"title": "Test 2", "url": "www.test2.com", "html": self.html_content},
]
expected_markdown = "## Test Entry 1\n\n[Read More](https://example.com/test-entry-1)\n\nThis is a test.\n\n## Test Entry 2\n\n[Read More](https://example.com/test-entry-2)\n\nThis is another test."
markdown_content = self.formatter.format_dataset(data)
self.assertEqual(markdown_content, expected_markdown)
self.assertEqual(
process_chunk(chunk),
f"## Test 1\n\n[Read More](www.test1.com)\n\n{self.markdown_content}\n\n## Test 2\n\n[Read More](www.test2.com)\n\n{self.markdown_content}",
)


if __name__ == "__main__":
unittest.main()
unittest.main

0 comments on commit 558c24b

Please sign in to comment.