Skip to content

Commit

Permalink
Initial stab at test cases using add with --parser, --depth
Browse files Browse the repository at this point in the history
Current status of tests:

FAILED tests/parser/test_auto.py::test_add_stdin_json
FAILED tests/parser/test_auto.py::test_add_file_json
FAILED tests/parser/test_auto.py::test_add_stdin_file_json
FAILED tests/parser/test_specific.py::test_json_url
FAILED tests/parser/test_specific.py::test_json_urls
FAILED tests/parser/test_specific.py::test_json_filenames
FAILED tests/parser/test_specific.py::test_json_stdin_urls
FAILED tests/parser/test_specific.py::test_json_stdin_filenames
FAILED tests/parser/test_specific.py::test_json_depth1_url
FAILED tests/parser/test_specific.py::test_json_depth1_urls
FAILED tests/parser/test_specific.py::test_json_depth1_filename
FAILED tests/parser/test_specific.py::test_json_depth1_filenames
FAILED tests/parser/test_specific.py::test_json_depth1_stdin_urls
FAILED tests/parser/test_specific.py::test_json_depth1_stdin_filenames
FAILED tests/parser/test_specific.py::test_json_depth1_stdin_contents

(15 failed, 4 passed.)

Working on issue ArchiveBox#1363
  • Loading branch information
jimwins committed Mar 4, 2024
1 parent 99b92b6 commit 861f78f
Show file tree
Hide file tree
Showing 8 changed files with 413 additions and 0 deletions.
16 changes: 16 additions & 0 deletions tests/mock_server/templates/1.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
<!doctype html>
<html>
<head>
<title>Example 1</title>
<meta charset="utf-8"/>
<meta http-equiv="Content-type" content="text/html; charset=utf-8"/>
<meta name="viewport" content="width=device-width, initial-scale=1"/>
</head>

<body>
<ul>
<a href="http://127.0.0.1:8080/static/3.html">Example 3</a>
</ul>
<!-- http://www.example.com/should-not-exist -->
</body>
</html>
1 change: 1 addition & 0 deletions tests/mock_server/templates/1.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[{"href":"http://127.0.0.1:8080/static/1.html","title":"Example 1","time":"2014-06-14T15:51:42Z"}]
16 changes: 16 additions & 0 deletions tests/mock_server/templates/2.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
<!doctype html>
<html>
<head>
<title>Example 2</title>
<meta charset="utf-8"/>
<meta http-equiv="Content-type" content="text/html; charset=utf-8"/>
<meta name="viewport" content="width=device-width, initial-scale=1"/>
</head>

<body>
<ul>
<a href="http://127.0.0.1:8080/static/4.html">Example 4</a>
</ul>
</body>
</html>

1 change: 1 addition & 0 deletions tests/mock_server/templates/2.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[{"href":"http://127.0.0.1:8080/static/2.html","title":"Example 2","time":"2014-06-14T15:51:44Z"}]
Empty file added tests/parser/__init__.py
Empty file.
73 changes: 73 additions & 0 deletions tests/parser/fixtures.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import pytest

import os
import subprocess
import sqlite3

from typing import Optional, Union
from pathlib import Path

# This seems kind of heavy, running archivebox init before every test :shrug:
@pytest.fixture(autouse=True)
def archivebox_init(tmp_path):
os.chdir(tmp_path)
process = subprocess.run(['archivebox', 'init', '--quick'], capture_output=True)

# We poke around in the database to verify what happened, but maybe we should be
# working with the Django Models instead?
@pytest.fixture
def db():
conn = sqlite3.connect("index.sqlite3")
conn.row_factory = sqlite3.Row
return conn.cursor()

@pytest.fixture
def base_url():
return 'http://127.0.0.1:8080/static/'

@pytest.fixture
def base_path():
return '../../mock_server/templates/'

def add_files_or_urls(urls: Union[str, list[str]], options: Optional[list] = None):
if isinstance(urls, str):
urls = [ urls ]

# Build the command
command = [ "archivebox", "add"] + urls + [ "--index-only" ];
if options:
command += options

# Run our command, capturing stdout and stderr
process = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)

# Return what horrors we have created
return process.stdout.decode('utf-8')

def add_stdin(data: str, options: Optional[list] = None):
# Build the command
command = [ "archivebox", "add", "--index-only" ];
if options:
command += options

# Run our command, feeding it stdin and capturing stdout and stderr
process = subprocess.Popen(
command,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT
)

process.stdin.write(data.encode())
outs, errs = process.communicate() # errs should actually be empty
process.stdin.close()

# should we check process.retcode here?

return outs.decode('utf-8')

# useful for debugging test cases
def dump_snapshot(db):
results = db.execute("SELECT * FROM core_snapshot").fetchall()
for row in results:
print(dict(row))
97 changes: 97 additions & 0 deletions tests/parser/test_auto.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
from .fixtures import *

# Next five tests are the first ones from:
# https://github.com/ArchiveBox/ArchiveBox/issues/1363#issuecomment-1966177173
# (The defaults are --parser=auto and --depth=0.)
#
# The rest of the tests that test specifying the parser are in ./test_specific.py
#
def test_add_urls(db):
# archivebox add 'https://example.com' 'https://example.org'
expected = [
'https://example.com',
'https://example.org',
]
results = add_files_or_urls(expected)

assert db.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0] == len(expected)
for url in expected:
assert db.execute("SELECT COUNT(*) FROM core_snapshot WHERE url = ?", (url,)).fetchone()[0] == 1, f"{url} not found, even though it was expected"

def test_add_stdin_urls(db):
# echo -e"https://example.com\nhttps://example.org" | archivebox add --depth=0
expected = [
'https://example.com',
'https://example.org',
]
results = add_stdin("\n".join(expected))

assert db.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0] == len(expected)
for url in expected:
assert db.execute("SELECT COUNT(*) FROM core_snapshot WHERE url = ?", (url,)).fetchone()[0] == 1, f"{url} not found, even though it was expected"

def test_add_stdin_json(db, base_path):
# cat example.json | archivebox add --depth=0
results = add_stdin(open(base_path + 'example.json','r').read())

expected = [
'http://127.0.0.1:8080/static/title_og_with_html',
'http://127.0.0.1:8080/static/shift_jis.html',
'http://127.0.0.1:8080/static/iana.org.html',
'http://127.0.0.1:8080/static/example.com.html',
]
unexpected = [
'http://www.example.com/should-not-exist',
]

assert db.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0] == len(expected)
for url in expected:
assert db.execute("SELECT COUNT(*) FROM core_snapshot WHERE url = ?", (url,)).fetchone()[0] == 1, f"{url} not found, even though it was expected"
for url in unexpected:
assert db.execute("SELECT COUNT(*) FROM core_snapshot WHERE url = ?", (url,)).fetchone()[0] == 0, f"{url} was not expected to be found"
assert db.execute("SELECT COUNT(*) FROM core_tag").fetchone()[0] == 6

def test_add_file_json(db, base_path):
# archivebox add --depth=0 example.json
results = add_files_or_urls(base_path + 'example.json')

expected = [
'http://127.0.0.1:8080/static/title_og_with_html',
'http://127.0.0.1:8080/static/shift_jis.html',
'http://127.0.0.1:8080/static/iana.org.html',
'http://127.0.0.1:8080/static/example.com.html',
]
unexpected = [
'http://www.example.com/should-not-exist',
]

assert db.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0] == len(expected)
for url in expected:
assert db.execute("SELECT COUNT(*) FROM core_snapshot WHERE url = ?", (url,)).fetchone()[0] == 1, f"{url} not found, even though it was expected"
for url in unexpected:
assert db.execute("SELECT COUNT(*) FROM core_snapshot WHERE url = ?", (url,)).fetchone()[0] == 0, f"{url} was not expected to be found"
assert db.execute("SELECT COUNT(*) FROM core_tag").fetchone()[0] == 6

def test_add_stdin_file_json(db, base_path):
# echo 'example.json' | archivebox add --depth=0

data= base_path + 'example.json'

results = add_stdin(data)

expected = [
'http://127.0.0.1:8080/static/title_og_with_html',
'http://127.0.0.1:8080/static/shift_jis.html',
'http://127.0.0.1:8080/static/iana.org.html',
'http://127.0.0.1:8080/static/example.com.html',
]
unexpected = [
'http://www.example.com/should-not-exist',
]

assert db.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0] == len(expected)
for url in expected:
assert db.execute("SELECT COUNT(*) FROM core_snapshot WHERE url = ?", (url,)).fetchone()[0] == 1, f"{url} not found, even though it was expected"
for url in unexpected:
assert db.execute("SELECT COUNT(*) FROM core_snapshot WHERE url = ?", (url,)).fetchone()[0] == 0, f"{url} was not expected to be found"
assert db.execute("SELECT COUNT(*) FROM core_tag").fetchone()[0] == 6

0 comments on commit 861f78f

Please sign in to comment.