Skip to content

Commit

Permalink
Add test cases, plus a fix and test for issue ArchiveBox#1363
Browse files Browse the repository at this point in the history
  • Loading branch information
jimwins committed Feb 27, 2024
1 parent 22f9a28 commit c921966
Show file tree
Hide file tree
Showing 4 changed files with 139 additions and 0 deletions.
7 changes: 7 additions & 0 deletions archivebox/cli/archivebox_add.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,13 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
stdin_urls = ''
if not urls:
stdin_urls = accept_stdin(stdin)
else:
if command.parser != "auto":
stderr(
'[X] --parser can\'t be used with URLs or file paths, only stdin\n',
color='red',
)
raise SystemExit(2)

if (stdin_urls and urls) or (not stdin and not urls):
stderr(
Expand Down
24 changes: 24 additions & 0 deletions tests/mock_server/templates/example.atom
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
<?xml version="1.0" encoding="utf-8"?>
<feed
xml:lang="en"
xmlns="http://www.w3.org/2005/Atom"
>
<id>http://www.example.com/</id>
<title>Example of an Atom feed</title>
<link rel="self" type="application/atom+xml" href="http://www.example.com/index.atom" />
<link rel="alternate" type="text/html" href="http://www.example.com/" />
<author>
<name>Jim Winstead</name>
</author>
<updated>2024-02-26T03:18:26Z</updated>
<entry>
<title>Example</title>
<link rel="alternate" type="text/html" href="http://127.0.0.1:8080/static/example.com.html" />
<id>tag:example.com,2024-02-25:3319</id>
<updated>2024-02-26T03:18:26Z</updated>
<published>2024-02-25T19:18:25-08:00</published>
<category term="Tag1" scheme="http://example.com/archive" />
<category term="Tag2" scheme="http://example.com/archive" />
<content type="html">This is some &lt;b&gt;content&lt;/b&gt;</content>
</entry>
</feed>
32 changes: 32 additions & 0 deletions tests/mock_server/templates/example.rss
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:admin="http://webns.net/mvcb/"
xmlns:content="http://purl.org/rss/1.0/modules/content/"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
<channel>
<title>Sample Feed</title>
<link>http://example.org/</link>
<description>For documentation only</description>
<dc:language>en-us</dc:language>
<dc:creator>Nobody ([email protected])</dc:creator>
<dc:rights>Public domain</dc:rights>
<dc:date>2024-02-26T17:28:12-08:00</dc:date>
<admin:generatorAgent rdf:resource="http://www.example.org/"/>
<admin:errorReportsTo rdf:resource="mailto:[email protected]"/>

<item>
<title>First!</title>
<link>http://127.0.0.1:8080/static/example.com.html</link>
<guid isPermaLink="false">[email protected]</guid>
<description>
This has a description.
</description>
<dc:subject>Tag1 Tag2</dc:subject>
<dc:date>2024-02-26T17:28:12-08:00</dc:date>
<content:encoded><![CDATA[
This has a <b>description</b>.]]>
</content:encoded>
</item>
</channel>
</rss>
76 changes: 76 additions & 0 deletions tests/test_add.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,3 +91,79 @@ def test_extract_input_uses_only_passed_extractors(tmp_path, process):

assert (archived_item_path / "warc").exists()
assert not (archived_item_path / "singlefile.html").exists()

def test_explicit_parser_fails_with_url(tmp_path, process, disable_extractors_dict):
arg_process = subprocess.run(
["archivebox", "add", "--parser=rss", "http://127.0.0.1:8080/static/example.com.html"],
capture_output=True,
env=disable_extractors_dict,
)
assert '--parser can\'t be' in arg_process.stderr.decode("utf-8")

def test_generic_rss(tmp_path, process, disable_extractors_dict):
with open('../../mock_server/templates/example.rss', 'r', encoding='utf-8') as f:
arg_process = subprocess.run(
["archivebox", "add", "--index-only", "--parser=rss"],
stdin=f,
capture_output=True,
env=disable_extractors_dict,
)

conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
urls = c.execute("SELECT url from core_snapshot").fetchall()
tags = c.execute("SELECT name from core_tag").fetchall()
conn.commit()
conn.close()

urls = list(map(lambda x: x[0], urls))
assert "http://127.0.0.1:8080/static/example.com.html" in urls
# if the following URL appears, we must have fallen back to another parser
assert not "http://purl.org/dc/elements/1.1/" in urls

tags = list(map(lambda x: x[0], tags))
assert "Tag1 Tag2" in tags

def test_pinboard_rss(tmp_path, process, disable_extractors_dict):
with open('../../mock_server/templates/example.rss', 'r', encoding='utf-8') as f:
arg_process = subprocess.run(
["archivebox", "add", "--index-only", "--parser=pinboard_rss"],
stdin=f,
capture_output=True,
env=disable_extractors_dict,
)

conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
tags = c.execute("SELECT name from core_tag").fetchall()
conn.commit()
conn.close()

tags = list(map(lambda x: x[0], tags))
assert "Tag1" in tags
assert "Tag2" in tags

def test_atom(tmp_path, process, disable_extractors_dict):
with open('../../mock_server/templates/example.atom', 'r', encoding='utf-8') as f:
arg_process = subprocess.run(
["archivebox", "add", "--index-only", "--parser=rss"],
stdin=f,
capture_output=True,
env=disable_extractors_dict,
)

conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
urls = c.execute("SELECT url from core_snapshot").fetchall()
tags = c.execute("SELECT name from core_tag").fetchall()
conn.commit()
conn.close()

urls = list(map(lambda x: x[0], urls))
assert "http://127.0.0.1:8080/static/example.com.html" in urls
# if the following URL appears, we must have fallen back to another parser
assert not "http://www.w3.org/2005/Atom" in urls

tags = list(map(lambda x: x[0], tags))
assert "Tag1" in tags
assert "Tag2" in tags

0 comments on commit c921966

Please sign in to comment.