From 2c8a3f3a83cf2f067048835114bdba81b7f092e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 25 Jul 2024 10:03:36 +0200 Subject: [PATCH 1/2] Add a minimal action I/O implementation to improve SearchAction support --- extruct/w3cmicrodata.py | 12 ++++++++++++ tests/samples/schema.org/SearchAction.001.html | 8 ++++++++ tests/samples/schema.org/SearchAction.001.json | 1 + tests/test_microdata.py | 15 +++++++++++++++ 4 files changed, 36 insertions(+) create mode 100644 tests/samples/schema.org/SearchAction.001.html create mode 100644 tests/samples/schema.org/SearchAction.001.json diff --git a/extruct/w3cmicrodata.py b/extruct/w3cmicrodata.py index 89a79b3c..c2696eae 100644 --- a/extruct/w3cmicrodata.py +++ b/extruct/w3cmicrodata.py @@ -235,6 +235,18 @@ def _extract_property_value(self, node, items_seen, base_url, itemids, force=Fal elif node.get("content"): return node.get("content") + # https://schema.org/docs/actions.html#part-4 + elif ( + (itemprop := node.get("itemprop")) + and (itemprop.endswith("-input") or itemprop.endswith("-output")) + ): + result = {} + if "required" in node.attrib: + result["valueRequired"] = True + if name := node.get("name"): + result["valueName"] = name + return result + else: return self._extract_textContent(node) diff --git a/tests/samples/schema.org/SearchAction.001.html b/tests/samples/schema.org/SearchAction.001.html new file mode 100644 index 00000000..ef493125 --- /dev/null +++ b/tests/samples/schema.org/SearchAction.001.html @@ -0,0 +1,8 @@ +
+ +
+ + + +
+
diff --git a/tests/samples/schema.org/SearchAction.001.json b/tests/samples/schema.org/SearchAction.001.json new file mode 100644 index 00000000..bd967365 --- /dev/null +++ b/tests/samples/schema.org/SearchAction.001.json @@ -0,0 +1 @@ +[{"type": "https://schema.org/WebSite", "properties": {"url": "https://www.example.com/", "potentialAction": {"type": "https://schema.org/SearchAction", "properties": {"target": "https://query.example.com/search?q={search_term_string}", "query-input": {"valueRequired": true, "valueName": "search_term_string"}}}}}] \ No newline at end of file diff --git a/tests/test_microdata.py b/tests/test_microdata.py index c1168a02..69e305cb 100644 --- a/tests/test_microdata.py +++ b/tests/test_microdata.py @@ -10,6 +10,18 @@ class TestMicrodata(unittest.TestCase): maxDiff = None + def _test_schemaorg(self, schema, indexes=None): + indexes = indexes or [1] + for i in indexes: + body = get_testdata("schema.org", f"{schema}.{i:03d}.html") + expected = json.loads( + get_testdata("schema.org", f"{schema}.{i:03d}.json").decode() + ) + mde = MicrodataExtractor() + data = mde.extract(body) + self.assertEqual(data, expected) + + def test_schemaorg_CreativeWork(self): for i in [1]: body = get_testdata("schema.org", "CreativeWork.{:03d}.html".format(i)) @@ -63,6 +75,9 @@ def test_schemaorg_Event(self): self.assertEqual(data, expected) + def test_schemaorg_SearchAction(self): + self._test_schemaorg("SearchAction") + def test_w3c_textContent_values(self): body = get_testdata("w3c", "microdata.4.2.strings.html") expected = json.loads( From 1000c838a921372d77733bbddd6189f0296eee14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 25 Jul 2024 10:22:32 +0200 Subject: [PATCH 2/2] Run pre-commit --- extruct/w3cmicrodata.py | 5 ++--- tests/test_microdata.py | 1 - 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/extruct/w3cmicrodata.py b/extruct/w3cmicrodata.py index c2696eae..0a7dbe6a 100644 --- a/extruct/w3cmicrodata.py +++ b/extruct/w3cmicrodata.py @@ -236,9 +236,8 @@ def _extract_property_value(self, node, items_seen, base_url, itemids, force=Fal return node.get("content") # https://schema.org/docs/actions.html#part-4 - elif ( - (itemprop := node.get("itemprop")) - and (itemprop.endswith("-input") or itemprop.endswith("-output")) + elif (itemprop := node.get("itemprop")) and ( + itemprop.endswith("-input") or itemprop.endswith("-output") ): result = {} if "required" in node.attrib: diff --git a/tests/test_microdata.py b/tests/test_microdata.py index 69e305cb..eda2d26c 100644 --- a/tests/test_microdata.py +++ b/tests/test_microdata.py @@ -21,7 +21,6 @@ def _test_schemaorg(self, schema, indexes=None): data = mde.extract(body) self.assertEqual(data, expected) - def test_schemaorg_CreativeWork(self): for i in [1]: body = get_testdata("schema.org", "CreativeWork.{:03d}.html".format(i))