Skip to content

Commit

Permalink
chore CORE-4775: remove html page number metadata field (#2942)
Browse files Browse the repository at this point in the history
### Summary

Rip off page_number metadata fields until we have page counting for all
kinds of html files (not just limited to news articles with multiple
`<article>` tag)

### Test
Unit tests
`test_add_chunking_strategy_on_partition_html_respects_multipage` and
`test_add_chunking_strategy_title_on_partition_auto_respects_multipage`
removed since they relay on the `page_number` fields from the SEC html
file - now test moved to mock test for chunk_by_title -> revisit those
tests when we find test file for this

Also changed the element ids from partition outputs for html files -
element id change due to page number change (in element id hashing) ->
todo ticket: update other deterministic element id tests per crag's
comment

---------

Co-authored-by: ryannikolaidis <[email protected]>
Co-authored-by: yuming-long <[email protected]>
  • Loading branch information
3 people committed Apr 30, 2024
1 parent 0d80886 commit 542d442
Show file tree
Hide file tree
Showing 31 changed files with 626 additions and 964 deletions.
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
## 0.13.7-dev0

### Enhancements
* **Remove `page_number` metadata fields** for HTML partition until we have a better strategy to decide page counting.

### Features

### Fixes

## 0.13.6

### Enhancements
Expand Down
78 changes: 32 additions & 46 deletions test_unstructured/chunking/test_title.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,38 @@ def test_chunk_by_title_separates_by_page_number():
]


def test_chuck_by_title_respects_multipage():
elements: list[Element] = [
Title("A Great Day", metadata=ElementMetadata(page_number=1)),
Text("Today is a great day.", metadata=ElementMetadata(page_number=2)),
Text("It is sunny outside.", metadata=ElementMetadata(page_number=2)),
Table("Heading\nCell text"),
Title("An Okay Day"),
Text("Today is an okay day."),
Text("It is rainy outside."),
Title("A Bad Day"),
Text(
"Today is a bad day.",
metadata=ElementMetadata(
regex_metadata={"a": [RegexMetadata(text="A", start=0, end=1)]},
),
),
Text("It is storming outside."),
CheckBox(),
]
chunks = chunk_by_title(elements, multipage_sections=True, combine_text_under_n_chars=0)
assert chunks == [
CompositeElement(
"A Great Day\n\nToday is a great day.\n\nIt is sunny outside.",
),
Table("Heading\nCell text"),
CompositeElement("An Okay Day\n\nToday is an okay day.\n\nIt is rainy outside."),
CompositeElement(
"A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.",
),
]


def test_chunk_by_title_does_not_break_on_regex_metadata_change():
"""PreChunker is insensitive to regex-metadata changes.
Expand Down Expand Up @@ -328,52 +360,6 @@ def test_add_chunking_strategy_respects_max_characters():
assert chunk_elements == chunks


def test_add_chunking_strategy_on_partition_html_respects_multipage():
filename = "example-docs/example-10k-1p.html"
partitioned_elements_multipage_false_combine_chars_0 = partition_html(
filename,
chunking_strategy="by_title",
multipage_sections=False,
combine_text_under_n_chars=0,
new_after_n_chars=300,
max_characters=400,
)
partitioned_elements_multipage_true_combine_chars_0 = partition_html(
filename,
chunking_strategy="by_title",
multipage_sections=True,
combine_text_under_n_chars=0,
new_after_n_chars=300,
max_characters=400,
)
elements = partition_html(filename)
cleaned_elements_multipage_false_combine_chars_0 = chunk_by_title(
elements,
multipage_sections=False,
combine_text_under_n_chars=0,
new_after_n_chars=300,
max_characters=400,
)
cleaned_elements_multipage_true_combine_chars_0 = chunk_by_title(
elements,
multipage_sections=True,
combine_text_under_n_chars=0,
new_after_n_chars=300,
max_characters=400,
)
assert (
partitioned_elements_multipage_false_combine_chars_0
== cleaned_elements_multipage_false_combine_chars_0
)
assert (
partitioned_elements_multipage_true_combine_chars_0
== cleaned_elements_multipage_true_combine_chars_0
)
assert len(partitioned_elements_multipage_true_combine_chars_0) != len(
partitioned_elements_multipage_false_combine_chars_0,
)


def test_chunk_by_title_drops_detection_class_prob():
elements: list[Element] = [
Title(
Expand Down
46 changes: 0 additions & 46 deletions test_unstructured/partition/test_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -1097,52 +1097,6 @@ def test_add_chunking_strategy_on_partition_auto():
assert chunk_elements == chunks


def test_add_chunking_strategy_title_on_partition_auto_respects_multipage():
filename = "example-docs/example-10k-1p.html"
partitioned_elements_multipage_false_combine_chars_0 = partition(
filename,
chunking_strategy="by_title",
multipage_sections=False,
combine_text_under_n_chars=0,
new_after_n_chars=300,
max_characters=400,
)
partitioned_elements_multipage_true_combine_chars_0 = partition(
filename,
chunking_strategy="by_title",
multipage_sections=True,
combine_text_under_n_chars=0,
new_after_n_chars=300,
max_characters=400,
)
elements = partition(filename)
cleaned_elements_multipage_false_combine_chars_0 = chunk_by_title(
elements,
multipage_sections=False,
combine_text_under_n_chars=0,
new_after_n_chars=300,
max_characters=400,
)
cleaned_elements_multipage_true_combine_chars_0 = chunk_by_title(
elements,
multipage_sections=True,
combine_text_under_n_chars=0,
new_after_n_chars=300,
max_characters=400,
)
assert (
partitioned_elements_multipage_false_combine_chars_0
== cleaned_elements_multipage_false_combine_chars_0
)
assert (
partitioned_elements_multipage_true_combine_chars_0
== cleaned_elements_multipage_true_combine_chars_0
)
assert len(partitioned_elements_multipage_true_combine_chars_0) != len(
partitioned_elements_multipage_false_combine_chars_0,
)


def test_add_chunking_strategy_on_partition_auto_respects_max_chars():
filename = "example-docs/example-10k-1p.html"

Expand Down
14 changes: 6 additions & 8 deletions test_unstructured/partition/test_html_partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -731,15 +731,13 @@ def test_all_element_ids_are_unique():


def test_element_ids_are_deterministic():
ids = [e.id for e in partition_html("example-docs/fake-html-with-duplicate-elements.html")]
assert ids == [
"cba9e551ed975e0f8a1956095894e92a",
"f540ea3b6569aafeb433df6616e79971",
"f4a34ee0fac26589fffdb53d0dfedbaf",
"15168aeddbd19da60791109a5a45af65",
"0c027f66120dd96271489dd0bb69bff5",
"abe89090c2e46dda8fff81053cc79f17",
ids_first_partition = [
e.id for e in partition_html("example-docs/fake-html-with-duplicate-elements.html")
]
ids_second_partition = [
e.id for e in partition_html("example-docs/fake-html-with-duplicate-elements.html")
]
assert ids_first_partition == ids_second_partition


def test_partition_html_b_tag_parsing():
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[
{
"element_id": "b7b1c359c06495bd6fe8e174b2a9908f",
"element_id": "32bc8af17151389d3e80f65036f8e65b",
"metadata": {
"data_source": {
"date_created": "2023-06-16T05:04:47+00:00",
Expand All @@ -17,7 +17,6 @@
"languages": [
"eng"
],
"page_number": 1,
"text_as_html": "<table><tr><td></td><td></td><td>January 2023 ( Someone fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge. Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>"
},
"text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[
{
"element_id": "8d15e7cb1bbb2bf4bab95dcd20a79f29",
"element_id": "f346c0d677012f9d4265678f9626c829",
"metadata": {
"data_source": {
"date_created": "0001-01-01T08:00:00Z",
Expand All @@ -16,14 +16,13 @@
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
]
},
"text": "Documents",
"type": "Title"
},
{
"element_id": "2ef8cded92afdc398b5757e488f5d53d",
"element_id": "fea3bac751e7273dfe57b271fe9dd22b",
"metadata": {
"data_source": {
"date_created": "0001-01-01T08:00:00Z",
Expand All @@ -39,8 +38,7 @@
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
]
},
"text": "Events",
"type": "Title"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[
{
"element_id": "a227bc5e1e168472aa02c7ddeac6023b",
"element_id": "fe9e95d69e2fe6e0fcf74f630e24f11f",
"metadata": {
"data_source": {
"date_created": "0001-01-01T08:00:00Z",
Expand All @@ -17,14 +17,13 @@
"languages": [
"cat",
"fra"
],
"page_number": 1
]
},
"text": "This is a plain text site page for testing purposes",
"type": "ListItem"
},
{
"element_id": "110e27269e69e01c41db4faf9a31d770",
"element_id": "bf2f616265a06fa30e74df2cf6291c40",
"metadata": {
"data_source": {
"date_created": "0001-01-01T08:00:00Z",
Expand All @@ -41,14 +40,13 @@
"languages": [
"cat",
"fra"
],
"page_number": 1
]
},
"text": "These are bullet points meant for testing",
"type": "ListItem"
},
{
"element_id": "c398848281e72db6061cf211b7c211d9",
"element_id": "f59e42aff8f1b1ad83f8280a0686eabe",
"metadata": {
"data_source": {
"date_created": "0001-01-01T08:00:00Z",
Expand All @@ -65,14 +63,13 @@
"languages": [
"cat",
"fra"
],
"page_number": 1
]
},
"text": "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aliquam ex tellus, sodales non nulla et, sodales consequat turpis. Etiam vestibulum nisl placerat risus elementum, a sodales purus rhoncus. Sed eget velit pharetra, pretium nisi nec, laoreet ligula. Duis luctus mi in ligula cursus, vel lacinia tortor ultricies. Aenean sit amet sodales odio, a maximus elit. Pellentesque vehicula diam sit amet leo placerat placerat. Integer varius elementum accumsan. Donec posuere elit mauris, eget efficitur nisl viverra vitae.",
"type": "NarrativeText"
},
{
"element_id": "8a67276048c91e45cae58a087eba44cc",
"element_id": "9fa12141ac0e9ad3d09fe51dc393ad59",
"metadata": {
"data_source": {
"date_created": "0001-01-01T08:00:00Z",
Expand All @@ -89,8 +86,7 @@
"languages": [
"cat",
"fra"
],
"page_number": 1
]
},
"text": "Integer at dictum nisi. Cras venenatis non velit in posuere. Curabitur tristique, eros eget tristique pellentesque, neque metus ullamcorper ligula, nec posuere neque lacus nec felis. Nulla a libero eget eros consectetur hendrerit. Pellentesque interdum, diam eget tristique pretium, quam lorem pulvinar lorem, a eleifend nisl lectus at ex. Praesent pulvinar ex ut consequat condimentum. Sed rutrum, erat a hendrerit blandit, urna mauris posuere est, at porttitor risus diam non leo. Nullam rutrum vehicula dolor, quis venenatis ligula rutrum sit amet. Nam massa justo, fermentum in dui lacinia, tincidunt imperdiet nunc. Nam posuere tortor ac lectus elementum, non mollis urna consequat. In interdum non tellus sed pellentesque.",
"type": "NarrativeText"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[
{
"element_id": "b7b1c359c06495bd6fe8e174b2a9908f",
"element_id": "32bc8af17151389d3e80f65036f8e65b",
"metadata": {
"data_source": {
"date_created": "2023-06-16T05:04:47+00:00",
Expand All @@ -17,7 +17,6 @@
"languages": [
"eng"
],
"page_number": 1,
"text_as_html": "<table><tr><td></td><td></td><td>January 2023 ( Someone fed my essays into GPT to make something that could answer<br/>questions based on them, then asked it where good ideas come from. The<br/>answer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,<br/>or missing, or broken? You can see anomalies in everyday life (much<br/>of standup comedy is based on this), but the best place to look for<br/>them is at the frontiers of knowledge. Knowledge grows fractally.<br/>From a distance its edges look smooth, but when you learn enough<br/>to get close to one, you&#x27;ll notice it&#x27;s full of gaps. These gaps<br/>will seem obvious; it will seem inexplicable that no one has tried<br/>x or wondered about y. In the best case, exploring such gaps yields<br/>whole new fractal buds.</td></tr></table>"
},
"text": "January 2023 ( Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.) The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge. Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[
{
"element_id": "8d15e7cb1bbb2bf4bab95dcd20a79f29",
"element_id": "f346c0d677012f9d4265678f9626c829",
"metadata": {
"data_source": {
"date_created": "0001-01-01T08:00:00Z",
Expand All @@ -16,14 +16,13 @@
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
]
},
"text": "Documents",
"type": "Title"
},
{
"element_id": "2ef8cded92afdc398b5757e488f5d53d",
"element_id": "fea3bac751e7273dfe57b271fe9dd22b",
"metadata": {
"data_source": {
"date_created": "0001-01-01T08:00:00Z",
Expand All @@ -39,8 +38,7 @@
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
]
},
"text": "Events",
"type": "Title"
Expand Down

0 comments on commit 542d442

Please sign in to comment.