Skip to content

Commit

Permalink
Improve quotation generation and confirmation (#505)
Browse files Browse the repository at this point in the history
* Add polyfill-based text fragment quotation extraction
* Return to dom-anchor-text-quote-based quotation extraction
* Replace dom-anchor-text-quote with approx-string-match for quotation confirmation
* Disable polyfill-based quotation extraction test because it adds ~3m to our GH action test run

---------

Signed-off-by: Carl Gieringer <[email protected]>
  • Loading branch information
carlgieringer authored Aug 10, 2023
1 parent 445883b commit fca5fc6
Show file tree
Hide file tree
Showing 45 changed files with 8,560 additions and 212 deletions.
2 changes: 1 addition & 1 deletion .gitattributes
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
**/testData/** -linguist-detectable
howdju-text-fragment-generation/dist/** -linguist-detectable
howdju-text-fragments/dist/** -linguist-detectable
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,5 @@
# macOS file system metadata. These were showing up in act's Github Workflow
# runner.
.DS_Store

*.cpuprofile
2 changes: 1 addition & 1 deletion babel.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ module.exports = {
ignore: [
new RegExp(
// What we add here should probably go into Jest's transformIgnorePatterns too.
"/node_modules/(?!(@grrr/cookie-consent|@grrr/utils|nanoid|jsdom|strip-indent|normalize-url|text-fragments-polyfill))"
"/node_modules/(?!(@grrr/cookie-consent|@grrr/utils|nanoid|jsdom|strip-indent|normalize-url|text-fragments-polyfill|approx-string-match))"
),
],
plugins: [
Expand Down
2 changes: 1 addition & 1 deletion bin/check-todo-format.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ target=${2:-}
# Equivalent for the whole repo:
# egrep --exclude=check-todo-format\.sh --exclude-dir=node_modules --exclude-dir=dist --exclude-dir=coverage --exclude-dir=Pods --exclude-dir=\.git -RI '\bTODO\b' . | egrep -v '\bTODO\((\d+,?)+\)'
git diff $base $target ':(exclude)bin/check-todo-format.sh' ':(exclude).github/workflows/ci.yml'\
':(exclude)howdju-text-fragment-generation/dist/global-fragment-generation.js'\
':(exclude)howdju-text-fragments/dist/global-fragment-generation.js'\
| grep '^+' | egrep '\bTODO\b' | egrep -v '\bTODO\((\d+,?)+\)'

# grep returns 0 if it found matches. It is an error if we found matches.
Expand Down
62 changes: 62 additions & 0 deletions howdju-common/lib/approximateStringMatch.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import { readFileSync } from "fs";
import { JSDOM } from "jsdom";
import stripIndent from "strip-indent";
import * as textPosition from "dom-anchor-text-position";

import { approximateMatch } from "./approximateStringMatch";
import { toPlainTextContent } from "./domCommon";

describe("approximateMatch", () => {
test("matches", () => {
const html = readFileSync(
"lib/testData/urlTextFragments/lexfridman.html",
"utf8"
);
const dom = new JSDOM(html);
const doc = dom.window.document;
const quotation = stripIndent(`
Robert F. Kennedy Jr
(00:09:49) I suppose the way that Camus viewed the world and the way that the Stoics did and a lot of the existentialists, it was that it was so absurd and that the problems and the tasks that were given just to live a life are so insurmountable that the only way that we can get back the gods for giving us this impossible task of living life was to embrace it and to enjoy it and to do our best at it. To me, I read Camus, and particularly in The Myth of Sisyphus as a parable that… And it’s the same lesson that I think he writes about in The Plague, where we’re all given these insurmountable tasks in our lives, but that by doing our duty, by being of service to others, we can bring meaning to a meaningless chaos and we can bring order to the universe.
`).trim();

const matches = approximateMatch(doc.body.textContent ?? "", quotation);

expect(matches).toEqual([{ end: 10800, errors: 25, start: 9995 }]);
const [{ start, end }] = matches;
const range = textPosition.toRange(doc.body, { start, end });
const foundQuotation = toPlainTextContent(range);
// TODO(507) it should be possible to match the quotation exactly.
const expectedFoundQuotation = `Robert F. Kennedy Jr (00:09:49) I suppose the way that Camus viewed the world and the way that the Stoics did and a lot of the existentialists, it was that it was so absurd and that the problems and the tasks that were given just to live a life are so insurmountable that the only way that we can get back the gods for giving us this impossible task of living life was to embrace it and to enjoy it and to do our best at it. To me, I read Camus, and particularly in The Myth of Sisyphus as a parable that… And it’s the same lesson that I think he writes about in The Plague, where we’re all given these insurmountable tasks in our lives, but that by doing our duty, by being of service to others, we can bring meaning to a meaningless chaos and we can bring order to the universe.`;
expect(foundQuotation).toEqual(expectedFoundQuotation);
});

test("matches non-optimally", () => {
const html = readFileSync(
"lib/testData/urlTextFragments/lexfridman.html",
"utf8"
);
const dom = new JSDOM(html);
const doc = dom.window.document;
const quotation = stripIndent(`
Lex Fridman
(00:21:33) And you think that kind of empathy that you referred to, that requires moral courage?
`).trim();

const [{ start, end, errors }] = approximateMatch(
doc.body.textContent ?? "",
quotation
);

expect({ start, end, errors }).toEqual({
start: 19933,
end: 20035,
errors: 21,
});
const range = textPosition.toRange(doc.body, { start, end });
const foundQuotation = toPlainTextContent(range);
// TODO(507) it should be possible to match the quotation exactly.
expect(foundQuotation).toEqual(quotation.substring(20));
});
});
7 changes: 7 additions & 0 deletions howdju-common/lib/approximateStringMatch.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import search from "approx-string-match";

export const MAX_ACCEPTABLE_ERRORS = 50;

export function approximateMatch(document: string, query: string) {
return search(document, query, MAX_ACCEPTABLE_ERRORS);
}
46 changes: 45 additions & 1 deletion howdju-common/lib/domCommon.test.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
import { readFileSync } from "fs";
import { JSDOM } from "jsdom";

import { getElementById, getFirstChild } from "howdju-test-common";
import { nodePositionCompare, walkRangeNodes } from "./domCommon";

import {
nodePositionCompare,
walkRangeNodes,
findTextInDoc,
} from "./domCommon";
import stripIndent from "strip-indent";

describe("nodePositionCompare", () => {
test("works for siblings", () => {
Expand Down Expand Up @@ -173,3 +181,39 @@ describe("walkRangeNodes", () => {
]);
});
});

describe("findTextInDoc", () => {
test("finds multiline text in the Seattle Times", () => {
const url =
"https://www.seattletimes.com/seattle-news/homeless/heres-why-people-think-seattle-will-reverse-course-on-homelessness/";
const html = readFileSync(
"lib/testData/domBibliographicInfoTestData/seattletimes.html",
"utf8"
);
const dom = new JSDOM(html, { url });
const doc = dom.window.document;
const quotation = stripIndent(`
Many poll respondents said the reason they believe the homelessness crisis is worse now than it was three years ago is because they see it more.
“I see a lot more encampments around or RVs parked on the side of the road where they didn’t used to be,” said Drew Scoggins, a Northgate resident who responded to the poll.`).trim();

expect(findTextInDoc(doc, quotation)).toEqual(quotation);
});
test("finds multiline text in a Lex Fridman podcast transcript", () => {
const url = "https://lexfridman.com/robert-f-kennedy-jr-transcript/";
const html = readFileSync(
"lib/testData/urlTextFragments/lexfridman.html",
"utf8"
);
const dom = new JSDOM(html, { url });
const doc = dom.window.document;
const quotation = stripIndent(`
Robert F. Kennedy Jr
(00:09:49) I suppose the way that Camus viewed the world and the way that the Stoics did and a lot of the existentialists, it was that it was so absurd and that the problems and the tasks that were given just to live a life are so insurmountable that the only way that we can get back the gods for giving us this impossible task of living life was to embrace it and to enjoy it and to do our best at it. To me, I read Camus, and particularly in The Myth of Sisyphus as a parable that… And it’s the same lesson that I think he writes about in The Plague, where we’re all given these insurmountable tasks in our lives, but that by doing our duty, by being of service to others, we can bring meaning to a meaningless chaos and we can bring order to the universe.
`).trim();
const foundQuotation = `Robert F. Kennedy Jr (00:09:49) I suppose the way that Camus viewed the world and the way that the Stoics did and a lot of the existentialists, it was that it was so absurd and that the problems and the tasks that were given just to live a life are so insurmountable that the only way that we can get back the gods for giving us this impossible task of living life was to embrace it and to enjoy it and to do our best at it. To me, I read Camus, and particularly in The Myth of Sisyphus as a parable that… And it’s the same lesson that I think he writes about in The Plague, where we’re all given these insurmountable tasks in our lives, but that by doing our duty, by being of service to others, we can bring meaning to a meaningless chaos and we can bring order to the universe.`;

expect(findTextInDoc(doc, quotation)).toEqual(foundQuotation);
});
});
71 changes: 64 additions & 7 deletions howdju-common/lib/domCommon.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import * as textPosition from "dom-anchor-text-position";
import * as textQuote from "dom-anchor-text-quote";
import { indexOf } from "lodash";
import { approximateMatch } from "./approximateStringMatch";

import { logger } from "./logger";

Expand Down Expand Up @@ -65,13 +66,21 @@ export function nodePositionCompare(node1: Node, node2: Node) {
export function getTextWithin(
doc: Document,
startText: string,
endText: string
endText: string,
{ prefix, suffix }: { prefix?: string; suffix?: string } = {
prefix: undefined,
suffix: undefined,
}
) {
// Some sites includes the content of the page in a script tag. E.g. substack's `body_html`. So
// use a hint at the beginning to try and find content in the body. (If we find this doens't work,
// we might need to use a binary search style approach until we either have exhausted ranges in
// the document or have found a range that isn't in a script tag.)
const { range } = getRangeOfText(doc, startText, endText, 0);
const { range } = getRangeOfText(doc, startText, endText, {
hint: 0,
prefix,
suffix,
});
if (!range) {
return undefined;
}
Expand Down Expand Up @@ -111,23 +120,44 @@ function isScriptNode(node: Node) {
);
}

export function findTextInDoc(doc: Document, text: string): string | undefined {
const range = getRangeOfTextInDoc(doc, text);
if (!range) {
return undefined;
}

return toPlainTextContent(range);
}

export function getRangeOfTextInDoc(
doc: Document,
quotation: string
): Range | undefined {
const matches = approximateMatch(doc.body.textContent || "", quotation);
if (!matches.length) {
return undefined;
}
const { start, end } = matches[0];
return textPosition.toRange(doc.body, { start, end }) || undefined;
}

function getRangeOfText(
doc: Document,
startText: string,
endText: string,
hint?: number
{ prefix, suffix, hint }: { prefix?: string; suffix?: string; hint?: number }
) {
let startPosition = textQuote.toTextPosition(
doc.body,
{ exact: startText },
{ exact: startText, prefix },
hint !== undefined ? { hint } : undefined
);
if (!startPosition) {
return { range: undefined, end: undefined };
}
let endPosition = textQuote.toTextPosition(
doc.body,
{ exact: endText },
{ exact: endText, suffix },
{ hint: startPosition.end }
);
if (!endPosition) {
Expand Down Expand Up @@ -173,6 +203,26 @@ function getRangeOfText(
}
}

// Sometimes dom-anchor-text-position finds a bad start position. E.g. for
// https://lexfridman.com/robert-f-kennedy-jr-transcript/#:~:text=Camus-,Lex%20Fridman,act%20of%20rebellion.%E2%80%9D%20What%20do%20you%20think%20he%20means%20by%20that%3F,-Robert%20F.%20Kennedy
// it finds a start position at the beginning of the document.
// So after finding the end position, see if there is a closer match for the start position.
const maybeCloserStartPosition = textQuote.toTextPosition(
doc.body,
{ exact: startText, prefix },
{ hint: endPosition.start }
);
if (maybeCloserStartPosition) {
const maybeCloserStartPositionDistance =
endPosition.start - maybeCloserStartPosition.end;
if (
maybeCloserStartPositionDistance > 0 &&
maybeCloserStartPositionDistance < endPosition.start - startPosition.end
) {
startPosition = maybeCloserStartPosition;
}
}

const range = textPosition.toRange(doc.body, {
start: startPosition.start,
end: endPosition.end,
Expand Down Expand Up @@ -225,6 +275,7 @@ export function toPlainTextContent(range: Range) {
} else {
text = node.textContent;
}
text = text?.trim();
if (text) {
textParts.push(text);
}
Expand All @@ -233,13 +284,19 @@ export function toPlainTextContent(range: Range) {
leave: (node) => {
if (
node.nodeType === Node.ELEMENT_NODE &&
node.nodeName.toLowerCase() === "p"
["p", "div", "h1", "h2", "h3", "h4", "h5", "h6"].includes(
node.nodeName.toLowerCase()
)
) {
textParts.push("\n\n");
}
},
});
return textParts.join("").replace(/\s+$/gm, "\n").trim();
return textParts
.join(" ")
.replace(/^\s+/gm, "")
.replace(/\s+$/gm, "\n")
.trim();
}

function isTextNode(node: Node): node is Text {
Expand Down
1 change: 1 addition & 0 deletions howdju-common/lib/index.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/// <reference path="../../howdju-test-common/lib/globals.d.ts" />

export * from "./anchors";
export * from "./approximateStringMatch";
export * from "./arguments";
export * from "./apiModels";
export * from "./codes";
Expand Down
Loading

0 comments on commit fca5fc6

Please sign in to comment.