Skip to content

Commit

Permalink
Be more efficient in determining the publication date
Browse files Browse the repository at this point in the history
We can store the publication date of each chapter in the download manifest, and then consult that when scaffolding the EPUB, instead of re-parsing the HTML we just generated.
  • Loading branch information
domenic committed Nov 6, 2024
1 parent fba981f commit bbd404d
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 21 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
# cache key.
- uses: actions/cache@v4
with:
key: worm-ward-cache-2021-01-17
key: worm-ward-cache-2024-11-06
path: ./cache

- run: node ./lib/worm-scraper.js --book=worm
Expand Down
3 changes: 1 addition & 2 deletions lib/convert-worker.js
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ function convertChapter(chapter, book, inputPath, outputPath) {
}

function getChapterString(chapter, book, rawChapterDoc) {
const datePublished = rawChapterDoc.querySelector(".entry-date").dateTime;
const { xml, warnings } =
getBodyXML(chapter, book, rawChapterDoc.querySelector(".entry-content"));

Expand All @@ -32,7 +31,7 @@ function getChapterString(chapter, book, rawChapterDoc) {
<head>
<meta charset="utf-8"/>
<title>${chapter.title}</title>
<meta itemprop="datePublished" content="${datePublished}"/>
<meta itemprop="datePublished" content="${chapter.datePublished}"/>
</head>
${xml}
</html>`;
Expand Down
7 changes: 6 additions & 1 deletion lib/download.js
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,12 @@ async function downloadAllChapters(manifest, startChapterURL, cachePath, manifes

const { contents, dom, url } = await downloadChapter(currentChapter);
const title = getChapterTitle(dom.window.document);
const datePublished = getChapterDatePublished(dom.window.document);
currentChapter = getNextChapterURL(dom.window.document);

dom.window.close();

manifest.push({ url, title, filename });
manifest.push({ url, title, datePublished, filename });
await fs.writeFile(path.resolve(cachePath, filename), contents);

// Incrementally update the manifest after every successful download, instead of waiting until the end.
Expand Down Expand Up @@ -89,6 +90,10 @@ function getChapterTitle(rawChapterDoc) {
return rawChapterDoc.querySelector("h1.entry-title").textContent.replace(//u, " ");
}

function getChapterDatePublished(rawChapterDoc) {
return rawChapterDoc.querySelector(".entry-date").dateTime;
}

function retry(times, fn) {
if (times === 0) {
return fn();
Expand Down
26 changes: 9 additions & 17 deletions lib/scaffold.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"use strict";
const fs = require("fs").promises;
const path = require("path");
const { JSDOM } = require("jsdom");

const BOOK_PUBLISHER = "Domenic Denicola";
const BOOK_AUTHOR = "Wildbow";
Expand All @@ -23,12 +22,9 @@ module.exports = async (
await Promise.all([
fs.cp(scaffoldingPath, bookPath, { recursive: true, filter: noThumbs }),
fs.cp(coverImagePath, path.resolve(bookPath, "OEBPS", COVER_IMAGE_FILENAME)),
Promise.all([
getChapters(contentPath, chaptersPath, manifestPath),
getPublicationDate(chaptersPath)
]).then(([chapters, publicationDate]) => {
getChaptersAndDatePublished(contentPath, chaptersPath, manifestPath).then(([chapters, datePublished]) => {
return Promise.all([
writeOPF(chapters, contentPath, bookInfo, publicationDate),
writeOPF(chapters, contentPath, bookInfo, datePublished),
writeNav(chapters, contentPath)
]);
})
Expand All @@ -41,7 +37,7 @@ function noThumbs(filePath) {
return path.basename(filePath) !== "Thumbs.db";
}

function writeOPF(chapters, contentPath, bookInfo, publicationDate) {
function writeOPF(chapters, contentPath, bookInfo, datePublished) {
const manifestChapters = chapters.map(c => {
return ` <item id="${c.id}" href="${c.href}" media-type="application/xhtml+xml"/>`;
}).join("\n");
Expand All @@ -67,7 +63,7 @@ function writeOPF(chapters, contentPath, bookInfo, publicationDate) {
<meta refines="#creator" property="role" scheme="marc:relators">aut</meta>
<dc:publisher>${BOOK_PUBLISHER}</dc:publisher>
<dc:date>${publicationDate}</dc:date>
<dc:date>${datePublished}</dc:date>
<meta property="dcterms:modified">${dateWithoutMilliseconds}</meta>
<dc:description>${bookInfo.description}</dc:description>
Expand Down Expand Up @@ -123,15 +119,15 @@ ${navPoints}
return fs.writeFile(path.resolve(contentPath, NAV_FILENAME), contents);
}

async function getChapters(contentPath, chaptersPath, manifestPath) {
async function getChaptersAndDatePublished(contentPath, chaptersPath, manifestPath) {
const hrefPrefix = `${path.relative(contentPath, chaptersPath)}/`;

const manifestContents = await fs.readFile(manifestPath, { encoding: "utf-8" });
const manifestChapters = JSON.parse(manifestContents);

const filenames = await fs.readdir(chaptersPath);

return filenames
const chapters = filenames
.filter(f => path.extname(f) === ".xhtml")
.sort()
.map((f, i) => {
Expand All @@ -141,13 +137,9 @@ async function getChapters(contentPath, chaptersPath, manifestPath) {
href: `${hrefPrefix}${f}`
};
});
}

// We say that the publication date of the book is equal to the publication date of the last chapter.
async function getPublicationDate(chaptersPath) {
const filenames = await fs.readdir(chaptersPath);
// We say that the publication date of the book is equal to the publication date of the last chapter.
const { datePublished } = manifestChapters.at(-1);

const lastFile = filenames.at(-1);
const dom = await JSDOM.fromFile(path.resolve(chaptersPath, lastFile));
return dom.window.document.querySelector(`meta[itemprop="datePublished"]`).getAttribute("content");
return [chapters, datePublished];
}

0 comments on commit bbd404d

Please sign in to comment.