Skip to content

Commit

Permalink
Update to EPUB 3 output
Browse files Browse the repository at this point in the history
As part of this, a few invisible structural improvements:

* Use the same cover HTML everywhere, instead of one per book.
* Include each chapter's publication date as microdata in each output chapter.
* Move each chapter's original URL from a HTML comment to microdata.

And a few possibly-visible improvements:

* Include the publication date for each book in the EPUB's metadata. (It's set to the last chapter's publication date.)
* Add a last-modified date to the EPUB metadata, equal to the date the EPUB was generated. This might suffice for #46, but you could also imagine something better...
* Add a landmark for the beginning of the content, which should allow some readers to skip past the cover when desired.
* Stop marking the cover as "auxiliary", which makes sure the cover appears in certain viewers (such as Calibre).

Fixes #45.
  • Loading branch information
domenic authored Nov 6, 2024
1 parent c2bdbcd commit fba981f
Show file tree
Hide file tree
Showing 9 changed files with 115 additions and 123 deletions.
File renamed without changes
24 changes: 0 additions & 24 deletions covers/ward/cover.xhtml

This file was deleted.

File renamed without changes
24 changes: 0 additions & 24 deletions covers/worm/cover.xhtml

This file was deleted.

15 changes: 9 additions & 6 deletions lib/convert-worker.js
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,19 @@ function convertChapter(chapter, book, inputPath, outputPath) {
}

function getChapterString(chapter, book, rawChapterDoc) {
const datePublished = rawChapterDoc.querySelector(".entry-date").dateTime;
const { xml, warnings } =
getBodyXML(chapter, book, rawChapterDoc.querySelector(".entry-content"));

const output = `<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
const output = `<?xml version="1.0" encoding="utf-8" ?>
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" lang="en"
itemscope="itemscope" itemtype="https://schema.org/Chapter"
itemid="${chapter.url}">
<head>
<meta http-equiv="Content-Type" content="application/xhtml+xml; charset=utf-8" />
<meta charset="utf-8"/>
<title>${chapter.title}</title>
<meta itemprop="datePublished" content="${datePublished}"/>
</head>
${xml}
</html>`;
Expand Down Expand Up @@ -298,10 +302,9 @@ function getBodyXML(chapter, book, contentEl) {
}

// Serializer inserts extra xmlns for us since it doesn't know we're going to put this into a <html>.
// Use this opportunity to insert a comment pointing to the original URL, for reference.
xml = xml.replace(
/<body xmlns="http:\/\/www.w3.org\/1999\/xhtml">/u,
`<body>\n<!-- ${chapter.url} -->\n`
`<body>\n`
);

return { xml, warnings };
Expand Down
146 changes: 80 additions & 66 deletions lib/scaffold.js
Original file line number Diff line number Diff line change
@@ -1,23 +1,35 @@
"use strict";
const fs = require("fs").promises;
const path = require("path");
const { JSDOM } = require("jsdom");

const BOOK_PUBLISHER = "Domenic Denicola";
const BOOK_AUTHOR = "Wildbow";

const NCX_FILENAME = "toc.ncx";

module.exports = async (scaffoldingPath, coverPath, bookPath, contentPath, chaptersPath, manifestPath, bookInfo) => {
const COVER_DOCUMENT_FILENAME = "cover.xhtml";
const COVER_IMAGE_FILENAME = "cover.jpg";
const COVER_IMAGE_MIMETYPE = "image/jpeg";
const NAV_FILENAME = "nav.xhtml";

module.exports = async (
scaffoldingPath,
coverImagePath,
bookPath,
contentPath,
chaptersPath,
manifestPath,
bookInfo
) => {
await Promise.all([
fs.cp(scaffoldingPath, bookPath, { recursive: true, filter: noThumbs }),
fs.cp(coverPath, path.resolve(bookPath, "OEBPS"), { recursive: true, filter: noThumbs }),
fs.cp(coverImagePath, path.resolve(bookPath, "OEBPS", COVER_IMAGE_FILENAME)),
Promise.all([
getChapters(contentPath, chaptersPath, manifestPath),
getCoverFiles(coverPath)
]).then(([chapters, coverFiles]) => {
getPublicationDate(chaptersPath)
]).then(([chapters, publicationDate]) => {
return Promise.all([
writeOPF(chapters, contentPath, coverFiles, bookInfo),
writeNcx(chapters, contentPath, bookInfo)
writeOPF(chapters, contentPath, bookInfo, publicationDate),
writeNav(chapters, contentPath)
]);
})
]);
Expand All @@ -26,84 +38,89 @@ module.exports = async (scaffoldingPath, coverPath, bookPath, contentPath, chapt
};

function noThumbs(filePath) {
// Thumbs.db causes the strangest errors as Windows has it locked a lot of the time.
return path.basename(filePath) !== "Thumbs.db";
}

function writeOPF(chapters, contentPath, coverFiles, bookInfo) {
function writeOPF(chapters, contentPath, bookInfo, publicationDate) {
const manifestChapters = chapters.map(c => {
return `<item id="${c.id}" href="${c.href}" media-type="application/xhtml+xml"/>`;
return ` <item id="${c.id}" href="${c.href}" media-type="application/xhtml+xml"/>`;
}).join("\n");

const spineChapters = chapters.map(c => {
return `<itemref idref="${c.id}"/>`;
return ` <itemref idref="${c.id}"/>`;
}).join("\n");

const contents = `<?xml version="1.0"?>
<package version="2.0" xmlns="http://www.idpf.org/2007/opf" unique-identifier="BookId">
const dateWithoutMilliseconds = `${(new Date()).toISOString().split(".")[0]}Z`;

/* eslint-disable max-len */
const contents = `<?xml version="1.0" encoding="utf-8"?>
<package version="3.0" unique-identifier="BookId" xmlns="http://www.idpf.org/2007/opf">
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">
<dc:title>${bookInfo.title}</dc:title>
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
<dc:identifier id="BookId">urn:uuid:${bookInfo.id}</dc:identifier>
<dc:language>en</dc:language>
<dc:identifier id="BookId" opf:scheme="UUID">urn:uuid:${bookInfo.id}</dc:identifier>
<dc:creator opf:file-as="${BOOK_AUTHOR}" opf:role="aut">${BOOK_AUTHOR}</dc:creator>
<dc:title id="title">${bookInfo.title}</dc:title>
<meta refines="#title" property="title-type">main</meta>
<dc:creator id="creator">${BOOK_AUTHOR}</dc:creator>
<meta refines="#creator" property="role" scheme="marc:relators">aut</meta>
<dc:publisher>${BOOK_PUBLISHER}</dc:publisher>
<dc:date>${publicationDate}</dc:date>
<meta property="dcterms:modified">${dateWithoutMilliseconds}</meta>
<dc:description>${bookInfo.description}</dc:description>
<meta name="cover" content="cover-image"/>
</metadata>
<manifest>
<item id="ncx" href="${NCX_FILENAME}" media-type="application/x-dtbncx+xml"/>
<item id="cover" href="${coverFiles.xhtml}" media-type="application/xhtml+xml"/>
<item id="cover-image" href="${coverFiles.image}" media-type="${coverFiles.imageMimeType}"/>
<item id="nav" href="${NAV_FILENAME}" media-type="application/xhtml+xml" properties="nav"/>
<item id="cover" href="${COVER_DOCUMENT_FILENAME}" media-type="application/xhtml+xml"/>
<item id="cover-image" href="${COVER_IMAGE_FILENAME}" media-type="${COVER_IMAGE_MIMETYPE}" properties="cover-image"/>
${manifestChapters}
</manifest>
<spine toc="ncx">
<itemref idref="cover" linear="no"/>
<spine>
<itemref idref="cover"/>
${spineChapters}
</spine>
<guide>
<reference type="cover" title="Cover" href="${coverFiles.xhtml}"/>
</guide>
</package>`;
/* eslint-enable max-len */

return fs.writeFile(path.resolve(contentPath, "content.opf"), contents);
}

function writeNcx(chapters, contentPath, bookInfo) {
const navPoints = chapters.map((c, i) => {
return `<navPoint class="chapter" id="${c.id}" playOrder="${i + 1}">
<navLabel><text>${c.title}</text></navLabel>
<content src="${c.href}"/>
</navPoint>`;

function writeNav(chapters, contentPath) {
const navPoints = chapters.map(c => {
return ` <li><a href="${c.href}">${c.title}</a></li>`;
}).join("\n");

const contents = `<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">
<ncx version="2005-1" xml:lang="en" xmlns="http://www.daisy.org/z3986/2005/ncx/">
<head>
<meta name="dtb:uid" content="urn:uuid:${bookInfo.id}"/>
<meta name="dtb:depth" content="1"/>
<meta name="dtb:totalPageCount" content="0"/>
<meta name="dtb:maxPageNumber" content="0"/>
</head>
<docTitle>
<text>${bookInfo.title}</text>
</docTitle>
<docAuthor>
<text>${BOOK_AUTHOR}</text>
</docAuthor>
<navMap>
const contents = `<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" lang="en">
<head>
<meta charset="utf-8"/>
<title>Table of Contents</title>
</head>
<body>
<nav epub:type="toc" id="toc">
<h1>Table of Contents</h1>
<ol>
${navPoints}
</navMap>
</ncx>`;

return fs.writeFile(path.resolve(contentPath, NCX_FILENAME), contents);
</ol>
</nav>
<nav epub:type="landmarks">
<h2>Guide</h2>
<ol>
<li><a epub:type="cover" href="${COVER_DOCUMENT_FILENAME}">Cover</a></li>
<li><a epub:type="bodymatter" href="${chapters[0].href}">Begin Reading</a></li>
</ol>
</nav>
</body>
</html>`;

return fs.writeFile(path.resolve(contentPath, NAV_FILENAME), contents);
}

async function getChapters(contentPath, chaptersPath, manifestPath) {
Expand All @@ -126,14 +143,11 @@ async function getChapters(contentPath, chaptersPath, manifestPath) {
});
}

async function getCoverFiles(coverPath) {
const filenames = await fs.readdir(coverPath);

const images = filenames.filter(f => [".png", ".jpg"].includes(path.extname(f)));
if (images.length !== 1) {
throw new Error(`Expected one cover image in ${coverPath}; found ${images.length}`);
}
const imageMimeType = path.extname(images[0]) === ".png" ? "image/png" : "image/jpeg";
// We say that the publication date of the book is equal to the publication date of the last chapter.
async function getPublicationDate(chaptersPath) {
const filenames = await fs.readdir(chaptersPath);

return { xhtml: "cover.xhtml", imageMimeType, image: images[0] };
const lastFile = filenames.at(-1);
const dom = await JSDOM.fromFile(path.resolve(chaptersPath, lastFile));
return dom.window.document.querySelector(`meta[itemprop="datePublished"]`).getAttribute("content");
}
4 changes: 2 additions & 2 deletions lib/worm-scraper.js
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ const cachePath = path.resolve(argv.cache, argv.book);
const manifestPath = path.resolve(cachePath, "manifest.json");

const scaffoldingPath = path.resolve(__dirname, "../scaffolding");
const coverPath = path.resolve(__dirname, "../covers", argv.book);
const coverImagePath = path.resolve(__dirname, "../covers", `${argv.book}.jpg`);
const stagingPath = path.resolve(argv.staging, argv.book);
const contentPath = path.resolve(stagingPath, "OEBPS");
const chaptersPath = path.resolve(contentPath, "chapters");
Expand Down Expand Up @@ -96,7 +96,7 @@ if (argv._.includes("scaffold")) {
const bookInfo = books[argv.book];
commands.push(() => scaffold(
scaffoldingPath,
coverPath,
coverImagePath,
stagingPath,
contentPath,
chaptersPath,
Expand Down
2 changes: 1 addition & 1 deletion scaffolding/META-INF/container.xml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<?xml version="1.0" encoding="utf-8"?>
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
<rootfiles>
<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
Expand Down
23 changes: 23 additions & 0 deletions scaffolding/OEBPS/cover.xhtml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" lang="en">
<head>
<meta charset="utf-8"/>
<title>Cover</title>
<style>
body {
text-align: center;
margin: 0;
padding: 0;
}
img {
max-width: 100%;
height: 100%;
margin: 0 auto;
}
</style>
</head>
<body epub:type="cover">
<img src="cover.jpg" alt="" role="doc-cover"/>
</body>
</html>

0 comments on commit fba981f

Please sign in to comment.