summaryrefslogtreecommitdiff
path: root/epub.py
diff options
context:
space:
mode:
authorBen Connors <benconnors@outlook.com>2019-09-01 16:45:43 -0400
committerBen Connors <benconnors@outlook.com>2019-09-01 16:45:43 -0400
commit4eb082506ba9943e63c7a4ac565e98586bc180e6 (patch)
treebbff71d87077102702e5c3e0e0901f33ec4c1f17 /epub.py
Initial commit
Diffstat (limited to 'epub.py')
-rw-r--r--epub.py279
1 files changed, 279 insertions, 0 deletions
diff --git a/epub.py b/epub.py
new file mode 100644
index 0000000..971018d
--- /dev/null
+++ b/epub.py
@@ -0,0 +1,279 @@
+#!/usr/bin/env python3
+
+"""Module for generating EPUB v3.0 content."""
+
+import datetime as dt
+import hashlib
+import os
+import uuid
+import zipfile
+import mimetypes
+
+from lxml import etree
+
+NBSP = "\u00A0"
+OPF_NS = {"dc": "http://purl.org/dc/elements/1.1/",
+ "opf": "http://www.idpf.org/2007/opf"}
+OPF = "{%s}"%OPF_NS["opf"]
+DC = "{%s}"%OPF_NS["dc"]
+EPUB = "{http://www.idpf.org/2007/ops}"
+
+def hash_file(fname):
+ """Return the SHA-256 hash of a file."""
+ sha256 = hashlib.sha256()
+ with open(fname, "rb") as f:
+ while True:
+ data = f.read(1048576)
+ if not data:
+ break
+ sha256.update(data)
+ return sha256.digest().hex()
+
+class Page:
+ """Class for creating a single page in an EPUB.
+
+ Note that this is not the conventional, printed concept of a page: a single page should
+ be the unit of division of the book, most often an entire chapter (as the actual
+ displayed page varies by device and screen size).
+
+ The `content` parameter holds all of the content for the page, stored similarly to
+ S-exps. It must be a list of 2-tuples:
+
+ (tag, data)
+
+ The `tag` element may be a string tag name or a 2-tuple of the form:
+
+ (tag_name, attrib)
+
+ Where `attrib` is a dictionary containing the attributes of the tag and `tag_name` is a
+ string.
+
+ If `tag` is the empty string, `data` is interpreted as a list of elements to be added
+ to the current element, in the same format as the `content` list. Otherwise, `data` is
+ interpreted as a string to be stored as the tag's text.
+
+
+ For example, to create the following:
+
+ <h1>A List</h1>
+ <div class="container">
+ <ol>
+ <li>Element 1</li>
+ <li>Element 2</li>
+ </ol>
+ </div>
+
+ The `content` would be:
+
+ [
+ ("h1", "A List"),
+ (
+ ("div", {"class": "container"}),
+ (
+ "ol",
+ (
+ "",
+ (
+ ("li", "Element 1"),
+ ("li", "Element 2"),
+ )
+ )
+ ),
+ )
+ ]
+ """
+ TITLE = "title"
+ TOC = "toc"
+ CONTENT = "content"
+
+ @staticmethod
+ def _generate_inner(root, content):
+ static = {}
+
+ for elem in content:
+ tstack = []
+ while not issubclass(type(elem), str):
+ tag, elem = elem
+ if not tag:
+ break
+ tstack.append(tag)
+ if not tstack:
+ tstack.append("p")
+
+ tag = root
+ for t in tstack:
+ if not isinstance(t, str):
+ t, attrib = t
+ else:
+ attrib = {}
+
+ if t == "img" and "src" in attrib: ## Store the src
+ ext = os.path.splitext(attrib["src"])[1]
+ h = hash_file(attrib["src"])
+ if ext:
+ h += ext
+ attrib = attrib.copy()
+ static["OEBPS/Static/"+h] = attrib["src"]
+ attrib["src"] = "../Static/"+h
+ elif ("src" in attrib and os.path.isfile(attrib["src"])) or ("href" in attrib and os.path.isfile(attrib["href"])):
+ raise ValueError("Unknown tag %s for href/src" % t)
+
+ tag = etree.SubElement(tag, t, attrib)
+ if not isinstance(elem, str):
+ st = Page._generate_inner(tag, elem)
+ static.update(st)
+ else:
+ tag.text = elem
+
+ return static
+
+ def generate_xhtml(self):
+ """Generate the XHTML representation of this page.
+
+ Returns a 3-tuple:
+
+ (root, kwargs, static)
+
+ Where `root` is the root Element for the page, `kwargs` are the keyword arguments
+ to be used when converting this page to a string, and `static` is a mapping from
+ zip file path to filesystem path for storing static files required by this page.
+ The file's zip path is determined from its hash, so collisions need not be handled.
+ """
+ chap = etree.Element("html", {}, {None: "http://www.w3.org/1999/xhtml", "epub": "http://www.idpf.org/2007/ops"})
+
+ head = etree.SubElement(chap, "head")
+ etree.SubElement(head, "title").text = self.title
+ etree.SubElement(head, "link", {"href": "../Styles/stylesheet.css", "rel": "stylesheet", "type": "text/css"})
+ etree.SubElement(head, "link", {"href": "../Styles/page-template.xpgt", "rel": "stylesheet", "type": "application/vnd.adobe-page-template+xml"})
+
+ div = etree.SubElement(etree.SubElement(chap, "body"), "div")
+ static = self._generate_inner(div, self.content)
+
+ return (chap, {"doctype": '<!DOCTYPE html>', "standalone": False}, static)
+
+ def __init__(self, title, content, in_toc=True, type_="content", fname=None):
+ self.content = content
+ self.title = title
+ self.in_toc = in_toc
+ self.type = type_
+ self.fname = fname
+
+class BasicTOCPage(Page):
+ """Page for generating a TOC."""
+ def add_page(self, p: Page, fname):
+ """Add a page to the TOC."""
+ print("toc", p.title, p.in_toc)
+ if p.in_toc:
+ if p.type != Page.CONTENT or p.fname is not None:
+ self.toc_ol.append(("li", (("a", {"href": fname}), p.title)))
+ else:
+ self.toc_ol.append(("li", (("a", {"href": fname}), "{num}. {header}".format(num=self.chapter_count, header=p.title))))
+ self.chapter_count += 1
+
+ def clear(self):
+ """Clear the current table of contents."""
+ while self.toc_ol:
+ self.toc_ol.pop(0)
+
+ def __init__(self, in_toc=True, fname="toc.xhtml"):
+ super().__init__("Table of Contents", [("h1", "Table of Contents")], in_toc, type_=Page.TOC, fname=fname)
+ self.chapter_count = 1
+ self.toc_ol = []
+ self.content.append((("nav", {EPUB+"type": "toc"}), ("ol", ("", self.toc_ol))))
+
+class BasicTitlePage(Page):
+ """Basic title page that shows the title and the author."""
+ def __init__(self, book_title, author, in_toc=True, fname="title.xhtml"):
+ super().__init__("Title Page", [NBSP, ("h1", book_title), NBSP, ("h2", "By "+author)], in_toc=in_toc, type_=Page.TITLE, fname=fname)
+
+class Book:
+ """Class representing an EPUB v3.0 container."""
+ def generate_epub(self, target="out.epub"):
+ """Generate the EPUB."""
+ static = {"OEBPS/Styles/page-template.xpgt" : "/home/ben/Workspace/epub/static/page-template.xpgt",
+ "OEBPS/Styles/stylesheet.css": "/home/ben/Workspace/epub/static/stylesheet.css"}
+ xmlmap = {}
+
+ ## Generate the container file
+ container = etree.Element("container", {"version": "1.0"}, {None: "urn:oasis:names:tc:opendocument:xmlns:container"})
+ xmlmap["META-INF/container.xml"] = (container, {}, {})
+ rf = etree.SubElement(container, "rootfiles")
+ etree.SubElement(rf, "rootfile", {"full-path": "OEBPS/content.opf", "media-type": "application/oebps-package+xml"})
+
+ ## Generate the content file
+ content = etree.Element("package", {"unique-identifier" : "BookID", "version": "3.0"}, {None: OPF_NS["opf"]})
+ xmlmap["OEBPS/content.opf"] = (content, {}, {})
+
+ metad = etree.SubElement(content, "metadata", {}, OPF_NS)
+ etree.SubElement(metad, DC+"title").text = self.title
+ etree.SubElement(metad, DC+"rights").text = "Public Domain"
+ etree.SubElement(metad, DC+"language").text = "en-US"
+ etree.SubElement(metad, DC+"creator", {"id": "author"}).text = self.author
+ etree.SubElement(metad, "meta", {"refines": "#author", "property": "role", "scheme": "marc:relators", "id": "role"}).text = "aut"
+ etree.SubElement(metad, "meta", {"property": "dcterms:modified"}).text = dt.datetime.utcnow().isoformat()[:-7]+'Z'
+ etree.SubElement(metad, DC+"identifier", {"id": "BookID"}).text = str(uuid.uuid3(uuid.NAMESPACE_OID, self.title+'|'+self.author))
+ manif = etree.SubElement(content, "manifest")
+ etree.SubElement(manif, "item", {"id": "page-template.xpgt", "href": "Styles/page-template.xpgt", "media-type": "application/vnd.adobe-page-template+xml"})
+ etree.SubElement(manif, "item", {"id": "stylesheet.css", "href": "Styles/stylesheet.css", "media-type": "text/css"})
+
+ if self.cover is not None:
+ cover_path = "Static/cover" + os.path.splitext(self.cover)[1]
+ etree.SubElement(manif, "item", {"id": "cover", "href": cover_path, "media-type": mimetypes.guess_type(cover_path), "properties": "cover-image"})
+ static["OEBPS/"+cover_path] = self.cover
+
+ spine = etree.SubElement(content, "spine")
+
+ ## Generate pages
+ self.toc.clear()
+ form = "page%06d.xhtml"
+ n = 1
+ for p in self.pages:
+ manif_attr = {}
+
+ if p.type == Page.TOC:
+ manif_attr["properties"] = "nav"
+
+ if p.fname is None:
+ fname = form % n
+ else:
+ fname = p.fname
+
+ ## Add it to the manifest
+ etree.SubElement(manif, "item", {"id": fname, "href": "Text/"+fname, "media-type": "application/xhtml+xml", **manif_attr})
+ ## Add it to the spine
+ etree.SubElement(spine, "itemref", {"idref": fname})
+ ## Add it to the TOC
+ self.toc.add_page(p, fname)
+ ## Generate the page
+ xml = p.generate_xhtml()
+ xmlmap["OEBPS/Text/"+fname] = xml
+ for zpath in xml[2]:
+ if zpath.startswith("OEBPS/"):
+ zpath = zpath[6:]
+ etree.SubElement(manif, "item", {"id": zpath.replace('/', '-'), "href": zpath, "media-type": mimetypes.guess_type(zpath)[0]})
+ n += 1
+
+ ## Regenerate the TOC
+ xmlmap["OEBPS/Text/"+self.toc.fname] = self.toc.generate_xhtml()
+
+ epub = zipfile.ZipFile(target, 'w', zipfile.ZIP_DEFLATED)
+ epub.writestr("mimetype", "application/epub+zip", zipfile.ZIP_STORED)
+ for zpath, path in static.items():
+ epub.write(path, zpath)
+ for zpath, xml in xmlmap.items():
+ xml, kwargs, static = xml
+ epub.writestr(zpath, etree.tostring(xml, encoding="utf-8", xml_declaration=True, pretty_print=True, **kwargs))
+ for szpath, spath in static.items():
+ epub.write(spath, szpath)
+
+ return epub
+
+ def __init__(self, title, author, toc_class=BasicTOCPage, title_class=BasicTitlePage, cover=None):
+ self.title = title
+ self.author = author
+
+ self.title_page = title_class(title, author)
+ self.toc = toc_class(in_toc=False)
+ self.cover = cover
+
+ self.pages = [self.title_page, self.toc]