diff options
author | Ben Connors <benconnors@outlook.com> | 2019-09-01 16:45:43 -0400 |
---|---|---|
committer | Ben Connors <benconnors@outlook.com> | 2019-09-01 16:45:43 -0400 |
commit | 4eb082506ba9943e63c7a4ac565e98586bc180e6 (patch) | |
tree | bbff71d87077102702e5c3e0e0901f33ec4c1f17 /epub.py |
Initial commit
Diffstat (limited to 'epub.py')
-rw-r--r-- | epub.py | 279 |
1 files changed, 279 insertions, 0 deletions
@@ -0,0 +1,279 @@ +#!/usr/bin/env python3 + +"""Module for generating EPUB v3.0 content.""" + +import datetime as dt +import hashlib +import os +import uuid +import zipfile +import mimetypes + +from lxml import etree + +NBSP = "\u00A0" +OPF_NS = {"dc": "http://purl.org/dc/elements/1.1/", + "opf": "http://www.idpf.org/2007/opf"} +OPF = "{%s}"%OPF_NS["opf"] +DC = "{%s}"%OPF_NS["dc"] +EPUB = "{http://www.idpf.org/2007/ops}" + +def hash_file(fname): + """Return the SHA-256 hash of a file.""" + sha256 = hashlib.sha256() + with open(fname, "rb") as f: + while True: + data = f.read(1048576) + if not data: + break + sha256.update(data) + return sha256.digest().hex() + +class Page: + """Class for creating a single page in an EPUB. + + Note that this is not the conventional, printed concept of a page: a single page should + be the unit of division of the book, most often an entire chapter (as the actual + displayed page varies by device and screen size). + + The `content` parameter holds all of the content for the page, stored similarly to + S-exps. It must be a list of 2-tuples: + + (tag, data) + + The `tag` element may be a string tag name or a 2-tuple of the form: + + (tag_name, attrib) + + Where `attrib` is a dictionary containing the attributes of the tag and `tag_name` is a + string. + + If `tag` is the empty string, `data` is interpreted as a list of elements to be added + to the current element, in the same format as the `content` list. Otherwise, `data` is + interpreted as a string to be stored as the tag's text. + + + For example, to create the following: + + <h1>A List</h1> + <div class="container"> + <ol> + <li>Element 1</li> + <li>Element 2</li> + </ol> + </div> + + The `content` would be: + + [ + ("h1", "A List"), + ( + ("div", {"class": "container"}), + ( + "ol", + ( + "", + ( + ("li", "Element 1"), + ("li", "Element 2"), + ) + ) + ), + ) + ] + """ + TITLE = "title" + TOC = "toc" + CONTENT = "content" + + @staticmethod + def _generate_inner(root, content): + static = {} + + for elem in content: + tstack = [] + while not issubclass(type(elem), str): + tag, elem = elem + if not tag: + break + tstack.append(tag) + if not tstack: + tstack.append("p") + + tag = root + for t in tstack: + if not isinstance(t, str): + t, attrib = t + else: + attrib = {} + + if t == "img" and "src" in attrib: ## Store the src + ext = os.path.splitext(attrib["src"])[1] + h = hash_file(attrib["src"]) + if ext: + h += ext + attrib = attrib.copy() + static["OEBPS/Static/"+h] = attrib["src"] + attrib["src"] = "../Static/"+h + elif ("src" in attrib and os.path.isfile(attrib["src"])) or ("href" in attrib and os.path.isfile(attrib["href"])): + raise ValueError("Unknown tag %s for href/src" % t) + + tag = etree.SubElement(tag, t, attrib) + if not isinstance(elem, str): + st = Page._generate_inner(tag, elem) + static.update(st) + else: + tag.text = elem + + return static + + def generate_xhtml(self): + """Generate the XHTML representation of this page. + + Returns a 3-tuple: + + (root, kwargs, static) + + Where `root` is the root Element for the page, `kwargs` are the keyword arguments + to be used when converting this page to a string, and `static` is a mapping from + zip file path to filesystem path for storing static files required by this page. + The file's zip path is determined from its hash, so collisions need not be handled. + """ + chap = etree.Element("html", {}, {None: "http://www.w3.org/1999/xhtml", "epub": "http://www.idpf.org/2007/ops"}) + + head = etree.SubElement(chap, "head") + etree.SubElement(head, "title").text = self.title + etree.SubElement(head, "link", {"href": "../Styles/stylesheet.css", "rel": "stylesheet", "type": "text/css"}) + etree.SubElement(head, "link", {"href": "../Styles/page-template.xpgt", "rel": "stylesheet", "type": "application/vnd.adobe-page-template+xml"}) + + div = etree.SubElement(etree.SubElement(chap, "body"), "div") + static = self._generate_inner(div, self.content) + + return (chap, {"doctype": '<!DOCTYPE html>', "standalone": False}, static) + + def __init__(self, title, content, in_toc=True, type_="content", fname=None): + self.content = content + self.title = title + self.in_toc = in_toc + self.type = type_ + self.fname = fname + +class BasicTOCPage(Page): + """Page for generating a TOC.""" + def add_page(self, p: Page, fname): + """Add a page to the TOC.""" + print("toc", p.title, p.in_toc) + if p.in_toc: + if p.type != Page.CONTENT or p.fname is not None: + self.toc_ol.append(("li", (("a", {"href": fname}), p.title))) + else: + self.toc_ol.append(("li", (("a", {"href": fname}), "{num}. {header}".format(num=self.chapter_count, header=p.title)))) + self.chapter_count += 1 + + def clear(self): + """Clear the current table of contents.""" + while self.toc_ol: + self.toc_ol.pop(0) + + def __init__(self, in_toc=True, fname="toc.xhtml"): + super().__init__("Table of Contents", [("h1", "Table of Contents")], in_toc, type_=Page.TOC, fname=fname) + self.chapter_count = 1 + self.toc_ol = [] + self.content.append((("nav", {EPUB+"type": "toc"}), ("ol", ("", self.toc_ol)))) + +class BasicTitlePage(Page): + """Basic title page that shows the title and the author.""" + def __init__(self, book_title, author, in_toc=True, fname="title.xhtml"): + super().__init__("Title Page", [NBSP, ("h1", book_title), NBSP, ("h2", "By "+author)], in_toc=in_toc, type_=Page.TITLE, fname=fname) + +class Book: + """Class representing an EPUB v3.0 container.""" + def generate_epub(self, target="out.epub"): + """Generate the EPUB.""" + static = {"OEBPS/Styles/page-template.xpgt" : "/home/ben/Workspace/epub/static/page-template.xpgt", + "OEBPS/Styles/stylesheet.css": "/home/ben/Workspace/epub/static/stylesheet.css"} + xmlmap = {} + + ## Generate the container file + container = etree.Element("container", {"version": "1.0"}, {None: "urn:oasis:names:tc:opendocument:xmlns:container"}) + xmlmap["META-INF/container.xml"] = (container, {}, {}) + rf = etree.SubElement(container, "rootfiles") + etree.SubElement(rf, "rootfile", {"full-path": "OEBPS/content.opf", "media-type": "application/oebps-package+xml"}) + + ## Generate the content file + content = etree.Element("package", {"unique-identifier" : "BookID", "version": "3.0"}, {None: OPF_NS["opf"]}) + xmlmap["OEBPS/content.opf"] = (content, {}, {}) + + metad = etree.SubElement(content, "metadata", {}, OPF_NS) + etree.SubElement(metad, DC+"title").text = self.title + etree.SubElement(metad, DC+"rights").text = "Public Domain" + etree.SubElement(metad, DC+"language").text = "en-US" + etree.SubElement(metad, DC+"creator", {"id": "author"}).text = self.author + etree.SubElement(metad, "meta", {"refines": "#author", "property": "role", "scheme": "marc:relators", "id": "role"}).text = "aut" + etree.SubElement(metad, "meta", {"property": "dcterms:modified"}).text = dt.datetime.utcnow().isoformat()[:-7]+'Z' + etree.SubElement(metad, DC+"identifier", {"id": "BookID"}).text = str(uuid.uuid3(uuid.NAMESPACE_OID, self.title+'|'+self.author)) + manif = etree.SubElement(content, "manifest") + etree.SubElement(manif, "item", {"id": "page-template.xpgt", "href": "Styles/page-template.xpgt", "media-type": "application/vnd.adobe-page-template+xml"}) + etree.SubElement(manif, "item", {"id": "stylesheet.css", "href": "Styles/stylesheet.css", "media-type": "text/css"}) + + if self.cover is not None: + cover_path = "Static/cover" + os.path.splitext(self.cover)[1] + etree.SubElement(manif, "item", {"id": "cover", "href": cover_path, "media-type": mimetypes.guess_type(cover_path), "properties": "cover-image"}) + static["OEBPS/"+cover_path] = self.cover + + spine = etree.SubElement(content, "spine") + + ## Generate pages + self.toc.clear() + form = "page%06d.xhtml" + n = 1 + for p in self.pages: + manif_attr = {} + + if p.type == Page.TOC: + manif_attr["properties"] = "nav" + + if p.fname is None: + fname = form % n + else: + fname = p.fname + + ## Add it to the manifest + etree.SubElement(manif, "item", {"id": fname, "href": "Text/"+fname, "media-type": "application/xhtml+xml", **manif_attr}) + ## Add it to the spine + etree.SubElement(spine, "itemref", {"idref": fname}) + ## Add it to the TOC + self.toc.add_page(p, fname) + ## Generate the page + xml = p.generate_xhtml() + xmlmap["OEBPS/Text/"+fname] = xml + for zpath in xml[2]: + if zpath.startswith("OEBPS/"): + zpath = zpath[6:] + etree.SubElement(manif, "item", {"id": zpath.replace('/', '-'), "href": zpath, "media-type": mimetypes.guess_type(zpath)[0]}) + n += 1 + + ## Regenerate the TOC + xmlmap["OEBPS/Text/"+self.toc.fname] = self.toc.generate_xhtml() + + epub = zipfile.ZipFile(target, 'w', zipfile.ZIP_DEFLATED) + epub.writestr("mimetype", "application/epub+zip", zipfile.ZIP_STORED) + for zpath, path in static.items(): + epub.write(path, zpath) + for zpath, xml in xmlmap.items(): + xml, kwargs, static = xml + epub.writestr(zpath, etree.tostring(xml, encoding="utf-8", xml_declaration=True, pretty_print=True, **kwargs)) + for szpath, spath in static.items(): + epub.write(spath, szpath) + + return epub + + def __init__(self, title, author, toc_class=BasicTOCPage, title_class=BasicTitlePage, cover=None): + self.title = title + self.author = author + + self.title_page = title_class(title, author) + self.toc = toc_class(in_toc=False) + self.cover = cover + + self.pages = [self.title_page, self.toc] |