Initial commit

author: Ben Connors <benconnors@outlook.com> 2019-09-01 16:45:43 -0400
committer: Ben Connors <benconnors@outlook.com> 2019-09-01 16:45:43 -0400
commit: 4eb082506ba9943e63c7a4ac565e98586bc180e6 (patch)
tree: bbff71d87077102702e5c3e0e0901f33ec4c1f17 /epub.py
1 files changed, 279 insertions, 0 deletions
diff --git a/epub.py b/epub.py
new file mode 100644
index 0000000..971018d
--- /dev/null
+++ b/epub.py
@@ -0,0 +1,279 @@
+#!/usr/bin/env python3
+
+"""Module for generating EPUB v3.0 content."""
+
+import datetime as dt
+import hashlib
+import os
+import uuid
+import zipfile
+import mimetypes
+
+from lxml import etree
+
+NBSP = "\u00A0"
+OPF_NS = {"dc": "http://purl.org/dc/elements/1.1/",
+          "opf": "http://www.idpf.org/2007/opf"}
+OPF = "{%s}"%OPF_NS["opf"]
+DC = "{%s}"%OPF_NS["dc"]
+EPUB = "{http://www.idpf.org/2007/ops}"
+
+def hash_file(fname):
+    """Return the SHA-256 hash of a file."""
+    sha256 = hashlib.sha256()
+    with open(fname, "rb") as f:
+        while True:
+            data = f.read(1048576)
+            if not data:
+                break
+            sha256.update(data)
+    return sha256.digest().hex()
+
+class Page:
+    """Class for creating a single page in an EPUB. 
+
+    Note that this is not the conventional, printed concept of a page: a single page should 
+    be the unit of division of the book, most often an entire chapter (as the actual 
+    displayed page varies by device and screen size).
+
+    The `content` parameter holds all of the content for the page, stored similarly to 
+    S-exps. It must be a list of 2-tuples:
+
+        (tag, data)
+
+    The `tag` element may be a string tag name or a 2-tuple of the form:
+
+        (tag_name, attrib)
+
+    Where `attrib` is a dictionary containing the attributes of the tag and `tag_name` is a
+    string.
+
+    If `tag` is the empty string, `data` is interpreted as a list of elements to be added 
+    to the current element, in the same format as the `content` list. Otherwise, `data` is 
+    interpreted as a string to be stored as the tag's text.
+
+
+    For example, to create the following:
+
+        <h1>A List</h1>
+        <div class="container">
+          <ol>
+            <li>Element 1</li>
+            <li>Element 2</li>
+          </ol>
+        </div>
+
+    The `content` would be:
+
+        [
+            ("h1", "A List"),
+            (
+                ("div", {"class": "container"}),
+                (
+                    "ol",
+                    (
+                        "",
+                        (
+                            ("li", "Element 1"),
+                            ("li", "Element 2"),
+                        )
+                    )
+                ),
+            )
+        ]
+    """
+    TITLE = "title"
+    TOC = "toc"
+    CONTENT = "content"
+
+    @staticmethod
+    def _generate_inner(root, content):
+        static = {}
+
+        for elem in content:
+            tstack = []
+            while not issubclass(type(elem), str):
+                tag, elem = elem
+                if not tag:
+                    break
+                tstack.append(tag)
+            if not tstack:
+                tstack.append("p")
+
+            tag = root
+            for t in tstack:
+                if not isinstance(t, str):
+                    t, attrib = t
+                else:
+                    attrib = {}
+
+                if t == "img" and "src" in attrib: ## Store the src
+                    ext = os.path.splitext(attrib["src"])[1]
+                    h = hash_file(attrib["src"]) 
+                    if ext:
+                        h += ext
+                    attrib = attrib.copy()
+                    static["OEBPS/Static/"+h] = attrib["src"]
+                    attrib["src"] = "../Static/"+h
+                elif ("src" in attrib and os.path.isfile(attrib["src"])) or ("href" in attrib and os.path.isfile(attrib["href"])):
+                    raise ValueError("Unknown tag %s for href/src" % t)
+                    
+                tag = etree.SubElement(tag, t, attrib)
+            if not isinstance(elem, str):
+                st = Page._generate_inner(tag, elem)
+                static.update(st)
+            else:
+                tag.text = elem
+
+        return static
+
+    def generate_xhtml(self):
+        """Generate the XHTML representation of this page. 
+    
+        Returns a 3-tuple:
+
+            (root, kwargs, static)
+
+        Where `root` is the root Element for the page, `kwargs` are the keyword arguments
+        to be used when converting this page to a string, and `static` is a mapping from 
+        zip file path to filesystem path for storing static files required by this page. 
+        The file's zip path is determined from its hash, so collisions need not be handled.
+        """
+        chap = etree.Element("html", {}, {None: "http://www.w3.org/1999/xhtml", "epub": "http://www.idpf.org/2007/ops"})
+
+        head = etree.SubElement(chap, "head")
+        etree.SubElement(head, "title").text = self.title
+        etree.SubElement(head, "link", {"href": "../Styles/stylesheet.css", "rel": "stylesheet", "type": "text/css"})
+        etree.SubElement(head, "link", {"href": "../Styles/page-template.xpgt", "rel": "stylesheet", "type": "application/vnd.adobe-page-template+xml"})
+
+        div = etree.SubElement(etree.SubElement(chap, "body"), "div")
+        static = self._generate_inner(div, self.content)
+
+        return (chap, {"doctype": '<!DOCTYPE html>', "standalone": False}, static)
+
+    def __init__(self, title, content, in_toc=True, type_="content", fname=None):
+        self.content = content
+        self.title = title
+        self.in_toc = in_toc
+        self.type = type_
+        self.fname = fname
+
+class BasicTOCPage(Page):
+    """Page for generating a TOC."""
+    def add_page(self, p: Page, fname):
+        """Add a page to the TOC."""
+        print("toc", p.title, p.in_toc)
+        if p.in_toc:
+            if p.type != Page.CONTENT or p.fname is not None:
+                self.toc_ol.append(("li", (("a", {"href": fname}), p.title)))
+            else:
+                self.toc_ol.append(("li", (("a", {"href": fname}), "{num}. {header}".format(num=self.chapter_count, header=p.title))))
+                self.chapter_count += 1
+
+    def clear(self):
+        """Clear the current table of contents."""
+        while self.toc_ol:
+            self.toc_ol.pop(0)
+
+    def __init__(self, in_toc=True, fname="toc.xhtml"):
+        super().__init__("Table of Contents", [("h1", "Table of Contents")], in_toc, type_=Page.TOC, fname=fname)
+        self.chapter_count = 1
+        self.toc_ol = []
+        self.content.append((("nav", {EPUB+"type": "toc"}), ("ol", ("", self.toc_ol))))
+
+class BasicTitlePage(Page):
+    """Basic title page that shows the title and the author."""
+    def __init__(self, book_title, author, in_toc=True, fname="title.xhtml"):
+        super().__init__("Title Page", [NBSP, ("h1", book_title), NBSP, ("h2", "By "+author)], in_toc=in_toc, type_=Page.TITLE, fname=fname)
+
+class Book:
+    """Class representing an EPUB v3.0 container."""
+    def generate_epub(self, target="out.epub"):
+        """Generate the EPUB."""
+        static = {"OEBPS/Styles/page-template.xpgt" : "/home/ben/Workspace/epub/static/page-template.xpgt",
+                  "OEBPS/Styles/stylesheet.css": "/home/ben/Workspace/epub/static/stylesheet.css"}
+        xmlmap = {}
+
+        ## Generate the container file
+        container = etree.Element("container", {"version": "1.0"}, {None: "urn:oasis:names:tc:opendocument:xmlns:container"})
+        xmlmap["META-INF/container.xml"] = (container, {}, {})
+        rf = etree.SubElement(container, "rootfiles")
+        etree.SubElement(rf, "rootfile", {"full-path": "OEBPS/content.opf", "media-type": "application/oebps-package+xml"})
+
+        ## Generate the content file 
+        content = etree.Element("package", {"unique-identifier" : "BookID", "version": "3.0"}, {None: OPF_NS["opf"]})
+        xmlmap["OEBPS/content.opf"] = (content, {}, {})
+
+        metad = etree.SubElement(content, "metadata", {}, OPF_NS)
+        etree.SubElement(metad, DC+"title").text = self.title 
+        etree.SubElement(metad, DC+"rights").text = "Public Domain"
+        etree.SubElement(metad, DC+"language").text = "en-US"
+        etree.SubElement(metad, DC+"creator", {"id": "author"}).text = self.author
+        etree.SubElement(metad, "meta", {"refines": "#author", "property": "role", "scheme": "marc:relators", "id": "role"}).text = "aut"
+        etree.SubElement(metad, "meta", {"property": "dcterms:modified"}).text = dt.datetime.utcnow().isoformat()[:-7]+'Z'
+        etree.SubElement(metad, DC+"identifier", {"id": "BookID"}).text = str(uuid.uuid3(uuid.NAMESPACE_OID, self.title+'|'+self.author))
+        manif = etree.SubElement(content, "manifest")
+        etree.SubElement(manif, "item", {"id": "page-template.xpgt", "href": "Styles/page-template.xpgt", "media-type": "application/vnd.adobe-page-template+xml"})
+        etree.SubElement(manif, "item", {"id": "stylesheet.css", "href": "Styles/stylesheet.css", "media-type": "text/css"})
+
+        if self.cover is not None:
+            cover_path = "Static/cover" + os.path.splitext(self.cover)[1]
+            etree.SubElement(manif, "item", {"id": "cover", "href": cover_path, "media-type": mimetypes.guess_type(cover_path), "properties": "cover-image"})
+            static["OEBPS/"+cover_path] = self.cover
+
+        spine = etree.SubElement(content, "spine")
+        
+        ## Generate pages
+        self.toc.clear()
+        form = "page%06d.xhtml"
+        n = 1
+        for p in self.pages:
+            manif_attr = {}
+
+            if p.type == Page.TOC:
+                manif_attr["properties"] = "nav"
+            
+            if p.fname is None:
+                fname = form % n
+            else:
+                fname = p.fname
+
+            ## Add it to the manifest
+            etree.SubElement(manif, "item", {"id": fname, "href": "Text/"+fname, "media-type": "application/xhtml+xml", **manif_attr})
+            ## Add it to the spine
+            etree.SubElement(spine, "itemref", {"idref": fname})
+            ## Add it to the TOC
+            self.toc.add_page(p, fname)
+            ## Generate the page
+            xml = p.generate_xhtml()
+            xmlmap["OEBPS/Text/"+fname] = xml
+            for zpath in xml[2]:
+                if zpath.startswith("OEBPS/"):
+                    zpath = zpath[6:]
+                etree.SubElement(manif, "item", {"id": zpath.replace('/', '-'), "href": zpath, "media-type": mimetypes.guess_type(zpath)[0]})
+            n += 1
+
+        ## Regenerate the TOC
+        xmlmap["OEBPS/Text/"+self.toc.fname] = self.toc.generate_xhtml()
+
+        epub = zipfile.ZipFile(target, 'w', zipfile.ZIP_DEFLATED)
+        epub.writestr("mimetype", "application/epub+zip", zipfile.ZIP_STORED)
+        for zpath, path in static.items():
+            epub.write(path, zpath)
+        for zpath, xml in xmlmap.items():
+            xml, kwargs, static = xml
+            epub.writestr(zpath, etree.tostring(xml, encoding="utf-8", xml_declaration=True, pretty_print=True, **kwargs))
+            for szpath, spath in static.items():
+                epub.write(spath, szpath)
+
+        return epub
+
+    def __init__(self, title, author, toc_class=BasicTOCPage, title_class=BasicTitlePage, cover=None):
+        self.title = title
+        self.author = author
+
+        self.title_page = title_class(title, author)
+        self.toc = toc_class(in_toc=False)
+        self.cover = cover
+
+        self.pages = [self.title_page, self.toc]
author	Ben Connors <benconnors@outlook.com>	2019-09-01 16:45:43 -0400
committer	Ben Connors <benconnors@outlook.com>	2019-09-01 16:45:43 -0400
commit	4eb082506ba9943e63c7a4ac565e98586bc180e6 (patch)
tree	bbff71d87077102702e5c3e0e0901f33ec4c1f17 /epub.py