Skip to content

Commit

Permalink
ebook: Check html syntax
Browse files Browse the repository at this point in the history
  • Loading branch information
entorb committed Feb 4, 2025
1 parent 529e9a8 commit ab5df2b
Show file tree
Hide file tree
Showing 6 changed files with 58 additions and 17 deletions.
22 changes: 15 additions & 7 deletions .github/workflows/check-and-ebook.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,17 +43,20 @@ jobs:
path: hash-chapters.txt
key: chapter-hash-for-ebook-${{ github.ref_name }}-${{ steps.calculate-hash.outputs.hash }}

- name: Preparations
run: ln -s python-requirements.txt requirements.txt

- name: Python set up
uses: actions/setup-python@v5
with:
python-version: "3.12"
# no packages besides pytest installed, so no caching needed
# cache: "pip"
cache: "pip"

- name: Python packages
run: pip install -r python-requirements.txt

- name: Run pytest unittests
run: |
pip install pytest
pytest
run: pytest

- name: Check chapters for known issues
run: python3 -O scripts/check_chapters.py
Expand All @@ -78,12 +81,17 @@ jobs:
persist-credentials: false
fetch-depth: 1 # 0 if you want to push to repo

- name: Preparations
run: ln -s python-requirements.txt requirements.txt

- name: Python set up
uses: actions/setup-python@v5
with:
python-version: "3.12"
# no packages installed, so no caching
# cache: "pip"
cache: "pip"

- name: Python packages
run: pip install -r python-requirements.txt

# - name: setup environment to DE lang
# run: |
Expand Down
3 changes: 2 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,9 @@ ENV TZ=Europe/Berlin
# prevent keyboard input requests in apt install
ENV DEBIAN_FRONTEND=noninteractive

# install packages and cleanup afterwards
RUN apt-get update && apt-get dist-upgrade -y && \
apt-get install -y python3 git texlive-xetex texlive-lang-greek texlive-lang-german latexmk texlive-extra-utils pandoc calibre imagemagick ghostscript && \
apt-get install -y python3 python3-lxml git texlive-xetex texlive-lang-greek texlive-lang-german latexmk texlive-extra-utils pandoc calibre imagemagick ghostscript && \
apt-get clean autoclean && apt-get autoremove --yes && rm -rf /var/lib/{apt,dpkg,cache,log}/

# set working directory
Expand Down
2 changes: 2 additions & 0 deletions python-requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
lxml
pytest
38 changes: 29 additions & 9 deletions scripts/ebook/step_6.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
import sys
from pathlib import Path

from lxml import etree # pip install lxml

sys.path.append(str(Path(__file__).resolve().parent.parent))
from check_chapters_settings import settings

Expand All @@ -19,6 +21,17 @@
target_file = Path("hpmor.html")


def check_html(cont: str) -> None:
"""Check html syntax."""
parser = etree.XMLParser(recover=False) # Do not auto-fix errors
try:
etree.fromstring(cont, parser) # noqa: S320
except etree.XMLSyntaxError as e:
print("HTML Error:", e)
sys.exit(1)
# raise


def fix_ellipsis(s: str) -> str:
"""
Fix ellipsis spacing for ebooks.
Expand All @@ -41,6 +54,8 @@ def fix_ellipsis(s: str) -> str:
s = re.sub(r"…(?=<em>)", "… ", s)
# before opening EN-quotes: add space
# s = re.sub(r"…(?=[“])", "… ", s)
# NO: before opening DE-quotes: add space
# s = re.sub(r"…(?=[„])", "… ", s)
return s


Expand All @@ -49,6 +64,8 @@ def fix_ellipsis(s: str) -> str:

with source_file.open(encoding="utf-8", newline="\n") as fh_in:
cont = fh_in.read()
print("checking source html")
check_html(cont)

# remove strange leftovers from tex -> html conversion
cont = re.sub(
Expand Down Expand Up @@ -86,15 +103,6 @@ def fix_ellipsis(s: str) -> str:
# count=1,
# )

# remove training slashes to satisfy https://validator.w3.org
cont = cont.replace("<br />", "<br>")
cont = cont.replace("<hr />", "<hr>")
cont = re.sub(
r"(<meta [^>]*) />",
r"\1>",
cont,
)

# fix spaces around ellipsis
cont = fix_ellipsis(cont)

Expand Down Expand Up @@ -154,5 +162,17 @@ def fix_ellipsis(s: str) -> str:
css = fh_in.read()
cont = cont.replace("</style>\n", css + "\n</style>\n")

print("checking target html")
check_html(cont)

# remove training slashes to satisfy https://validator.w3.org
cont = cont.replace("<br />", "<br>")
cont = cont.replace("<hr />", "<hr>")
cont = re.sub(
r"(<meta [^>]*) />",
r"\1>",
cont,
)

with target_file.open(mode="w", encoding="utf-8", newline="\n") as fh_out:
fh_out.write(cont)
6 changes: 6 additions & 0 deletions scripts/install_requirements_ebook.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
#!/bin/sh

# ensure we are in the hpmor root dir
script_dir=$(cd $(dirname $0) && pwd)
cd $script_dir/..

sudo apt-get install texlive-extra-utils pandoc calibre imagemagick ghostscript
# pandoc calibre : for ebook converting
# texlive-extra-utils : for latexpand
# imagemagick ghostscript : for pdf title page to image conversion

pip install -r python-requirements.txt
4 changes: 4 additions & 0 deletions scripts/install_requirements_pdf.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
#!/bin/sh

# ensure we are in the hpmor root dir
script_dir=$(cd $(dirname $0) && pwd)
cd $script_dir/..

sudo apt-get install texlive-xetex texlive-lang-greek texlive-lang-german latexmk

0 comments on commit ab5df2b

Please sign in to comment.