diff --git a/docs/app1.rst b/docs/app1.rst index 1a8d7ea2c..3c7ea7a10 100644 --- a/docs/app1.rst +++ b/docs/app1.rst @@ -286,7 +286,7 @@ Text Extraction Flags Defaults ========================= ==== ==== ===== === ==== ======= ===== ====== ====== Indicator text html xhtml xml dict rawdict words blocks search ========================= ==== ==== ===== === ==== ======= ===== ====== ====== -preserve ligatures 1 1 1 1 1 1 1 1 1 +preserve ligatures 1 1 1 1 1 1 1 1 0 preserve whitespace 1 1 1 1 1 1 1 1 1 preserve images n/a 1 1 n/a 1 1 n/a 0 0 inhibit spaces 0 0 0 0 0 0 0 0 0 diff --git a/docs/vars.rst b/docs/vars.rst index 76a6ff32d..8b1a575f4 100644 --- a/docs/vars.rst +++ b/docs/vars.rst @@ -262,7 +262,7 @@ The following constants represent the default combinations of the above for text .. py:data:: TEXTFLAGS_SEARCH - `TEXT_PRESERVE_LIGATURES | TEXT_PRESERVE_WHITESPACE | TEXT_MEDIABOX_CLIP | TEXT_DEHYPHENATE` + `TEXT_PRESERVE_WHITESPACE | TEXT_MEDIABOX_CLIP | TEXT_DEHYPHENATE` .. _linkDest Kinds: diff --git a/src/__init__.py b/src/__init__.py index 9396a3d28..784247f94 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -13312,7 +13312,6 @@ def width(self): TEXTFLAGS_RAWDICT = TEXTFLAGS_DICT TEXTFLAGS_SEARCH = (0 - | TEXT_PRESERVE_LIGATURES | TEXT_PRESERVE_WHITESPACE | TEXT_MEDIABOX_CLIP | TEXT_DEHYPHENATE diff --git a/tests/resources/text-find-ligatures.pdf b/tests/resources/text-find-ligatures.pdf new file mode 100644 index 000000000..40c8e688f Binary files /dev/null and b/tests/resources/text-find-ligatures.pdf differ diff --git a/tests/test_textsearch.py b/tests/test_textsearch.py index a97eda2e0..16bbc2047 100644 --- a/tests/test_textsearch.py +++ b/tests/test_textsearch.py @@ -7,6 +7,7 @@ Text search with 'clip' parameter - clip rectangle contains two occurrences of searched text. Confirm search locations are inside clip. """ + import os import pymupdf @@ -14,6 +15,7 @@ scriptdir = os.path.abspath(os.path.dirname(__file__)) filename1 = os.path.join(scriptdir, "resources", "2.pdf") filename2 = os.path.join(scriptdir, "resources", "github_sample.pdf") +filename3 = os.path.join(scriptdir, "resources", "text-find-ligatures.pdf") def test_search1(): @@ -35,3 +37,16 @@ def test_search2(): assert len(rl) == 2 for r in rl: assert r in clip + + +def test_search3(): + """Ensure we find text whether or not it contains ligatures.""" + doc = pymupdf.open(filename3) + page = doc[0] + needle = "flag" + hits = page.search_for(needle, flags=pymupdf.TEXTFLAGS_SEARCH) + assert len(hits) == 2 # all occurrences found + hits = page.search_for( + needle, flags=pymupdf.TEXTFLAGS_SEARCH | pymupdf.TEXT_PRESERVE_LIGATURES + ) + assert len(hits) == 1 # only found text without ligatures