Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Immunize against empty table headers #3210

Merged
merged 1 commit into from
Feb 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 75 additions & 34 deletions src/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,8 @@

EDGES = [] # vector graphics from PyMuPDF
CHARS = [] # text characters from PyMuPDF
TEXTPAGE = None
Comment on lines 95 to +97
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would like to eventually remove the use of these globals, but ok for now.

white_spaces = set(string.whitespace) # for checking white space only cells
# -------------------------------------------------------------------
# End of PyMuPDF interface code
# -------------------------------------------------------------------
Expand Down Expand Up @@ -1183,7 +1185,7 @@ def find_smallest_cell(points, i: int):
return list(filter(None, cell_gen))


def cells_to_tables(cells) -> list:
def cells_to_tables(page, cells) -> list:
"""
Given a list of bounding boxes (`cells`), return a list of tables that
hold those cells most simply (and contiguously).
Expand Down Expand Up @@ -1237,11 +1239,32 @@ def bbox_to_corners(bbox) -> tuple:
# ... store it.
tables.append(list(current_cells))

# PyMuPDF modification:
# Remove tables without text or having only 1 column
for i in range(len(tables) - 1, -1, -1):
r = EMPTY_RECT()
x1_vals = set()
x0_vals = set()
for c in tables[i]:
r |= c
x1_vals.add(c[2])
x0_vals.add(c[0])
if (
len(x1_vals) < 2
or len(x0_vals) < 2
or white_spaces.issuperset(
page.get_textbox(
r,
textpage=TEXTPAGE,
)
)
):
del tables[i]

# Sort the tables top-to-bottom-left-to-right based on the value of the
# topmost-and-then-leftmost coordinate of a table.
_sorted = sorted(tables, key=lambda t: min((c[1], c[0]) for c in t))
filtered = [t for t in _sorted if len(t) > 1]
return filtered
return _sorted


class CellGroup(object):
Expand Down Expand Up @@ -1446,6 +1469,8 @@ def recover_top_row_cells(table):

# sort (x0, x1) pairs by x0-values
l_r = sorted(list(l_r), key=lambda c: c[0])
if not l_r:
return [], (0, 0, 0, 0)

# recovered row 0 cells
cells = [(l_r[0][0], y0, l_r[0][1], y1)]
Expand Down Expand Up @@ -1717,7 +1742,8 @@ def __init__(self, page, settings=None):
)
self.cells = intersections_to_cells(self.intersections)
self.tables = [
Table(self.page, cell_group) for cell_group in cells_to_tables(self.cells)
Table(self.page, cell_group)
for cell_group in cells_to_tables(self.page, self.cells)
]

def get_edges(self) -> list:
Expand Down Expand Up @@ -1845,11 +1871,12 @@ def __getitem__(self, i):
# -----------------------------------------------------------------------------
def make_chars(page, clip=None):
"""Extract text as "rawdict" to fill CHARS."""
global CHARS
global CHARS, TEXTPAGE
page_number = page.number + 1
page_height = page.rect.height
ctm = page.transformation_matrix
blocks = page.get_text("rawdict", clip=clip, flags=TEXTFLAGS_TEXT)["blocks"]
TEXTPAGE = page.get_textpage(clip=clip, flags=TEXTFLAGS_TEXT)
blocks = page.get_text("rawdict", textpage=TEXTPAGE)["blocks"]
doctop_base = page_height * page.number
for block in blocks:
for line in block["lines"]:
Expand Down Expand Up @@ -1897,11 +1924,11 @@ def make_chars(page, clip=None):
CHARS.append(char_dict)


# -----------------------------------------------------------------------------
# ------------------------------------------------------------------------
# Extract all page vector graphics to fill the EDGES list.
# We are ignoring Bézier curves completely and are converting everything
# else to lines.
# -----------------------------------------------------------------------------
# ------------------------------------------------------------------------
def make_edges(page, clip=None, tset=None, add_lines=None):
global EDGES
snap_x = tset.snap_x_tolerance
Expand Down Expand Up @@ -1930,23 +1957,38 @@ def are_neighbors(r1, r2):
larger than some delta.

This check supports empty rect-likes and thus also lines.

Note:
This type of check is MUCH faster than native Rect containment checks.
"""
if (
if ( # check if x-coordinates of r1 are within those of r2
r2.x0 - snap_x <= r1.x0 <= r2.x1 + snap_x
or r2.x0 - snap_x <= r1.x1 <= r2.x1 + snap_x
) and (
) and ( # ... same for y-coordinates
r2.y0 - snap_y <= r1.y0 <= r2.y1 + snap_y
or r2.y0 - snap_y <= r1.y1 <= r2.y1 + snap_y
):
return True

# same check with r1 / r2 exchanging their roles (this is necessary!)
if (
r1.x0 - snap_x <= r2.x0 <= r1.x1 + snap_x
or r1.x0 - snap_x <= r2.x1 <= r1.x1 + snap_x
) and (
r1.y0 - snap_y <= r2.y0 <= r1.y1 + snap_y
or r1.y0 - snap_y <= r2.y1 <= r1.y1 + snap_y
):
return True
return False

def clean_graphics():
"""Detect and join rectangles of connected vector graphics."""
# exclude irrelevant graphics
paths = []
"""Detect and join rectangles of "connected" vector graphics."""

paths = [] # paths relevant for table detection
for p in page.get_drawings():
if ( # ignore fill-only graphics if they are no lines
# ignore fill-only graphics if they do not simulate lines,
# which means one of width or height are small.
if (
p["type"] == "f"
and lines_strict
and p["rect"].width > snap_x
Expand All @@ -1955,33 +1997,32 @@ def clean_graphics():
continue
paths.append(p)

prects = sorted([p["rect"] for p in paths], key=lambda r: (r.y1, r.x0))
# start with all vector graphics rectangles
prects = sorted(set([p["rect"] for p in paths]), key=lambda r: (r.y1, r.x0))
new_rects = [] # the final list of joined rectangles
# -------------------------------------------------------------------------
# Strategy: Join rectangles that "almost touch" each other,
# Extend first rectangle with any remaining in the list that touches it.
# Then move it to final list and continue with the rest.
# -------------------------------------------------------------------------
# ----------------------------------------------------------------
# Strategy: Join rectangles that "almost touch" each other.
# Extend first rectangle with any other that is a "neighbor".
# Then move it to the final list and continue with the rest.
# ----------------------------------------------------------------
while prects: # the algorithm will empty this list
r = prects[0] # first rectangle
prect0 = prects[0] # copy of first rectangle (performance reasons!)
repeat = True
while repeat: # this loop extends first rect in list
repeat = False # will be set to true if any other rect touches
for i in range(len(prects) - 1, -1, -1): # run backwards
if i == 0: # don't touch first rectangle
continue
if are_neighbors(r, prects[i]): # touches rect 0!
r |= prects[i] # extend first rect
prects[0] = +r # update it in list
repeat = False # set to true again if some other rect touches
for i in range(len(prects) - 1, 0, -1): # run backwards
if are_neighbors(prect0, prects[i]): # close enough to rect 0?
prect0 |= prects[i] # extend rect 0
del prects[i] # delete this rect
repeat = True # check remaining
repeat = True # keep checking the rest

# move first item over to result list
new_rects.append(prects.pop(0))
prects = sorted(list(set(prects)), key=lambda r: (r.y1, r.x0))
# move rect 0 over to result list if there is some text in it
if not white_spaces.issuperset(page.get_textbox(prect0, textpage=TEXTPAGE)):
# contains text, so accept it as a table bbox candidate
new_rects.append(prect0)
del prects[0] # remove from rect list

new_rects = sorted(list(set(new_rects)), key=lambda r: (r.y1, r.x0))
return [r for r in new_rects if r.width > 5 and r.height > 5], paths
return new_rects, paths

bboxes, paths = clean_graphics()

Expand Down
Binary file added tests/resources/battery-file-22.pdf
Binary file not shown.
18 changes: 15 additions & 3 deletions tests/test_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,9 +201,8 @@ def test_add_lines():
filename = os.path.join(scriptdir, "resources", "small-table.pdf")
doc = fitz.open(filename)
page = doc[0]
tab1 = page.find_tables()[0]
assert tab1.col_count == 1
assert tab1.row_count == 5
assert page.find_tables().tables == []

more_lines = [
((238.9949951171875, 200.0), (238.9949951171875, 300.0)),
((334.5559997558594, 200.0), (334.5559997558594, 300.0)),
Expand Down Expand Up @@ -251,3 +250,16 @@ def test_3179():
page = doc[0]
tabs = page.find_tables()
assert len(tabs.tables) == 3


def test_battery_file():
"""Tests correctly ignoring non-table suspects.

Earlier versions erroneously tried to identify table headers
where there existed no table at all.
"""
filename = os.path.join(scriptdir, "resources", "battery-file-22.pdf")
doc = fitz.open(filename)
page = doc[0]
tabs = page.find_tables()
assert len(tabs.tables) == 0
Loading