Skip to content

Commit

Permalink
Immunize against empty table headers
Browse files Browse the repository at this point in the history
Improved check for empty tables, fixes bugs when determining table headers.

Improved computation of enveloping vector graphic rectangles.

Ignore more meaningless "pseudo" tables
  • Loading branch information
JorjMcKie committed Feb 28, 2024
1 parent c41f831 commit 2ddbf01
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 22 deletions.
84 changes: 65 additions & 19 deletions src/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,8 @@

EDGES = [] # vector graphics from PyMuPDF
CHARS = [] # text characters from PyMuPDF
TEXTPAGE = None
white_spaces = set(string.whitespace) # for checking white space only cells
# -------------------------------------------------------------------
# End of PyMuPDF interface code
# -------------------------------------------------------------------
Expand Down Expand Up @@ -1183,7 +1185,7 @@ def find_smallest_cell(points, i: int):
return list(filter(None, cell_gen))


def cells_to_tables(cells) -> list:
def cells_to_tables(page, cells) -> list:
"""
Given a list of bounding boxes (`cells`), return a list of tables that
hold those cells most simply (and contiguously).
Expand Down Expand Up @@ -1237,11 +1239,32 @@ def bbox_to_corners(bbox) -> tuple:
# ... store it.
tables.append(list(current_cells))

# PyMuPDF modification:
# Remove tables without text or having only 1 column
for i in range(len(tables) - 1, -1, -1):
r = EMPTY_RECT()
x1_vals = set()
x0_vals = set()
for c in tables[i]:
r |= c
x1_vals.add(c[2])
x0_vals.add(c[0])
if (
len(x1_vals) < 2
or len(x0_vals) < 2
or white_spaces.issuperset(
page.get_textbox(
r,
textpage=TEXTPAGE,
)
)
):
del tables[i]

# Sort the tables top-to-bottom-left-to-right based on the value of the
# topmost-and-then-leftmost coordinate of a table.
_sorted = sorted(tables, key=lambda t: min((c[1], c[0]) for c in t))
filtered = [t for t in _sorted if len(t) > 1]
return filtered
return _sorted


class CellGroup(object):
Expand Down Expand Up @@ -1446,6 +1469,8 @@ def recover_top_row_cells(table):

# sort (x0, x1) pairs by x0-values
l_r = sorted(list(l_r), key=lambda c: c[0])
if not l_r:
return [], (0, 0, 0, 0)

# recovered row 0 cells
cells = [(l_r[0][0], y0, l_r[0][1], y1)]
Expand Down Expand Up @@ -1717,7 +1742,8 @@ def __init__(self, page, settings=None):
)
self.cells = intersections_to_cells(self.intersections)
self.tables = [
Table(self.page, cell_group) for cell_group in cells_to_tables(self.cells)
Table(self.page, cell_group)
for cell_group in cells_to_tables(self.page, self.cells)
]

def get_edges(self) -> list:
Expand Down Expand Up @@ -1845,11 +1871,12 @@ def __getitem__(self, i):
# -----------------------------------------------------------------------------
def make_chars(page, clip=None):
"""Extract text as "rawdict" to fill CHARS."""
global CHARS
global CHARS, TEXTPAGE
page_number = page.number + 1
page_height = page.rect.height
ctm = page.transformation_matrix
blocks = page.get_text("rawdict", clip=clip, flags=TEXTFLAGS_TEXT)["blocks"]
TEXTPAGE = page.get_textpage(clip=clip, flags=TEXTFLAGS_TEXT)
blocks = page.get_text("rawdict", textpage=TEXTPAGE)["blocks"]
doctop_base = page_height * page.number
for block in blocks:
for line in block["lines"]:
Expand Down Expand Up @@ -1932,11 +1959,22 @@ def are_neighbors(r1, r2):
This check supports empty rect-likes and thus also lines.
"""
if (
r2.x0 - snap_x <= r1.x0 <= r2.x1 + snap_x
or r2.x0 - snap_x <= r1.x1 <= r2.x1 + snap_x
) and (
r2.y0 - snap_y <= r1.y0 <= r2.y1 + snap_y
or r2.y0 - snap_y <= r1.y1 <= r2.y1 + snap_y
(
r2.x0 - snap_x <= r1.x0 <= r2.x1 + snap_x
or r2.x0 - snap_x <= r1.x1 <= r2.x1 + snap_x
)
and (
r2.y0 - snap_y <= r1.y0 <= r2.y1 + snap_y
or r2.y0 - snap_y <= r1.y1 <= r2.y1 + snap_y
)
or (
r1.x0 - snap_x <= r2.x0 <= r1.x1 + snap_x
or r1.x0 - snap_x <= r2.x1 <= r1.x1 + snap_x
)
and (
r1.y0 - snap_y <= r2.y0 <= r1.y1 + snap_y
or r1.y0 - snap_y <= r2.y1 <= r1.y1 + snap_y
)
):
return True
return False
Expand All @@ -1959,29 +1997,37 @@ def clean_graphics():
new_rects = [] # the final list of joined rectangles
# -------------------------------------------------------------------------
# Strategy: Join rectangles that "almost touch" each other,
# Extend first rectangle with any remaining in the list that touches it.
# Extend first rectangle with any other that is a "neighbor".
# Then move it to final list and continue with the rest.
# -------------------------------------------------------------------------
while prects: # the algorithm will empty this list
r = prects[0] # first rectangle
r = +prects[0] # first rectangle
repeat = True
while repeat: # this loop extends first rect in list
repeat = False # will be set to true if any other rect touches
for i in range(len(prects) - 1, -1, -1): # run backwards
if i == 0: # don't touch first rectangle
continue
if are_neighbors(r, prects[i]): # touches rect 0!
for i in range(len(prects) - 1, 0, -1): # run backwards
if are_neighbors(r, prects[i]): # close enough to rect 0!
r |= prects[i] # extend first rect
prects[0] = +r # update it in list
del prects[i] # delete this rect
repeat = True # check remaining

# move first item over to result list
prects[0] = +r # update rect 0
new_rects.append(prects.pop(0))
prects = sorted(list(set(prects)), key=lambda r: (r.y1, r.x0))

new_rects = sorted(list(set(new_rects)), key=lambda r: (r.y1, r.x0))
return [r for r in new_rects if r.width > 5 and r.height > 5], paths
# return only rectangles that do contain some text
return [
r
for r in new_rects
if not white_spaces.issuperset(
page.get_textbox(
r,
textpage=TEXTPAGE,
)
)
], paths

bboxes, paths = clean_graphics()

Expand Down
Binary file added tests/resources/battery-file-22.pdf
Binary file not shown.
18 changes: 15 additions & 3 deletions tests/test_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,9 +201,8 @@ def test_add_lines():
filename = os.path.join(scriptdir, "resources", "small-table.pdf")
doc = fitz.open(filename)
page = doc[0]
tab1 = page.find_tables()[0]
assert tab1.col_count == 1
assert tab1.row_count == 5
assert page.find_tables().tables == []

more_lines = [
((238.9949951171875, 200.0), (238.9949951171875, 300.0)),
((334.5559997558594, 200.0), (334.5559997558594, 300.0)),
Expand Down Expand Up @@ -251,3 +250,16 @@ def test_3179():
page = doc[0]
tabs = page.find_tables()
assert len(tabs.tables) == 3


def test_battery_file():
"""Tests correctly ignoring non-table suspects.
Earlier versions erroneously tried to identify table headers
where there existed no table at all.
"""
filename = os.path.join(scriptdir, "resources", "battery-file-22.pdf")
doc = fitz.open(filename)
page = doc[0]
tabs = page.find_tables()
assert len(tabs.tables) == 0

0 comments on commit 2ddbf01

Please sign in to comment.