-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpolisher.py
114 lines (61 loc) · 2.13 KB
/
polisher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import tabula as tb
import json
import sys
if len(sys.argv) < 2:
print("\n Usage : python3 "+ sys.argv[0] +" nbPages filename.pdf")
exit()
pages = [ k for k in range(6) ]
files = [ fileName for fileName in sys.argv ]
files.pop(0)
ID_string = 'N°'
def getRawTable(regions_raw):
"""getting a table from the mess outputed by the pdfConverter"""
regions = []
for i in range(0, len(regions_raw)):
for column in range(len(regions_raw[i]['data'])):
tab = []
for champs in range(len(regions_raw[i]['data'][column])):
tab.append(regions_raw[i]['data'][column][champs]['text'])
regions.append(tab)
return regions
def polishTables(regions):
corruptedZones = [] #To store the corrupted Zones index
finalTable = []
header = regions[0]
for line in regions:
if line[0] == ID_string:
# In case we a in a "header of table" line type, like : ["N°", "Nom", ...., "Moyenne"]
corruptedZones = []
i = 0
for zone in line:
#Here we look for corrupted zones, for example, the pdfConverter could output :
#["N°", "Nom", "", "Prénom", "", "Moyenne"], where le null-strings are corrupted value
if zone == None or len(zone) == 0:
corruptedZones.append(i)
i += 1
else:
# We create our finalArray line by line
finalLine = []
for zoneId in range( len( line ) ):
if (zoneId-1) not in corruptedZones: # why (-1) you would ask ? Myself, I don't really know
#If the column (zone) is not corrupted, we add it to the final array
finalLine.append(line[zoneId])
finalTable.append(finalLine)
#We generate a header for our table
for zone in header:
if zone == None or len(zone) == 0:
header.remove(zone)
finalTable.insert(0, header)
return finalTable
def exportTable(fileName, table):
f = open(fileName, 'w')
f.write(json.dumps(table))
f.close()
for fileName in files:
regions_raw = tb.read_pdf(fileName, pages="all", output_format="json")
raw_table = getRawTable(regions_raw)
print(raw_table)
table = polishTables(raw_table)
exportTable(fileName.replace('.pdf', '.json'), table)
print("File : " + fileName + " converted.")
print("Dayen")