-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocess_pdf.py
59 lines (41 loc) · 1.71 KB
/
process_pdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import configparser
import os
import sys
import time
from get_doc_analysis_for_table_extraction import GetResults
from start_doc_analysis_for_table_extraction import DocumentProcessor, load_aws_config
def main():
document = sys.argv[1] # document = 'pdf/AF_Dealer_Pricelist_072020_w.pdf'
csv_output = sys.argv[2] # document = 'pdf/AF_Dealer_Pricelist_072020_w.pdf'
config = load_aws_config() # access_key, secret_key, region_name, role_arn
roleArn = config["role_arn"]
bucket = config["bucket_name"]
region_name = config["region_name"]
access_key = config["aws_access_key_id"]
secret_key = config["aws_secret_access_key"]
pdf_key = config["pdf_key"]
# print(document)
# print(config)
analyzer = DocumentProcessor(access_key, secret_key,roleArn, bucket, document, region_name,pdf_key)
if(analyzer.uploadFile()):
analyzer.CreateTopicandQueue()
analyzer.ProcessDocument()
jobId = analyzer.textract_response['JobId']
counter = 0
while True:
if(counter > 30):
break
textract_response = (analyzer.textract.get_document_analysis(JobId=jobId))
if(textract_response["JobStatus"] == "SUCCEEDED"):
response_blocks = GetResults(analyzer.textract, jobId, csv_output)
break
elif(textract_response["JobStatus"] == "FAILED"):
textract_response["JobStatus"]
break
# print(textract_response["JobStatus"])
# sleep 3 seconds
time.sleep(3)
++counter
if __name__ == "__main__":
main()
# process_pdf.py source/pdf output_file/csv