-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
150 lines (117 loc) · 5.79 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# main.py for DREF_PARSETAG
from fastapi import FastAPI, Query, HTTPException
from fastapi.responses import StreamingResponse
from fastapi import File, UploadFile
from typing import Optional
from importlib import resources
import io
from enum import Enum
from dref_parsing.parser_utils import *
from dref_tagging.prediction import predict_tags_any_length
app = FastAPI()
# This Enum class allows us to see a dropdown menu with possible choices
class OuputFormat(str, Enum):
json = "json"
csv = "csv"
# Once Subdimension is found, this function helps select the corresponding Dimension
def get_Dimension_from_Subdimension(subdim, spec):
if subdim in list(spec.index):
return spec.loc[subdim,'Dimension']
return 'ERROR: No Dimension matches this Subdimension :('
# ------------------------------------------------------
# Main function for Parsing+Tagging
@app.post("/parse_and_tag/{output_format}")
async def parse_and_tag(output_format: OuputFormat,
pdf_file: Optional[bytes] = File(None,
description='Optional input of PDF file. If given, overwrites MDR code'),
Appeal_code: str = Query(
'MDRDO013',
title="Appeal code",
description="Starts with 'MDR' followed by 5 symbols. <br> Some available codes: DO013, BO014, CL014, AR017, VU008, TJ029, SO009, PH040, RS014, FJ004, CD031, MY005, LA007, CU006, AM006",
min_length=8,
max_length=8)):
"""
App for Parsing PDFs of DREF Final Reports and Tagging Excerpts.
<b>Input</b>: Appeal code of the report, MDR***** (or a PDF file)
<b>Output</b>:
a list of excerpts extracted from the PDF with its features: 'Learning', 'DREF_Sector',
and global features: 'Hazard', 'Country', 'Date', 'Region', 'Appeal code'.
The output can be given as a dictionary in json format, or as a csv file for download
The app uses IFRC GO API to determine the global features (call 'appeal')
and to get the URL of the PDF report (call 'appeal_document')
<b>Possible errors</b>:
<ul>
<li> Appeal code doesn't have a DREF Final Report in IFRC GO appeal database
<li> PDF URL for Appeal code was not found using IFRC GO API call appeal_document
<li> PDF Parsing didn't work
</ul>
"""
# Renaming: In the program we call it 'lead', while IFRC calls it 'Appeal_code'
lead = Appeal_code
# if PDF file is given, lead input is ignored
if pdf_file:
lead = 'Unknown'
# ---------------------------------------------------------
# Parsing PDF
try:
# excerpts (and other relevant columns)
all_parsed = parse_PDF_combined(lead, pdf_file = pdf_file)
except ExceptionNotInAPI:
raise HTTPException(status_code=404,
detail=f"{lead} doesn't have a DREF Final Report in IFRC GO appeal database")
except ExceptionNoURLforPDF:
raise HTTPException(status_code=404,
detail=f"PDF URL for Appeal code {lead} was not found using IFRC GO API call appeal_document")
except:
raise HTTPException(status_code=500, detail="PDF Parsing didn't work by some reason")
df = all_parsed[['Modified Excerpt', 'Learning', 'DREF_Sector', 'lead', 'Hazard', 'Country', 'Date', 'Region']].copy() #,'position', 'DREF_Sector_id']]
# -----------------------------------------------------------
# Tagging excerpts and cleaning/renaming
df.loc[:,'Subdimension'] = df['Modified Excerpt'].apply(lambda x: predict_tags_any_length(x)[0])
# Split to "row per tag"
df = df.explode('Subdimension')
# Define Dimensions from Subdimensions using a dics from csv file
with resources.path("dref_tagging.config", "DREF_spec.csv") as DREF_spec_file:
spec = pd.read_csv(DREF_spec_file).set_index('Subdimension')
df['Dimension'] = df['Subdimension'].apply(lambda x: get_Dimension_from_Subdimension(x, spec))
df = df.fillna('Unknown')
df = df.rename(columns={'lead':'Appeal code','Modified Excerpt':'Excerpt'})
# reorder columns
cols_order = ['Excerpt', 'Learning', 'DREF_Sector', 'Appeal code', 'Hazard', 'Country', 'Date', 'Region', 'Dimension' ,'Subdimension']
df = df[cols_order]
# -----------------------------------
# Return DataFrame as Json or Csv:
if output_format == 'json':
return df.to_dict()
else:
# prepare csv output
stream = io.StringIO()
# NB: comma as a separator works OK even if there exist commas in some excerpts
# since pandas is smart to insert quotes where needed
df.to_csv(stream, index = False, sep=',')
response = StreamingResponse(iter([stream.getvalue()]), media_type="text/csv")
response.headers["Content-Disposition"] = "attachment; filename=export.csv"
return response
# *********************************************************************
@app.get("/refresh/")
async def reload_GO_API_data():
"""
Reload data from GO database.
This may be needed since the Parse-and-Tag app downloads data from GO
the first time it runs and never checks for updates.
To refresh data from GO, run this app.
"""
try:
initialize_apdo(refresh=True)
initialize_aadf(refresh=True)
# NB: we need to do import again,
# otherwise updated apdo, aadf won't be accessible here
# (even though they got updated in parser_utils)
from dref_parsing.parser_utils import apdo, aadf
output = f'GO API Reload: {len(aadf)} items in appeal, {len(apdo)} items in appeal_documents'
output += ' (only DREF Final Reports are selected)'
except:
raise HTTPException(status_code=500, detail="Error while accessing GO API data")
return output
# Command to start API:
# uvicorn main:app --reload