forked from cemac/TMA-data-extraction
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract_PDF_data.sh
executable file
·83 lines (70 loc) · 1.81 KB
/
extract_PDF_data.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#!/bin/bash
E_NOARGS=65
E_NODIR=67
if [ -z "$1" ]
then
echo "Usage: `basename $0` data_dir out_dir"
exit $E_NOARGS
else
if [ -d "$1" ]
then
data_dir="${1%/}/"
fi
fi
if [ -n "$2" ]
then
out_dir="${2%/}/"
if [ ! -d "$2" ]
then
# Create output directory
mkdir -p $out_dir
fi
else
echo "Output directory not specified"
echo "Usage: $0 data_dir out_dir"
exit $E_NODIR
fi
if [ -z ${PDF_PARSER_ROOT:+x} ]
then
# If not set or set but NULL, assume script is in python directory
export PDF_PARSER_ROOT=$(pwd)/python
fi
if [ -z ${R_LIBS_USER:+x} ]
then
# If not set or set but NULL, set to location of conda R libs
export R_LIBS_USER=${CONDA_PATH}/lib/R/library
else
echo "R_LIBS_USER="${R_LIBS_USER}
fi
# Set up temporary data for intermediate files and specify log file
TEMP_DIR=${out_dir}temp
mkdir -p $TEMP_DIR
log_file=${out_dir}tma_extract.log
__conda_setup="$($CONDA_EXE 'shell.bash' 'hook' 2> /dev/null)"
if [ $? -eq 0 ]; then
eval "$__conda_setup"
else
conda_path=$(dirname $(dirname $CONDA_EXE))
if [ -f "${conda_path}/etc/profile.d/conda.sh" ]; then
. "${conda_path}/etc/profile.d/conda.sh"
else
export PATH="${conda_path}/bin:$PATH"
fi
fi
unset __conda_setup
conda activate tma_data_extraction
# Parse PDFs to get page contents
bash script/parse.sh $data_dir $TEMP_DIR > $log_file
# Extract weather alert metadata
Rscript R/pdf_extract.R $data_dir $TEMP_DIR >> $log_file
rm -rf $TEMP_DIR/*_raw_data.csv
# Process page contents and metadata and produce netcdf of weather alert data
for fil in $TEMP_DIR/*.csv
do
base=$(basename -s '.csv' $fil)
python python/process_content.py $TEMP_DIR/${base}_page2.txt $TEMP_DIR/${base}.csv >> $log_file
mv *.nc $out_dir
done
conda deactivate
# Clean up
rm -rf $TEMP_DIR