-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbagit_script.py
101 lines (96 loc) · 4.69 KB
/
bagit_script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import re
import os
import shutil
from bdbag import bdbag_api
from bagit import BagValidationError
from bdbag.bdbagit import BaggingInterruptedError
def extract_bags():
num_bags = 0
for file in os.listdir(path = 'bags_zip'):
#converts a zipped bag into a bag
bdbag_api.extract_bag('bags_zip/' + file, output_path = 'bags_extract', temp=False)
num_bags += 1
print('* Bag Extraction Complete *')
print('* Extracted {0} bags *'.format(str(num_bags)))
def validate_bags():
error_log_handle = open('validation_error_log.txt', 'a')
filevar = 'TECHMD.xml'
num_bags = 0
for directory in os.listdir(path = 'bags_extract'):
#attempts to validate bags and logs any problem directories that raised errors
try:
bdbag_api.validate_bag('bags_extract/' + directory, fast = False)
except BagValidationError:
error_log_handle.write('Bag Validation Error | Directory: ' + directory + '\n')
except BaggingInterruptedError:
error_log_handle.write('Bagging Interruped Error | Directory: ' + directory + '\n')
except RuntimeError:
error_log_handle.write('Runtime Error | Directory: ' + directory + '\n')
subdir = os.listdir(path = 'bags_extract/' + directory + '/data')
if filevar not in subdir:
error_log_handle.write('TECHMD.xml File Not Found Error | Directory: ' + directory + '\n')
shutil.rmtree('bags_extract/' + directory)
num_bags += 1
print('* Bag Validation Complete *')
print('* Validated {0} bags *'.format(str(num_bags)))
error_log_handle.close()
def process_bags():
num_bags = 0
error_log_str = ''
error_log_handle = open('validation_error_log.txt', 'r')
error_log = error_log_handle.read()
for line in error_log:
error_log_str = error_log_str + line
for directory in os.listdir(path = 'bags_extract'):
#skips any directories that raised errors during validation
if error_log_str.find(directory) != -1 :
continue
else:
#converts the bags back into normal directories, removing bagit and manifest files
bdbag_api.revert_bag('bags_extract/' + directory)
#removes unnecessary files generated by Islandora
unneccesary_files = ['foo.xml', 'foxml.xml', 'JP2.jp2', 'JPG.jpg', 'POLICY.xml', 'RELS-EXT.rdf', 'RELS-INT.rdf', 'TN.jpg', 'HOCR.html', 'OCR.txt', 'MP4.mp4', 'PROXY_MP3.mp3']
for file in os.listdir(path = 'bags_extract/' + directory):
if file in unneccesary_files:
os.remove('bags_extract/' + directory + '/' +file)
#use regex to identify originally uploaded file name
xml = open('bags_extract/' + directory + '/TECHMD.xml')
fn_lst = []
for line in xml:
result = re.findall('>(.+\.OBJ\..+)<', line)
fn_lst = fn_lst + result
orig_file_name = fn_lst[0]
if orig_file_name.startswith('/'):
orig_file_name = orig_file_name[5:]
orig_file_name = orig_file_name.split('.')
orig_file_name = orig_file_name[0] + '.' + orig_file_name[2]
obj_file_name = ''
for file in os.listdir(path = 'bags_extract/' + directory):
if re.search('^OBJ', file):
obj_file_name = file
#rename the OBJ file to original filename pulled from TECHMD.xml
os.rename('bags_extract/' + directory + '/' + obj_file_name,'bags_extract/' + directory + '/' + orig_file_name)
num_bags += 1
error_log_handle.close()
print('* Bag Processing Complete *')
print('* Processed {0} bags *'.format(str(num_bags)))
def create_bags():
num_bags = 0
for directory in os.listdir(path = 'bags_extract'):
#creates new well formed bag for Preservica SIP
bdbag_api.make_bag('bags_extract/' + directory, algs = ['sha256'], metadata = {
'Source-Organization' : 'University of Rochester',
'Contact-Name' : 'John Dewees',
'Contact-Email' : 'john.dewees@rochester.edu'})
#zips the bag to prepare for ingest into Preservica
bdbag_api.archive_bag('bags_extract/' + directory, bag_archiver = 'zip')
shutil.move('bags_extract/' + directory + '.zip', 'bags_upload/' + directory + '.zip')
print('-- Created: {0}.zip'.format(directory))
num_bags += 1
print('* Bag Creation Complete *')
print('* Created {0} bags *'.format(str(num_bags)))
print('* Check error log for problem assets *')
extract_bags()
validate_bags()
process_bags()
create_bags()