-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreview_m3.py
242 lines (176 loc) · 6.79 KB
/
review_m3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
from docxtpl import DocxTemplate, RichText
import os
import smtplib
import time
import ssl
import datetime
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.mime.application import MIMEApplication
import docx.oxml
import docx.oxml.ns as ns
# Function to list all rank.txt files within the specified directory
def list_rank_files(directory):
rank_files = []
# Find all rank.txt files within the specified dictionary
for root, _, files in os.walk(directory):
for file in files:
if file.endswith('.txt') and 'rank' in file:
rank_files.append(file)
return rank_files
def extract_data(directory, rank_file, data_dict):
# Build location
location = f'{directory}{rank_file}'
# Open the file and write contents to list
papers_list = []
with open(location, 'r') as file:
for entry in file:
entry = entry.rstrip()
papers_list.append(entry)
data_dict[rank_file] = papers_list
return data_dict
# Simple function to fetch the correct template and project name
def import_template(directory):
doc = DocxTemplate(f"{directory}template.docx")
return doc
def create_file(directory, template, data_dict, start_date):
doc = template
# Fetch the current date
end_date = datetime.date.today()
end_date = end_date.strftime('%d/%m/%Y')
# Add context to the template
context = {
"start_date": start_date,
"end_date": end_date
}
# Render and save the formatted document
doc.render(context)
doc.save(f'{directory}lit_review.docx')
# Loop over the keys and corresponding variables in the dictionary
new_dictionary = {}
for key, data_list in data_dict.items():
papers_list = []
# Split the string into title and url
for item in data_list:
split_string = item.split('|')
title = split_string[0]
url = split_string[1]
rt = RichText()
rt.add(title, url_id=doc.build_url_id(url)) # Define text as rich text object and hyperlink the url
# Append to a new list
papers_list.append(rt)
# Append to the correct key in a new dictionary
new_dictionary[key] = papers_list
# Loop over the new dictionary and input the data as context
for key, rt_list in new_dictionary.items():
context[key] = rt_list
# Render the document and update table of contents
doc.render(context)
update_table_of_contents(doc)
# Save the document in the given directory
doc.save(f'{directory}lit_review.docx')
return doc
def update_table_of_contents(doc):
# Find the settings element in the document
settings_element = doc.settings.element
# Create an "updateFields" element and set its "val" attribute to "true"
update_fields_element = docx.oxml.shared.OxmlElement('w:updateFields')
update_fields_element.set(ns.qn('w:val'), 'true')
# Add the "updateFields" element to the settings element
settings_element.append(update_fields_element)
def send_email(directory, project):
# Build the path of the file
location = f'{directory}lit_review.docx'
# Fetch the current date
date = datetime.date.today()
date = date.strftime('%d/%m/%Y')
email_sender = 'automatedscrapingbot@gmail.com'
email_password = os.environ["SECRET_TOKEN"]
email_receiver = 'wl@destinypharma.com'
subject = f'Literature review ({project} {date})'
body = f'Please find attached the latest literature review for {project} ({date})'
em = MIMEMultipart()
em['From'] = email_sender
em['To'] = email_receiver
em['Subject'] = subject
em.attach(MIMEText(body))
# Attach the lit review file to the email
with open(location, 'rb') as attachment_file:
attachment = MIMEApplication(attachment_file.read(), _subtype="docx")
attachment.add_header('Content-Disposition', f'attachment; filename="lit_review.docx"')
em.attach(attachment)
context = ssl.create_default_context()
# Send the email
with smtplib.SMTP_SSL('smtp.gmail.com', 465, context=context) as smtp:
smtp.login(email_sender, email_password)
smtp.sendmail(email_sender, email_receiver, em.as_string())
def clear_ranks(directory, files_list):
# Create temporary variable
temp = 0
# Delete all the rank files in the files list
for file in files_list:
file_path = f'{directory}{file}'
if os.path.exists(file_path):
os.remove(file_path)
else:
print(f"File '{file_path}' does not exist.")
temp+=1
# Check no errors occurred when deleting files and print outcome
if temp > 0:
print('Error in clearing all rank files')
elif temp == 0:
print('All rank files successfully cleared')
def review_log(project):
# Create date and log string
date = datetime.datetime.now()
date = date.strftime("%H:%M %d/%m/%Y")
changes_log = f'{date}, {project} review sent'
# Write to file and print
with open('review_log.txt', 'a') as log:
log.write(changes_log + '\n')
print('Changes logged')
def fetch_start_date(directory):
with open(f'{directory}start_date.txt', 'r') as file:
start_date = file.readline().strip()
return start_date
def log_start_date(directory):
# Log start date for beginning new literature search
date = datetime.date.today()
date = date.strftime('%d/%m/%Y')
with open(f'{directory}start_date.txt', 'w') as start_date_file:
start_date_file.write(date)
def main():
# Define directory
directory = 'm3_data/'
project = 'NTCD-M3'
# Fetch starting date of the literature reviews
start_date = fetch_start_date(directory)
data_dict = {}
# Loop over the listed ranks within each directory
ranks = list_rank_files(directory)
for rank in ranks:
data_dict = extract_data(directory, rank, data_dict)
# Remove the '.txt' extension from the key (not recognised by jinja2)
new_dict = {}
for key, value in data_dict.items():
new_key = key.replace('.txt', '')
new_dict[new_key] = value
# Import the correct template
doc = import_template(directory)
print(f'Sourced {project} template')
# Create the lit review
create_file(directory, doc, new_dict, start_date)
print(f'Assembled {project} literature review')
# Send email with literature review attached
send_email(directory, project)
print(f'{project} email delivered')
# Clear all the rank files
clear_ranks(directory, ranks)
# Log review sent
review_log(project)
# Log start date for beginning new literature search
log_start_date(directory)
# Final print statement
print('Queries complete, returning to sleep')
if __name__ == '__main__':
main()