-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsearch_grants.py
153 lines (106 loc) · 4.09 KB
/
search_grants.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
"""
Written in Python 3.5
The purpose of this script is to search PubMed for publications that were funded by MIDAS grants."""
import requests
import xml.etree.ElementTree as ET
import datetime as dt
from Bio import Entrez
def grants_prompt():
"""Prompts user for text file containing grant numbers."""
try:
prompt = input("Please enter the name of a text file containing the grant numbers. \n\n"
"Also, please be aware that the grant numbers must end with '[gr]' (without single quotes) in order for the program to process\n"
"them. \n\n"
"Input: ")
fhand = open(prompt)
return fhand
except IOError:
print("Please ensure that you spelled the file name correctly and that you're in the right directory.")
def search(query):
"""Search PubMed by grant number and return primary IDs."""
Entrez.email = 'diller17@ufl.edu'
handle = Entrez.esearch(db='pubmed',
term=query,
retmax=1000,
)
results = Entrez.read(handle)
return results
def fetch_doc_sum(id_list):
"""Search PubMed by primary IDs to obtain DocSums for each publication."""
ids = ','.join(id_list)
Entrez.email = 'diller17@ufl.edu'
handle = Entrez.efetch(db='pubmed',
id=ids,
retmode='xml',
version='2.0')
records = handle.read()
return records
def checkXML(XML, path):
"""Looks for path to XML tag and pulls out the text therein."""
if XML.find(path) is not None:
if XML.find(path).text is not None:
return XML.find(path).text
else:
return ""
else:
return ""
def parse_authors(authors):
"""Parses out authors' last names and initials of first/middle names."""
authorList = list()
for author in authors:
authorList.append(checkXML(author, './LastName') + " "
+ checkXML(author, './Initials'))
return authorList
def parse_grants(grants):
"""Parses grant numbers from DocSum XML."""
grant_list = list()
for grant in grants:
grant_list.append(checkXML(grant, './GrantID'))
return grant_list
def extract_info(pub_med_XML):
"""Not currently used.
Extracts article title, author list, journal, date of publication, DOI, and PMID from the DocSum XML."""
tree = ET.fromstring(pub_med_XML)
articles = tree.findall('./PubmedArticle/MedlineCitation')
papers = list()
for article in articles:
paper = dict()
paper['TITLE'] = article.find('./Article/ArticleTitle').text
paper['AUTHORS'] = parse_authors(article.findall('./Article/AuthorList/Author'))
paper['PMID'] = article.find('./PMID').text
paper['DOI'] = checkXML(article, './Article/ELocationID')
paper['GRANT'] = parse_grants(article.findall('./Article/GrantList/Grant'))
paper['JOURNAL'] = article.find('./Article/Journal/ISOAbbreviation').text
papers.append(paper)
return papers
obc_api_url = "http://api.onbc.io/publications"
# Executes functions
if __name__ == '__main__':
grants_in = grants_prompt()
all_ids = set()
for line in grants_in:
line = line.rstrip()
#line = line.splitlines()
results = search(line)
id_list = results['IdList']
for id in id_list:
all_ids.add(id)
grants_in.close()
pmid_lst = list()
in_obc = requests.get(obc_api_url).json()
for publication in in_obc:
pmid_lst.append(publication.get("pmid"))
new_pub_ids = []
for id in all_ids:
if id not in pmid_lst:
new_pub_ids.append(id)
else : continue
papers = fetch_doc_sum(new_pub_ids)
#info = extract_info(papers)
today = dt.datetime.today().strftime("%Y-%m-%d")
fname = "most_recent_publications_{}.xml".format(today)
with open(fname, "a") as xml_out:
xml_out.write(papers)
print("Number of new publications: ", len(new_pub_ids))
#for id in new_pub_ids:
# print(id)