-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patharxiv_scraper.py
179 lines (153 loc) · 6.86 KB
/
arxiv_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
from html import unescape
import arxiv
import configparser
import dataclasses
import feedparser
import json
import re
import requests
from dataclasses import dataclass
from datetime import datetime, timedelta
from typing import Any, List, Optional, Tuple
from environment import CONFIG, OUTPUT_DEBUG_FILE_FORMAT
class EnhancedJSONEncoder(json.JSONEncoder):
def default(self, o):
if dataclasses.is_dataclass(o):
return dataclasses.asdict(o)
return super().default(o)
@dataclass
class Paper:
# paper class should track the list of authors, paper title, abstract, arxiv id
authors: List[str]
title: str
abstract: str
arxiv_id: str
# add a hash function using arxiv_id
def __hash__(self):
return hash(self.arxiv_id)
def is_earlier(ts1, ts2):
# compares two arxiv ids, returns true if ts1 is older than ts2
return int(ts1.replace(".", "")) < int(ts2.replace(".", ""))
def get_papers_from_arxiv_api(area: str, timestamp, last_id) -> List[Paper]:
# look for papers that are newer than the newest papers in RSS.
# we do this by looking at last_id and grabbing everything newer.
end_date = timestamp
start_date = timestamp - timedelta(days=4)
search = arxiv.Search(
query="("
+ area
+ ") AND submittedDate:["
+ start_date.strftime("%Y%m%d")
+ "* TO "
+ end_date.strftime("%Y%m%d")
+ "*]",
max_results=None,
sort_by=arxiv.SortCriterion.SubmittedDate,
)
results = list(arxiv.Client().results(search))
api_papers = []
for result in results:
new_id = result.get_short_id()[:10]
if is_earlier(last_id, new_id):
authors = [author.name for author in result.authors]
summary = result.summary
summary = unescape(re.sub("\n", " ", summary))
paper = Paper(
authors=authors,
title=result.title,
abstract=summary,
arxiv_id=result.get_short_id()[:10],
)
api_papers.append(paper)
return api_papers
def get_papers_from_arxiv_rss(area: str, config: Optional[dict]) -> tuple[List, None, None] | tuple[List[Paper], datetime, Any]:
# get the feed from http://export.arxiv.org/rss/ and use the updated timestamp to avoid duplicates
updated = datetime.utcnow() - timedelta(days=1)
# format this into the string format 'Fri, 03 Nov 2023 00:30:00 GMT'
url = f"https://export.arxiv.org/rss/{area}"
updated_string = updated.strftime("%a, %d %b %Y %H:%M:%S GMT")
print(f"Getting papers from {url}")
feed = feedparser.parse(url, modified=updated_string)
if feed.status == 304:
if config is not None:
print(f"No {config['FILTERING'].get('announce_type', 'new').replace(',', '/')} papers since {updated_string} for {area}")
return [], None, None # if there are no new paper return an empty list
# get the list of entries
entries = feed.entries
if len(entries) == 0:
print(f"No entries found for {area}")
return [], None, None # if there are no new paper return an empty list
print(f"{len(entries)} entries found for {area}")
if CONFIG["OUTPUT"].getboolean("dump_debug_file"):
try:
response = requests.get(url, timeout=10)
if response.status_code == 200:
with open(OUTPUT_DEBUG_FILE_FORMAT.format(f"raw_content_{area}.rss"), "w", encoding="utf-8") as outfile:
outfile.write(response.text)
else:
print(f"Warning: Failed to fetch RSS content, status code {response.status_code}")
except Exception as e:
print(f"Error fetching RSS content: {e}")
# parse last-modified date
paper_list = []
timestamp = datetime.strptime(feed.feed["updated"], "%a, %d %b %Y %H:%M:%S +0000")
last_id = entries[0].link.split("/")[-1]
announce_type = set(config["FILTERING"].get("announce_type", "new").split(","))
for paper in entries:
# ignore updated papers
if not paper["arxiv_announce_type"] in announce_type:
if config["OUTPUT"].getboolean("debug_messages"):
print(f"Ignoring \"{paper.title}\" by `announce_type` ({paper['arxiv_announce_type']})")
continue
# extract area
paper_area = paper.tags[0]["term"]
# ignore papers not in primary area
if (area != paper_area) and (config["FILTERING"].getboolean("force_primary")):
if config["OUTPUT"].getboolean("debug_messages"):
print(f"Ignoring \"{paper.title}\" by `paper_area` ({paper_area})")
continue
# otherwise make a new paper, for the author field make sure to strip the HTML tags
authors = [
unescape(re.sub("<[^<]+?>", "", author)).strip()
for author in paper.author.replace("\n", ", ").split(",")
]
# strip html tags from summary
summary = re.sub("<[^<]+?>", "", paper.summary)
summary = unescape(re.sub("\n", " ", summary))
# strip the last pair of parentehses containing (arXiv:xxxx.xxxxx [area.XX])
title = re.sub(r"\(arXiv:[0-9]+\.[0-9]+v[0-9]+ \[.*\]\)$", "", paper.title)
# strip the abstract
abstract = summary.split("Abstract: ")[-1]
# remove the link part of the id
id = paper.link.split("/")[-1]
# make a new paper
new_paper = Paper(authors=authors, title=title, abstract=abstract, arxiv_id=id)
paper_list.append(new_paper)
print(f"{len(paper_list)} papers left for {area}")
return entries, paper_list, timestamp, last_id
def merge_paper_list(paper_list, api_paper_list):
api_set = set([paper.arxiv_id for paper in api_paper_list])
merged_paper_list = api_paper_list
for paper in paper_list:
if paper.arxiv_id not in api_set:
merged_paper_list.append(paper)
return merged_paper_list
def get_papers_from_arxiv_rss_api(area: str, config: Optional[dict]) -> Tuple[List, List[Paper]]:
entries, paper_list, timestamp, last_id = get_papers_from_arxiv_rss(area, config)
# if timestamp is None:
# return []
# api_paper_list = get_papers_from_arxiv_api(area, timestamp, last_id)
# merged_paper_list = merge_paper_list(paper_list, api_paper_list)
# return merged_paper_list
return entries, paper_list
if __name__ == "__main__":
config = configparser.ConfigParser()
config.read("configs/config.ini")
paper_list, timestamp, last_id = get_papers_from_arxiv_rss("cs.CL", config)
print(timestamp)
api_paper_list = get_papers_from_arxiv_api("cs.CL", timestamp, last_id)
merged_paper_list = merge_paper_list(paper_list, api_paper_list)
print([paper.arxiv_id for paper in merged_paper_list])
print([paper.arxiv_id for paper in paper_list])
print([paper.arxiv_id for paper in api_paper_list])
print("success")