-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcnn_article_finder.py
220 lines (178 loc) · 8.02 KB
/
cnn_article_finder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
import json
from logger import setup_logger
from typing import Dict, List
from base_scraper import BaseScraper
# Add custom exceptions
class NoTopicsError(Exception):
"""Raised when there's no topics provided"""
pass
class InvalidJSONError(Exception):
"""Raised when there's an error in the initialization of the CNNArticleFinder class"""
pass
class NoMatchingTopicsError(Exception):
"""Raised when there's no matching topics found"""
pass
class TopicNavigationError(Exception):
"""Raised when there's an error in topic navigation"""
pass
class PageSoupError(Exception):
"""Raised when there's an error in getting soup"""
pass
class DivNotFoundError(Exception):
"""Raised when the div is not found in the soup"""
pass
class AElementNotFoundError(Exception):
"""Raised when the a elements are not found in the soup"""
pass
class CNNConfig:
"""Configuration class for CNN-related constants and settings"""
BASE_URL = "https://www.cnn.com"
TOPIC_PAGES = {
'US': f'{BASE_URL}/us',
'World': f'{BASE_URL}/world',
'Politics': f'{BASE_URL}/politics',
'Business': f'{BASE_URL}/business',
'Health': f'{BASE_URL}/health',
'Entertainment': f'{BASE_URL}/entertainment',
'Style': f'{BASE_URL}/style',
'Travel': f'{BASE_URL}/Travel',
'Science': f'{BASE_URL}/science',
'Climate': f'{BASE_URL}/climate'
}
class CNNArticleFinder:
"""
Extracts URLs for trending CNN articles of the topics of interest.
"""
def __init__(self, user_data: str):
"""
Args:
user_data (str): JSON string containing topics to find articles for
Example: '{"topics": ["Technology", "Health"]}'
Raises:
InitError: If no topics are provided by the user_data json string
InitError: If no valid topics are found in the user_data json string
InitError: Invalid JSON format in user_data
"""
self.logger = setup_logger(__name__)
# This error handling will likely be given to another file. Will keep for now
try:
parsed_data = json.loads(user_data)
self.topics = parsed_data.get('topics', [])
if not self.topics:
self.logger.critical("No topics provided in user_data")
raise NoTopicsError("No topics provided in user_data")
if not set(self.topics) & set(CNNConfig.TOPIC_PAGES.keys()):
self.logger.critical(f"Invalid topics provided: {self.topics}. Valid topics are: {list(CNNConfig.TOPIC_PAGES.keys())}")
raise NoMatchingTopicsError('User provided topics do not match the topics in the CNNConfig.TOPIC_PAGES dictionary')
except json.JSONDecodeError as e:
self.logger.critical(f"Invalid JSON format in user_data: {e}")
raise InvalidJSONError(f"Invalid JSON format in user_data: {e}")
self.topic_pages = self.topic_navigation()
def topic_navigation(self) -> List[str]:
"""
Navigates to the proper URLs based on selected topics.
Returns:
List[str]: List of URLs needed for extraction
Raises:
NoMatchingTopicsError: If no matching topics are found
ValueError: Navigation fails
"""
user_topics = self.topics
topic_pages = CNNConfig.TOPIC_PAGES
try:
valid_topics = set(user_topics) & set(topic_pages.keys())
if not valid_topics:
# This may not be useful right now, but will be once the LLM is creating desired user topics based
# on political preferences.
self.logger.error(f"No matching topics found. Provided topics: {self.topics}")
raise NoMatchingTopicsError(f"No matching topics found. Provided topics: {self.topics}")
return [topic_pages[key] for key in valid_topics]
except Exception as e:
self.logger.error(f"Error in topic_navigation: {e}")
raise Exception(f"Error in topic_navigation: {e}")
def get_page_soup(self) -> Dict[str, object]:
"""
Fetches and parses HTML content for each topic page.
Returns:
Dict[str, object]: Dictionary containing BeautifulSoup objects for each topic
Raises:
PageSoupError: If page content could not be fetched for any topic
"""
content = {}
for topic in self.topics:
for page in self.topic_pages:
base_scraper = BaseScraper(url=page)
page_soup = base_scraper.get_soup() # Let the BaseScraper handle the errors
if not page_soup:
self.logger.error(f"Could not get soup for {topic} from {page}")
raise PageSoupError(f"Could not get soup for {topic} from {page}")
content[topic] = page_soup
return content
def hyperlink_search(self) -> Dict[str, List[str]]:
"""
Searches soup for hyperlinks.
Returns:
Dict[str, List[str]]: Dictionary containing article hyperlinks by topic
Raises:
ValueError: If page soup is not found
"""
soup = self.get_page_soup()
try:
content = {}
for topic, page in soup.items():
div = page.find('div', class_='container_lead-plus-headlines__cards-wrapper')
if not div:
self.logger.warning(f"No headline wrapper found in soup for {topic}")
raise DivNotFoundError(f"No headline wrapper found in soup for {topic}")
a_elements = div.find_all('a', href=True)
if not a_elements:
self.logger.warning(f"No a elements found for {topic}. Hyperlinks not found")
raise AElementNotFoundError(f"No a elements found for {topic}. Hyperlinks not found")
href_values = [tag['href'] for tag in a_elements]
content[topic] = href_values
return content
except Exception as e:
self.logger.error(f"Error in hyperlink_search: {e}")
raise Exception(f"Error in hyperlink_search: {e}")
def get_link(self) -> Dict[str, List[str]]:
"""
Converts extracted hyperlinks to accessible URLs.
Returns:
Dict[str, List[str]]: Dictionary of complete URLs by topic
Raises:
ValueError: If no hyperlinks are found
"""
try:
hyperlinks = self.hyperlink_search()
if not hyperlinks:
raise ValueError('No hyperlinks found')
urls = {}
for topic, hyperlink_list in hyperlinks.items():
urls[topic] = [
f"{CNNConfig.BASE_URL}{hyperlink}"
for hyperlink in hyperlink_list
]
# remove duplicates
urls = {topic: list(dict.fromkeys(links)) for topic, links in urls.items()}
return urls
except Exception as e:
self.logger.error(f"Error creating URLs in get_link: {e}")
raise Exception(f"Error creating URLs in get_link: {e}")
def main():
try:
user_data = json.dumps({
'topics': ['US']
})
article_finder = CNNArticleFinder(
user_data=user_data
)
# Execute and log results
article_finder.logger.info(f"Selected topics: {article_finder.topics}")
article_finder.logger.info(f"Topic pages: {article_finder.topic_pages}")
article_finder.logger.info(f"Found hyperlinks: {article_finder.hyperlink_search()}")
article_finder.logger.info(f"Complete URLs: {article_finder.get_link()}")
except Exception as e:
article_finder.logger.error(f"Unexpected error: {e}")
raise Exception(f"Unexpected error: {e}")
if __name__ == '__main__':
main()