diff --git a/recipes/wapo.recipe.py b/recipes/wapo.recipe.py index ef4d3120..46753617 100644 --- a/recipes/wapo.recipe.py +++ b/recipes/wapo.recipe.py @@ -7,13 +7,12 @@ import os import sys from datetime import datetime, timezone -from urllib.parse import urljoin, urlencode +from urllib.parse import urljoin, urlencode, urlparse # custom include to share code between recipes sys.path.append(os.environ["recipes_includes"]) from recipes_shared import BasicNewsrackRecipe, format_title, get_datetime_format -from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.web.feeds.news import BasicNewsRecipe _name = "Washington Post" @@ -82,7 +81,7 @@ def _extract_child_nodes(self, nodes, parent_element, soup, url): continue if node_type == "text": para_ele = soup.new_tag("p") - para_ele.append(BeautifulSoup(c["content"])) + para_ele.append(self.soup(c["content"])) parent_element.append(para_ele) elif node_type == "image": figure_ele = soup.new_tag("figure", attrs={"class": "figure"}) @@ -108,17 +107,17 @@ def _extract_child_nodes(self, nodes, parent_element, soup, url): parent_element.append(container_ele) elif node_type == "header": header_ele = soup.new_tag(f'h{c["level"]}') - header_ele.append(BeautifulSoup(c["content"])) + header_ele.append(self.soup(c["content"])) parent_element.append(header_ele) elif node_type == "correction": para_ele = soup.new_tag("p", attrs={"class": "correction"}) - para_ele.append(BeautifulSoup(c.get("content") or c.get("text"))) + para_ele.append(self.soup(c.get("content") or c.get("text"))) parent_element.append(para_ele) elif node_type == "oembed_response": - embed_ele = BeautifulSoup(c["raw_oembed"]["html"]) + embed_ele = self.soup(c["raw_oembed"]["html"]) parent_element.append(embed_ele) elif node_type == "raw_html": - content = BeautifulSoup(c["content"]) + content = self.soup(c["content"]) container = content.find("div", attrs={"data-fallback-image-url": True}) if container: figure_ele = soup.new_tag("figure") @@ -147,7 +146,7 @@ def _extract_child_nodes(self, nodes, parent_element, soup, url): ol_ele = soup.new_tag("ol") for i in c.get("items", []): li_ele = soup.new_tag("li") - li_ele.append(BeautifulSoup(i["content"])) + li_ele.append(self.soup(i["content"])) ol_ele.append(li_ele) container_ele.append(ol_ele) parent_element.append(container_ele) @@ -160,14 +159,12 @@ def _extract_child_nodes(self, nodes, parent_element, soup, url): container_ele.append(soup.new_tag("hr", attrs={"class": "story"})) header_ele = soup.new_tag("h3") - header_ele.append( - BeautifulSoup(c.get("headlines", {}).get("basic", "")) - ) + header_ele.append(self.soup(c.get("headlines", {}).get("basic", ""))) container_ele.append(header_ele) # Example 2022-04-13T14:04:03.051Z "%Y-%m-%dT%H:%M:%S.%fZ" post_date = self.parse_date(c["display_date"]) - meta_ele = BeautifulSoup( + meta_ele = self.soup( f"""