diff --git a/recipes/aeon.recipe.py b/recipes/aeon.recipe.py
index b9ba65eb..b76c4c1d 100644
--- a/recipes/aeon.recipe.py
+++ b/recipes/aeon.recipe.py
@@ -7,7 +7,6 @@
from recipes_shared import BasicNewsrackRecipe, format_title, get_date_format
from calibre.web.feeds.news import BasicNewsRecipe, prefixed_classes
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
_name = "Aeon"
@@ -55,7 +54,7 @@ def _find_article(self, data):
return False
def preprocess_raw_html_(self, raw_html, url):
- soup = BeautifulSoup(raw_html)
+ soup = self.soup(raw_html)
article = self.get_ld_json(soup, filter_fn=self._find_article)
if not (article and article.get("articleBody")):
err_msg = f"Unable to find article: {url}"
diff --git a/recipes/bloomberg-businessweek.recipe.py b/recipes/bloomberg-businessweek.recipe.py
index 8c798f61..3a29cbe9 100644
--- a/recipes/bloomberg-businessweek.recipe.py
+++ b/recipes/bloomberg-businessweek.recipe.py
@@ -12,7 +12,6 @@
from urllib.parse import urljoin, urlparse
from calibre import browser, iswindows, random_user_agent
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.utils.date import parse_date
from calibre.web.feeds.news import BasicNewsRecipe
@@ -176,7 +175,7 @@ def render_content(self, content, soup, parent):
if content_type == "aside":
return soup.new_tag("blockquote")
if content_type == "embed" and content.get("iframeData", {}).get("html"):
- return BeautifulSoup(content["iframeData"]["html"])
+ return self.soup(content["iframeData"]["html"])
if content_type == "link" and content.get("data", {}).get(
"destination", {}
).get("web"):
@@ -229,7 +228,7 @@ def render_content(self, content, soup, parent):
div.append(img)
if photo.get("caption"):
caption = soup.new_tag("div", attrs={"class": "caption"})
- caption.append(BeautifulSoup(photo["caption"]))
+ caption.append(self.soup(photo["caption"]))
div.append(caption)
if photo.get("credit"):
credit = soup.new_tag("div", attrs={"class": "credit"})
@@ -287,7 +286,7 @@ def nested_render(self, content, soup, parent):
def preprocess_raw_html(self, raw_html, url):
self.download_count += 1
article = None
- soup = BeautifulSoup(raw_html)
+ soup = self.soup(raw_html)
for script in soup.find_all(
"script",
attrs={
@@ -322,7 +321,7 @@ def preprocess_raw_html(self, raw_html, url):
self.abort_article(err_msg)
date_published = parse_date(article["publishedAt"], assume_utc=True)
- soup = BeautifulSoup(
+ soup = self.soup(
"""
@@ -342,9 +341,7 @@ def preprocess_raw_html(self, raw_html, url):
soup.head.title.append(article.get("headlineText") or article["headline"])
h1_title = soup.find("h1")
- h1_title.append(
- BeautifulSoup(article.get("headlineText") or article["headline"])
- )
+ h1_title.append(self.soup(article.get("headlineText") or article["headline"]))
if article.get("summaryText") or article.get("abstract"):
sub_headline = soup.new_tag("div", attrs={"class": "sub-headline"})
if article.get("summaryText"):
@@ -359,7 +356,7 @@ def preprocess_raw_html(self, raw_html, url):
if article.get("byline"):
soup.find(class_="article-meta").insert(
0,
- BeautifulSoup(f'{article["byline"]}'),
+ self.soup(f'{article["byline"]}'),
)
else:
try:
@@ -367,7 +364,7 @@ def preprocess_raw_html(self, raw_html, url):
if post_authors:
soup.find(class_="article-meta").insert(
0,
- BeautifulSoup(
+ self.soup(
f'{", ".join(post_authors)}'
),
)
@@ -378,7 +375,7 @@ def preprocess_raw_html(self, raw_html, url):
if categories:
soup.body.article.insert(
0,
- BeautifulSoup(
+ self.soup(
f'{" / ".join(categories)}'
),
)
@@ -393,12 +390,12 @@ def preprocess_raw_html(self, raw_html, url):
caption_ele = soup.new_tag(
"div", attrs={"class": "news-figure-caption-text"}
)
- caption_ele.append(BeautifulSoup(lede_img_caption_html))
+ caption_ele.append(self.soup(lede_img_caption_html))
img_container.append(caption_ele)
soup.body.article.append(img_container)
if type(article["body"]) == str:
- body_soup = BeautifulSoup(article["body"])
+ body_soup = self.soup(article["body"])
for img_div in body_soup.find_all(
name="figure", attrs={"data-type": "image"}
):
@@ -408,7 +405,7 @@ def preprocess_raw_html(self, raw_html, url):
img["src"] = img["src"]
soup.body.article.append(body_soup)
else:
- body_soup = BeautifulSoup()
+ body_soup = self.soup()
self.nested_render(article["body"], body_soup, body_soup)
soup.body.article.append(body_soup)
return str(soup)
diff --git a/recipes/bloomberg-news.recipe.py b/recipes/bloomberg-news.recipe.py
index db5700a3..bd5b6488 100644
--- a/recipes/bloomberg-news.recipe.py
+++ b/recipes/bloomberg-news.recipe.py
@@ -15,7 +15,6 @@
from urllib.parse import urlparse
from calibre import browser, iswindows, random_user_agent
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.utils.date import parse_date
from calibre.web.feeds.news import BasicNewsRecipe
@@ -199,7 +198,7 @@ def render_content(self, content, soup, parent):
if content_type == "aside":
return soup.new_tag("blockquote")
if content_type == "embed" and content.get("iframeData", {}).get("html"):
- return BeautifulSoup(content["iframeData"]["html"])
+ return self.soup(content["iframeData"]["html"])
if content_type == "link" and content.get("data", {}).get(
"destination", {}
).get("web"):
@@ -310,7 +309,7 @@ def nested_render(self, content, soup, parent):
def preprocess_raw_html(self, raw_html, url):
self.download_count += 1
article = None
- soup = BeautifulSoup(raw_html)
+ soup = self.soup(raw_html)
for script in soup.find_all(
"script",
attrs={
@@ -345,7 +344,7 @@ def preprocess_raw_html(self, raw_html, url):
self.abort_article(err_msg)
date_published = parse_date(article["publishedAt"], assume_utc=True)
- soup = BeautifulSoup(
+ soup = self.soup(
"""
@@ -366,9 +365,7 @@ def preprocess_raw_html(self, raw_html, url):
soup.head.title.append(article.get("headlineText") or article["headline"])
h1_title = soup.find("h1")
- h1_title.append(
- BeautifulSoup(article.get("headlineText") or article["headline"])
- )
+ h1_title.append(self.soup(article.get("headlineText") or article["headline"]))
if article.get("summaryText") or article.get("abstract"):
sub_headline = soup.new_tag("div", attrs={"class": "sub-headline"})
if article.get("summaryText"):
@@ -383,7 +380,7 @@ def preprocess_raw_html(self, raw_html, url):
if article.get("byline"):
soup.find(class_="article-meta").insert(
0,
- BeautifulSoup(f'{article["byline"]}'),
+ self.soup(f'{article["byline"]}'),
)
else:
try:
@@ -391,7 +388,7 @@ def preprocess_raw_html(self, raw_html, url):
if post_authors:
soup.find(class_="article-meta").insert(
0,
- BeautifulSoup(
+ self.soup(
f'{", ".join(post_authors)}'
),
)
@@ -402,7 +399,7 @@ def preprocess_raw_html(self, raw_html, url):
if categories:
soup.body.article.insert(
0,
- BeautifulSoup(
+ self.soup(
f'{" / ".join(categories)}'
),
)
@@ -417,12 +414,12 @@ def preprocess_raw_html(self, raw_html, url):
caption_ele = soup.new_tag(
"div", attrs={"class": "news-figure-caption-text"}
)
- caption_ele.append(BeautifulSoup(lede_img_caption_html))
+ caption_ele.append(self.soup(lede_img_caption_html))
img_container.append(caption_ele)
soup.body.article.append(img_container)
if type(article["body"]) == str:
- body_soup = BeautifulSoup(article["body"])
+ body_soup = self.soup(article["body"])
for img_div in body_soup.find_all(
name="figure", attrs={"data-type": "image"}
):
@@ -432,7 +429,7 @@ def preprocess_raw_html(self, raw_html, url):
img["src"] = img["src"]
soup.body.article.append(body_soup)
else:
- body_soup = BeautifulSoup()
+ body_soup = self.soup()
self.nested_render(article["body"], body_soup, body_soup)
soup.body.article.append(body_soup)
return str(soup)
@@ -442,7 +439,7 @@ def parse_index(self):
feed_items = {}
for feed_name, feed_url in self.feeds:
res = br.open_novisit(feed_url, timeout=self.timeout)
- soup = BeautifulSoup(res.read().decode("utf-8"), "xml")
+ soup = self.soup(res.read().decode("utf-8"), "xml")
articles = []
cutoff_date = datetime.utcnow().replace(tzinfo=timezone.utc) - timedelta(
days=self.oldest_article
diff --git a/recipes/fivebooks.recipe.py b/recipes/fivebooks.recipe.py
index 5442e43b..1addb3ca 100644
--- a/recipes/fivebooks.recipe.py
+++ b/recipes/fivebooks.recipe.py
@@ -15,7 +15,6 @@
sys.path.append(os.environ["recipes_includes"])
from recipes_shared import BasicNewsrackRecipe, format_title
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe
_name = "Five Books"
@@ -94,7 +93,7 @@ def populate_article_metadata(self, article, soup, first):
article.text_summary = description_tag["data-post-description"]
def preprocess_raw_html(self, raw_html, url):
- soup = BeautifulSoup(raw_html)
+ soup = self.soup(raw_html)
content = soup.find(class_="main-content")
data = self.get_ld_json(soup, lambda d: d.get("@graph", []))
if not data:
@@ -120,7 +119,7 @@ def parse_index(self):
raw_html = (
br.open_novisit(feed_url, timeout=self.timeout).read().decode("utf-8")
)
- soup = BeautifulSoup(raw_html)
+ soup = self.soup(raw_html)
interviews = soup.find_all(class_="library-page")
if self.max_articles_per_feed < len(interviews):
interviews = interviews[: self.max_articles_per_feed]
diff --git a/recipes/forbes-editors-picks.recipe.py b/recipes/forbes-editors-picks.recipe.py
index 3e2cb78f..3e821921 100644
--- a/recipes/forbes-editors-picks.recipe.py
+++ b/recipes/forbes-editors-picks.recipe.py
@@ -8,7 +8,6 @@
sys.path.append(os.environ["recipes_includes"])
from recipes_shared import BasicNewsrackRecipe, format_title
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe
_name = "Forbes - Editor's Picks"
@@ -72,7 +71,7 @@ class ForbesEditorsPicks(BasicNewsrackRecipe, BasicNewsRecipe):
"""
def preprocess_raw_html(self, raw_html, url):
- soup = BeautifulSoup(raw_html)
+ soup = self.soup(raw_html)
article = soup.find("article")
meta = self.get_ld_json(soup, lambda d: d.get("@type", "") == "NewsArticle")
modified_date = meta.get("dateModified") or meta.get("datePublished")
diff --git a/recipes/foreign-affairs.recipe.py b/recipes/foreign-affairs.recipe.py
index 33848b4a..d7166724 100644
--- a/recipes/foreign-affairs.recipe.py
+++ b/recipes/foreign-affairs.recipe.py
@@ -10,7 +10,6 @@
from recipes_shared import BasicNewsrackRecipe
import mechanize
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe, classes
_name = "Foreign Affairs"
@@ -64,7 +63,7 @@ def get_data(data):
feeds = []
def as_article(source):
- title = BeautifulSoup(source["title"][0]).get_text()
+ title = self.soup(source["title"][0]).get_text()
desc = ""
fs = source.get("field_subtitle")
if fs:
diff --git a/recipes/foreign-policy-magazine.recipe.py b/recipes/foreign-policy-magazine.recipe.py
index 7e74527b..98d4fa41 100644
--- a/recipes/foreign-policy-magazine.recipe.py
+++ b/recipes/foreign-policy-magazine.recipe.py
@@ -13,7 +13,6 @@
sys.path.append(os.environ["recipes_includes"])
from recipes_shared import WordPressNewsrackRecipe, get_datetime_format
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe
_name = "Foreign Policy Magazine"
@@ -79,7 +78,7 @@ def preprocess_raw_html(self, raw_html, url):
if not self.pub_date or date_published_gmt > self.pub_date:
self.pub_date = date_published_gmt
- soup = BeautifulSoup(
+ soup = self.soup(
f"""
{post["title"]["rendered"]}
@@ -96,7 +95,7 @@ def preprocess_raw_html(self, raw_html, url):
"""
)
- content = BeautifulSoup(post["content"]["rendered"])
+ content = self.soup(post["content"]["rendered"])
# FP doesn't use featuremedia, the first attachment is the lede image
attachment_endpoint = (
post.get("_links", {}).get("wp:attachment", [{}])[0].get("href")
@@ -111,7 +110,7 @@ def preprocess_raw_html(self, raw_html, url):
lede.append(img)
if attachment.get("caption", {}).get("rendered"):
caption = soup.new_tag("div", attrs={"class": "wp-caption-text"})
- caption.append(BeautifulSoup(attachment["caption"]["rendered"]))
+ caption.append(self.soup(attachment["caption"]["rendered"]))
lede.append(caption)
soup.body.article.append(lede)
diff --git a/recipes/foreign-policy.recipe.py b/recipes/foreign-policy.recipe.py
index 11089a37..ab6a6581 100644
--- a/recipes/foreign-policy.recipe.py
+++ b/recipes/foreign-policy.recipe.py
@@ -10,7 +10,6 @@
sys.path.append(os.environ["recipes_includes"])
from recipes_shared import WordPressNewsrackRecipe, get_datetime_format
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe
_name = "Foreign Policy"
@@ -76,7 +75,7 @@ def preprocess_raw_html(self, raw_html, url):
post_authors = self.extract_authors(post)
categories = self.extract_categories(post)
- soup = BeautifulSoup(
+ soup = self.soup(
f"""
{post["title"]["rendered"]}
@@ -93,7 +92,7 @@ def preprocess_raw_html(self, raw_html, url):
"""
)
- content = BeautifulSoup(post["content"]["rendered"])
+ content = self.soup(post["content"]["rendered"])
# FP doesn't use featuremedia, the first attachment is the lede image
attachment_endpoint = (
post.get("_links", {}).get("wp:attachment", [{}])[0].get("href")
@@ -108,7 +107,7 @@ def preprocess_raw_html(self, raw_html, url):
lede.append(img)
if attachment.get("caption", {}).get("rendered"):
caption = soup.new_tag("div", attrs={"class": "wp-caption-text"})
- caption.append(BeautifulSoup(attachment["caption"]["rendered"]))
+ caption.append(self.soup(attachment["caption"]["rendered"]))
lede.append(caption)
soup.body.article.append(lede)
diff --git a/recipes/ft-paper.recipe.py b/recipes/ft-paper.recipe.py
index 3598ede9..1ef84af2 100644
--- a/recipes/ft-paper.recipe.py
+++ b/recipes/ft-paper.recipe.py
@@ -17,7 +17,6 @@
sys.path.append(os.environ["recipes_includes"])
from recipes_shared import BasicCookielessNewsrackRecipe, format_title, get_date_format
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe, classes
_name = "Financial Times (Print)"
@@ -99,7 +98,7 @@ def ft_parse_index(self, soup):
return feeds
def preprocess_raw_html(self, raw_html, url):
- soup = BeautifulSoup(raw_html)
+ soup = self.soup(raw_html)
article = self.get_ld_json(soup, lambda d: d.get("@type", "") == "NewsArticle")
if not (article and article.get("articleBody")):
err_msg = f"Unable to find article: {url}"
diff --git a/recipes/ft.recipe.py b/recipes/ft.recipe.py
index bcdc48bf..d0eeda85 100644
--- a/recipes/ft.recipe.py
+++ b/recipes/ft.recipe.py
@@ -15,7 +15,6 @@
sys.path.append(os.environ["recipes_includes"])
from recipes_shared import BasicCookielessNewsrackRecipe, format_title, get_date_format
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe
_name = "Financial Times"
@@ -64,7 +63,7 @@ def print_version(self, url):
return urljoin("https://ft.com", url)
def preprocess_raw_html(self, raw_html, url):
- soup = BeautifulSoup(raw_html)
+ soup = self.soup(raw_html)
article = self.get_ld_json(soup, lambda d: d.get("@type", "") == "NewsArticle")
if not (article and article.get("articleBody")):
err_msg = f"Unable to find article: {url}"
diff --git a/recipes/fulcrum-sg.recipe.py b/recipes/fulcrum-sg.recipe.py
index 6acba48f..e3e822d1 100644
--- a/recipes/fulcrum-sg.recipe.py
+++ b/recipes/fulcrum-sg.recipe.py
@@ -11,7 +11,6 @@
sys.path.append(os.environ["recipes_includes"])
from recipes_shared import WordPressNewsrackRecipe, get_datetime_format
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe
_name = "Fulcrum"
@@ -75,7 +74,7 @@ def _extract_featured_media(self, post, soup):
:param post_content: Extracted post content
:return:
"""
- post_soup = BeautifulSoup(post["content"]["rendered"])
+ post_soup = self.soup(post["content"]["rendered"])
for img in post_soup.find_all("img", attrs={"data-src": True}):
img["src"] = img["data-src"]
post_content = str(post_soup)
@@ -96,7 +95,7 @@ def _extract_featured_media(self, post, soup):
container_ele.append(img_ele)
if feature_info.get("caption", {}).get("rendered"):
cap_ele = soup.new_tag("div", attrs={"class": "caption"})
- cap_ele.append(BeautifulSoup(feature_info["caption"]["rendered"]))
+ cap_ele.append(self.soup(feature_info["caption"]["rendered"]))
container_ele.append(cap_ele)
post_content = str(container_ele) + post_content
else:
@@ -117,7 +116,7 @@ def preprocess_raw_html(self, raw_html, url):
categories = self.extract_categories(post)
categories.extend(self.extract_tags(post))
- soup = BeautifulSoup(
+ soup = self.soup(
f"""
{post["title"]["rendered"]}
@@ -136,15 +135,11 @@ def preprocess_raw_html(self, raw_html, url):
)
sub_headline = soup.find("h2", class_="sub-headline")
if post.get("excerpt", {}).get("rendered"):
- sub_headline.append(
- BeautifulSoup(post["excerpt"]["rendered"], "html.parser")
- )
+ sub_headline.append(self.soup(post["excerpt"]["rendered"]))
else:
sub_headline.decompose()
- soup.body.article.append(
- BeautifulSoup(self._extract_featured_media(post, soup))
- )
+ soup.body.article.append(self.soup(self._extract_featured_media(post, soup)))
return str(soup)
def parse_index(self):
diff --git a/recipes/harpers-magazine.recipe.py b/recipes/harpers-magazine.recipe.py
index ff499b0e..b3c864f0 100644
--- a/recipes/harpers-magazine.recipe.py
+++ b/recipes/harpers-magazine.recipe.py
@@ -6,7 +6,6 @@
sys.path.append(os.environ["recipes_includes"])
from recipes_shared import BasicCookielessNewsrackRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe
_name = "Harper's Magazine"
@@ -78,7 +77,7 @@ class HarpersMagazine(BasicCookielessNewsrackRecipe, BasicNewsRecipe):
"""
def preprocess_raw_html(self, raw_html, url):
- soup = BeautifulSoup(raw_html)
+ soup = self.soup(raw_html)
soup.find("meta", attrs={"property": "article:modified_time"})
# Example: 2023-05-16T16:43:24+00:00 "%Y-%m-%dT%H:%M:%S%z"
article_datetime = soup.find(
diff --git a/recipes/harvard-intl-review.recipe.py b/recipes/harvard-intl-review.recipe.py
index e425a7f1..891ccbd7 100644
--- a/recipes/harvard-intl-review.recipe.py
+++ b/recipes/harvard-intl-review.recipe.py
@@ -19,7 +19,6 @@
get_datetime_format,
)
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds import Feed
from calibre.web.feeds.news import BasicNewsRecipe
@@ -75,7 +74,7 @@ def parse_feeds(self):
date_published = a.utctime.replace(tzinfo=timezone.utc)
article_index = f"{date_published:{get_date_format()}}"
# add author and pub date
- soup = BeautifulSoup(a.content)
+ soup = self.soup(a.content)
header = None
if soup.body.contents[0].name in ["h1", "h2", "h3"]:
header = soup.body.contents[0]
diff --git a/recipes/hbr.recipe.py b/recipes/hbr.recipe.py
index 82a60485..0baceddf 100644
--- a/recipes/hbr.recipe.py
+++ b/recipes/hbr.recipe.py
@@ -7,7 +7,6 @@
sys.path.append(os.environ["recipes_includes"])
from recipes_shared import BasicCookielessNewsrackRecipe, get_date_format
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe, classes
# Original https://github.com/kovidgoyal/calibre/blob/49a1d469ce4f04f79ce786a75b8f4bdcfd32ad2c/recipes/hbr.recipe
@@ -67,7 +66,7 @@ class HBR(BasicCookielessNewsrackRecipe, BasicNewsRecipe):
]
def preprocess_raw_html(self, raw_html, _):
- soup = BeautifulSoup(raw_html)
+ soup = self.soup(raw_html)
# set article date
pub_datetime = soup.find("meta", attrs={"property": "article:published_time"})
diff --git a/recipes/lithub.recipe.py b/recipes/lithub.recipe.py
index 74275bc2..11536437 100644
--- a/recipes/lithub.recipe.py
+++ b/recipes/lithub.recipe.py
@@ -11,7 +11,6 @@
sys.path.append(os.environ["recipes_includes"])
from recipes_shared import WordPressNewsrackRecipe, get_datetime_format
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe
_name = "Literary Hub"
@@ -63,7 +62,7 @@ def _extract_featured_media(self, post, soup):
:param post_content: Extracted post content
:return:
"""
- post_soup = BeautifulSoup(post["content"]["rendered"])
+ post_soup = self.soup(post["content"]["rendered"])
for img in post_soup.find_all("img", attrs={"data-src": True}):
img["src"] = img["data-src"]
post_content = str(post_soup)
@@ -101,7 +100,7 @@ def preprocess_raw_html(self, raw_html, url):
post_authors = self.extract_authors(post)
categories = self.extract_categories(post)
- soup = BeautifulSoup(
+ soup = self.soup(
f"""
{post["title"]["rendered"]}
@@ -117,9 +116,7 @@ def preprocess_raw_html(self, raw_html, url):
"""
)
- soup.body.article.append(
- BeautifulSoup(self._extract_featured_media(post, soup))
- )
+ soup.body.article.append(self.soup(self._extract_featured_media(post, soup)))
return str(soup)
def parse_index(self):
diff --git a/recipes/longreads-features.recipe.py b/recipes/longreads-features.recipe.py
index 172a35bd..a630d0a0 100644
--- a/recipes/longreads-features.recipe.py
+++ b/recipes/longreads-features.recipe.py
@@ -11,7 +11,6 @@
sys.path.append(os.environ["recipes_includes"])
from recipes_shared import WordPressNewsrackRecipe, get_datetime_format
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe, prefixed_classes
_name = "Longreads Features"
@@ -57,7 +56,7 @@ def _extract_featured_media(self, post, soup):
:param post_content: Extracted post content
:return:
"""
- post_soup = BeautifulSoup(post["content"]["rendered"])
+ post_soup = self.soup(post["content"]["rendered"])
for img in post_soup.find_all("img", attrs={"data-src": True}):
img["src"] = img["data-src"]
post_content = str(post_soup)
@@ -95,7 +94,7 @@ def preprocess_raw_html(self, raw_html, url):
post_authors = self.extract_authors(post)
categories = self.extract_categories(post)
- soup = BeautifulSoup(
+ soup = self.soup(
f"""
{post["title"]["rendered"]}
@@ -111,9 +110,7 @@ def preprocess_raw_html(self, raw_html, url):
"""
)
- soup.body.article.append(
- BeautifulSoup(self._extract_featured_media(post, soup))
- )
+ soup.body.article.append(self.soup(self._extract_featured_media(post, soup)))
return str(soup)
def parse_index(self):
diff --git a/recipes/mit-tech-review-magazine.recipe.py b/recipes/mit-tech-review-magazine.recipe.py
index 1ceb8450..c93b55a4 100644
--- a/recipes/mit-tech-review-magazine.recipe.py
+++ b/recipes/mit-tech-review-magazine.recipe.py
@@ -18,7 +18,6 @@
sys.path.append(os.environ["recipes_includes"])
from recipes_shared import WordPressNewsrackRecipe, get_date_format
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe
@@ -108,7 +107,7 @@ def preprocess_raw_html(self, raw_html, url):
post_authors = self.extract_authors(post)
categories = self.extract_categories(post)
- soup = BeautifulSoup(
+ soup = self.soup(
f"""
{post["title"]["rendered"]}
diff --git a/recipes/mit-tech-review.recipe.py b/recipes/mit-tech-review.recipe.py
index 67db3f53..48401804 100644
--- a/recipes/mit-tech-review.recipe.py
+++ b/recipes/mit-tech-review.recipe.py
@@ -11,7 +11,6 @@
sys.path.append(os.environ["recipes_includes"])
from recipes_shared import WordPressNewsrackRecipe, get_datetime_format
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe
_name = "MIT Technology Review"
@@ -59,7 +58,7 @@ def _extract_featured_media(self, post):
:param post_content: Extracted post content
:return:
"""
- post_soup = BeautifulSoup(post["content"]["rendered"])
+ post_soup = self.soup(post["content"]["rendered"])
for img in post_soup.find_all("img", attrs={"data-src": True}):
img["src"] = img["data-src"]
post_content = str(post_soup)
@@ -96,7 +95,7 @@ def preprocess_raw_html(self, raw_html, url):
post_authors = self.extract_authors(post)
categories = self.extract_categories(post)
- soup = BeautifulSoup(
+ soup = self.soup(
f"""
{post["title"]["rendered"]}
diff --git a/recipes/new-republic-magazine.recipe.py b/recipes/new-republic-magazine.recipe.py
index 4052fa55..c350305e 100644
--- a/recipes/new-republic-magazine.recipe.py
+++ b/recipes/new-republic-magazine.recipe.py
@@ -8,8 +8,6 @@
from functools import cmp_to_key
from urllib.parse import urljoin, urlencode, urlsplit, urlparse
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
-
# custom include to share code between recipes
sys.path.append(os.environ["recipes_includes"])
from recipes_shared import BasicNewsrackRecipe, get_date_format
@@ -233,7 +231,7 @@ def preprocess_raw_html(self, raw_html, url):
{lede_image_caption}
"""
- body_soup = BeautifulSoup(article["body"])
+ body_soup = self.soup(article["body"])
for img in body_soup.find_all("img", attrs={"data-serialized": True}):
try:
img_info = json.loads(img["data-serialized"])
diff --git a/recipes/newyorker.recipe.py b/recipes/newyorker.recipe.py
index b158ec77..eb035872 100644
--- a/recipes/newyorker.recipe.py
+++ b/recipes/newyorker.recipe.py
@@ -20,7 +20,6 @@
get_datetime_format,
)
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe, classes, prefixed_classes
from calibre.ebooks.markdown import Markdown
@@ -85,7 +84,7 @@ class NewYorker(BasicCookielessNewsrackRecipe, BasicNewsRecipe):
remove_attributes = ["style", "sizes", "data-event-click"]
def preprocess_raw_html(self, raw_html, url):
- soup = BeautifulSoup(raw_html)
+ soup = self.soup(raw_html)
preload_state = {}
preload_script_eles = [
@@ -172,7 +171,7 @@ def preprocess_raw_html(self, raw_html, url):
image_html,
)
- interactive_container.append(BeautifulSoup(md.convert(article_body)))
+ interactive_container.append(self.soup(md.convert(article_body)))
interactive_container["class"] = "og"
except Exception as e:
self.log.warning(f"Unable to convert interactive article: {e}")
diff --git a/recipes/noema-magazine.recipe.py b/recipes/noema-magazine.recipe.py
index 467e55e8..e4775063 100644
--- a/recipes/noema-magazine.recipe.py
+++ b/recipes/noema-magazine.recipe.py
@@ -10,7 +10,6 @@
sys.path.append(os.environ["recipes_includes"])
from recipes_shared import WordPressNewsrackRecipe, get_datetime_format
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe
_name = "Noema Magazine"
@@ -73,7 +72,7 @@ def _extract_featured_media(self, post, soup):
:param post_content: Extracted post content
:return:
"""
- post_soup = BeautifulSoup(post["content"]["rendered"])
+ post_soup = self.soup(post["content"]["rendered"])
for h in post_soup.find_all("h5"):
h.name = "h3"
post_content = str(post_soup)
@@ -94,9 +93,7 @@ def _extract_featured_media(self, post, soup):
container_ele.append(img_ele)
if feature_info.get("caption", {}).get("rendered"):
container_ele.append(
- BeautifulSoup(
- feature_info["caption"]["rendered"], "html.parser"
- )
+ self.soup(feature_info["caption"]["rendered"], "html.parser")
)
post_content = str(container_ele) + post_content
else:
@@ -113,7 +110,7 @@ def preprocess_raw_html(self, raw_html, url):
post_authors = self.extract_authors(post)
categories = self.extract_categories(post)
- soup = BeautifulSoup(
+ soup = self.soup(
f"""
{post["title"]["rendered"]}
@@ -129,9 +126,7 @@ def preprocess_raw_html(self, raw_html, url):
"""
)
- soup.body.article.append(
- BeautifulSoup(self._extract_featured_media(post, soup))
- )
+ soup.body.article.append(self.soup(self._extract_featured_media(post, soup)))
return str(soup)
def parse_index(self):
diff --git a/recipes/scientific-american.recipe.py b/recipes/scientific-american.recipe.py
index 46d3fe77..f6ecdb38 100644
--- a/recipes/scientific-american.recipe.py
+++ b/recipes/scientific-american.recipe.py
@@ -13,7 +13,6 @@
sys.path.append(os.environ["recipes_includes"])
from recipes_shared import BasicNewsrackRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe
@@ -79,7 +78,7 @@ def get_browser(self, *a, **kw):
return br
def preprocess_raw_html(self, raw_html, url):
- soup = BeautifulSoup(raw_html)
+ soup = self.soup(raw_html)
info = self.get_script_json(soup, r"dataLayer\s*=\s*")
if info:
for i in info:
diff --git a/recipes/scmp.recipe.py b/recipes/scmp.recipe.py
index d059d34e..b937ec1a 100644
--- a/recipes/scmp.recipe.py
+++ b/recipes/scmp.recipe.py
@@ -10,7 +10,6 @@
sys.path.append(os.environ["recipes_includes"])
from recipes_shared import BasicNewsrackRecipe, format_title, get_datetime_format
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe
_name = "South China Morning Post"
@@ -154,10 +153,10 @@ def _extract_child_nodes(self, children, ele, soup, level=1):
new_ele["class"] = "caption"
child_html += str(new_ele)
ele["class"] = "article-img"
- ele.append(BeautifulSoup(child_html))
+ ele.append(self.soup(child_html))
def preprocess_raw_html(self, raw_html, url):
- soup = BeautifulSoup(raw_html)
+ soup = self.soup(raw_html)
article = self.get_script_json(soup, r"window.__APOLLO_STATE__\s*=\s*")
if not article:
if os.environ.get("recipe_debug_folder", ""):
@@ -226,7 +225,7 @@ def preprocess_raw_html(self, raw_html, url):
"""
- new_soup = BeautifulSoup(html_output, "html.parser")
+ new_soup = self.soup(html_output)
# sub headline
for c in content.get("subHeadline", {}).get("json", []):
ele = new_soup.new_tag(c["type"])
diff --git a/recipes/sydney-morning-herald.recipe.py b/recipes/sydney-morning-herald.recipe.py
index e2e20a65..4a6dcee3 100644
--- a/recipes/sydney-morning-herald.recipe.py
+++ b/recipes/sydney-morning-herald.recipe.py
@@ -11,8 +11,6 @@
sys.path.append(os.environ["recipes_includes"])
from recipes_shared import BasicNewsrackRecipe, format_title
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
-
# Original at https://github.com/kovidgoyal/calibre/blob/8bc3d757f4bb78ee002caf2766d7285497349097/recipes/smh.recipe
from calibre.web.feeds.news import BasicNewsRecipe
@@ -77,7 +75,7 @@ def populate_article_metadata(self, article, _, __):
self.title = format_title(_name, self.pub_date)
def preprocess_raw_html(self, raw_html, url):
- soup = BeautifulSoup(raw_html)
+ soup = self.soup(raw_html)
vid_player = soup.find(
"div", attrs={"data-testid": "video-player", "class": "noPrint"}
)
diff --git a/recipes/taipei-times.recipe.py b/recipes/taipei-times.recipe.py
index 33cfc938..f96318ef 100644
--- a/recipes/taipei-times.recipe.py
+++ b/recipes/taipei-times.recipe.py
@@ -10,7 +10,6 @@
sys.path.append(os.environ["recipes_includes"])
from recipes_shared import BasicNewsrackRecipe, format_title
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe
_name = "Taipei Times"
@@ -48,7 +47,7 @@ def populate_article_metadata(self, article, _, __):
self.title = format_title(_name, post_date_local)
def preprocess_raw_html(self, raw_html, _):
- soup = BeautifulSoup(raw_html)
+ soup = self.soup(raw_html)
# replace byline with actual byline element
byline = soup.select_one("ul.as")
diff --git a/recipes/thediplomat.recipe.py b/recipes/thediplomat.recipe.py
index 6044c9b3..38985451 100644
--- a/recipes/thediplomat.recipe.py
+++ b/recipes/thediplomat.recipe.py
@@ -15,7 +15,6 @@
sys.path.append(os.environ["recipes_includes"])
from recipes_shared import WordPressNewsrackRecipe, get_date_format
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe
_name = "The Diplomat"
@@ -88,7 +87,7 @@ def preprocess_raw_html(self, raw_html, url):
# formulate the api response into html
post = json.loads(raw_html)
post_date = self.parse_date(post["date"], tz_info=None, as_utc=False)
- soup = BeautifulSoup(
+ soup = self.soup(
f"""
@@ -106,25 +105,25 @@ def preprocess_raw_html(self, raw_html, url):
title.string = unescape(post["title"]["rendered"])
soup.body.h1.string = unescape(post["title"]["rendered"])
soup.find("div", class_="sub-headline").append(
- BeautifulSoup(post["excerpt"]["rendered"])
+ self.soup(post["excerpt"]["rendered"])
)
# inject authors
post_authors = self.extract_authors(post)
if post_authors:
soup.find(class_="article-meta").insert(
0,
- BeautifulSoup(f'{", ".join(post_authors)}'),
+ self.soup(f'{", ".join(post_authors)}'),
)
# inject categories
categories = self.extract_categories(post)
if categories:
soup.body.article.insert(
0,
- BeautifulSoup(
+ self.soup(
f'{" / ".join(categories)}'
),
)
- soup.body.article.append(BeautifulSoup(self._extract_featured_media(post)))
+ soup.body.article.append(self.soup(self._extract_featured_media(post)))
return str(soup)
def populate_article_metadata(self, article, soup, first):
diff --git a/recipes/thirdpole.recipe.py b/recipes/thirdpole.recipe.py
index c1e2e030..06246327 100644
--- a/recipes/thirdpole.recipe.py
+++ b/recipes/thirdpole.recipe.py
@@ -11,7 +11,6 @@
sys.path.append(os.environ["recipes_includes"])
from recipes_shared import WordPressNewsrackRecipe, get_datetime_format
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe
_name = "The Third Pole"
@@ -73,7 +72,7 @@ def _extract_featured_media(self, post, soup):
:param post_content: Extracted post content
:return:
"""
- post_soup = BeautifulSoup(post["content"]["rendered"])
+ post_soup = self.soup(post["content"]["rendered"])
for img in post_soup.find_all("img", attrs={"data-src": True}):
img["src"] = img["data-src"]
post_content = str(post_soup)
@@ -111,7 +110,7 @@ def preprocess_raw_html(self, raw_html, url):
post_authors = self.extract_authors(post)
categories = self.extract_categories(post)
- soup = BeautifulSoup(
+ soup = self.soup(
f"""
{post["title"]["rendered"]}
@@ -127,9 +126,7 @@ def preprocess_raw_html(self, raw_html, url):
"""
)
- soup.body.article.append(
- BeautifulSoup(self._extract_featured_media(post, soup))
- )
+ soup.body.article.append(self.soup(self._extract_featured_media(post, soup)))
return str(soup)
def parse_index(self):
diff --git a/recipes/time-magazine.recipe.py b/recipes/time-magazine.recipe.py
index 7e66c583..0c3fbdfd 100644
--- a/recipes/time-magazine.recipe.py
+++ b/recipes/time-magazine.recipe.py
@@ -12,7 +12,6 @@
sys.path.append(os.environ["recipes_includes"])
from recipes_shared import BasicNewsrackRecipe, get_date_format
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile
from calibre.web.feeds.news import BasicNewsRecipe
@@ -58,7 +57,7 @@ def preprocess_raw_html(self, raw_html, url):
if not self.pub_date or date_published_utc > self.pub_date:
self.pub_date = date_published_utc
- content_soup = BeautifulSoup(article["content"])
+ content_soup = self.soup(article["content"])
cover_url = self.canonicalize_internal_url(self.cover_url)
# clean up weirdness
div_gmail = content_soup.find_all(name="div", attrs={"class": "gmail_default"})
diff --git a/recipes/wapo-paper.recipe.py b/recipes/wapo-paper.recipe.py
index 6ea6a472..a471086a 100644
--- a/recipes/wapo-paper.recipe.py
+++ b/recipes/wapo-paper.recipe.py
@@ -13,7 +13,6 @@
sys.path.append(os.environ["recipes_includes"])
from recipes_shared import BasicNewsrackRecipe, get_datetime_format
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.utils.cleantext import clean_ascii_chars
@@ -73,7 +72,7 @@ def _extract_child_nodes(self, nodes, parent_element, soup, url):
continue
if node_type == "text":
para_ele = soup.new_tag("p")
- para_ele.append(BeautifulSoup(c["content"]))
+ para_ele.append(self.soup(c["content"]))
parent_element.append(para_ele)
elif node_type == "image":
figure_ele = soup.new_tag("figure", attrs={"class": "figure"})
@@ -99,17 +98,17 @@ def _extract_child_nodes(self, nodes, parent_element, soup, url):
parent_element.append(container_ele)
elif node_type == "header":
header_ele = soup.new_tag(f'h{c["level"]}')
- header_ele.append(BeautifulSoup(c["content"]))
+ header_ele.append(self.soup(c["content"]))
parent_element.append(header_ele)
elif node_type == "correction":
para_ele = soup.new_tag("p", attrs={"class": "correction"})
- para_ele.append(BeautifulSoup(c.get("content") or c.get("text")))
+ para_ele.append(self.soup(c.get("content") or c.get("text")))
parent_element.append(para_ele)
elif node_type == "oembed_response":
- embed_ele = BeautifulSoup(c["raw_oembed"]["html"])
+ embed_ele = self.soup(c["raw_oembed"]["html"])
parent_element.append(embed_ele)
elif node_type == "raw_html":
- content = BeautifulSoup(c["content"])
+ content = self.soup(c["content"])
container = content.find("div", attrs={"data-fallback-image-url": True})
if container:
figure_ele = soup.new_tag("figure")
@@ -133,12 +132,12 @@ def _extract_child_nodes(self, nodes, parent_element, soup, url):
) or c.get("header")
if header_string:
header_ele = soup.new_tag("h3")
- header_ele.append(BeautifulSoup(header_string))
+ header_ele.append(self.soup(header_string))
container_ele.append(header_ele)
ol_ele = soup.new_tag("ol")
for i in c.get("items", []):
li_ele = soup.new_tag("li")
- li_ele.append(BeautifulSoup(i["content"]))
+ li_ele.append(self.soup(i["content"]))
ol_ele.append(li_ele)
container_ele.append(ol_ele)
parent_element.append(container_ele)
@@ -151,14 +150,12 @@ def _extract_child_nodes(self, nodes, parent_element, soup, url):
container_ele.append(soup.new_tag("hr", attrs={"class": "story"}))
header_ele = soup.new_tag("h3")
- header_ele.append(
- BeautifulSoup(c.get("headlines", {}).get("basic", ""))
- )
+ header_ele.append(self.soup(c.get("headlines", {}).get("basic", "")))
container_ele.append(header_ele)
# Example 2022-04-13T14:04:03.051Z "%Y-%m-%dT%H:%M:%S.%fZ"
post_date = self.parse_date(c["display_date"])
- meta_ele = BeautifulSoup(
+ meta_ele = self.soup(
f"""
{post_date:{get_datetime_format()}}
@@ -182,7 +179,7 @@ def _extract_child_nodes(self, nodes, parent_element, soup, url):
self.log.debug(json.dumps(c))
def preprocess_raw_html(self, raw_html, url):
- soup = BeautifulSoup(raw_html)
+ soup = self.soup(raw_html)
data = self.get_script_json(soup, "", {"id": "__NEXT_DATA__", "src": False})
content = data.get("props", {}).get("pageProps", {}).get("globalContent", {})
if not content:
@@ -221,7 +218,7 @@ def preprocess_raw_html(self, raw_html, url):
"""
- new_soup = BeautifulSoup(html)
+ new_soup = self.soup(html)
title_ele = new_soup.new_tag("title")
title_ele.append(title)
new_soup.head.append(title_ele)
diff --git a/recipes/wired.recipe.py b/recipes/wired.recipe.py
index 6767a61f..b42deb30 100644
--- a/recipes/wired.recipe.py
+++ b/recipes/wired.recipe.py
@@ -14,7 +14,6 @@
sys.path.append(os.environ["recipes_includes"])
from recipes_shared import BasicCookielessNewsrackRecipe, format_title
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe, classes
_name = "Wired Magazine"
@@ -73,7 +72,7 @@ class WiredMagazine(BasicCookielessNewsrackRecipe, BasicNewsRecipe):
]
def preprocess_raw_html(self, raw_html, url):
- soup = BeautifulSoup(raw_html)
+ soup = self.soup(raw_html)
pub_date_meta = soup.find(
name="meta", attrs={"property": "article:published_time"}
)
diff --git a/recipes/world-today.recipe.py b/recipes/world-today.recipe.py
index d1055b5b..8fdd1e90 100644
--- a/recipes/world-today.recipe.py
+++ b/recipes/world-today.recipe.py
@@ -10,7 +10,6 @@
sys.path.append(os.environ["recipes_includes"])
from recipes_shared import BasicNewsrackRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe
_issue_url = ""
@@ -72,7 +71,7 @@ class WorldToday(BasicNewsrackRecipe, BasicNewsRecipe):
"""
def preprocess_raw_html(self, raw_html, url):
- soup = BeautifulSoup(raw_html)
+ soup = self.soup(raw_html)
# find pub date
mod_date_ele = soup.find("meta", attrs={"property": "article:modified_time"})
# Example: 2022-09-30T12:40:17+0100 "%Y-%m-%dT%H:%M:%S%z"
diff --git a/recipes/wsj-paper.recipe.py b/recipes/wsj-paper.recipe.py
index 7b0ba6aa..252ed99d 100644
--- a/recipes/wsj-paper.recipe.py
+++ b/recipes/wsj-paper.recipe.py
@@ -16,7 +16,6 @@
sys.path.append(os.environ["recipes_includes"])
from recipes_shared import BasicNewsrackRecipe, format_title, get_date_format
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe, classes
_name = "Wall Street Journal (Print)"
@@ -60,7 +59,7 @@ class WSJ(BasicNewsrackRecipe, BasicNewsRecipe):
]
def preprocess_raw_html(self, raw_html, url):
- soup = BeautifulSoup(raw_html)
+ soup = self.soup(raw_html)
# find pub date
mod_date_ele = soup.find(
"meta", attrs={"name": "article.updated"}