From 479b072d207e06f7e1cabc511ac4752322a64cdf Mon Sep 17 00:00:00 2001 From: ping Date: Tue, 10 Oct 2023 18:33:07 +0800 Subject: [PATCH] Switch to using self.soup() [skip ci] --- recipes/aeon.recipe.py | 3 +-- recipes/bloomberg-businessweek.recipe.py | 25 ++++++++++------------ recipes/bloomberg-news.recipe.py | 25 ++++++++++------------ recipes/fivebooks.recipe.py | 5 ++--- recipes/forbes-editors-picks.recipe.py | 3 +-- recipes/foreign-affairs.recipe.py | 3 +-- recipes/foreign-policy-magazine.recipe.py | 7 +++--- recipes/foreign-policy.recipe.py | 7 +++--- recipes/ft-paper.recipe.py | 3 +-- recipes/ft.recipe.py | 3 +-- recipes/fulcrum-sg.recipe.py | 15 +++++-------- recipes/harpers-magazine.recipe.py | 3 +-- recipes/harvard-intl-review.recipe.py | 3 +-- recipes/hbr.recipe.py | 3 +-- recipes/lithub.recipe.py | 9 +++----- recipes/longreads-features.recipe.py | 9 +++----- recipes/mit-tech-review-magazine.recipe.py | 3 +-- recipes/mit-tech-review.recipe.py | 5 ++--- recipes/new-republic-magazine.recipe.py | 4 +--- recipes/newyorker.recipe.py | 5 ++--- recipes/noema-magazine.recipe.py | 13 ++++------- recipes/scientific-american.recipe.py | 3 +-- recipes/scmp.recipe.py | 7 +++--- recipes/sydney-morning-herald.recipe.py | 4 +--- recipes/taipei-times.recipe.py | 3 +-- recipes/thediplomat.recipe.py | 11 +++++----- recipes/thirdpole.recipe.py | 9 +++----- recipes/time-magazine.recipe.py | 3 +-- recipes/wapo-paper.recipe.py | 25 ++++++++++------------ recipes/wired.recipe.py | 3 +-- recipes/world-today.recipe.py | 3 +-- recipes/wsj-paper.recipe.py | 3 +-- 32 files changed, 88 insertions(+), 142 deletions(-) diff --git a/recipes/aeon.recipe.py b/recipes/aeon.recipe.py index b9ba65eb..b76c4c1d 100644 --- a/recipes/aeon.recipe.py +++ b/recipes/aeon.recipe.py @@ -7,7 +7,6 @@ from recipes_shared import BasicNewsrackRecipe, format_title, get_date_format from calibre.web.feeds.news import BasicNewsRecipe, prefixed_classes -from calibre.ebooks.BeautifulSoup import BeautifulSoup _name = "Aeon" @@ -55,7 +54,7 @@ def _find_article(self, data): return False def preprocess_raw_html_(self, raw_html, url): - soup = BeautifulSoup(raw_html) + soup = self.soup(raw_html) article = self.get_ld_json(soup, filter_fn=self._find_article) if not (article and article.get("articleBody")): err_msg = f"Unable to find article: {url}" diff --git a/recipes/bloomberg-businessweek.recipe.py b/recipes/bloomberg-businessweek.recipe.py index 8c798f61..3a29cbe9 100644 --- a/recipes/bloomberg-businessweek.recipe.py +++ b/recipes/bloomberg-businessweek.recipe.py @@ -12,7 +12,6 @@ from urllib.parse import urljoin, urlparse from calibre import browser, iswindows, random_user_agent -from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.utils.date import parse_date from calibre.web.feeds.news import BasicNewsRecipe @@ -176,7 +175,7 @@ def render_content(self, content, soup, parent): if content_type == "aside": return soup.new_tag("blockquote") if content_type == "embed" and content.get("iframeData", {}).get("html"): - return BeautifulSoup(content["iframeData"]["html"]) + return self.soup(content["iframeData"]["html"]) if content_type == "link" and content.get("data", {}).get( "destination", {} ).get("web"): @@ -229,7 +228,7 @@ def render_content(self, content, soup, parent): div.append(img) if photo.get("caption"): caption = soup.new_tag("div", attrs={"class": "caption"}) - caption.append(BeautifulSoup(photo["caption"])) + caption.append(self.soup(photo["caption"])) div.append(caption) if photo.get("credit"): credit = soup.new_tag("div", attrs={"class": "credit"}) @@ -287,7 +286,7 @@ def nested_render(self, content, soup, parent): def preprocess_raw_html(self, raw_html, url): self.download_count += 1 article = None - soup = BeautifulSoup(raw_html) + soup = self.soup(raw_html) for script in soup.find_all( "script", attrs={ @@ -322,7 +321,7 @@ def preprocess_raw_html(self, raw_html, url): self.abort_article(err_msg) date_published = parse_date(article["publishedAt"], assume_utc=True) - soup = BeautifulSoup( + soup = self.soup( """ @@ -342,9 +341,7 @@ def preprocess_raw_html(self, raw_html, url): soup.head.title.append(article.get("headlineText") or article["headline"]) h1_title = soup.find("h1") - h1_title.append( - BeautifulSoup(article.get("headlineText") or article["headline"]) - ) + h1_title.append(self.soup(article.get("headlineText") or article["headline"])) if article.get("summaryText") or article.get("abstract"): sub_headline = soup.new_tag("div", attrs={"class": "sub-headline"}) if article.get("summaryText"): @@ -359,7 +356,7 @@ def preprocess_raw_html(self, raw_html, url): if article.get("byline"): soup.find(class_="article-meta").insert( 0, - BeautifulSoup(f'{article["byline"]}'), + self.soup(f'{article["byline"]}'), ) else: try: @@ -367,7 +364,7 @@ def preprocess_raw_html(self, raw_html, url): if post_authors: soup.find(class_="article-meta").insert( 0, - BeautifulSoup( + self.soup( f'{", ".join(post_authors)}' ), ) @@ -378,7 +375,7 @@ def preprocess_raw_html(self, raw_html, url): if categories: soup.body.article.insert( 0, - BeautifulSoup( + self.soup( f'{" / ".join(categories)}' ), ) @@ -393,12 +390,12 @@ def preprocess_raw_html(self, raw_html, url): caption_ele = soup.new_tag( "div", attrs={"class": "news-figure-caption-text"} ) - caption_ele.append(BeautifulSoup(lede_img_caption_html)) + caption_ele.append(self.soup(lede_img_caption_html)) img_container.append(caption_ele) soup.body.article.append(img_container) if type(article["body"]) == str: - body_soup = BeautifulSoup(article["body"]) + body_soup = self.soup(article["body"]) for img_div in body_soup.find_all( name="figure", attrs={"data-type": "image"} ): @@ -408,7 +405,7 @@ def preprocess_raw_html(self, raw_html, url): img["src"] = img["src"] soup.body.article.append(body_soup) else: - body_soup = BeautifulSoup() + body_soup = self.soup() self.nested_render(article["body"], body_soup, body_soup) soup.body.article.append(body_soup) return str(soup) diff --git a/recipes/bloomberg-news.recipe.py b/recipes/bloomberg-news.recipe.py index db5700a3..bd5b6488 100644 --- a/recipes/bloomberg-news.recipe.py +++ b/recipes/bloomberg-news.recipe.py @@ -15,7 +15,6 @@ from urllib.parse import urlparse from calibre import browser, iswindows, random_user_agent -from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.utils.date import parse_date from calibre.web.feeds.news import BasicNewsRecipe @@ -199,7 +198,7 @@ def render_content(self, content, soup, parent): if content_type == "aside": return soup.new_tag("blockquote") if content_type == "embed" and content.get("iframeData", {}).get("html"): - return BeautifulSoup(content["iframeData"]["html"]) + return self.soup(content["iframeData"]["html"]) if content_type == "link" and content.get("data", {}).get( "destination", {} ).get("web"): @@ -310,7 +309,7 @@ def nested_render(self, content, soup, parent): def preprocess_raw_html(self, raw_html, url): self.download_count += 1 article = None - soup = BeautifulSoup(raw_html) + soup = self.soup(raw_html) for script in soup.find_all( "script", attrs={ @@ -345,7 +344,7 @@ def preprocess_raw_html(self, raw_html, url): self.abort_article(err_msg) date_published = parse_date(article["publishedAt"], assume_utc=True) - soup = BeautifulSoup( + soup = self.soup( """ @@ -366,9 +365,7 @@ def preprocess_raw_html(self, raw_html, url): soup.head.title.append(article.get("headlineText") or article["headline"]) h1_title = soup.find("h1") - h1_title.append( - BeautifulSoup(article.get("headlineText") or article["headline"]) - ) + h1_title.append(self.soup(article.get("headlineText") or article["headline"])) if article.get("summaryText") or article.get("abstract"): sub_headline = soup.new_tag("div", attrs={"class": "sub-headline"}) if article.get("summaryText"): @@ -383,7 +380,7 @@ def preprocess_raw_html(self, raw_html, url): if article.get("byline"): soup.find(class_="article-meta").insert( 0, - BeautifulSoup(f'{article["byline"]}'), + self.soup(f'{article["byline"]}'), ) else: try: @@ -391,7 +388,7 @@ def preprocess_raw_html(self, raw_html, url): if post_authors: soup.find(class_="article-meta").insert( 0, - BeautifulSoup( + self.soup( f'{", ".join(post_authors)}' ), ) @@ -402,7 +399,7 @@ def preprocess_raw_html(self, raw_html, url): if categories: soup.body.article.insert( 0, - BeautifulSoup( + self.soup( f'{" / ".join(categories)}' ), ) @@ -417,12 +414,12 @@ def preprocess_raw_html(self, raw_html, url): caption_ele = soup.new_tag( "div", attrs={"class": "news-figure-caption-text"} ) - caption_ele.append(BeautifulSoup(lede_img_caption_html)) + caption_ele.append(self.soup(lede_img_caption_html)) img_container.append(caption_ele) soup.body.article.append(img_container) if type(article["body"]) == str: - body_soup = BeautifulSoup(article["body"]) + body_soup = self.soup(article["body"]) for img_div in body_soup.find_all( name="figure", attrs={"data-type": "image"} ): @@ -432,7 +429,7 @@ def preprocess_raw_html(self, raw_html, url): img["src"] = img["src"] soup.body.article.append(body_soup) else: - body_soup = BeautifulSoup() + body_soup = self.soup() self.nested_render(article["body"], body_soup, body_soup) soup.body.article.append(body_soup) return str(soup) @@ -442,7 +439,7 @@ def parse_index(self): feed_items = {} for feed_name, feed_url in self.feeds: res = br.open_novisit(feed_url, timeout=self.timeout) - soup = BeautifulSoup(res.read().decode("utf-8"), "xml") + soup = self.soup(res.read().decode("utf-8"), "xml") articles = [] cutoff_date = datetime.utcnow().replace(tzinfo=timezone.utc) - timedelta( days=self.oldest_article diff --git a/recipes/fivebooks.recipe.py b/recipes/fivebooks.recipe.py index 5442e43b..1addb3ca 100644 --- a/recipes/fivebooks.recipe.py +++ b/recipes/fivebooks.recipe.py @@ -15,7 +15,6 @@ sys.path.append(os.environ["recipes_includes"]) from recipes_shared import BasicNewsrackRecipe, format_title -from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.web.feeds.news import BasicNewsRecipe _name = "Five Books" @@ -94,7 +93,7 @@ def populate_article_metadata(self, article, soup, first): article.text_summary = description_tag["data-post-description"] def preprocess_raw_html(self, raw_html, url): - soup = BeautifulSoup(raw_html) + soup = self.soup(raw_html) content = soup.find(class_="main-content") data = self.get_ld_json(soup, lambda d: d.get("@graph", [])) if not data: @@ -120,7 +119,7 @@ def parse_index(self): raw_html = ( br.open_novisit(feed_url, timeout=self.timeout).read().decode("utf-8") ) - soup = BeautifulSoup(raw_html) + soup = self.soup(raw_html) interviews = soup.find_all(class_="library-page") if self.max_articles_per_feed < len(interviews): interviews = interviews[: self.max_articles_per_feed] diff --git a/recipes/forbes-editors-picks.recipe.py b/recipes/forbes-editors-picks.recipe.py index 3e2cb78f..3e821921 100644 --- a/recipes/forbes-editors-picks.recipe.py +++ b/recipes/forbes-editors-picks.recipe.py @@ -8,7 +8,6 @@ sys.path.append(os.environ["recipes_includes"]) from recipes_shared import BasicNewsrackRecipe, format_title -from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.web.feeds.news import BasicNewsRecipe _name = "Forbes - Editor's Picks" @@ -72,7 +71,7 @@ class ForbesEditorsPicks(BasicNewsrackRecipe, BasicNewsRecipe): """ def preprocess_raw_html(self, raw_html, url): - soup = BeautifulSoup(raw_html) + soup = self.soup(raw_html) article = soup.find("article") meta = self.get_ld_json(soup, lambda d: d.get("@type", "") == "NewsArticle") modified_date = meta.get("dateModified") or meta.get("datePublished") diff --git a/recipes/foreign-affairs.recipe.py b/recipes/foreign-affairs.recipe.py index 33848b4a..d7166724 100644 --- a/recipes/foreign-affairs.recipe.py +++ b/recipes/foreign-affairs.recipe.py @@ -10,7 +10,6 @@ from recipes_shared import BasicNewsrackRecipe import mechanize -from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.web.feeds.news import BasicNewsRecipe, classes _name = "Foreign Affairs" @@ -64,7 +63,7 @@ def get_data(data): feeds = [] def as_article(source): - title = BeautifulSoup(source["title"][0]).get_text() + title = self.soup(source["title"][0]).get_text() desc = "" fs = source.get("field_subtitle") if fs: diff --git a/recipes/foreign-policy-magazine.recipe.py b/recipes/foreign-policy-magazine.recipe.py index 7e74527b..98d4fa41 100644 --- a/recipes/foreign-policy-magazine.recipe.py +++ b/recipes/foreign-policy-magazine.recipe.py @@ -13,7 +13,6 @@ sys.path.append(os.environ["recipes_includes"]) from recipes_shared import WordPressNewsrackRecipe, get_datetime_format -from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.web.feeds.news import BasicNewsRecipe _name = "Foreign Policy Magazine" @@ -79,7 +78,7 @@ def preprocess_raw_html(self, raw_html, url): if not self.pub_date or date_published_gmt > self.pub_date: self.pub_date = date_published_gmt - soup = BeautifulSoup( + soup = self.soup( f""" {post["title"]["rendered"]} @@ -96,7 +95,7 @@ def preprocess_raw_html(self, raw_html, url): """ ) - content = BeautifulSoup(post["content"]["rendered"]) + content = self.soup(post["content"]["rendered"]) # FP doesn't use featuremedia, the first attachment is the lede image attachment_endpoint = ( post.get("_links", {}).get("wp:attachment", [{}])[0].get("href") @@ -111,7 +110,7 @@ def preprocess_raw_html(self, raw_html, url): lede.append(img) if attachment.get("caption", {}).get("rendered"): caption = soup.new_tag("div", attrs={"class": "wp-caption-text"}) - caption.append(BeautifulSoup(attachment["caption"]["rendered"])) + caption.append(self.soup(attachment["caption"]["rendered"])) lede.append(caption) soup.body.article.append(lede) diff --git a/recipes/foreign-policy.recipe.py b/recipes/foreign-policy.recipe.py index 11089a37..ab6a6581 100644 --- a/recipes/foreign-policy.recipe.py +++ b/recipes/foreign-policy.recipe.py @@ -10,7 +10,6 @@ sys.path.append(os.environ["recipes_includes"]) from recipes_shared import WordPressNewsrackRecipe, get_datetime_format -from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.web.feeds.news import BasicNewsRecipe _name = "Foreign Policy" @@ -76,7 +75,7 @@ def preprocess_raw_html(self, raw_html, url): post_authors = self.extract_authors(post) categories = self.extract_categories(post) - soup = BeautifulSoup( + soup = self.soup( f""" {post["title"]["rendered"]} @@ -93,7 +92,7 @@ def preprocess_raw_html(self, raw_html, url): """ ) - content = BeautifulSoup(post["content"]["rendered"]) + content = self.soup(post["content"]["rendered"]) # FP doesn't use featuremedia, the first attachment is the lede image attachment_endpoint = ( post.get("_links", {}).get("wp:attachment", [{}])[0].get("href") @@ -108,7 +107,7 @@ def preprocess_raw_html(self, raw_html, url): lede.append(img) if attachment.get("caption", {}).get("rendered"): caption = soup.new_tag("div", attrs={"class": "wp-caption-text"}) - caption.append(BeautifulSoup(attachment["caption"]["rendered"])) + caption.append(self.soup(attachment["caption"]["rendered"])) lede.append(caption) soup.body.article.append(lede) diff --git a/recipes/ft-paper.recipe.py b/recipes/ft-paper.recipe.py index 3598ede9..1ef84af2 100644 --- a/recipes/ft-paper.recipe.py +++ b/recipes/ft-paper.recipe.py @@ -17,7 +17,6 @@ sys.path.append(os.environ["recipes_includes"]) from recipes_shared import BasicCookielessNewsrackRecipe, format_title, get_date_format -from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.web.feeds.news import BasicNewsRecipe, classes _name = "Financial Times (Print)" @@ -99,7 +98,7 @@ def ft_parse_index(self, soup): return feeds def preprocess_raw_html(self, raw_html, url): - soup = BeautifulSoup(raw_html) + soup = self.soup(raw_html) article = self.get_ld_json(soup, lambda d: d.get("@type", "") == "NewsArticle") if not (article and article.get("articleBody")): err_msg = f"Unable to find article: {url}" diff --git a/recipes/ft.recipe.py b/recipes/ft.recipe.py index bcdc48bf..d0eeda85 100644 --- a/recipes/ft.recipe.py +++ b/recipes/ft.recipe.py @@ -15,7 +15,6 @@ sys.path.append(os.environ["recipes_includes"]) from recipes_shared import BasicCookielessNewsrackRecipe, format_title, get_date_format -from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.web.feeds.news import BasicNewsRecipe _name = "Financial Times" @@ -64,7 +63,7 @@ def print_version(self, url): return urljoin("https://ft.com", url) def preprocess_raw_html(self, raw_html, url): - soup = BeautifulSoup(raw_html) + soup = self.soup(raw_html) article = self.get_ld_json(soup, lambda d: d.get("@type", "") == "NewsArticle") if not (article and article.get("articleBody")): err_msg = f"Unable to find article: {url}" diff --git a/recipes/fulcrum-sg.recipe.py b/recipes/fulcrum-sg.recipe.py index 6acba48f..e3e822d1 100644 --- a/recipes/fulcrum-sg.recipe.py +++ b/recipes/fulcrum-sg.recipe.py @@ -11,7 +11,6 @@ sys.path.append(os.environ["recipes_includes"]) from recipes_shared import WordPressNewsrackRecipe, get_datetime_format -from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.web.feeds.news import BasicNewsRecipe _name = "Fulcrum" @@ -75,7 +74,7 @@ def _extract_featured_media(self, post, soup): :param post_content: Extracted post content :return: """ - post_soup = BeautifulSoup(post["content"]["rendered"]) + post_soup = self.soup(post["content"]["rendered"]) for img in post_soup.find_all("img", attrs={"data-src": True}): img["src"] = img["data-src"] post_content = str(post_soup) @@ -96,7 +95,7 @@ def _extract_featured_media(self, post, soup): container_ele.append(img_ele) if feature_info.get("caption", {}).get("rendered"): cap_ele = soup.new_tag("div", attrs={"class": "caption"}) - cap_ele.append(BeautifulSoup(feature_info["caption"]["rendered"])) + cap_ele.append(self.soup(feature_info["caption"]["rendered"])) container_ele.append(cap_ele) post_content = str(container_ele) + post_content else: @@ -117,7 +116,7 @@ def preprocess_raw_html(self, raw_html, url): categories = self.extract_categories(post) categories.extend(self.extract_tags(post)) - soup = BeautifulSoup( + soup = self.soup( f""" {post["title"]["rendered"]} @@ -136,15 +135,11 @@ def preprocess_raw_html(self, raw_html, url): ) sub_headline = soup.find("h2", class_="sub-headline") if post.get("excerpt", {}).get("rendered"): - sub_headline.append( - BeautifulSoup(post["excerpt"]["rendered"], "html.parser") - ) + sub_headline.append(self.soup(post["excerpt"]["rendered"])) else: sub_headline.decompose() - soup.body.article.append( - BeautifulSoup(self._extract_featured_media(post, soup)) - ) + soup.body.article.append(self.soup(self._extract_featured_media(post, soup))) return str(soup) def parse_index(self): diff --git a/recipes/harpers-magazine.recipe.py b/recipes/harpers-magazine.recipe.py index ff499b0e..b3c864f0 100644 --- a/recipes/harpers-magazine.recipe.py +++ b/recipes/harpers-magazine.recipe.py @@ -6,7 +6,6 @@ sys.path.append(os.environ["recipes_includes"]) from recipes_shared import BasicCookielessNewsrackRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.web.feeds.news import BasicNewsRecipe _name = "Harper's Magazine" @@ -78,7 +77,7 @@ class HarpersMagazine(BasicCookielessNewsrackRecipe, BasicNewsRecipe): """ def preprocess_raw_html(self, raw_html, url): - soup = BeautifulSoup(raw_html) + soup = self.soup(raw_html) soup.find("meta", attrs={"property": "article:modified_time"}) # Example: 2023-05-16T16:43:24+00:00 "%Y-%m-%dT%H:%M:%S%z" article_datetime = soup.find( diff --git a/recipes/harvard-intl-review.recipe.py b/recipes/harvard-intl-review.recipe.py index e425a7f1..891ccbd7 100644 --- a/recipes/harvard-intl-review.recipe.py +++ b/recipes/harvard-intl-review.recipe.py @@ -19,7 +19,6 @@ get_datetime_format, ) -from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.web.feeds import Feed from calibre.web.feeds.news import BasicNewsRecipe @@ -75,7 +74,7 @@ def parse_feeds(self): date_published = a.utctime.replace(tzinfo=timezone.utc) article_index = f"{date_published:{get_date_format()}}" # add author and pub date - soup = BeautifulSoup(a.content) + soup = self.soup(a.content) header = None if soup.body.contents[0].name in ["h1", "h2", "h3"]: header = soup.body.contents[0] diff --git a/recipes/hbr.recipe.py b/recipes/hbr.recipe.py index 82a60485..0baceddf 100644 --- a/recipes/hbr.recipe.py +++ b/recipes/hbr.recipe.py @@ -7,7 +7,6 @@ sys.path.append(os.environ["recipes_includes"]) from recipes_shared import BasicCookielessNewsrackRecipe, get_date_format -from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.web.feeds.news import BasicNewsRecipe, classes # Original https://github.com/kovidgoyal/calibre/blob/49a1d469ce4f04f79ce786a75b8f4bdcfd32ad2c/recipes/hbr.recipe @@ -67,7 +66,7 @@ class HBR(BasicCookielessNewsrackRecipe, BasicNewsRecipe): ] def preprocess_raw_html(self, raw_html, _): - soup = BeautifulSoup(raw_html) + soup = self.soup(raw_html) # set article date pub_datetime = soup.find("meta", attrs={"property": "article:published_time"}) diff --git a/recipes/lithub.recipe.py b/recipes/lithub.recipe.py index 74275bc2..11536437 100644 --- a/recipes/lithub.recipe.py +++ b/recipes/lithub.recipe.py @@ -11,7 +11,6 @@ sys.path.append(os.environ["recipes_includes"]) from recipes_shared import WordPressNewsrackRecipe, get_datetime_format -from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.web.feeds.news import BasicNewsRecipe _name = "Literary Hub" @@ -63,7 +62,7 @@ def _extract_featured_media(self, post, soup): :param post_content: Extracted post content :return: """ - post_soup = BeautifulSoup(post["content"]["rendered"]) + post_soup = self.soup(post["content"]["rendered"]) for img in post_soup.find_all("img", attrs={"data-src": True}): img["src"] = img["data-src"] post_content = str(post_soup) @@ -101,7 +100,7 @@ def preprocess_raw_html(self, raw_html, url): post_authors = self.extract_authors(post) categories = self.extract_categories(post) - soup = BeautifulSoup( + soup = self.soup( f""" {post["title"]["rendered"]} @@ -117,9 +116,7 @@ def preprocess_raw_html(self, raw_html, url): """ ) - soup.body.article.append( - BeautifulSoup(self._extract_featured_media(post, soup)) - ) + soup.body.article.append(self.soup(self._extract_featured_media(post, soup))) return str(soup) def parse_index(self): diff --git a/recipes/longreads-features.recipe.py b/recipes/longreads-features.recipe.py index 172a35bd..a630d0a0 100644 --- a/recipes/longreads-features.recipe.py +++ b/recipes/longreads-features.recipe.py @@ -11,7 +11,6 @@ sys.path.append(os.environ["recipes_includes"]) from recipes_shared import WordPressNewsrackRecipe, get_datetime_format -from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.web.feeds.news import BasicNewsRecipe, prefixed_classes _name = "Longreads Features" @@ -57,7 +56,7 @@ def _extract_featured_media(self, post, soup): :param post_content: Extracted post content :return: """ - post_soup = BeautifulSoup(post["content"]["rendered"]) + post_soup = self.soup(post["content"]["rendered"]) for img in post_soup.find_all("img", attrs={"data-src": True}): img["src"] = img["data-src"] post_content = str(post_soup) @@ -95,7 +94,7 @@ def preprocess_raw_html(self, raw_html, url): post_authors = self.extract_authors(post) categories = self.extract_categories(post) - soup = BeautifulSoup( + soup = self.soup( f""" {post["title"]["rendered"]} @@ -111,9 +110,7 @@ def preprocess_raw_html(self, raw_html, url): """ ) - soup.body.article.append( - BeautifulSoup(self._extract_featured_media(post, soup)) - ) + soup.body.article.append(self.soup(self._extract_featured_media(post, soup))) return str(soup) def parse_index(self): diff --git a/recipes/mit-tech-review-magazine.recipe.py b/recipes/mit-tech-review-magazine.recipe.py index 1ceb8450..c93b55a4 100644 --- a/recipes/mit-tech-review-magazine.recipe.py +++ b/recipes/mit-tech-review-magazine.recipe.py @@ -18,7 +18,6 @@ sys.path.append(os.environ["recipes_includes"]) from recipes_shared import WordPressNewsrackRecipe, get_date_format -from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.web.feeds.news import BasicNewsRecipe @@ -108,7 +107,7 @@ def preprocess_raw_html(self, raw_html, url): post_authors = self.extract_authors(post) categories = self.extract_categories(post) - soup = BeautifulSoup( + soup = self.soup( f""" {post["title"]["rendered"]} diff --git a/recipes/mit-tech-review.recipe.py b/recipes/mit-tech-review.recipe.py index 67db3f53..48401804 100644 --- a/recipes/mit-tech-review.recipe.py +++ b/recipes/mit-tech-review.recipe.py @@ -11,7 +11,6 @@ sys.path.append(os.environ["recipes_includes"]) from recipes_shared import WordPressNewsrackRecipe, get_datetime_format -from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.web.feeds.news import BasicNewsRecipe _name = "MIT Technology Review" @@ -59,7 +58,7 @@ def _extract_featured_media(self, post): :param post_content: Extracted post content :return: """ - post_soup = BeautifulSoup(post["content"]["rendered"]) + post_soup = self.soup(post["content"]["rendered"]) for img in post_soup.find_all("img", attrs={"data-src": True}): img["src"] = img["data-src"] post_content = str(post_soup) @@ -96,7 +95,7 @@ def preprocess_raw_html(self, raw_html, url): post_authors = self.extract_authors(post) categories = self.extract_categories(post) - soup = BeautifulSoup( + soup = self.soup( f""" {post["title"]["rendered"]} diff --git a/recipes/new-republic-magazine.recipe.py b/recipes/new-republic-magazine.recipe.py index 4052fa55..c350305e 100644 --- a/recipes/new-republic-magazine.recipe.py +++ b/recipes/new-republic-magazine.recipe.py @@ -8,8 +8,6 @@ from functools import cmp_to_key from urllib.parse import urljoin, urlencode, urlsplit, urlparse -from calibre.ebooks.BeautifulSoup import BeautifulSoup - # custom include to share code between recipes sys.path.append(os.environ["recipes_includes"]) from recipes_shared import BasicNewsrackRecipe, get_date_format @@ -233,7 +231,7 @@ def preprocess_raw_html(self, raw_html, url): {lede_image_caption}

""" - body_soup = BeautifulSoup(article["body"]) + body_soup = self.soup(article["body"]) for img in body_soup.find_all("img", attrs={"data-serialized": True}): try: img_info = json.loads(img["data-serialized"]) diff --git a/recipes/newyorker.recipe.py b/recipes/newyorker.recipe.py index b158ec77..eb035872 100644 --- a/recipes/newyorker.recipe.py +++ b/recipes/newyorker.recipe.py @@ -20,7 +20,6 @@ get_datetime_format, ) -from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.web.feeds.news import BasicNewsRecipe, classes, prefixed_classes from calibre.ebooks.markdown import Markdown @@ -85,7 +84,7 @@ class NewYorker(BasicCookielessNewsrackRecipe, BasicNewsRecipe): remove_attributes = ["style", "sizes", "data-event-click"] def preprocess_raw_html(self, raw_html, url): - soup = BeautifulSoup(raw_html) + soup = self.soup(raw_html) preload_state = {} preload_script_eles = [ @@ -172,7 +171,7 @@ def preprocess_raw_html(self, raw_html, url): image_html, ) - interactive_container.append(BeautifulSoup(md.convert(article_body))) + interactive_container.append(self.soup(md.convert(article_body))) interactive_container["class"] = "og" except Exception as e: self.log.warning(f"Unable to convert interactive article: {e}") diff --git a/recipes/noema-magazine.recipe.py b/recipes/noema-magazine.recipe.py index 467e55e8..e4775063 100644 --- a/recipes/noema-magazine.recipe.py +++ b/recipes/noema-magazine.recipe.py @@ -10,7 +10,6 @@ sys.path.append(os.environ["recipes_includes"]) from recipes_shared import WordPressNewsrackRecipe, get_datetime_format -from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.web.feeds.news import BasicNewsRecipe _name = "Noema Magazine" @@ -73,7 +72,7 @@ def _extract_featured_media(self, post, soup): :param post_content: Extracted post content :return: """ - post_soup = BeautifulSoup(post["content"]["rendered"]) + post_soup = self.soup(post["content"]["rendered"]) for h in post_soup.find_all("h5"): h.name = "h3" post_content = str(post_soup) @@ -94,9 +93,7 @@ def _extract_featured_media(self, post, soup): container_ele.append(img_ele) if feature_info.get("caption", {}).get("rendered"): container_ele.append( - BeautifulSoup( - feature_info["caption"]["rendered"], "html.parser" - ) + self.soup(feature_info["caption"]["rendered"], "html.parser") ) post_content = str(container_ele) + post_content else: @@ -113,7 +110,7 @@ def preprocess_raw_html(self, raw_html, url): post_authors = self.extract_authors(post) categories = self.extract_categories(post) - soup = BeautifulSoup( + soup = self.soup( f""" {post["title"]["rendered"]} @@ -129,9 +126,7 @@ def preprocess_raw_html(self, raw_html, url): """ ) - soup.body.article.append( - BeautifulSoup(self._extract_featured_media(post, soup)) - ) + soup.body.article.append(self.soup(self._extract_featured_media(post, soup))) return str(soup) def parse_index(self): diff --git a/recipes/scientific-american.recipe.py b/recipes/scientific-american.recipe.py index 46d3fe77..f6ecdb38 100644 --- a/recipes/scientific-american.recipe.py +++ b/recipes/scientific-american.recipe.py @@ -13,7 +13,6 @@ sys.path.append(os.environ["recipes_includes"]) from recipes_shared import BasicNewsrackRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.web.feeds.news import BasicNewsRecipe @@ -79,7 +78,7 @@ def get_browser(self, *a, **kw): return br def preprocess_raw_html(self, raw_html, url): - soup = BeautifulSoup(raw_html) + soup = self.soup(raw_html) info = self.get_script_json(soup, r"dataLayer\s*=\s*") if info: for i in info: diff --git a/recipes/scmp.recipe.py b/recipes/scmp.recipe.py index d059d34e..b937ec1a 100644 --- a/recipes/scmp.recipe.py +++ b/recipes/scmp.recipe.py @@ -10,7 +10,6 @@ sys.path.append(os.environ["recipes_includes"]) from recipes_shared import BasicNewsrackRecipe, format_title, get_datetime_format -from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.web.feeds.news import BasicNewsRecipe _name = "South China Morning Post" @@ -154,10 +153,10 @@ def _extract_child_nodes(self, children, ele, soup, level=1): new_ele["class"] = "caption" child_html += str(new_ele) ele["class"] = "article-img" - ele.append(BeautifulSoup(child_html)) + ele.append(self.soup(child_html)) def preprocess_raw_html(self, raw_html, url): - soup = BeautifulSoup(raw_html) + soup = self.soup(raw_html) article = self.get_script_json(soup, r"window.__APOLLO_STATE__\s*=\s*") if not article: if os.environ.get("recipe_debug_folder", ""): @@ -226,7 +225,7 @@ def preprocess_raw_html(self, raw_html, url): """ - new_soup = BeautifulSoup(html_output, "html.parser") + new_soup = self.soup(html_output) # sub headline for c in content.get("subHeadline", {}).get("json", []): ele = new_soup.new_tag(c["type"]) diff --git a/recipes/sydney-morning-herald.recipe.py b/recipes/sydney-morning-herald.recipe.py index e2e20a65..4a6dcee3 100644 --- a/recipes/sydney-morning-herald.recipe.py +++ b/recipes/sydney-morning-herald.recipe.py @@ -11,8 +11,6 @@ sys.path.append(os.environ["recipes_includes"]) from recipes_shared import BasicNewsrackRecipe, format_title -from calibre.ebooks.BeautifulSoup import BeautifulSoup - # Original at https://github.com/kovidgoyal/calibre/blob/8bc3d757f4bb78ee002caf2766d7285497349097/recipes/smh.recipe from calibre.web.feeds.news import BasicNewsRecipe @@ -77,7 +75,7 @@ def populate_article_metadata(self, article, _, __): self.title = format_title(_name, self.pub_date) def preprocess_raw_html(self, raw_html, url): - soup = BeautifulSoup(raw_html) + soup = self.soup(raw_html) vid_player = soup.find( "div", attrs={"data-testid": "video-player", "class": "noPrint"} ) diff --git a/recipes/taipei-times.recipe.py b/recipes/taipei-times.recipe.py index 33cfc938..f96318ef 100644 --- a/recipes/taipei-times.recipe.py +++ b/recipes/taipei-times.recipe.py @@ -10,7 +10,6 @@ sys.path.append(os.environ["recipes_includes"]) from recipes_shared import BasicNewsrackRecipe, format_title -from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.web.feeds.news import BasicNewsRecipe _name = "Taipei Times" @@ -48,7 +47,7 @@ def populate_article_metadata(self, article, _, __): self.title = format_title(_name, post_date_local) def preprocess_raw_html(self, raw_html, _): - soup = BeautifulSoup(raw_html) + soup = self.soup(raw_html) # replace byline