Skip to content

Commit

Permalink
Switch to using self.soup() [skip ci]
Browse files Browse the repository at this point in the history
  • Loading branch information
ping committed Oct 10, 2023
1 parent 178433a commit 479b072
Show file tree
Hide file tree
Showing 32 changed files with 88 additions and 142 deletions.
3 changes: 1 addition & 2 deletions recipes/aeon.recipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from recipes_shared import BasicNewsrackRecipe, format_title, get_date_format

from calibre.web.feeds.news import BasicNewsRecipe, prefixed_classes
from calibre.ebooks.BeautifulSoup import BeautifulSoup

_name = "Aeon"

Expand Down Expand Up @@ -55,7 +54,7 @@ def _find_article(self, data):
return False

def preprocess_raw_html_(self, raw_html, url):
soup = BeautifulSoup(raw_html)
soup = self.soup(raw_html)
article = self.get_ld_json(soup, filter_fn=self._find_article)
if not (article and article.get("articleBody")):
err_msg = f"Unable to find article: {url}"
Expand Down
25 changes: 11 additions & 14 deletions recipes/bloomberg-businessweek.recipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
from urllib.parse import urljoin, urlparse

from calibre import browser, iswindows, random_user_agent
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.utils.date import parse_date
from calibre.web.feeds.news import BasicNewsRecipe

Expand Down Expand Up @@ -176,7 +175,7 @@ def render_content(self, content, soup, parent):
if content_type == "aside":
return soup.new_tag("blockquote")
if content_type == "embed" and content.get("iframeData", {}).get("html"):
return BeautifulSoup(content["iframeData"]["html"])
return self.soup(content["iframeData"]["html"])
if content_type == "link" and content.get("data", {}).get(
"destination", {}
).get("web"):
Expand Down Expand Up @@ -229,7 +228,7 @@ def render_content(self, content, soup, parent):
div.append(img)
if photo.get("caption"):
caption = soup.new_tag("div", attrs={"class": "caption"})
caption.append(BeautifulSoup(photo["caption"]))
caption.append(self.soup(photo["caption"]))
div.append(caption)
if photo.get("credit"):
credit = soup.new_tag("div", attrs={"class": "credit"})
Expand Down Expand Up @@ -287,7 +286,7 @@ def nested_render(self, content, soup, parent):
def preprocess_raw_html(self, raw_html, url):
self.download_count += 1
article = None
soup = BeautifulSoup(raw_html)
soup = self.soup(raw_html)
for script in soup.find_all(
"script",
attrs={
Expand Down Expand Up @@ -322,7 +321,7 @@ def preprocess_raw_html(self, raw_html, url):
self.abort_article(err_msg)

date_published = parse_date(article["publishedAt"], assume_utc=True)
soup = BeautifulSoup(
soup = self.soup(
"""<html>
<head><title></title></head>
<body>
Expand All @@ -342,9 +341,7 @@ def preprocess_raw_html(self, raw_html, url):

soup.head.title.append(article.get("headlineText") or article["headline"])
h1_title = soup.find("h1")
h1_title.append(
BeautifulSoup(article.get("headlineText") or article["headline"])
)
h1_title.append(self.soup(article.get("headlineText") or article["headline"]))
if article.get("summaryText") or article.get("abstract"):
sub_headline = soup.new_tag("div", attrs={"class": "sub-headline"})
if article.get("summaryText"):
Expand All @@ -359,15 +356,15 @@ def preprocess_raw_html(self, raw_html, url):
if article.get("byline"):
soup.find(class_="article-meta").insert(
0,
BeautifulSoup(f'<span class="author">{article["byline"]}</span>'),
self.soup(f'<span class="author">{article["byline"]}</span>'),
)
else:
try:
post_authors = [a["name"] for a in article.get("authors", [])]
if post_authors:
soup.find(class_="article-meta").insert(
0,
BeautifulSoup(
self.soup(
f'<span class="author">{", ".join(post_authors)}</span>'
),
)
Expand All @@ -378,7 +375,7 @@ def preprocess_raw_html(self, raw_html, url):
if categories:
soup.body.article.insert(
0,
BeautifulSoup(
self.soup(
f'<span class="article-section">{" / ".join(categories)}</span>'
),
)
Expand All @@ -393,12 +390,12 @@ def preprocess_raw_html(self, raw_html, url):
caption_ele = soup.new_tag(
"div", attrs={"class": "news-figure-caption-text"}
)
caption_ele.append(BeautifulSoup(lede_img_caption_html))
caption_ele.append(self.soup(lede_img_caption_html))
img_container.append(caption_ele)
soup.body.article.append(img_container)

if type(article["body"]) == str:
body_soup = BeautifulSoup(article["body"])
body_soup = self.soup(article["body"])
for img_div in body_soup.find_all(
name="figure", attrs={"data-type": "image"}
):
Expand All @@ -408,7 +405,7 @@ def preprocess_raw_html(self, raw_html, url):
img["src"] = img["src"]
soup.body.article.append(body_soup)
else:
body_soup = BeautifulSoup()
body_soup = self.soup()
self.nested_render(article["body"], body_soup, body_soup)
soup.body.article.append(body_soup)
return str(soup)
Expand Down
25 changes: 11 additions & 14 deletions recipes/bloomberg-news.recipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
from urllib.parse import urlparse

from calibre import browser, iswindows, random_user_agent
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.utils.date import parse_date
from calibre.web.feeds.news import BasicNewsRecipe

Expand Down Expand Up @@ -199,7 +198,7 @@ def render_content(self, content, soup, parent):
if content_type == "aside":
return soup.new_tag("blockquote")
if content_type == "embed" and content.get("iframeData", {}).get("html"):
return BeautifulSoup(content["iframeData"]["html"])
return self.soup(content["iframeData"]["html"])
if content_type == "link" and content.get("data", {}).get(
"destination", {}
).get("web"):
Expand Down Expand Up @@ -310,7 +309,7 @@ def nested_render(self, content, soup, parent):
def preprocess_raw_html(self, raw_html, url):
self.download_count += 1
article = None
soup = BeautifulSoup(raw_html)
soup = self.soup(raw_html)
for script in soup.find_all(
"script",
attrs={
Expand Down Expand Up @@ -345,7 +344,7 @@ def preprocess_raw_html(self, raw_html, url):
self.abort_article(err_msg)

date_published = parse_date(article["publishedAt"], assume_utc=True)
soup = BeautifulSoup(
soup = self.soup(
"""<html>
<head><title></title></head>
<body>
Expand All @@ -366,9 +365,7 @@ def preprocess_raw_html(self, raw_html, url):

soup.head.title.append(article.get("headlineText") or article["headline"])
h1_title = soup.find("h1")
h1_title.append(
BeautifulSoup(article.get("headlineText") or article["headline"])
)
h1_title.append(self.soup(article.get("headlineText") or article["headline"]))
if article.get("summaryText") or article.get("abstract"):
sub_headline = soup.new_tag("div", attrs={"class": "sub-headline"})
if article.get("summaryText"):
Expand All @@ -383,15 +380,15 @@ def preprocess_raw_html(self, raw_html, url):
if article.get("byline"):
soup.find(class_="article-meta").insert(
0,
BeautifulSoup(f'<span class="author">{article["byline"]}</span>'),
self.soup(f'<span class="author">{article["byline"]}</span>'),
)
else:
try:
post_authors = [a["name"] for a in article.get("authors", [])]
if post_authors:
soup.find(class_="article-meta").insert(
0,
BeautifulSoup(
self.soup(
f'<span class="author">{", ".join(post_authors)}</span>'
),
)
Expand All @@ -402,7 +399,7 @@ def preprocess_raw_html(self, raw_html, url):
if categories:
soup.body.article.insert(
0,
BeautifulSoup(
self.soup(
f'<span class="article-section">{" / ".join(categories)}</span>'
),
)
Expand All @@ -417,12 +414,12 @@ def preprocess_raw_html(self, raw_html, url):
caption_ele = soup.new_tag(
"div", attrs={"class": "news-figure-caption-text"}
)
caption_ele.append(BeautifulSoup(lede_img_caption_html))
caption_ele.append(self.soup(lede_img_caption_html))
img_container.append(caption_ele)
soup.body.article.append(img_container)

if type(article["body"]) == str:
body_soup = BeautifulSoup(article["body"])
body_soup = self.soup(article["body"])
for img_div in body_soup.find_all(
name="figure", attrs={"data-type": "image"}
):
Expand All @@ -432,7 +429,7 @@ def preprocess_raw_html(self, raw_html, url):
img["src"] = img["src"]
soup.body.article.append(body_soup)
else:
body_soup = BeautifulSoup()
body_soup = self.soup()
self.nested_render(article["body"], body_soup, body_soup)
soup.body.article.append(body_soup)
return str(soup)
Expand All @@ -442,7 +439,7 @@ def parse_index(self):
feed_items = {}
for feed_name, feed_url in self.feeds:
res = br.open_novisit(feed_url, timeout=self.timeout)
soup = BeautifulSoup(res.read().decode("utf-8"), "xml")
soup = self.soup(res.read().decode("utf-8"), "xml")
articles = []
cutoff_date = datetime.utcnow().replace(tzinfo=timezone.utc) - timedelta(
days=self.oldest_article
Expand Down
5 changes: 2 additions & 3 deletions recipes/fivebooks.recipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
sys.path.append(os.environ["recipes_includes"])
from recipes_shared import BasicNewsrackRecipe, format_title

from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe

_name = "Five Books"
Expand Down Expand Up @@ -94,7 +93,7 @@ def populate_article_metadata(self, article, soup, first):
article.text_summary = description_tag["data-post-description"]

def preprocess_raw_html(self, raw_html, url):
soup = BeautifulSoup(raw_html)
soup = self.soup(raw_html)
content = soup.find(class_="main-content")
data = self.get_ld_json(soup, lambda d: d.get("@graph", []))
if not data:
Expand All @@ -120,7 +119,7 @@ def parse_index(self):
raw_html = (
br.open_novisit(feed_url, timeout=self.timeout).read().decode("utf-8")
)
soup = BeautifulSoup(raw_html)
soup = self.soup(raw_html)
interviews = soup.find_all(class_="library-page")
if self.max_articles_per_feed < len(interviews):
interviews = interviews[: self.max_articles_per_feed]
Expand Down
3 changes: 1 addition & 2 deletions recipes/forbes-editors-picks.recipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
sys.path.append(os.environ["recipes_includes"])
from recipes_shared import BasicNewsrackRecipe, format_title

from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe

_name = "Forbes - Editor's Picks"
Expand Down Expand Up @@ -72,7 +71,7 @@ class ForbesEditorsPicks(BasicNewsrackRecipe, BasicNewsRecipe):
"""

def preprocess_raw_html(self, raw_html, url):
soup = BeautifulSoup(raw_html)
soup = self.soup(raw_html)
article = soup.find("article")
meta = self.get_ld_json(soup, lambda d: d.get("@type", "") == "NewsArticle")
modified_date = meta.get("dateModified") or meta.get("datePublished")
Expand Down
3 changes: 1 addition & 2 deletions recipes/foreign-affairs.recipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from recipes_shared import BasicNewsrackRecipe

import mechanize
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe, classes

_name = "Foreign Affairs"
Expand Down Expand Up @@ -64,7 +63,7 @@ def get_data(data):
feeds = []

def as_article(source):
title = BeautifulSoup(source["title"][0]).get_text()
title = self.soup(source["title"][0]).get_text()
desc = ""
fs = source.get("field_subtitle")
if fs:
Expand Down
7 changes: 3 additions & 4 deletions recipes/foreign-policy-magazine.recipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
sys.path.append(os.environ["recipes_includes"])
from recipes_shared import WordPressNewsrackRecipe, get_datetime_format

from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe

_name = "Foreign Policy Magazine"
Expand Down Expand Up @@ -79,7 +78,7 @@ def preprocess_raw_html(self, raw_html, url):
if not self.pub_date or date_published_gmt > self.pub_date:
self.pub_date = date_published_gmt

soup = BeautifulSoup(
soup = self.soup(
f"""<html>
<head><title>{post["title"]["rendered"]}</title></head>
<body>
Expand All @@ -96,7 +95,7 @@ def preprocess_raw_html(self, raw_html, url):
</body></html>"""
)

content = BeautifulSoup(post["content"]["rendered"])
content = self.soup(post["content"]["rendered"])
# FP doesn't use featuremedia, the first attachment is the lede image
attachment_endpoint = (
post.get("_links", {}).get("wp:attachment", [{}])[0].get("href")
Expand All @@ -111,7 +110,7 @@ def preprocess_raw_html(self, raw_html, url):
lede.append(img)
if attachment.get("caption", {}).get("rendered"):
caption = soup.new_tag("div", attrs={"class": "wp-caption-text"})
caption.append(BeautifulSoup(attachment["caption"]["rendered"]))
caption.append(self.soup(attachment["caption"]["rendered"]))
lede.append(caption)
soup.body.article.append(lede)

Expand Down
7 changes: 3 additions & 4 deletions recipes/foreign-policy.recipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
sys.path.append(os.environ["recipes_includes"])
from recipes_shared import WordPressNewsrackRecipe, get_datetime_format

from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe

_name = "Foreign Policy"
Expand Down Expand Up @@ -76,7 +75,7 @@ def preprocess_raw_html(self, raw_html, url):
post_authors = self.extract_authors(post)
categories = self.extract_categories(post)

soup = BeautifulSoup(
soup = self.soup(
f"""<html>
<head><title>{post["title"]["rendered"]}</title></head>
<body>
Expand All @@ -93,7 +92,7 @@ def preprocess_raw_html(self, raw_html, url):
</body></html>"""
)

content = BeautifulSoup(post["content"]["rendered"])
content = self.soup(post["content"]["rendered"])
# FP doesn't use featuremedia, the first attachment is the lede image
attachment_endpoint = (
post.get("_links", {}).get("wp:attachment", [{}])[0].get("href")
Expand All @@ -108,7 +107,7 @@ def preprocess_raw_html(self, raw_html, url):
lede.append(img)
if attachment.get("caption", {}).get("rendered"):
caption = soup.new_tag("div", attrs={"class": "wp-caption-text"})
caption.append(BeautifulSoup(attachment["caption"]["rendered"]))
caption.append(self.soup(attachment["caption"]["rendered"]))
lede.append(caption)
soup.body.article.append(lede)

Expand Down
3 changes: 1 addition & 2 deletions recipes/ft-paper.recipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
sys.path.append(os.environ["recipes_includes"])
from recipes_shared import BasicCookielessNewsrackRecipe, format_title, get_date_format

from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe, classes

_name = "Financial Times (Print)"
Expand Down Expand Up @@ -99,7 +98,7 @@ def ft_parse_index(self, soup):
return feeds

def preprocess_raw_html(self, raw_html, url):
soup = BeautifulSoup(raw_html)
soup = self.soup(raw_html)
article = self.get_ld_json(soup, lambda d: d.get("@type", "") == "NewsArticle")
if not (article and article.get("articleBody")):
err_msg = f"Unable to find article: {url}"
Expand Down
3 changes: 1 addition & 2 deletions recipes/ft.recipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
sys.path.append(os.environ["recipes_includes"])
from recipes_shared import BasicCookielessNewsrackRecipe, format_title, get_date_format

from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe

_name = "Financial Times"
Expand Down Expand Up @@ -64,7 +63,7 @@ def print_version(self, url):
return urljoin("https://ft.com", url)

def preprocess_raw_html(self, raw_html, url):
soup = BeautifulSoup(raw_html)
soup = self.soup(raw_html)
article = self.get_ld_json(soup, lambda d: d.get("@type", "") == "NewsArticle")
if not (article and article.get("articleBody")):
err_msg = f"Unable to find article: {url}"
Expand Down
Loading

0 comments on commit 479b072

Please sign in to comment.