Skip to content

Commit

Permalink
calibre's soup is not bs4's soup, and does not use any kwargs
Browse files Browse the repository at this point in the history
  • Loading branch information
ping committed Oct 10, 2023
1 parent 6e50234 commit 61e1e59
Show file tree
Hide file tree
Showing 7 changed files with 27 additions and 62 deletions.
30 changes: 9 additions & 21 deletions recipes/bloomberg-businessweek.recipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ def render_content(self, content, soup, parent):
if content_type == "aside":
return soup.new_tag("blockquote")
if content_type == "embed" and content.get("iframeData", {}).get("html"):
return BeautifulSoup(content["iframeData"]["html"], features="html.parser")
return BeautifulSoup(content["iframeData"]["html"])
if content_type == "link" and content.get("data", {}).get(
"destination", {}
).get("web"):
Expand Down Expand Up @@ -229,9 +229,7 @@ def render_content(self, content, soup, parent):
div.append(img)
if photo.get("caption"):
caption = soup.new_tag("div", attrs={"class": "caption"})
caption.append(
BeautifulSoup(photo["caption"], features="html.parser")
)
caption.append(BeautifulSoup(photo["caption"]))
div.append(caption)
if photo.get("credit"):
credit = soup.new_tag("div", attrs={"class": "credit"})
Expand Down Expand Up @@ -345,10 +343,7 @@ def preprocess_raw_html(self, raw_html, url):
soup.head.title.append(article.get("headlineText") or article["headline"])
h1_title = soup.find("h1")
h1_title.append(
BeautifulSoup(
article.get("headlineText") or article["headline"],
features="html.parser",
)
BeautifulSoup(article.get("headlineText") or article["headline"])
)
if article.get("summaryText") or article.get("abstract"):
sub_headline = soup.new_tag("div", attrs={"class": "sub-headline"})
Expand All @@ -364,10 +359,7 @@ def preprocess_raw_html(self, raw_html, url):
if article.get("byline"):
soup.find(class_="article-meta").insert(
0,
BeautifulSoup(
f'<span class="author">{article["byline"]}</span>',
features="html.parser",
),
BeautifulSoup(f'<span class="author">{article["byline"]}</span>'),
)
else:
try:
Expand All @@ -376,8 +368,7 @@ def preprocess_raw_html(self, raw_html, url):
soup.find(class_="article-meta").insert(
0,
BeautifulSoup(
f'<span class="author">{", ".join(post_authors)}</span>',
features="html.parser",
f'<span class="author">{", ".join(post_authors)}</span>'
),
)
except (KeyError, TypeError):
Expand All @@ -388,8 +379,7 @@ def preprocess_raw_html(self, raw_html, url):
soup.body.article.insert(
0,
BeautifulSoup(
f'<span class="article-section">{" / ".join(categories)}</span>',
features="html.parser",
f'<span class="article-section">{" / ".join(categories)}</span>'
),
)
# inject lede image
Expand All @@ -403,14 +393,12 @@ def preprocess_raw_html(self, raw_html, url):
caption_ele = soup.new_tag(
"div", attrs={"class": "news-figure-caption-text"}
)
caption_ele.append(
BeautifulSoup(lede_img_caption_html, features="html.parser")
)
caption_ele.append(BeautifulSoup(lede_img_caption_html))
img_container.append(caption_ele)
soup.body.article.append(img_container)

if type(article["body"]) == str:
body_soup = BeautifulSoup(article["body"], features="html.parser")
body_soup = BeautifulSoup(article["body"])
for img_div in body_soup.find_all(
name="figure", attrs={"data-type": "image"}
):
Expand All @@ -420,7 +408,7 @@ def preprocess_raw_html(self, raw_html, url):
img["src"] = img["src"]
soup.body.article.append(body_soup)
else:
body_soup = BeautifulSoup(features="html.parser")
body_soup = BeautifulSoup()
self.nested_render(article["body"], body_soup, body_soup)
soup.body.article.append(body_soup)
return str(soup)
Expand Down
26 changes: 8 additions & 18 deletions recipes/bloomberg-news.recipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ def render_content(self, content, soup, parent):
if content_type == "aside":
return soup.new_tag("blockquote")
if content_type == "embed" and content.get("iframeData", {}).get("html"):
return BeautifulSoup(content["iframeData"]["html"], features="html.parser")
return BeautifulSoup(content["iframeData"]["html"])
if content_type == "link" and content.get("data", {}).get(
"destination", {}
).get("web"):
Expand Down Expand Up @@ -367,10 +367,7 @@ def preprocess_raw_html(self, raw_html, url):
soup.head.title.append(article.get("headlineText") or article["headline"])
h1_title = soup.find("h1")
h1_title.append(
BeautifulSoup(
article.get("headlineText") or article["headline"],
features="html.parser",
)
BeautifulSoup(article.get("headlineText") or article["headline"])
)
if article.get("summaryText") or article.get("abstract"):
sub_headline = soup.new_tag("div", attrs={"class": "sub-headline"})
Expand All @@ -386,10 +383,7 @@ def preprocess_raw_html(self, raw_html, url):
if article.get("byline"):
soup.find(class_="article-meta").insert(
0,
BeautifulSoup(
f'<span class="author">{article["byline"]}</span>',
features="html.parser",
),
BeautifulSoup(f'<span class="author">{article["byline"]}</span>'),
)
else:
try:
Expand All @@ -398,8 +392,7 @@ def preprocess_raw_html(self, raw_html, url):
soup.find(class_="article-meta").insert(
0,
BeautifulSoup(
f'<span class="author">{", ".join(post_authors)}</span>',
features="html.parser",
f'<span class="author">{", ".join(post_authors)}</span>'
),
)
except (KeyError, TypeError):
Expand All @@ -410,8 +403,7 @@ def preprocess_raw_html(self, raw_html, url):
soup.body.article.insert(
0,
BeautifulSoup(
f'<span class="article-section">{" / ".join(categories)}</span>',
features="html.parser",
f'<span class="article-section">{" / ".join(categories)}</span>'
),
)
# inject lede image
Expand All @@ -425,14 +417,12 @@ def preprocess_raw_html(self, raw_html, url):
caption_ele = soup.new_tag(
"div", attrs={"class": "news-figure-caption-text"}
)
caption_ele.append(
BeautifulSoup(lede_img_caption_html), features="html.parser"
)
caption_ele.append(BeautifulSoup(lede_img_caption_html))
img_container.append(caption_ele)
soup.body.article.append(img_container)

if type(article["body"]) == str:
body_soup = BeautifulSoup(article["body"], features="html.parser")
body_soup = BeautifulSoup(article["body"])
for img_div in body_soup.find_all(
name="figure", attrs={"data-type": "image"}
):
Expand All @@ -442,7 +432,7 @@ def preprocess_raw_html(self, raw_html, url):
img["src"] = img["src"]
soup.body.article.append(body_soup)
else:
body_soup = BeautifulSoup(features="html.parser")
body_soup = BeautifulSoup()
self.nested_render(article["body"], body_soup, body_soup)
soup.body.article.append(body_soup)
return str(soup)
Expand Down
6 changes: 1 addition & 5 deletions recipes/foreign-policy-magazine.recipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,11 +111,7 @@ def preprocess_raw_html(self, raw_html, url):
lede.append(img)
if attachment.get("caption", {}).get("rendered"):
caption = soup.new_tag("div", attrs={"class": "wp-caption-text"})
caption.append(
BeautifulSoup(
attachment["caption"]["rendered"], features="html.parser"
)
)
caption.append(BeautifulSoup(attachment["caption"]["rendered"]))
lede.append(caption)
soup.body.article.append(lede)

Expand Down
6 changes: 1 addition & 5 deletions recipes/foreign-policy.recipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,11 +108,7 @@ def preprocess_raw_html(self, raw_html, url):
lede.append(img)
if attachment.get("caption", {}).get("rendered"):
caption = soup.new_tag("div", attrs={"class": "wp-caption-text"})
caption.append(
BeautifulSoup(
attachment["caption"]["rendered"], features="html.parser"
)
)
caption.append(BeautifulSoup(attachment["caption"]["rendered"]))
lede.append(caption)
soup.body.article.append(lede)

Expand Down
2 changes: 1 addition & 1 deletion recipes/new-republic-magazine.recipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ def preprocess_raw_html(self, raw_html, url):
<img src="{lede_img_url}">{lede_image_caption}
</p>"""

body_soup = BeautifulSoup(article["body"], features="html.parser")
body_soup = BeautifulSoup(article["body"])
for img in body_soup.find_all("img", attrs={"data-serialized": True}):
try:
img_info = json.loads(img["data-serialized"])
Expand Down
17 changes: 6 additions & 11 deletions recipes/wapo-paper.recipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def _extract_child_nodes(self, nodes, parent_element, soup, url):
continue
if node_type == "text":
para_ele = soup.new_tag("p")
para_ele.append(BeautifulSoup(c["content"], features="html.parser"))
para_ele.append(BeautifulSoup(c["content"]))
parent_element.append(para_ele)
elif node_type == "image":
figure_ele = soup.new_tag("figure", attrs={"class": "figure"})
Expand All @@ -99,7 +99,7 @@ def _extract_child_nodes(self, nodes, parent_element, soup, url):
parent_element.append(container_ele)
elif node_type == "header":
header_ele = soup.new_tag(f'h{c["level"]}')
header_ele.append(BeautifulSoup(c["content"], features="html.parser"))
header_ele.append(BeautifulSoup(c["content"]))
parent_element.append(header_ele)
elif node_type == "correction":
para_ele = soup.new_tag("p", attrs={"class": "correction"})
Expand Down Expand Up @@ -133,14 +133,12 @@ def _extract_child_nodes(self, nodes, parent_element, soup, url):
) or c.get("header")
if header_string:
header_ele = soup.new_tag("h3")
header_ele.append(
BeautifulSoup(header_string, features="html.parser")
)
header_ele.append(BeautifulSoup(header_string))
container_ele.append(header_ele)
ol_ele = soup.new_tag("ol")
for i in c.get("items", []):
li_ele = soup.new_tag("li")
li_ele.append(BeautifulSoup(i["content"], features="html.parser"))
li_ele.append(BeautifulSoup(i["content"]))
ol_ele.append(li_ele)
container_ele.append(ol_ele)
parent_element.append(container_ele)
Expand All @@ -154,9 +152,7 @@ def _extract_child_nodes(self, nodes, parent_element, soup, url):

header_ele = soup.new_tag("h3")
header_ele.append(
BeautifulSoup(
c.get("headlines", {}).get("basic", ""), features="html.parser"
)
BeautifulSoup(c.get("headlines", {}).get("basic", ""))
)
container_ele.append(header_ele)

Expand All @@ -166,8 +162,7 @@ def _extract_child_nodes(self, nodes, parent_element, soup, url):
f"""<div class="article-meta">
<span class="author"></span>
<span class="published-dt">{post_date:{get_datetime_format()}}</span>
</div>""",
features="html.parser",
</div>"""
)
authors = [a["name"] for a in c.get("credits", {}).get("by", [])]
meta_ele.find("span", class_="author").append(", ".join(authors))
Expand Down
2 changes: 1 addition & 1 deletion recipes/wapo.recipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def _extract_child_nodes(self, nodes, parent_element, soup, url):
parent_element.append(container_ele)
elif node_type == "header":
header_ele = soup.new_tag(f'h{c["level"]}')
header_ele.append(BeautifulSoup(c["content"], features="html.parser"))
header_ele.append(BeautifulSoup(c["content"]))
parent_element.append(header_ele)
elif node_type == "correction":
para_ele = soup.new_tag("p", attrs={"class": "correction"})
Expand Down

0 comments on commit 61e1e59

Please sign in to comment.