diff --git a/recipes/bloomberg-businessweek.recipe.py b/recipes/bloomberg-businessweek.recipe.py index 3d64cbac..8c798f61 100644 --- a/recipes/bloomberg-businessweek.recipe.py +++ b/recipes/bloomberg-businessweek.recipe.py @@ -176,7 +176,7 @@ def render_content(self, content, soup, parent): if content_type == "aside": return soup.new_tag("blockquote") if content_type == "embed" and content.get("iframeData", {}).get("html"): - return BeautifulSoup(content["iframeData"]["html"], features="html.parser") + return BeautifulSoup(content["iframeData"]["html"]) if content_type == "link" and content.get("data", {}).get( "destination", {} ).get("web"): @@ -229,9 +229,7 @@ def render_content(self, content, soup, parent): div.append(img) if photo.get("caption"): caption = soup.new_tag("div", attrs={"class": "caption"}) - caption.append( - BeautifulSoup(photo["caption"], features="html.parser") - ) + caption.append(BeautifulSoup(photo["caption"])) div.append(caption) if photo.get("credit"): credit = soup.new_tag("div", attrs={"class": "credit"}) @@ -345,10 +343,7 @@ def preprocess_raw_html(self, raw_html, url): soup.head.title.append(article.get("headlineText") or article["headline"]) h1_title = soup.find("h1") h1_title.append( - BeautifulSoup( - article.get("headlineText") or article["headline"], - features="html.parser", - ) + BeautifulSoup(article.get("headlineText") or article["headline"]) ) if article.get("summaryText") or article.get("abstract"): sub_headline = soup.new_tag("div", attrs={"class": "sub-headline"}) @@ -364,10 +359,7 @@ def preprocess_raw_html(self, raw_html, url): if article.get("byline"): soup.find(class_="article-meta").insert( 0, - BeautifulSoup( - f'{article["byline"]}', - features="html.parser", - ), + BeautifulSoup(f'{article["byline"]}'), ) else: try: @@ -376,8 +368,7 @@ def preprocess_raw_html(self, raw_html, url): soup.find(class_="article-meta").insert( 0, BeautifulSoup( - f'{", ".join(post_authors)}', - features="html.parser", + f'{", ".join(post_authors)}' ), ) except (KeyError, TypeError): @@ -388,8 +379,7 @@ def preprocess_raw_html(self, raw_html, url): soup.body.article.insert( 0, BeautifulSoup( - f'{" / ".join(categories)}', - features="html.parser", + f'{" / ".join(categories)}' ), ) # inject lede image @@ -403,14 +393,12 @@ def preprocess_raw_html(self, raw_html, url): caption_ele = soup.new_tag( "div", attrs={"class": "news-figure-caption-text"} ) - caption_ele.append( - BeautifulSoup(lede_img_caption_html, features="html.parser") - ) + caption_ele.append(BeautifulSoup(lede_img_caption_html)) img_container.append(caption_ele) soup.body.article.append(img_container) if type(article["body"]) == str: - body_soup = BeautifulSoup(article["body"], features="html.parser") + body_soup = BeautifulSoup(article["body"]) for img_div in body_soup.find_all( name="figure", attrs={"data-type": "image"} ): @@ -420,7 +408,7 @@ def preprocess_raw_html(self, raw_html, url): img["src"] = img["src"] soup.body.article.append(body_soup) else: - body_soup = BeautifulSoup(features="html.parser") + body_soup = BeautifulSoup() self.nested_render(article["body"], body_soup, body_soup) soup.body.article.append(body_soup) return str(soup) diff --git a/recipes/bloomberg-news.recipe.py b/recipes/bloomberg-news.recipe.py index 0121f948..db5700a3 100644 --- a/recipes/bloomberg-news.recipe.py +++ b/recipes/bloomberg-news.recipe.py @@ -199,7 +199,7 @@ def render_content(self, content, soup, parent): if content_type == "aside": return soup.new_tag("blockquote") if content_type == "embed" and content.get("iframeData", {}).get("html"): - return BeautifulSoup(content["iframeData"]["html"], features="html.parser") + return BeautifulSoup(content["iframeData"]["html"]) if content_type == "link" and content.get("data", {}).get( "destination", {} ).get("web"): @@ -367,10 +367,7 @@ def preprocess_raw_html(self, raw_html, url): soup.head.title.append(article.get("headlineText") or article["headline"]) h1_title = soup.find("h1") h1_title.append( - BeautifulSoup( - article.get("headlineText") or article["headline"], - features="html.parser", - ) + BeautifulSoup(article.get("headlineText") or article["headline"]) ) if article.get("summaryText") or article.get("abstract"): sub_headline = soup.new_tag("div", attrs={"class": "sub-headline"}) @@ -386,10 +383,7 @@ def preprocess_raw_html(self, raw_html, url): if article.get("byline"): soup.find(class_="article-meta").insert( 0, - BeautifulSoup( - f'{article["byline"]}', - features="html.parser", - ), + BeautifulSoup(f'{article["byline"]}'), ) else: try: @@ -398,8 +392,7 @@ def preprocess_raw_html(self, raw_html, url): soup.find(class_="article-meta").insert( 0, BeautifulSoup( - f'{", ".join(post_authors)}', - features="html.parser", + f'{", ".join(post_authors)}' ), ) except (KeyError, TypeError): @@ -410,8 +403,7 @@ def preprocess_raw_html(self, raw_html, url): soup.body.article.insert( 0, BeautifulSoup( - f'{" / ".join(categories)}', - features="html.parser", + f'{" / ".join(categories)}' ), ) # inject lede image @@ -425,14 +417,12 @@ def preprocess_raw_html(self, raw_html, url): caption_ele = soup.new_tag( "div", attrs={"class": "news-figure-caption-text"} ) - caption_ele.append( - BeautifulSoup(lede_img_caption_html), features="html.parser" - ) + caption_ele.append(BeautifulSoup(lede_img_caption_html)) img_container.append(caption_ele) soup.body.article.append(img_container) if type(article["body"]) == str: - body_soup = BeautifulSoup(article["body"], features="html.parser") + body_soup = BeautifulSoup(article["body"]) for img_div in body_soup.find_all( name="figure", attrs={"data-type": "image"} ): @@ -442,7 +432,7 @@ def preprocess_raw_html(self, raw_html, url): img["src"] = img["src"] soup.body.article.append(body_soup) else: - body_soup = BeautifulSoup(features="html.parser") + body_soup = BeautifulSoup() self.nested_render(article["body"], body_soup, body_soup) soup.body.article.append(body_soup) return str(soup) diff --git a/recipes/foreign-policy-magazine.recipe.py b/recipes/foreign-policy-magazine.recipe.py index 32f3f491..7e74527b 100644 --- a/recipes/foreign-policy-magazine.recipe.py +++ b/recipes/foreign-policy-magazine.recipe.py @@ -111,11 +111,7 @@ def preprocess_raw_html(self, raw_html, url): lede.append(img) if attachment.get("caption", {}).get("rendered"): caption = soup.new_tag("div", attrs={"class": "wp-caption-text"}) - caption.append( - BeautifulSoup( - attachment["caption"]["rendered"], features="html.parser" - ) - ) + caption.append(BeautifulSoup(attachment["caption"]["rendered"])) lede.append(caption) soup.body.article.append(lede) diff --git a/recipes/foreign-policy.recipe.py b/recipes/foreign-policy.recipe.py index db4f4b1f..11089a37 100644 --- a/recipes/foreign-policy.recipe.py +++ b/recipes/foreign-policy.recipe.py @@ -108,11 +108,7 @@ def preprocess_raw_html(self, raw_html, url): lede.append(img) if attachment.get("caption", {}).get("rendered"): caption = soup.new_tag("div", attrs={"class": "wp-caption-text"}) - caption.append( - BeautifulSoup( - attachment["caption"]["rendered"], features="html.parser" - ) - ) + caption.append(BeautifulSoup(attachment["caption"]["rendered"])) lede.append(caption) soup.body.article.append(lede) diff --git a/recipes/new-republic-magazine.recipe.py b/recipes/new-republic-magazine.recipe.py index c2010a97..4052fa55 100644 --- a/recipes/new-republic-magazine.recipe.py +++ b/recipes/new-republic-magazine.recipe.py @@ -233,7 +233,7 @@ def preprocess_raw_html(self, raw_html, url): {lede_image_caption}

""" - body_soup = BeautifulSoup(article["body"], features="html.parser") + body_soup = BeautifulSoup(article["body"]) for img in body_soup.find_all("img", attrs={"data-serialized": True}): try: img_info = json.loads(img["data-serialized"]) diff --git a/recipes/wapo-paper.recipe.py b/recipes/wapo-paper.recipe.py index 88fa9567..6ea6a472 100644 --- a/recipes/wapo-paper.recipe.py +++ b/recipes/wapo-paper.recipe.py @@ -73,7 +73,7 @@ def _extract_child_nodes(self, nodes, parent_element, soup, url): continue if node_type == "text": para_ele = soup.new_tag("p") - para_ele.append(BeautifulSoup(c["content"], features="html.parser")) + para_ele.append(BeautifulSoup(c["content"])) parent_element.append(para_ele) elif node_type == "image": figure_ele = soup.new_tag("figure", attrs={"class": "figure"}) @@ -99,7 +99,7 @@ def _extract_child_nodes(self, nodes, parent_element, soup, url): parent_element.append(container_ele) elif node_type == "header": header_ele = soup.new_tag(f'h{c["level"]}') - header_ele.append(BeautifulSoup(c["content"], features="html.parser")) + header_ele.append(BeautifulSoup(c["content"])) parent_element.append(header_ele) elif node_type == "correction": para_ele = soup.new_tag("p", attrs={"class": "correction"}) @@ -133,14 +133,12 @@ def _extract_child_nodes(self, nodes, parent_element, soup, url): ) or c.get("header") if header_string: header_ele = soup.new_tag("h3") - header_ele.append( - BeautifulSoup(header_string, features="html.parser") - ) + header_ele.append(BeautifulSoup(header_string)) container_ele.append(header_ele) ol_ele = soup.new_tag("ol") for i in c.get("items", []): li_ele = soup.new_tag("li") - li_ele.append(BeautifulSoup(i["content"], features="html.parser")) + li_ele.append(BeautifulSoup(i["content"])) ol_ele.append(li_ele) container_ele.append(ol_ele) parent_element.append(container_ele) @@ -154,9 +152,7 @@ def _extract_child_nodes(self, nodes, parent_element, soup, url): header_ele = soup.new_tag("h3") header_ele.append( - BeautifulSoup( - c.get("headlines", {}).get("basic", ""), features="html.parser" - ) + BeautifulSoup(c.get("headlines", {}).get("basic", "")) ) container_ele.append(header_ele) @@ -166,8 +162,7 @@ def _extract_child_nodes(self, nodes, parent_element, soup, url): f"""
{post_date:{get_datetime_format()}} -
""", - features="html.parser", + """ ) authors = [a["name"] for a in c.get("credits", {}).get("by", [])] meta_ele.find("span", class_="author").append(", ".join(authors)) diff --git a/recipes/wapo.recipe.py b/recipes/wapo.recipe.py index 6448b477..ef4d3120 100644 --- a/recipes/wapo.recipe.py +++ b/recipes/wapo.recipe.py @@ -108,7 +108,7 @@ def _extract_child_nodes(self, nodes, parent_element, soup, url): parent_element.append(container_ele) elif node_type == "header": header_ele = soup.new_tag(f'h{c["level"]}') - header_ele.append(BeautifulSoup(c["content"], features="html.parser")) + header_ele.append(BeautifulSoup(c["content"])) parent_element.append(header_ele) elif node_type == "correction": para_ele = soup.new_tag("p", attrs={"class": "correction"})