forked from ping/newsrack
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtaipei-times.recipe.py
65 lines (51 loc) · 2.36 KB
/
taipei-times.recipe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# Copyright (c) 2022 https://github.com/ping/
#
# This software is released under the GNU General Public License v3.0
# https://opensource.org/licenses/GPL-3.0
import os
import sys
from datetime import timezone, timedelta
# custom include to share code between recipes
sys.path.append(os.environ["recipes_includes"])
from recipes_shared import BasicNewsrackRecipe, format_title
from calibre.web.feeds.news import BasicNewsRecipe
_name = "Taipei Times"
class TaipeiTimes(BasicNewsrackRecipe, BasicNewsRecipe):
title = _name
language = "en"
__author__ = "ping"
publication_type = "newspaper"
description = "News from the Taipei Times https://www.taipeitimes.com/"
masthead_url = "https://www.taipeitimes.com/assets/images/logo.gif"
oldest_article = 1 # days
max_articles_per_feed = 50
ignore_duplicate_articles = {"title", "url"}
keep_only_tags = [dict(name="div", class_="archives")]
remove_tags = [dict(attrs={"class": ["ad_mg_t", "ad_mg_b", "sh"]})]
extra_css = """
.archives h1 { font-size: 1.8rem; margin-bottom: 0.5rem; }
.archives h2 { font-size: 1.2rem; margin-bottom: 0.5rem; font-weight: normal; font-style: italic; }
p.byline { font-weight: bold; color: #444; display: block; margin-top: 1rem; }
.imgboxa img { max-width: 100%; height: auto; }
.imgboxa p { font-size: 0.8rem; margin-top: 0.2rem; display: inline-block; font-weight: normal; }
"""
feeds = [(_name, "https://www.taipeitimes.com/xml/index.rss")]
def populate_article_metadata(self, article, _, __):
if not self.pub_date or article.utctime > self.pub_date:
self.pub_date = article.utctime
post_date_local = article.utctime.astimezone(timezone(timedelta(hours=8)))
self.title = format_title(_name, post_date_local)
def preprocess_raw_html(self, raw_html, _):
soup = self.soup(raw_html)
# replace byline <ul> with actual byline element
byline = soup.select_one("ul.as")
if byline:
byline_name = byline.find(attrs={"class": "name"})
if byline_name:
byline_name["class"] = "byline"
byline.replace_with(byline_name)
# replace with image caption's <h1> with <p> .... wtf
img_h1_captions = soup.select(".imgboxa h1")
for h1 in img_h1_captions:
h1.name = "p"
return str(soup)