diff --git a/Makefile b/Makefile index d75ff30..134341c 100644 --- a/Makefile +++ b/Makefile @@ -7,4 +7,7 @@ run: poetry run python -m narrator test: + poetry run python -m pytest -m "not slow" --verbosity=2 --showlocals --log-level=DEBUG + +test-all: poetry run python -m pytest --verbosity=2 --showlocals --log-level=DEBUG \ No newline at end of file diff --git a/config.yml b/config.yml index a96b757..afaa438 100644 --- a/config.yml +++ b/config.yml @@ -9,3 +9,19 @@ utils: path: c:\bin\balcon.exe ffmpeg: path: c:\bin\ffmpeg.exe + blb2txt: + path: c:\bin\blb2txt.exe + +to_txt: + max_input_size: 5242880 # 5 * 2 ^ 20 == 5 MiB + +url_parser: + sites: + habr: + url_re: https\://habr\.com/..(/company/[-\w]+/blog|/post)/\d+ + lang: ru + re: + title: main h1[data-test-id="articleTitle"] span + author: main a.tm-user-info__username + publication_date: main div.tm-article-snippet__meta time + text: "#post-content-body" \ No newline at end of file diff --git a/narrator/__main__.py b/narrator/__main__.py index 93f09b5..5e13077 100644 --- a/narrator/__main__.py +++ b/narrator/__main__.py @@ -1,6 +1,6 @@ import asyncio -from narrator.bot import dispatcher, narrator_bot +from narrator.telegram.bot import dispatcher, narrator_bot async def main(): diff --git a/narrator/article.py b/narrator/article.py deleted file mode 100644 index a86cd81..0000000 --- a/narrator/article.py +++ /dev/null @@ -1,41 +0,0 @@ -import os - -from narrator.sub_utils import add_suffix, make_filename - - -class Article: - def __init__(self): - self.title: str = "" - self.meta = {} - self.text: str = "" - - def __setitem__(self, key, val): - self.meta[key] = val - - def __getitem__(self, key): - return self.meta[key] - - def __repr__(self) -> str: - text = self.text[:20] if self.text else "" - return f'Article(text="{text}")' - - def save_to_txt(self, dir_path: str) -> str: - file_name = self.title or self.text[:20] - file_name = make_filename(file_name) - file_name = add_suffix(file_name, ".txt") - - file_path = os.path.join(dir_path, file_name) - - text = str(self) - - with open(file_path, "wb") as txt: - txt.write(text.encode("utf-8-sig")) - - return file_path - - def __str__(self) -> str: - meta_text = [] - for k, v in self.meta.items(): - meta_text.append(f"{k}:\n {v}") - meta_text = "\n".join(meta_text) - return f"\n{meta_text}\n\n{self.text}\n" diff --git a/narrator/bot.py b/narrator/bot.py deleted file mode 100644 index 2967807..0000000 --- a/narrator/bot.py +++ /dev/null @@ -1,92 +0,0 @@ -# from asyncio.log import logger -import logging -import tempfile -import os -from typing import Callable, Dict, Any, Awaitable - -from aiogram import Dispatcher, BaseMiddleware, Bot -from aiogram.types import ContentType, Message, FSInputFile, TelegramObject -from aiogram.fsm.storage.memory import MemoryStorage -from aiogram.filters import Command - -from .reader import url as r_url -from .reader import file as r_file -from .speaker import Speaker -import narrator.config as conf - -logger = logging.getLogger(__name__) - - -narrator_bot = Bot(token=conf.bot.token) -mem_storage = MemoryStorage() -dispatcher = Dispatcher(storage=mem_storage) - - -@dispatcher.message(Command(commands=["about"])) -async def cmd_test(message: Message): - await message.answer("🤖: ...") - - -@dispatcher.message(content_types=ContentType.DOCUMENT) -async def take_document(message: Message): - logger.info( - f'{message.from_user.id}: file(id={message.document.file_id}, name="{message.document.file_name}",) {message.text}' - ) - if not r_file.is_readable_ext(message.document.file_name): - await message.answer( - f"🤖: No reader matches the file: {message.document.file_name}" - ) - return - with tempfile.TemporaryDirectory() as temp_dir_path: - file_path = os.path.join(temp_dir_path, message.document.file_name) - await narrator_bot.download(message.document, file_path) - path_to_audio = Speaker.narrate_from_txt_to_file(file_path) - tf = FSInputFile(path_to_audio) - await message.answer_document(tf) - - -@dispatcher.message(content_types=ContentType.TEXT) -async def take_text(message: Message): - if r_url.is_url(message.text): - article = r_url.read_text(message.text) - with tempfile.TemporaryDirectory() as temp_dir_path: - file_path = article.save_to_txt(temp_dir_path) - path_to_audio = Speaker.narrate_from_txt_to_file(file_path) - tf = FSInputFile(path_to_audio) - await message.answer_document(tf) - else: - await take_else(message) - - -@dispatcher.message(content_types=ContentType.ANY) -async def take_else(message: Message): - user = message.from_user.id - - logger.info(f"{user}: {message.text}") - await message.answer("🤖: Please, send me a file or a link") - - -############################################################ - - -class Authorizer(BaseMiddleware): - WHITELIST: list[str] = conf.bot.allowed_usernames - - async def __call__( - self, - handler: Callable[[TelegramObject, Dict[str, Any]], Awaitable[Any]], - event: TelegramObject, - data: Dict[str, Any], - ) -> Any: - if event.from_user.username in Authorizer.WHITELIST: - return await handler(event, data) - - -dispatcher.message.middleware(Authorizer()) - - -############################################################ - -narrator_bot = Bot(token=conf.bot.token) - -__all__ = [dispatcher, narrator_bot] diff --git a/narrator/exceptions.py b/narrator/exceptions.py new file mode 100644 index 0000000..2bf8f84 --- /dev/null +++ b/narrator/exceptions.py @@ -0,0 +1,20 @@ +class NarratorException(Exception): + """ + Base class for any Narrator exception + """ + + +class UrlParserException(NarratorException): + pass + + +class UrlInvalid(UrlParserException): + pass + + +class UrlUnreachable(UrlParserException): + pass + + +class TxtTransformerException(NarratorException): + pass diff --git a/narrator/reader/__init__.py b/narrator/reader/__init__.py deleted file mode 100644 index b081291..0000000 --- a/narrator/reader/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -""" -Tools to convert input to txt, suitable for reading -""" -from . import file, url - - -def from_file(obj) -> str: - return file.read_text(obj) - - -def from_url(obj) -> str: - return url.read_text(obj) - - -__all__ = [from_file, from_url] diff --git a/narrator/reader/base_reader.py b/narrator/reader/base_reader.py deleted file mode 100644 index 052a093..0000000 --- a/narrator/reader/base_reader.py +++ /dev/null @@ -1,11 +0,0 @@ -from narrator.article import Article - - -class BaseReader: - @staticmethod - def read_text(obj) -> Article: - raise NotImplemented() - - @staticmethod - def is_readable(obj) -> bool: - raise NotImplemented() diff --git a/narrator/reader/file/__init__.py b/narrator/reader/file/__init__.py deleted file mode 100644 index 130dee2..0000000 --- a/narrator/reader/file/__init__.py +++ /dev/null @@ -1,24 +0,0 @@ -from aiogram.types import Document - -from narrator.article import Article -from .txt_reader import TxtReader -from .base_file_reader import BaseFileReader - -FILE_READERS = [TxtReader] - - -def read_text(file_path: str) -> Article: - for Reader in FILE_READERS: - if Reader.is_readable(file_path): - return Reader.read_text(file_path) - return f"No reader matches the file: {file_path}" - - -def is_readable_ext(file_name: str) -> bool: - for Reader in FILE_READERS: - if Reader.is_readable(file_name): - return True - return False - - -__all__ = [read_text, is_readable_ext] diff --git a/narrator/reader/file/base_file_reader.py b/narrator/reader/file/base_file_reader.py deleted file mode 100644 index 6072fb8..0000000 --- a/narrator/reader/file/base_file_reader.py +++ /dev/null @@ -1,13 +0,0 @@ -from aiogram.types import Document - -from ..base_reader import BaseReader - - -class BaseFileReader(BaseReader): - @staticmethod - def read_text(obj: str) -> str: - raise NotImplemented() - - @staticmethod - def is_readable(obj: str) -> bool: - raise NotImplemented() diff --git a/narrator/reader/file/txt_reader.py b/narrator/reader/file/txt_reader.py deleted file mode 100644 index 4a4fe11..0000000 --- a/narrator/reader/file/txt_reader.py +++ /dev/null @@ -1,19 +0,0 @@ -import os - -from aiogram.types import Document - -from narrator.article import Article -from .base_file_reader import BaseFileReader - - -class TxtReader(BaseFileReader): - @staticmethod - def read_text(obj: str) -> Article: - article = Article() - article["file name"] = obj # os.path.splitext(obj)[0] - - return article - - @staticmethod - def is_readable(obj: str) -> bool: - return os.path.splitext(obj)[1] == ".txt" diff --git a/narrator/reader/url/__init__.py b/narrator/reader/url/__init__.py deleted file mode 100644 index 7251949..0000000 --- a/narrator/reader/url/__init__.py +++ /dev/null @@ -1,22 +0,0 @@ -import urllib.request as ur - -from narrator.article import Article - -from .habr_reader import HabrReader -from .base_url_reader import BaseUrlReader - -URL_READERS = [HabrReader] - - -def read_text(obj: str) -> Article: - for Reader in URL_READERS: - if Reader.is_readable(obj): - return Reader.read_text(obj) - return f"No reader matches the url: {obj}" - - -def is_url(obj: str) -> bool: - return BaseUrlReader.is_readable(obj) - - -__all__ = [read_text, is_url] diff --git a/narrator/reader/url/base_url_reader.py b/narrator/reader/url/base_url_reader.py deleted file mode 100644 index fd9c274..0000000 --- a/narrator/reader/url/base_url_reader.py +++ /dev/null @@ -1,37 +0,0 @@ -import re -from urllib.request import urlopen -from urllib.parse import urlparse - -from narrator.article import Article -from ..base_reader import BaseReader - -# https://stackoverflow.com/a/38020041/2493536 -def _uri_validator(x): - try: - result = urlparse(x) - return all([result.scheme, result.netloc]) - except: - return False - - -class BaseUrlReader(BaseReader): - @staticmethod - def read_text(obj: str) -> Article: - raise NotImplemented() - - @staticmethod - def is_readable(obj: str) -> bool: - return _uri_validator(obj) - - @staticmethod - def _re_check_url(re_temp: str, url: str) -> bool: - if re.match(re_temp, url): - return True - else: - return False - - @staticmethod - def get_html(url: str) -> str: - with urlopen(url) as resp: - html = resp.read().decode() - return html diff --git a/narrator/reader/url/habr_reader.py b/narrator/reader/url/habr_reader.py deleted file mode 100644 index 2a8d387..0000000 --- a/narrator/reader/url/habr_reader.py +++ /dev/null @@ -1,31 +0,0 @@ -from bs4 import BeautifulSoup - -from .base_url_reader import BaseUrlReader -from narrator.article import Article - - -class HabrReader(BaseUrlReader): - @staticmethod - def read_text(obj: str) -> Article: - html = BaseUrlReader.get_html(obj) - soup = BeautifulSoup(html, "html.parser") - article = Article() - article.title = soup.select_one( - "main h1[data-test-id='articleTitle'] span" - ).text - article["author"] = soup.select_one( - "main a.tm-user-info__username" - ).text.strip() - article["date"] = soup.select_one("main div.tm-article-snippet__meta time").get( - "title", "" - ) - article.text = soup.select_one("#post-content-body").text - - return article - - @staticmethod - def is_readable(obj: str) -> bool: - re_temp = r"https\://habr\.com/..(/company/[-\w]+/blog|/post)/\d+" - return BaseUrlReader._re_check_url(re_temp, obj) and BaseUrlReader.is_readable( - obj - ) diff --git a/narrator/speaker/__init__.py b/narrator/speaker/__init__.py deleted file mode 100644 index fc71b49..0000000 --- a/narrator/speaker/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .speaker import Speaker - -__all__ = [Speaker] diff --git a/narrator/speaker/speaker.py b/narrator/speaker/speaker.py deleted file mode 100644 index fb62e96..0000000 --- a/narrator/speaker/speaker.py +++ /dev/null @@ -1,18 +0,0 @@ -import os - -from ..article import Article - -import narrator.sub_utils as utils - - -class Speaker: - @staticmethod - def narrate_from_txt_to_file(path_to_txt_file: str): - - # balcon.exe only works with UTF-8-BOM (or "utf-8-sig") - text = open(path_to_txt_file, encoding="utf-8").read() - open(path_to_txt_file, mode="w", encoding="utf-8-sig").write(text) - - path_to_wav = utils.balcon(path_to_txt_file) - path_to_mp3 = utils.ffmpeg__to_mp3(path_to_wav, 48) - return path_to_mp3 diff --git a/narrator/sub_utils.py b/narrator/sub_utils.py index 65d239f..764b395 100644 --- a/narrator/sub_utils.py +++ b/narrator/sub_utils.py @@ -44,14 +44,14 @@ def make_filename(file_name: str): from string import digits, ascii_letters cyrilic = "абвгдеёжзиклмнопрстуфхцчшщьыъэюя" - alphabet = digits + ascii_letters + cyrilic + cyrilic.upper() + " _-&%@#!()" + alphabet = digits + ascii_letters + cyrilic + cyrilic.upper() + " _-&%@#!()." file_name = "".join([c for c in file_name if c in alphabet]) return file_name def ffmpeg__to_mp3( from_file: str, - bitrate: int, + bitrate: int = 96, ): # ffmpeg -i input.wav -vn -ar 44100 -ac 2 -b:a 192k output.mp3 @@ -82,3 +82,18 @@ def balcon(from_txt_file: str): subprocess.run(balcon_args) return to_file + + +def blb2txt(from_file: str): + # .\blb2txt -f tst.docx -out txt.txt -e utf8 + to_file = add_suffix(from_file, ".txt") + + blb2txt_path = conf.utils.blb2txt.path + blb2txt_args = [blb2txt_path] + blb2txt_args.extend(["-f", from_file]) + blb2txt_args.extend(["-out", to_file]) + blb2txt_args.extend(["-e", "utf8"]) + + subprocess.run(blb2txt_args) + + return to_file diff --git a/narrator/telegram/bot.py b/narrator/telegram/bot.py new file mode 100644 index 0000000..95c9943 --- /dev/null +++ b/narrator/telegram/bot.py @@ -0,0 +1,85 @@ +import logging +import tempfile +from typing import Callable, Dict, Any, Awaitable + +from aiogram import Dispatcher, BaseMiddleware, Bot +from aiogram.types import ContentType, Message, FSInputFile, TelegramObject +from aiogram.filters import Command + +import narrator.config as conf +from ..exceptions import NarratorException + +from .worker import BaseWorker, DocWorker, UrlWorker + +logger = logging.getLogger(__name__) + +narrator_bot = Bot(token=conf.bot.token) +dispatcher = Dispatcher() + + +@dispatcher.message(Command(commands=["about"])) +async def cmd_test(message: Message): + await message.answer("🤖: ...") + + +@dispatcher.message(content_types=ContentType.ANY) +async def take_else(message: Message): + Worker: BaseWorker | None = None + obj_name: str | None = None + + if message.content_type == ContentType.TEXT: + Worker = UrlWorker + obj_name = "url" + elif message.content_type == ContentType.DOCUMENT: + Worker = DocWorker + obj_name = "file" + else: + await message.answer("🤖: Please, send me a file or a url") + return + + worker = Worker(narrator_bot, message) + is_valid, description, user_description = worker.check_validity() + if not is_valid: + logger.info(description) + await message.answer("🤖: " + user_description) + return + try: + with tempfile.TemporaryDirectory() as temp_dir_path: + path_to_audio = await worker.produce_audio_file(temp_dir_path) + f_wrap = FSInputFile(path_to_audio) + await message.answer_document(f_wrap) + except NarratorException: + logger.warning(f"Faild to process {obj_name}: ", exc_info=True) + await message.answer( + f"🤖: Something went wrong: I faild to process the {obj_name}" + ) + except Exception: + logger.error(f"Faild to process {obj_name}: ", exc_info=True) + await message.answer( + f"🤖: Something went terribly wrong: I faild to process the {obj_name}" + ) + + +############################################################ + + +class Authorizer(BaseMiddleware): + WHITELIST: list[str] = conf.bot.allowed_usernames + + async def __call__( + self, + handler: Callable[[TelegramObject, Dict[str, Any]], Awaitable[Any]], + event: TelegramObject, + data: Dict[str, Any], + ) -> Any: + if event.from_user.username in Authorizer.WHITELIST: + return await handler(event, data) + + +dispatcher.message.middleware(Authorizer()) + +############################################################ + +narrator_bot = Bot(token=conf.bot.token) + +__all__ = [dispatcher, narrator_bot] diff --git a/narrator/telegram/worker.py b/narrator/telegram/worker.py new file mode 100644 index 0000000..9c161a1 --- /dev/null +++ b/narrator/telegram/worker.py @@ -0,0 +1,108 @@ +import os +from typing import NamedTuple + +from aiogram import Bot +from aiogram.types import Message, Document + +from narrator.exceptions import UrlParserException +import narrator.url_parser as url_parser +import narrator.to_txt as to_txt +from narrator.text import Text +from narrator.sub_utils import ( + balcon, + blb2txt, + ffmpeg__to_mp3, + add_suffix, + crop_suffix, + make_filename, +) + + +class ValidityCheckResult(NamedTuple): + is_valid: bool + description: str | None = None + user_description: str | None = None + + +class BaseWorker: + def __init__(self, bot: Bot, message: Message) -> None: + self._bot = bot + self._message = message + + def check_validity(self) -> ValidityCheckResult: + raise NotImplemented() + + async def produce_audio_file(self, directory: str) -> str: + raise NotImplemented() + + +class UrlWorker(BaseWorker): + def __init__(self, bot: Bot, message: Message) -> None: + super().__init__(bot, message) + self._url = url_parser.Url(self._message.text) + + def check_validity(self) -> ValidityCheckResult: + url: url_parser.Url = self._url + if not url.is_valid: + description = "Not a valid url" + return ValidityCheckResult(False, description, description) + elif not url.is_reachable: + description = "The URL is unreachable (can't open the web page)" + return ValidityCheckResult(False, description, description) + try: + url.parse() + except UrlParserException as ex: + description = str(ex) + user_description = "Faild to parse the web page" + return ValidityCheckResult(False, description, user_description) + except Exception as ex: + description = str(ex) + user_description = "Something went terribly wrong" + return ValidityCheckResult(False, description, user_description) + return ValidityCheckResult(True) + + async def produce_audio_file(self, directory: str) -> str: + text: Text = self._url.parse() + # balcon.exe only works with UTF-8-BOM (or "utf-8-sig") + txt_path = text.save_to_txt(directory, encoding="utf-8-sig") + wav_path = balcon(txt_path) + mp3_path = ffmpeg__to_mp3(wav_path) + return mp3_path + + +class DocWorker(BaseWorker): + @staticmethod + def _bytes_to_mib_str(bytes: int, ndigits: int = 2): + if ndigits < 0: + raise ValueError + r = round(bytes / 2**20, ndigits) + fstr = "{" + f"0:0.{ndigits}f" + "}" + return fstr.format(r) + + def __init__(self, bot: Bot, message: Message) -> None: + super().__init__(bot, message) + self._doc: Document = self._message.document + + def check_validity(self) -> ValidityCheckResult: + doc = self._doc + if not to_txt.has_proper_extention(doc.file_name): + description = "The file extention is not supported" + user_description = "The file extention is not supported\n" + user_description += "List of supported extentions:\n" + user_description += " " + ", ".join(to_txt.INPUT_FORMATS) + return ValidityCheckResult(False, description, user_description) + elif doc.file_size > to_txt.MAX_INPUT_SIZE: + s_cur_mib = DocWorker._bytes_to_mib_str(doc.file_size) + s_max_mib = DocWorker._bytes_to_mib_str(to_txt.MAX_INPUT_SIZE) + description = f"The file is too big: the file size is {s_cur_mib} MiB, and the max size allowed is {s_max_mib} MiB)" + return ValidityCheckResult(False, description, description) + return ValidityCheckResult(True) + + async def produce_audio_file(self, directory: str) -> str: + filename = make_filename(self._doc.file_name) + file_path = os.path.join(directory, filename) + await self._bot.download(self._doc, file_path) + txt_path = blb2txt(file_path) + wav_path = balcon(txt_path) + mp3_path = ffmpeg__to_mp3(wav_path) + return mp3_path diff --git a/narrator/text.py b/narrator/text.py new file mode 100644 index 0000000..d1bb4a0 --- /dev/null +++ b/narrator/text.py @@ -0,0 +1,83 @@ +from __future__ import annotations +import os +from enum import Enum +import string +import datetime + + +class Text: + class Language(Enum): + # def __init__(self, char_set, name) -> None: + + RU: str = "ru" + EN: str = "en" + + @staticmethod + def guess_language(text: str | list[str]) -> Language: + stop_at = 1000 + if isinstance(text, list): + acc = 0 + to_join = [] + for par in text: + acc += len(par) + to_join.append(par) + if acc >= stop_at: + break + text = "".join(to_join) + + en_set = string.ascii_letters + ru_set = "абвгдеёжзийклмнопрстуфхцчшщъыьэюя" + ru_set += ru_set.upper() + + char_count = 0 + score = {"en": 0, "ru": 0} + for ch in text: + if ch in en_set: + score["en"] += 1 + char_count += 1 + elif ch in ru_set: + score["ru"] += 1 + char_count += 1 + if char_count >= stop_at: + break + return Text.Language.EN if score["en"] >= score["ru"] else Text.Language.RU + + def __init__( + self, title: str, paragraphs: list[str] | str, lang: Text.Language | None = None + ) -> None: + self._title = title + self._paragraphs = paragraphs + self._lang: Text.Language = lang or Text.guess_language(paragraphs) + self._datetime = datetime.datetime.now() + + def _preamble(self) -> str: + preamble = [] + + preamble.append(self._title) + if self._lang == Text.Language.RU: + preamble.append("Создано: " + self._datetime.strftime("%Y-%m-%d %H:%M:%S")) + else: + preamble.append("Created: " + self._datetime.strftime("%Y-%m-%d %H:%M:%S")) + return "\n".join(preamble) + + def save_to_txt( + self, directory: str, filename: str | None = None, encoding: str = "utf-8" + ) -> str: + if not filename: + filename = self._title + ".txt" + file_path = os.path.join(directory, filename) + with open(file_path, mode="w", encoding=encoding) as txt: # utf-8-sig + txt.write(self._preamble() + "\n") + if isinstance(self._paragraphs, list): + for p in self._paragraphs: + txt.write(p + "\n") + elif isinstance(self._paragraphs, str): + txt.write(self._paragraphs) + return file_path + + @staticmethod + def from_txt(file_path: str, encoding: str = "utf-8") -> Text: + text = open(file_path, encoding=encoding).read() + filename = os.path.basename(file_path) + title, _ = os.path.splitext(filename) + return Text(title, text) diff --git a/narrator/to_txt.py b/narrator/to_txt.py new file mode 100644 index 0000000..6711c19 --- /dev/null +++ b/narrator/to_txt.py @@ -0,0 +1,19 @@ +import os + +import narrator.config +from narrator.sub_utils import blb2txt + +conf = narrator.config.to_txt + +INPUT_FORMATS = [".epub", ".fb2", ".fb3", ".md", ".txt", ".doc", ".docx", ".rtf"] + +MAX_INPUT_SIZE = conf.max_input_size + + +def has_proper_extention(filename): + ext = os.path.splitext(filename)[1] + return ext.lower() in INPUT_FORMATS + + +def to_txt(source, dest_directory) -> str: + return blb2txt(source) diff --git a/narrator/url_parser.py b/narrator/url_parser.py new file mode 100644 index 0000000..169e986 --- /dev/null +++ b/narrator/url_parser.py @@ -0,0 +1,83 @@ +from urllib.parse import urlparse +from urllib.request import urlopen +import re + +from bs4 import BeautifulSoup + +from narrator.exceptions import UrlInvalid, UrlUnreachable, UrlParserException +from narrator.text import Text +import narrator.config + +conf = narrator.config.url_parser + +# https://stackoverflow.com/a/38020041/2493536 +def is_uri_valid(maybe_url: str) -> bool: + try: + result = urlparse(maybe_url) + return all([result.scheme, result.netloc]) + except: + return False + + +def is_url_reachable(url: str, timeout=3) -> bool: + try: + with urlopen(url, timeout=timeout): + return True + except Exception: + return False + + +class Url: + def __init__(self, url: str) -> None: + self._url = url + + self._is_valid: bool | None = None + self._is_reachable: bool | None = None + self._parsed_text: Text | None = None + + @property + def is_valid(self) -> bool: + if self._is_valid == None: + self._is_valid = is_uri_valid(self._url) + return self._is_valid + + @property + def is_reachable(self) -> bool: + if self._is_reachable == None: + self._is_reachable = is_url_reachable(self._url) + return self._is_reachable + + def _get_html(self) -> str: + with urlopen(self._url) as resp: + html = resp.read().decode() + return html + + def _pick_parse_config(self): + for site in conf.sites: + if re.match(conf.sites[site].url_re, self._url): + return conf.sites[site] + return None + + def parse(self) -> Text: + if not self.is_valid: + raise UrlInvalid() + if not self.is_reachable: + raise UrlUnreachable() + + if not self._parsed_text: + html = self._get_html() + soup = BeautifulSoup(html, "html.parser") + parse_config = self._pick_parse_config() + if not parse_config: + raise UrlParserException() + title = soup.select_one(parse_config.re.title).text + author = soup.select_one(parse_config.re.author).text + publication_date = soup.select_one(parse_config.re.publication_date) + + paragraphs = [] + paragraphs.append(f"author: {author}") + paragraphs.append(f"publication_date: {publication_date}") + paragraphs.append(soup.select_one(parse_config.re.text).text) + + self._parsed_text = Text(title, paragraphs) + return self._parsed_text diff --git a/pyproject.toml b/pyproject.toml index b01780f..311d60c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,3 +24,8 @@ beautifulsoup4 = "^4.11.1" [build-system] requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" + +[tool.pytest.ini_options] +markers = [ + "slow: marks tests as slow (deselect with '-m \"not slow\"')", +] \ No newline at end of file diff --git a/tests/test_config/config.yml b/tests/config/config.yml similarity index 100% rename from tests/test_config/config.yml rename to tests/config/config.yml diff --git a/tests/test_config/test_config.py b/tests/config/test_config.py similarity index 96% rename from tests/test_config/test_config.py rename to tests/config/test_config.py index 821c891..0890649 100644 --- a/tests/test_config/test_config.py +++ b/tests/config/test_config.py @@ -7,7 +7,7 @@ from narrator.config import _Config -CONF_YML_PATH = os.path.join("tests", "test_config", "config.yml") +CONF_YML_PATH = os.path.join("tests", "config", "config.yml") CONF_YML = yaml.safe_load(open(CONF_YML_PATH)) EMPTY_ENV = { diff --git a/tests/telegram/test_worker.py b/tests/telegram/test_worker.py new file mode 100644 index 0000000..590627d --- /dev/null +++ b/tests/telegram/test_worker.py @@ -0,0 +1,21 @@ +from ctypes.wintypes import DWORD +import pytest + +import narrator.telegram.worker as w + + +def test_doc_worker_bytes_to_mib_str(): + trf = w.DocWorker._bytes_to_mib_str + assert trf(0) == "0.00" + assert trf(1) == "0.00" + assert trf(23203) == "0.02" + assert trf(925693) == "0.88" + assert trf(2**20) == trf(2**20 - 1) == trf(2**20 + 1) == "1.00" + assert trf(1234567890) == "1177.38" + + assert trf(925693, 5) == "0.88281" + assert trf(925693, 1) == "0.9" + assert trf(925693, 0) == "1" + + with pytest.raises(ValueError): + _ = trf(925693, -1) diff --git a/tests/test_url_parser.py b/tests/test_url_parser.py new file mode 100644 index 0000000..0ccfc31 --- /dev/null +++ b/tests/test_url_parser.py @@ -0,0 +1,53 @@ +import pytest + +import narrator.url_parser as urlp + + +VALID_URLS = [ + "https://www.reddit.com", + "http://www.reddit.com/hot/", + "https://www.reddit.com/r/Python/", + "http://www.this_site.does_not.exist.xRoU8A5qZF.com", +] + +INVALID_URLS = [ + "", + "some text", + ["not", "a", "string"], + "www.reddit.com", + "ht@tps://www.reddit.com/", + "www.reddit.fake", +] + +REACHABLE_URLS = [ + "https://www.reddit.com", + "https://xkcd.com/1619/", + "http://xkcd.com/2529/", + "https://stackexchange.com/about", +] +UNREACHABLE_URLS = INVALID_URLS + [ + "http://www.this_site.does_not.exist.xRoU8A5qZF.com", +] + + +def test_is_uri_valid(): + for url in VALID_URLS: + assert urlp.is_uri_valid(url) + url_obj = urlp.Url(url) + assert url_obj.is_valid + for url in INVALID_URLS: + assert not urlp.is_uri_valid(url) + url_obj = urlp.Url(url) + assert not url_obj.is_valid + + +@pytest.mark.slow +def test_is_url_reachable(): + for url in REACHABLE_URLS: + assert urlp.is_url_reachable(url) + url_obj = urlp.Url(url) + assert url_obj.is_reachable + for url in UNREACHABLE_URLS: + assert not urlp.is_url_reachable(url) + url_obj = urlp.Url(url) + assert not url_obj.is_reachable