From 52ecd7ced5a381160cbbc2d4e3a5d5161f7439b5 Mon Sep 17 00:00:00 2001 From: sentenzo Date: Mon, 19 Sep 2022 23:14:00 +0300 Subject: [PATCH 1/7] massive refactoring (in progress) --- config.yml | 3 + narrator/__main__.py | 2 +- narrator/bot.py | 92 -------------------- narrator/exceptions.py | 12 +++ narrator/telegram/bot.py | 85 ++++++++++++++++++ narrator/telegram/worker.py | 82 +++++++++++++++++ narrator/to_txt.py | 13 +++ narrator/url_parser.py | 21 +++++ tests/{test_config => config}/config.yml | 0 tests/{test_config => config}/test_config.py | 2 +- 10 files changed, 218 insertions(+), 94 deletions(-) delete mode 100644 narrator/bot.py create mode 100644 narrator/exceptions.py create mode 100644 narrator/telegram/bot.py create mode 100644 narrator/telegram/worker.py create mode 100644 narrator/to_txt.py create mode 100644 narrator/url_parser.py rename tests/{test_config => config}/config.yml (100%) rename tests/{test_config => config}/test_config.py (96%) diff --git a/config.yml b/config.yml index a96b757..8164cd2 100644 --- a/config.yml +++ b/config.yml @@ -9,3 +9,6 @@ utils: path: c:\bin\balcon.exe ffmpeg: path: c:\bin\ffmpeg.exe + +to_txt: + max_input_size: 5242880 # 5 * 2 ^ 20 == 5 MiB \ No newline at end of file diff --git a/narrator/__main__.py b/narrator/__main__.py index 93f09b5..5e13077 100644 --- a/narrator/__main__.py +++ b/narrator/__main__.py @@ -1,6 +1,6 @@ import asyncio -from narrator.bot import dispatcher, narrator_bot +from narrator.telegram.bot import dispatcher, narrator_bot async def main(): diff --git a/narrator/bot.py b/narrator/bot.py deleted file mode 100644 index 2967807..0000000 --- a/narrator/bot.py +++ /dev/null @@ -1,92 +0,0 @@ -# from asyncio.log import logger -import logging -import tempfile -import os -from typing import Callable, Dict, Any, Awaitable - -from aiogram import Dispatcher, BaseMiddleware, Bot -from aiogram.types import ContentType, Message, FSInputFile, TelegramObject -from aiogram.fsm.storage.memory import MemoryStorage -from aiogram.filters import Command - -from .reader import url as r_url -from .reader import file as r_file -from .speaker import Speaker -import narrator.config as conf - -logger = logging.getLogger(__name__) - - -narrator_bot = Bot(token=conf.bot.token) -mem_storage = MemoryStorage() -dispatcher = Dispatcher(storage=mem_storage) - - -@dispatcher.message(Command(commands=["about"])) -async def cmd_test(message: Message): - await message.answer("🤖: ...") - - -@dispatcher.message(content_types=ContentType.DOCUMENT) -async def take_document(message: Message): - logger.info( - f'{message.from_user.id}: file(id={message.document.file_id}, name="{message.document.file_name}",) {message.text}' - ) - if not r_file.is_readable_ext(message.document.file_name): - await message.answer( - f"🤖: No reader matches the file: {message.document.file_name}" - ) - return - with tempfile.TemporaryDirectory() as temp_dir_path: - file_path = os.path.join(temp_dir_path, message.document.file_name) - await narrator_bot.download(message.document, file_path) - path_to_audio = Speaker.narrate_from_txt_to_file(file_path) - tf = FSInputFile(path_to_audio) - await message.answer_document(tf) - - -@dispatcher.message(content_types=ContentType.TEXT) -async def take_text(message: Message): - if r_url.is_url(message.text): - article = r_url.read_text(message.text) - with tempfile.TemporaryDirectory() as temp_dir_path: - file_path = article.save_to_txt(temp_dir_path) - path_to_audio = Speaker.narrate_from_txt_to_file(file_path) - tf = FSInputFile(path_to_audio) - await message.answer_document(tf) - else: - await take_else(message) - - -@dispatcher.message(content_types=ContentType.ANY) -async def take_else(message: Message): - user = message.from_user.id - - logger.info(f"{user}: {message.text}") - await message.answer("🤖: Please, send me a file or a link") - - -############################################################ - - -class Authorizer(BaseMiddleware): - WHITELIST: list[str] = conf.bot.allowed_usernames - - async def __call__( - self, - handler: Callable[[TelegramObject, Dict[str, Any]], Awaitable[Any]], - event: TelegramObject, - data: Dict[str, Any], - ) -> Any: - if event.from_user.username in Authorizer.WHITELIST: - return await handler(event, data) - - -dispatcher.message.middleware(Authorizer()) - - -############################################################ - -narrator_bot = Bot(token=conf.bot.token) - -__all__ = [dispatcher, narrator_bot] diff --git a/narrator/exceptions.py b/narrator/exceptions.py new file mode 100644 index 0000000..f826697 --- /dev/null +++ b/narrator/exceptions.py @@ -0,0 +1,12 @@ +class NarratorException(Exception): + """ + Base class for any Narrator exception + """ + + +class UrlParserException(NarratorException): + pass + + +class TxtTransformerException(NarratorException): + pass diff --git a/narrator/telegram/bot.py b/narrator/telegram/bot.py new file mode 100644 index 0000000..95c9943 --- /dev/null +++ b/narrator/telegram/bot.py @@ -0,0 +1,85 @@ +import logging +import tempfile +from typing import Callable, Dict, Any, Awaitable + +from aiogram import Dispatcher, BaseMiddleware, Bot +from aiogram.types import ContentType, Message, FSInputFile, TelegramObject +from aiogram.filters import Command + +import narrator.config as conf +from ..exceptions import NarratorException + +from .worker import BaseWorker, DocWorker, UrlWorker + +logger = logging.getLogger(__name__) + +narrator_bot = Bot(token=conf.bot.token) +dispatcher = Dispatcher() + + +@dispatcher.message(Command(commands=["about"])) +async def cmd_test(message: Message): + await message.answer("🤖: ...") + + +@dispatcher.message(content_types=ContentType.ANY) +async def take_else(message: Message): + Worker: BaseWorker | None = None + obj_name: str | None = None + + if message.content_type == ContentType.TEXT: + Worker = UrlWorker + obj_name = "url" + elif message.content_type == ContentType.DOCUMENT: + Worker = DocWorker + obj_name = "file" + else: + await message.answer("🤖: Please, send me a file or a url") + return + + worker = Worker(narrator_bot, message) + is_valid, description, user_description = worker.check_validity() + if not is_valid: + logger.info(description) + await message.answer("🤖: " + user_description) + return + try: + with tempfile.TemporaryDirectory() as temp_dir_path: + path_to_audio = await worker.produce_audio_file(temp_dir_path) + f_wrap = FSInputFile(path_to_audio) + await message.answer_document(f_wrap) + except NarratorException: + logger.warning(f"Faild to process {obj_name}: ", exc_info=True) + await message.answer( + f"🤖: Something went wrong: I faild to process the {obj_name}" + ) + except Exception: + logger.error(f"Faild to process {obj_name}: ", exc_info=True) + await message.answer( + f"🤖: Something went terribly wrong: I faild to process the {obj_name}" + ) + + +############################################################ + + +class Authorizer(BaseMiddleware): + WHITELIST: list[str] = conf.bot.allowed_usernames + + async def __call__( + self, + handler: Callable[[TelegramObject, Dict[str, Any]], Awaitable[Any]], + event: TelegramObject, + data: Dict[str, Any], + ) -> Any: + if event.from_user.username in Authorizer.WHITELIST: + return await handler(event, data) + + +dispatcher.message.middleware(Authorizer()) + +############################################################ + +narrator_bot = Bot(token=conf.bot.token) + +__all__ = [dispatcher, narrator_bot] diff --git a/narrator/telegram/worker.py b/narrator/telegram/worker.py new file mode 100644 index 0000000..c6850f3 --- /dev/null +++ b/narrator/telegram/worker.py @@ -0,0 +1,82 @@ +from typing import NamedTuple + +from aiogram import Bot +from aiogram.types import Message, Document + +from narrator.exceptions import UrlParserException +import narrator.url_parser as url_parser +import narrator.to_txt as to_txt + + +class ValidityCheckResult(NamedTuple): + is_valid: bool + description: str | None = None + user_description: str | None = None + + +class BaseWorker: + def __init__(self, bot: Bot, message: Message) -> None: + self._bot = bot + self._message = message + + def check_validity(self) -> ValidityCheckResult: + raise NotImplemented() + + async def produce_audio_file(self, directory: str) -> str: + raise NotImplemented() + + +class UrlWorker(BaseWorker): + def __init__(self, bot: Bot, message: Message) -> None: + super().__init__(bot, message) + self._url = url_parser.Url(self._message.text) + + def check_validity(self) -> ValidityCheckResult: + if not self._url.is_valid(): + description = "Not a valid url" + return ValidityCheckResult(False, description, description) + elif not self._url.is_reachable(): + description = "The URL is unreachable (can't open the web page)" + return ValidityCheckResult(False, description, description) + try: + self._url.parse() + except UrlParserException as ex: + description = str(ex) + user_description = "Faild to parse the web page" + return ValidityCheckResult(False, description, user_description) + except Exception as ex: + description = str(ex) + user_description = "Something went terribly wrong" + return ValidityCheckResult(False, description, user_description) + return ValidityCheckResult(True) + + +class DocWorker(BaseWorker): + @staticmethod + def _bytes_to_mib_str(bytes: int, ndigits: int = 2): + mul = 10**ndigits + val = bytes * mul // 2**20 + d, m = divmod(val, mul) + m = str(m) + if (n := ndigits - len(m)) > 0: + m += "0" * n + return f"{d}.{m}" + + def __init__(self, bot: Bot, message: Message) -> None: + super().__init__(bot, message) + self._doc: Document = self._message.document + + def check_validity(self) -> ValidityCheckResult: + doc = self._doc + if not to_txt.has_proper_extention(doc.file_name): + description = "The file extention is not supported" + user_description = "The file extention is not supported\n" + user_description += "List of supported extentions:\n" + user_description += " " + ", ".join(to_txt.INPUT_FORMATS) + return ValidityCheckResult(False, description, user_description) + elif doc.file_size > to_txt.MAX_INPUT_SIZE: + s_cur_mib = DocWorker._bytes_to_mib_str(doc.file_size) + s_max_mib = DocWorker._bytes_to_mib_str(to_txt.MAX_INPUT_SIZE) + description = f"The file is too big: the file size is {s_cur_mib} MiB, and the max size allowed is {s_max_mib} MiB)" + return ValidityCheckResult(False, description, description) + return ValidityCheckResult(True) diff --git a/narrator/to_txt.py b/narrator/to_txt.py new file mode 100644 index 0000000..a346b4e --- /dev/null +++ b/narrator/to_txt.py @@ -0,0 +1,13 @@ +import os + +import narrator.config + +conf = narrator.config.to_txt + +INPUT_FORMATS = [".epub", ".fb2", ".fb3", ".md", ".txt", ".doc", ".docx", ".rtf"] + +MAX_INPUT_SIZE = conf.max_input_size + + +def has_proper_extention(filename): + return os.path.splitext(filename)[1] in INPUT_FORMATS diff --git a/narrator/url_parser.py b/narrator/url_parser.py new file mode 100644 index 0000000..5761766 --- /dev/null +++ b/narrator/url_parser.py @@ -0,0 +1,21 @@ +class Url: + """ + dummy + """ + + def __init__(self, url: str) -> None: + self._url = url + self._parsed_text: list[str] | None = None + + def is_valid(self) -> bool: + return False + + def is_reachable(self) -> bool: + return False + + def parse(self) -> list[str]: + raise NotImplemented() + if not self._parsed_text: + ... + # self._parsed_text = something + return self._parsed_text diff --git a/tests/test_config/config.yml b/tests/config/config.yml similarity index 100% rename from tests/test_config/config.yml rename to tests/config/config.yml diff --git a/tests/test_config/test_config.py b/tests/config/test_config.py similarity index 96% rename from tests/test_config/test_config.py rename to tests/config/test_config.py index 821c891..0890649 100644 --- a/tests/test_config/test_config.py +++ b/tests/config/test_config.py @@ -7,7 +7,7 @@ from narrator.config import _Config -CONF_YML_PATH = os.path.join("tests", "test_config", "config.yml") +CONF_YML_PATH = os.path.join("tests", "config", "config.yml") CONF_YML = yaml.safe_load(open(CONF_YML_PATH)) EMPTY_ENV = { From 97948a46b6168c3a277b8a2d20159a7c2e496da7 Mon Sep 17 00:00:00 2001 From: sentenzo Date: Tue, 20 Sep 2022 13:40:33 +0300 Subject: [PATCH 2/7] some tests --- narrator/telegram/worker.py | 12 +++++------- tests/telegram/test_worker.py | 21 +++++++++++++++++++++ 2 files changed, 26 insertions(+), 7 deletions(-) create mode 100644 tests/telegram/test_worker.py diff --git a/narrator/telegram/worker.py b/narrator/telegram/worker.py index c6850f3..ec9f8cc 100644 --- a/narrator/telegram/worker.py +++ b/narrator/telegram/worker.py @@ -54,13 +54,11 @@ def check_validity(self) -> ValidityCheckResult: class DocWorker(BaseWorker): @staticmethod def _bytes_to_mib_str(bytes: int, ndigits: int = 2): - mul = 10**ndigits - val = bytes * mul // 2**20 - d, m = divmod(val, mul) - m = str(m) - if (n := ndigits - len(m)) > 0: - m += "0" * n - return f"{d}.{m}" + if ndigits < 0: + raise ValueError + r = round(bytes / 2**20, ndigits) + fstr = "{" + f"0:0.{ndigits}f" + "}" + return fstr.format(r) def __init__(self, bot: Bot, message: Message) -> None: super().__init__(bot, message) diff --git a/tests/telegram/test_worker.py b/tests/telegram/test_worker.py new file mode 100644 index 0000000..590627d --- /dev/null +++ b/tests/telegram/test_worker.py @@ -0,0 +1,21 @@ +from ctypes.wintypes import DWORD +import pytest + +import narrator.telegram.worker as w + + +def test_doc_worker_bytes_to_mib_str(): + trf = w.DocWorker._bytes_to_mib_str + assert trf(0) == "0.00" + assert trf(1) == "0.00" + assert trf(23203) == "0.02" + assert trf(925693) == "0.88" + assert trf(2**20) == trf(2**20 - 1) == trf(2**20 + 1) == "1.00" + assert trf(1234567890) == "1177.38" + + assert trf(925693, 5) == "0.88281" + assert trf(925693, 1) == "0.9" + assert trf(925693, 0) == "1" + + with pytest.raises(ValueError): + _ = trf(925693, -1) From 023096e2e2d1611649dea6e37000fa9b75e339c6 Mon Sep 17 00:00:00 2001 From: sentenzo Date: Tue, 20 Sep 2022 14:25:22 +0300 Subject: [PATCH 3/7] `url_parser`++ + tests --- Makefile | 3 +++ narrator/telegram/worker.py | 7 ++--- narrator/url_parser.py | 33 +++++++++++++++++++++-- pyproject.toml | 5 ++++ tests/test_url_parser.py | 53 +++++++++++++++++++++++++++++++++++++ 5 files changed, 96 insertions(+), 5 deletions(-) create mode 100644 tests/test_url_parser.py diff --git a/Makefile b/Makefile index d75ff30..134341c 100644 --- a/Makefile +++ b/Makefile @@ -7,4 +7,7 @@ run: poetry run python -m narrator test: + poetry run python -m pytest -m "not slow" --verbosity=2 --showlocals --log-level=DEBUG + +test-all: poetry run python -m pytest --verbosity=2 --showlocals --log-level=DEBUG \ No newline at end of file diff --git a/narrator/telegram/worker.py b/narrator/telegram/worker.py index ec9f8cc..664eb61 100644 --- a/narrator/telegram/worker.py +++ b/narrator/telegram/worker.py @@ -32,14 +32,15 @@ def __init__(self, bot: Bot, message: Message) -> None: self._url = url_parser.Url(self._message.text) def check_validity(self) -> ValidityCheckResult: - if not self._url.is_valid(): + url: url_parser.Url = self._url + if not url.is_valid: description = "Not a valid url" return ValidityCheckResult(False, description, description) - elif not self._url.is_reachable(): + elif not url.is_reachable: description = "The URL is unreachable (can't open the web page)" return ValidityCheckResult(False, description, description) try: - self._url.parse() + url.parse() except UrlParserException as ex: description = str(ex) user_description = "Faild to parse the web page" diff --git a/narrator/url_parser.py b/narrator/url_parser.py index 5761766..255bce1 100644 --- a/narrator/url_parser.py +++ b/narrator/url_parser.py @@ -1,3 +1,23 @@ +from urllib.parse import urlparse +from urllib.request import urlopen + +# https://stackoverflow.com/a/38020041/2493536 +def is_uri_valid(maybe_url: str) -> bool: + try: + result = urlparse(maybe_url) + return all([result.scheme, result.netloc]) + except: + return False + + +def is_url_reachable(url: str, timeout=3) -> bool: + try: + with urlopen(url, timeout=timeout): + return True + except Exception: + return False + + class Url: """ dummy @@ -5,13 +25,22 @@ class Url: def __init__(self, url: str) -> None: self._url = url + + self._is_valid: bool | None = None + self._is_reachable: bool | None = None self._parsed_text: list[str] | None = None + @property def is_valid(self) -> bool: - return False + if self._is_valid == None: + self._is_valid = is_uri_valid(self._url) + return self._is_valid + @property def is_reachable(self) -> bool: - return False + if self._is_reachable == None: + self._is_reachable = is_url_reachable(self._url) + return self._is_reachable def parse(self) -> list[str]: raise NotImplemented() diff --git a/pyproject.toml b/pyproject.toml index b01780f..311d60c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,3 +24,8 @@ beautifulsoup4 = "^4.11.1" [build-system] requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" + +[tool.pytest.ini_options] +markers = [ + "slow: marks tests as slow (deselect with '-m \"not slow\"')", +] \ No newline at end of file diff --git a/tests/test_url_parser.py b/tests/test_url_parser.py new file mode 100644 index 0000000..0ccfc31 --- /dev/null +++ b/tests/test_url_parser.py @@ -0,0 +1,53 @@ +import pytest + +import narrator.url_parser as urlp + + +VALID_URLS = [ + "https://www.reddit.com", + "http://www.reddit.com/hot/", + "https://www.reddit.com/r/Python/", + "http://www.this_site.does_not.exist.xRoU8A5qZF.com", +] + +INVALID_URLS = [ + "", + "some text", + ["not", "a", "string"], + "www.reddit.com", + "ht@tps://www.reddit.com/", + "www.reddit.fake", +] + +REACHABLE_URLS = [ + "https://www.reddit.com", + "https://xkcd.com/1619/", + "http://xkcd.com/2529/", + "https://stackexchange.com/about", +] +UNREACHABLE_URLS = INVALID_URLS + [ + "http://www.this_site.does_not.exist.xRoU8A5qZF.com", +] + + +def test_is_uri_valid(): + for url in VALID_URLS: + assert urlp.is_uri_valid(url) + url_obj = urlp.Url(url) + assert url_obj.is_valid + for url in INVALID_URLS: + assert not urlp.is_uri_valid(url) + url_obj = urlp.Url(url) + assert not url_obj.is_valid + + +@pytest.mark.slow +def test_is_url_reachable(): + for url in REACHABLE_URLS: + assert urlp.is_url_reachable(url) + url_obj = urlp.Url(url) + assert url_obj.is_reachable + for url in UNREACHABLE_URLS: + assert not urlp.is_url_reachable(url) + url_obj = urlp.Url(url) + assert not url_obj.is_reachable From 95829c889abf8230bac47bd2ab22e3c2150e62b0 Mon Sep 17 00:00:00 2001 From: sentenzo Date: Tue, 20 Sep 2022 16:10:41 +0300 Subject: [PATCH 4/7] massive refactoring (in progress) - part 2 --- config.yml | 15 ++++++++- narrator/article.py | 41 ------------------------ narrator/exceptions.py | 8 +++++ narrator/reader/__init__.py | 15 --------- narrator/reader/base_reader.py | 11 ------- narrator/reader/file/__init__.py | 24 -------------- narrator/reader/file/base_file_reader.py | 13 -------- narrator/reader/file/txt_reader.py | 19 ----------- narrator/reader/url/__init__.py | 22 ------------- narrator/reader/url/base_url_reader.py | 37 --------------------- narrator/reader/url/habr_reader.py | 31 ------------------ narrator/sub_utils.py | 15 +++++++++ narrator/telegram/worker.py | 26 +++++++++++++++ narrator/text.py | 15 +++++++++ narrator/to_txt.py | 8 ++++- narrator/url_parser.py | 15 +++++++-- 16 files changed, 97 insertions(+), 218 deletions(-) delete mode 100644 narrator/article.py delete mode 100644 narrator/reader/__init__.py delete mode 100644 narrator/reader/base_reader.py delete mode 100644 narrator/reader/file/__init__.py delete mode 100644 narrator/reader/file/base_file_reader.py delete mode 100644 narrator/reader/file/txt_reader.py delete mode 100644 narrator/reader/url/__init__.py delete mode 100644 narrator/reader/url/base_url_reader.py delete mode 100644 narrator/reader/url/habr_reader.py create mode 100644 narrator/text.py diff --git a/config.yml b/config.yml index 8164cd2..afaa438 100644 --- a/config.yml +++ b/config.yml @@ -9,6 +9,19 @@ utils: path: c:\bin\balcon.exe ffmpeg: path: c:\bin\ffmpeg.exe + blb2txt: + path: c:\bin\blb2txt.exe to_txt: - max_input_size: 5242880 # 5 * 2 ^ 20 == 5 MiB \ No newline at end of file + max_input_size: 5242880 # 5 * 2 ^ 20 == 5 MiB + +url_parser: + sites: + habr: + url_re: https\://habr\.com/..(/company/[-\w]+/blog|/post)/\d+ + lang: ru + re: + title: main h1[data-test-id="articleTitle"] span + author: main a.tm-user-info__username + publication_date: main div.tm-article-snippet__meta time + text: "#post-content-body" \ No newline at end of file diff --git a/narrator/article.py b/narrator/article.py deleted file mode 100644 index a86cd81..0000000 --- a/narrator/article.py +++ /dev/null @@ -1,41 +0,0 @@ -import os - -from narrator.sub_utils import add_suffix, make_filename - - -class Article: - def __init__(self): - self.title: str = "" - self.meta = {} - self.text: str = "" - - def __setitem__(self, key, val): - self.meta[key] = val - - def __getitem__(self, key): - return self.meta[key] - - def __repr__(self) -> str: - text = self.text[:20] if self.text else "" - return f'Article(text="{text}")' - - def save_to_txt(self, dir_path: str) -> str: - file_name = self.title or self.text[:20] - file_name = make_filename(file_name) - file_name = add_suffix(file_name, ".txt") - - file_path = os.path.join(dir_path, file_name) - - text = str(self) - - with open(file_path, "wb") as txt: - txt.write(text.encode("utf-8-sig")) - - return file_path - - def __str__(self) -> str: - meta_text = [] - for k, v in self.meta.items(): - meta_text.append(f"{k}:\n {v}") - meta_text = "\n".join(meta_text) - return f"\n{meta_text}\n\n{self.text}\n" diff --git a/narrator/exceptions.py b/narrator/exceptions.py index f826697..2bf8f84 100644 --- a/narrator/exceptions.py +++ b/narrator/exceptions.py @@ -8,5 +8,13 @@ class UrlParserException(NarratorException): pass +class UrlInvalid(UrlParserException): + pass + + +class UrlUnreachable(UrlParserException): + pass + + class TxtTransformerException(NarratorException): pass diff --git a/narrator/reader/__init__.py b/narrator/reader/__init__.py deleted file mode 100644 index b081291..0000000 --- a/narrator/reader/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -""" -Tools to convert input to txt, suitable for reading -""" -from . import file, url - - -def from_file(obj) -> str: - return file.read_text(obj) - - -def from_url(obj) -> str: - return url.read_text(obj) - - -__all__ = [from_file, from_url] diff --git a/narrator/reader/base_reader.py b/narrator/reader/base_reader.py deleted file mode 100644 index 052a093..0000000 --- a/narrator/reader/base_reader.py +++ /dev/null @@ -1,11 +0,0 @@ -from narrator.article import Article - - -class BaseReader: - @staticmethod - def read_text(obj) -> Article: - raise NotImplemented() - - @staticmethod - def is_readable(obj) -> bool: - raise NotImplemented() diff --git a/narrator/reader/file/__init__.py b/narrator/reader/file/__init__.py deleted file mode 100644 index 130dee2..0000000 --- a/narrator/reader/file/__init__.py +++ /dev/null @@ -1,24 +0,0 @@ -from aiogram.types import Document - -from narrator.article import Article -from .txt_reader import TxtReader -from .base_file_reader import BaseFileReader - -FILE_READERS = [TxtReader] - - -def read_text(file_path: str) -> Article: - for Reader in FILE_READERS: - if Reader.is_readable(file_path): - return Reader.read_text(file_path) - return f"No reader matches the file: {file_path}" - - -def is_readable_ext(file_name: str) -> bool: - for Reader in FILE_READERS: - if Reader.is_readable(file_name): - return True - return False - - -__all__ = [read_text, is_readable_ext] diff --git a/narrator/reader/file/base_file_reader.py b/narrator/reader/file/base_file_reader.py deleted file mode 100644 index 6072fb8..0000000 --- a/narrator/reader/file/base_file_reader.py +++ /dev/null @@ -1,13 +0,0 @@ -from aiogram.types import Document - -from ..base_reader import BaseReader - - -class BaseFileReader(BaseReader): - @staticmethod - def read_text(obj: str) -> str: - raise NotImplemented() - - @staticmethod - def is_readable(obj: str) -> bool: - raise NotImplemented() diff --git a/narrator/reader/file/txt_reader.py b/narrator/reader/file/txt_reader.py deleted file mode 100644 index 4a4fe11..0000000 --- a/narrator/reader/file/txt_reader.py +++ /dev/null @@ -1,19 +0,0 @@ -import os - -from aiogram.types import Document - -from narrator.article import Article -from .base_file_reader import BaseFileReader - - -class TxtReader(BaseFileReader): - @staticmethod - def read_text(obj: str) -> Article: - article = Article() - article["file name"] = obj # os.path.splitext(obj)[0] - - return article - - @staticmethod - def is_readable(obj: str) -> bool: - return os.path.splitext(obj)[1] == ".txt" diff --git a/narrator/reader/url/__init__.py b/narrator/reader/url/__init__.py deleted file mode 100644 index 7251949..0000000 --- a/narrator/reader/url/__init__.py +++ /dev/null @@ -1,22 +0,0 @@ -import urllib.request as ur - -from narrator.article import Article - -from .habr_reader import HabrReader -from .base_url_reader import BaseUrlReader - -URL_READERS = [HabrReader] - - -def read_text(obj: str) -> Article: - for Reader in URL_READERS: - if Reader.is_readable(obj): - return Reader.read_text(obj) - return f"No reader matches the url: {obj}" - - -def is_url(obj: str) -> bool: - return BaseUrlReader.is_readable(obj) - - -__all__ = [read_text, is_url] diff --git a/narrator/reader/url/base_url_reader.py b/narrator/reader/url/base_url_reader.py deleted file mode 100644 index fd9c274..0000000 --- a/narrator/reader/url/base_url_reader.py +++ /dev/null @@ -1,37 +0,0 @@ -import re -from urllib.request import urlopen -from urllib.parse import urlparse - -from narrator.article import Article -from ..base_reader import BaseReader - -# https://stackoverflow.com/a/38020041/2493536 -def _uri_validator(x): - try: - result = urlparse(x) - return all([result.scheme, result.netloc]) - except: - return False - - -class BaseUrlReader(BaseReader): - @staticmethod - def read_text(obj: str) -> Article: - raise NotImplemented() - - @staticmethod - def is_readable(obj: str) -> bool: - return _uri_validator(obj) - - @staticmethod - def _re_check_url(re_temp: str, url: str) -> bool: - if re.match(re_temp, url): - return True - else: - return False - - @staticmethod - def get_html(url: str) -> str: - with urlopen(url) as resp: - html = resp.read().decode() - return html diff --git a/narrator/reader/url/habr_reader.py b/narrator/reader/url/habr_reader.py deleted file mode 100644 index 2a8d387..0000000 --- a/narrator/reader/url/habr_reader.py +++ /dev/null @@ -1,31 +0,0 @@ -from bs4 import BeautifulSoup - -from .base_url_reader import BaseUrlReader -from narrator.article import Article - - -class HabrReader(BaseUrlReader): - @staticmethod - def read_text(obj: str) -> Article: - html = BaseUrlReader.get_html(obj) - soup = BeautifulSoup(html, "html.parser") - article = Article() - article.title = soup.select_one( - "main h1[data-test-id='articleTitle'] span" - ).text - article["author"] = soup.select_one( - "main a.tm-user-info__username" - ).text.strip() - article["date"] = soup.select_one("main div.tm-article-snippet__meta time").get( - "title", "" - ) - article.text = soup.select_one("#post-content-body").text - - return article - - @staticmethod - def is_readable(obj: str) -> bool: - re_temp = r"https\://habr\.com/..(/company/[-\w]+/blog|/post)/\d+" - return BaseUrlReader._re_check_url(re_temp, obj) and BaseUrlReader.is_readable( - obj - ) diff --git a/narrator/sub_utils.py b/narrator/sub_utils.py index 65d239f..6cc5c1d 100644 --- a/narrator/sub_utils.py +++ b/narrator/sub_utils.py @@ -82,3 +82,18 @@ def balcon(from_txt_file: str): subprocess.run(balcon_args) return to_file + + +def blb2txt(from_file: str): + # .\blb2txt -f tst.docx -out txt.txt -e utf8 + to_file = add_suffix(from_file, ".txt") + + blb2txt_path = conf.utils.blb2txt.path + blb2txt_args = [blb2txt_path] + blb2txt_args.extend(["-f", from_file]) + blb2txt_args.extend(["-out", to_file]) + blb2txt_args.extend(["-e", "utf8"]) + + subprocess.run(blb2txt_args) + + return to_file diff --git a/narrator/telegram/worker.py b/narrator/telegram/worker.py index 664eb61..b6985ce 100644 --- a/narrator/telegram/worker.py +++ b/narrator/telegram/worker.py @@ -1,3 +1,4 @@ +import os from typing import NamedTuple from aiogram import Bot @@ -6,6 +7,15 @@ from narrator.exceptions import UrlParserException import narrator.url_parser as url_parser import narrator.to_txt as to_txt +from narrator.text import Text +from narrator.sub_utils import ( + balcon, + blb2txt, + ffmpeg__to_mp3, + add_suffix, + crop_suffix, + make_filename, +) class ValidityCheckResult(NamedTuple): @@ -51,6 +61,13 @@ def check_validity(self) -> ValidityCheckResult: return ValidityCheckResult(False, description, user_description) return ValidityCheckResult(True) + async def produce_audio_file(self, directory: str) -> str: + text: Text = self._url.parse() + txt_path = text.save_to_txt(directory) + wav_path = balcon(txt_path) + mp3_path = ffmpeg__to_mp3(wav_path) + return mp3_path + class DocWorker(BaseWorker): @staticmethod @@ -79,3 +96,12 @@ def check_validity(self) -> ValidityCheckResult: description = f"The file is too big: the file size is {s_cur_mib} MiB, and the max size allowed is {s_max_mib} MiB)" return ValidityCheckResult(False, description, description) return ValidityCheckResult(True) + + async def produce_audio_file(self, directory: str) -> str: + filename = make_filename(self._doc.file_name) + file_path = os.path.join(directory, filename) + self._bot.download(self._doc, filename) + txt_path = blb2txt(file_path) + wav_path = balcon(txt_path) + mp3_path = ffmpeg__to_mp3(wav_path) + return mp3_path diff --git a/narrator/text.py b/narrator/text.py new file mode 100644 index 0000000..0ea8d37 --- /dev/null +++ b/narrator/text.py @@ -0,0 +1,15 @@ +from __future__ import annotations +from enum import Enum + + +class Text: + class Language(Enum): + ru: str = "ru" + en: str = "en" + + def __init__(self, lang: Text.Language, paragraphs: list[str]) -> None: + self._lang = lang + self._paragraphs = paragraphs + + def save_to_txt(self, directory: str, filename: str | None = None) -> str: + return "dummy" diff --git a/narrator/to_txt.py b/narrator/to_txt.py index a346b4e..6711c19 100644 --- a/narrator/to_txt.py +++ b/narrator/to_txt.py @@ -1,6 +1,7 @@ import os import narrator.config +from narrator.sub_utils import blb2txt conf = narrator.config.to_txt @@ -10,4 +11,9 @@ def has_proper_extention(filename): - return os.path.splitext(filename)[1] in INPUT_FORMATS + ext = os.path.splitext(filename)[1] + return ext.lower() in INPUT_FORMATS + + +def to_txt(source, dest_directory) -> str: + return blb2txt(source) diff --git a/narrator/url_parser.py b/narrator/url_parser.py index 255bce1..98f42de 100644 --- a/narrator/url_parser.py +++ b/narrator/url_parser.py @@ -1,6 +1,9 @@ from urllib.parse import urlparse from urllib.request import urlopen +from narrator.exceptions import UrlInvalid, UrlUnreachable +from narrator.text import Text + # https://stackoverflow.com/a/38020041/2493536 def is_uri_valid(maybe_url: str) -> bool: try: @@ -28,7 +31,7 @@ def __init__(self, url: str) -> None: self._is_valid: bool | None = None self._is_reachable: bool | None = None - self._parsed_text: list[str] | None = None + self._parsed_text: Text | None = None @property def is_valid(self) -> bool: @@ -42,9 +45,15 @@ def is_reachable(self) -> bool: self._is_reachable = is_url_reachable(self._url) return self._is_reachable - def parse(self) -> list[str]: - raise NotImplemented() + def parse(self) -> Text: + if not self.is_valid: + raise UrlInvalid() + if not self.is_reachable: + raise UrlUnreachable() + if not self._parsed_text: ... # self._parsed_text = something + # raise NotImplemented() + self._parsed_text = Text(Text.Language.ru, [""]) return self._parsed_text From 34d13fb91b0ca2269608975d50c8844b9008fe6c Mon Sep 17 00:00:00 2001 From: sentenzo Date: Tue, 20 Sep 2022 18:35:19 +0300 Subject: [PATCH 5/7] =?UTF-8?q?=F0=9F=A4=96:=20Something=20went=20terribly?= =?UTF-8?q?=20wrong?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- narrator/text.py | 80 ++++++++++++++++++++++++++++++++++++++---- narrator/url_parser.py | 38 +++++++++++++++++--- 2 files changed, 107 insertions(+), 11 deletions(-) diff --git a/narrator/text.py b/narrator/text.py index 0ea8d37..4f35f43 100644 --- a/narrator/text.py +++ b/narrator/text.py @@ -1,15 +1,83 @@ from __future__ import annotations +import os from enum import Enum +import string +import datetime class Text: class Language(Enum): - ru: str = "ru" - en: str = "en" + # def __init__(self, char_set, name) -> None: - def __init__(self, lang: Text.Language, paragraphs: list[str]) -> None: - self._lang = lang + RU: str = "ru" + EN: str = "en" + + @staticmethod + def guess_language(text: str | list[str]) -> Language: + stop_at = 1000 + if isinstance(text, list): + acc = 0 + to_join = [] + for par in text: + acc += len(par) + to_join.append(par) + if acc >= stop_at: + break + text = "".join(to_join) + + en_set = string.ascii_letters + ru_set = "абвгдеёжзийклмнопрстуфхцчшщъыьэюя" + ru_set += ru_set.upper() + + char_count = 0 + score = {"en": 0, "ru": 0} + for ch in text: + if ch in en_set: + score["en"] += 1 + char_count += 1 + elif ch in ru_set: + score["ru"] += 1 + char_count += 1 + if char_count >= stop_at: + break + return Text.Language.EN if score["en"] >= score["ru"] else Text.Language.RU + + def __init__( + self, title: str, paragraphs: list[str] | str, lang: Text.Language | None = None + ) -> None: + self._title = title self._paragraphs = paragraphs + self._lang: Text.Language = lang or Text.guess_language(paragraphs) + self._datetime = datetime.datetime.now() + + def _preamble(self) -> str: + preamble = [] + + preamble.append(self._title) + if self._lang == Text.Language.RU: + preamble.append("Создано: " + self._datetime.strftime("%Y-%m-%d %H:%M:%S")) + else: + preamble.append("Created: " + self._datetime.strftime("%Y-%m-%d %H:%M:%S")) + return "\n".join(preamble) + + def save_to_txt( + self, directory: str, filename: str | None = None, encoding: str = "utf-8" + ) -> str: + if not filename: + filename = self._title + ".txt" + file_path = os.path.join(directory, filename) + with open(file_path, mode="w", encoding=encoding) as txt: # utf-8-sig + txt.write(self._preamble() + "\n") + if isinstance(self._paragraphs, list): + for p in self._paragraphs: + txt.write(p + "\n") + elif isinstance(self._paragraphs, str): + txt.write(self._paragraphs) + return file_path - def save_to_txt(self, directory: str, filename: str | None = None) -> str: - return "dummy" + @staticmethod + def from_txt(file_path: str, encoding: str = "utf-8") -> Text: + text = open(file_path, encoding="utf-8").read() + filename = os.path.basename(file_path) + title, _ = os.path.splitext(filename) + return Text(title, text) diff --git a/narrator/url_parser.py b/narrator/url_parser.py index 98f42de..a8b4bd4 100644 --- a/narrator/url_parser.py +++ b/narrator/url_parser.py @@ -1,8 +1,14 @@ from urllib.parse import urlparse from urllib.request import urlopen +import re -from narrator.exceptions import UrlInvalid, UrlUnreachable +from bs4 import BeautifulSoup + +from narrator.exceptions import UrlInvalid, UrlUnreachable, UrlParserException from narrator.text import Text +import narrator.config + +conf = narrator.config.url_parser # https://stackoverflow.com/a/38020041/2493536 def is_uri_valid(maybe_url: str) -> bool: @@ -45,6 +51,17 @@ def is_reachable(self) -> bool: self._is_reachable = is_url_reachable(self._url) return self._is_reachable + def _get_html(self) -> str: + with urlopen(self._url) as resp: + html = resp.read().decode() + return html + + def _pick_parse_config(self): + for site in conf.sites: + if re.match(conf.sites[site].url_re, self._url): + return conf.sites[site] + return None + def parse(self) -> Text: if not self.is_valid: raise UrlInvalid() @@ -52,8 +69,19 @@ def parse(self) -> Text: raise UrlUnreachable() if not self._parsed_text: - ... - # self._parsed_text = something - # raise NotImplemented() - self._parsed_text = Text(Text.Language.ru, [""]) + html = self._get_html() + soup = BeautifulSoup(html, "html.parser") + parse_config = self._pick_parse_config() + if not parse_config: + raise UrlParserException() + title = soup.select_one(parse_config.re.title) + author = soup.select_one(parse_config.re.author) + publication_date = soup.select_one(parse_config.re.publication_date) + + paragraphs = [] + paragraphs.append(f"author: {author}") + paragraphs.append(f"publication_date: {publication_date}") + paragraphs.append(soup.select_one(parse_config.re.text).text) + + self._parsed_text = Text(title, paragraphs) return self._parsed_text From 3f3fa7673f1d17dc67713a186baafd83d9b98f77 Mon Sep 17 00:00:00 2001 From: sentenzo Date: Wed, 21 Sep 2022 16:20:54 +0300 Subject: [PATCH 6/7] bugfix --- narrator/sub_utils.py | 4 ++-- narrator/telegram/worker.py | 5 +++-- narrator/text.py | 2 +- narrator/url_parser.py | 8 ++------ 4 files changed, 8 insertions(+), 11 deletions(-) diff --git a/narrator/sub_utils.py b/narrator/sub_utils.py index 6cc5c1d..764b395 100644 --- a/narrator/sub_utils.py +++ b/narrator/sub_utils.py @@ -44,14 +44,14 @@ def make_filename(file_name: str): from string import digits, ascii_letters cyrilic = "абвгдеёжзиклмнопрстуфхцчшщьыъэюя" - alphabet = digits + ascii_letters + cyrilic + cyrilic.upper() + " _-&%@#!()" + alphabet = digits + ascii_letters + cyrilic + cyrilic.upper() + " _-&%@#!()." file_name = "".join([c for c in file_name if c in alphabet]) return file_name def ffmpeg__to_mp3( from_file: str, - bitrate: int, + bitrate: int = 96, ): # ffmpeg -i input.wav -vn -ar 44100 -ac 2 -b:a 192k output.mp3 diff --git a/narrator/telegram/worker.py b/narrator/telegram/worker.py index b6985ce..42ba5df 100644 --- a/narrator/telegram/worker.py +++ b/narrator/telegram/worker.py @@ -63,7 +63,8 @@ def check_validity(self) -> ValidityCheckResult: async def produce_audio_file(self, directory: str) -> str: text: Text = self._url.parse() - txt_path = text.save_to_txt(directory) + # balcon reads only utf-8-sig + txt_path = text.save_to_txt(directory, encoding="utf-8-sig") wav_path = balcon(txt_path) mp3_path = ffmpeg__to_mp3(wav_path) return mp3_path @@ -100,7 +101,7 @@ def check_validity(self) -> ValidityCheckResult: async def produce_audio_file(self, directory: str) -> str: filename = make_filename(self._doc.file_name) file_path = os.path.join(directory, filename) - self._bot.download(self._doc, filename) + await self._bot.download(self._doc, file_path) txt_path = blb2txt(file_path) wav_path = balcon(txt_path) mp3_path = ffmpeg__to_mp3(wav_path) diff --git a/narrator/text.py b/narrator/text.py index 4f35f43..d1bb4a0 100644 --- a/narrator/text.py +++ b/narrator/text.py @@ -77,7 +77,7 @@ def save_to_txt( @staticmethod def from_txt(file_path: str, encoding: str = "utf-8") -> Text: - text = open(file_path, encoding="utf-8").read() + text = open(file_path, encoding=encoding).read() filename = os.path.basename(file_path) title, _ = os.path.splitext(filename) return Text(title, text) diff --git a/narrator/url_parser.py b/narrator/url_parser.py index a8b4bd4..169e986 100644 --- a/narrator/url_parser.py +++ b/narrator/url_parser.py @@ -28,10 +28,6 @@ def is_url_reachable(url: str, timeout=3) -> bool: class Url: - """ - dummy - """ - def __init__(self, url: str) -> None: self._url = url @@ -74,8 +70,8 @@ def parse(self) -> Text: parse_config = self._pick_parse_config() if not parse_config: raise UrlParserException() - title = soup.select_one(parse_config.re.title) - author = soup.select_one(parse_config.re.author) + title = soup.select_one(parse_config.re.title).text + author = soup.select_one(parse_config.re.author).text publication_date = soup.select_one(parse_config.re.publication_date) paragraphs = [] From c02ac8b5e04cfb1df61f21e07ecb9a2fe1d74f96 Mon Sep 17 00:00:00 2001 From: sentenzo Date: Wed, 21 Sep 2022 16:23:31 +0300 Subject: [PATCH 7/7] minor refactoring --- narrator/speaker/__init__.py | 3 --- narrator/speaker/speaker.py | 18 ------------------ narrator/telegram/worker.py | 2 +- 3 files changed, 1 insertion(+), 22 deletions(-) delete mode 100644 narrator/speaker/__init__.py delete mode 100644 narrator/speaker/speaker.py diff --git a/narrator/speaker/__init__.py b/narrator/speaker/__init__.py deleted file mode 100644 index fc71b49..0000000 --- a/narrator/speaker/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .speaker import Speaker - -__all__ = [Speaker] diff --git a/narrator/speaker/speaker.py b/narrator/speaker/speaker.py deleted file mode 100644 index fb62e96..0000000 --- a/narrator/speaker/speaker.py +++ /dev/null @@ -1,18 +0,0 @@ -import os - -from ..article import Article - -import narrator.sub_utils as utils - - -class Speaker: - @staticmethod - def narrate_from_txt_to_file(path_to_txt_file: str): - - # balcon.exe only works with UTF-8-BOM (or "utf-8-sig") - text = open(path_to_txt_file, encoding="utf-8").read() - open(path_to_txt_file, mode="w", encoding="utf-8-sig").write(text) - - path_to_wav = utils.balcon(path_to_txt_file) - path_to_mp3 = utils.ffmpeg__to_mp3(path_to_wav, 48) - return path_to_mp3 diff --git a/narrator/telegram/worker.py b/narrator/telegram/worker.py index 42ba5df..9c161a1 100644 --- a/narrator/telegram/worker.py +++ b/narrator/telegram/worker.py @@ -63,7 +63,7 @@ def check_validity(self) -> ValidityCheckResult: async def produce_audio_file(self, directory: str) -> str: text: Text = self._url.parse() - # balcon reads only utf-8-sig + # balcon.exe only works with UTF-8-BOM (or "utf-8-sig") txt_path = text.save_to_txt(directory, encoding="utf-8-sig") wav_path = balcon(txt_path) mp3_path = ffmpeg__to_mp3(wav_path)