Skip to content

Commit

Permalink
control concurrent requests; a little optimization; upgrade python an…
Browse files Browse the repository at this point in the history
…d deps
  • Loading branch information
MarshalX committed Jun 2, 2024
1 parent 463cb40 commit 1f8a267
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 31 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/make_tracked_links_list.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,14 @@ jobs:
steps:

- name: Clone.
uses: actions/checkout@v2
uses: actions/checkout@v4
with:
token: ${{ secrets.PAT }}

- name: Setup Python.
uses: actions/setup-python@v2
uses: actions/setup-python@v5
with:
python-version: 3.9
python-version: 3.12

- name: Install dependencies.
run: |
Expand Down
67 changes: 44 additions & 23 deletions make_tracked_links_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
import logging
import os
import re
from asyncio import Queue
from asyncio.exceptions import TimeoutError
from functools import cache
from html import unescape
from time import time
from typing import Set
Expand All @@ -11,6 +13,7 @@
import aiohttp
from aiohttp import ClientConnectorError, ServerDisconnectedError


PROTOCOL = 'https://'
BASE_URL = 'telegram.org'
# it's necessary to help crawler to find more links
Expand All @@ -21,11 +24,12 @@
'corefork.telegram.org/getProxyConfig',

'telegram.org/privacy/gmailbot',
'telegram.org/tos',
'telegram.org/tos/mini-apps',
'telegram.org/tos/p2pl',
'telegram.org/tour',
'telegram.org/evolution',
'telegram.org/tos/bots',
'telegram.org/tos/business',

'desktop.telegram.org/changelog',
'td.telegram.org/current',
Expand Down Expand Up @@ -133,6 +137,8 @@
r'apps$',
r'img/emoji/.+',
r'img/StickerExample.psd$',
r'/privacy$', # geolocation depended
r'/tos$', # geolocation depended
},
},
'webz.telegram.org': {
Expand Down Expand Up @@ -180,15 +186,19 @@
'TE': 'trailers',
}

logging.basicConfig(format='%(message)s', level=logging.DEBUG)
logging.basicConfig(format='%(asctime)s %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)

VISITED_LINKS = set()
LINKS_TO_TRACK = set()
LINKS_TO_TRANSLATIONS = set()
LINKS_TO_TRACKABLE_RESOURCES = set()

WORKERS_COUNT = 30
WORKERS_TASK_QUEUE = Queue()


@cache
def should_exclude(url: str) -> bool:
direct_link = re.findall(DIRECT_LINK_REGEX, url)[0]
domain_rules = CRAWL_RULES.get(direct_link)
Expand All @@ -210,6 +220,9 @@ def should_exclude(url: str) -> bool:
exclude = False
break

if exclude:
logger.debug('Exclude %s by rules', url)

return exclude


Expand Down Expand Up @@ -254,7 +267,7 @@ def find_relative_scripts(code: str, cur_link: str) -> Set[str]:
# dirty magic for specific cases
if '/' in link: # path to file from the root
url = f'{direct_cur_link}/{link}'
else: # its relative link from current folder. Not from the root
else: # it is a relative link from the current folder. not from the root
current_folder_link, *_ = cur_link.rsplit('/', 1)
url = f'{current_folder_link}/{link}'

Expand Down Expand Up @@ -341,17 +354,18 @@ class ServerSideError(Exception):
pass


async def crawl(url: str, session: aiohttp.ClientSession):
while True:
async def crawl_worker(session: aiohttp.ClientSession):
while not WORKERS_TASK_QUEUE.empty():
url = WORKERS_TASK_QUEUE.get_nowait()

try:
await _crawl(url, session)
except (ServerSideError, ServerDisconnectedError, TimeoutError, ClientConnectorError):
logger.warning(f'Client or timeout error. Retrying {url}')

WORKERS_TASK_QUEUE.put_nowait(url)
if url in VISITED_LINKS:
VISITED_LINKS.remove(url)
else:
break


async def _crawl(url: str, session: aiohttp.ClientSession):
Expand All @@ -360,7 +374,7 @@ async def _crawl(url: str, session: aiohttp.ClientSession):
VISITED_LINKS.add(url)

try:
logger.info(f'[{len(VISITED_LINKS)}] Process {url}')
logger.debug('[%s] Process %s', len(VISITED_LINKS), url)
async with session.get(f'{PROTOCOL}{url}', allow_redirects=False, timeout=TIMEOUT) as response:
content_type = response.headers.get('content-type')

Expand All @@ -372,20 +386,20 @@ async def _crawl(url: str, session: aiohttp.ClientSession):
if response.status not in {200, 304}:
if response.status != 302:
content = await response.text(encoding='UTF-8')
logger.debug(f'Skip {url} because status code == {response.status}. Content: {content}')
logger.warning(f'Skip {url} because status code == {response.status}. Content: {content}')
return

if is_textable_content_type(content_type):
# raw content will be cached by aiohttp. Don't worry about it
# aiohttp will cache raw content. we don't worry about it
raw_content = await response.read()
content = await response.text(encoding='UTF-8')

if is_translation_url(url):
LINKS_TO_TRANSLATIONS.add(url)
logger.info(f'add {url} to LINKS_TO_TRANSLATIONS')
logger.debug('Add %s to LINKS_TO_TRANSLATIONS', url)
else:
LINKS_TO_TRACK.add(url)
logger.info(f'add {url} to LINKS_TO_TRACK')
logger.debug('Add %s to LINKS_TO_TRACK', url)

absolute_links = cleanup_links(find_absolute_links(content))

Expand All @@ -396,33 +410,40 @@ async def _crawl(url: str, session: aiohttp.ClientSession):
relative_links = cleanup_links(relative_links_finder(content, url))

sub_links = absolute_links | relative_links
await asyncio.gather(*[crawl(url, session) for url in sub_links])
for sub_url in sub_links:
if sub_url not in VISITED_LINKS:
WORKERS_TASK_QUEUE.put_nowait(sub_url)
elif is_trackable_content_type(content_type):
LINKS_TO_TRACKABLE_RESOURCES.add(url)
logger.info(f'add {url} to LINKS_TO_TRACKABLE_RESOURCES')
logger.debug('Add %s to LINKS_TO_TRACKABLE_RESOURCES', url)
else:
# for example, zip with update of macOS client
logger.info(f'Unhandled type: {content_type} from {url}')
logger.warning(f'Unhandled type: {content_type} from {url}')

# telegram url can work with and without trailing slash (no redirect). P.S. not on every subdomain ;d
# so this is a problem when we have random behavior with link will be added
# this if resolve this issue. If available both link we prefer without trailing slash
# telegram url can work with and without a trailing slash (no redirect).
# note: not on every subdomain ;d
# so this is a problem when we have random behavior with a link will be added
# this if resolve this issue.
# if available both links, we prefer without a trailing slash
for links_set in (LINKS_TO_TRACK, LINKS_TO_TRANSLATIONS, LINKS_TO_TRACKABLE_RESOURCES):
without_trailing_slash = url[:-1:] if url.endswith('/') else url
if without_trailing_slash in links_set and f'{without_trailing_slash}/' in links_set:
links_set.remove(f'{without_trailing_slash}/')
logger.info(f'remove {without_trailing_slash}/')
logger.debug('Remove %s/', without_trailing_slash)
except UnicodeDecodeError:
logger.warning(f'Codec can\'t decode bytes. So it was a tgs file or response with broken content type {url}')

if raw_content.startswith(b'GIF'):
LINKS_TO_TRACKABLE_RESOURCES.add(url)
logger.info(f'add {url} to LINKS_TO_TRACKABLE_RESOURCES (raw content)')
logger.debug('Add %s to LINKS_TO_TRACKABLE_RESOURCES (raw content)', url)


async def start(url_list: Set[str]):
for url in url_list:
WORKERS_TASK_QUEUE.put_nowait(url)

async with aiohttp.ClientSession(connector=CONNECTOR, headers=HEADERS) as session:
await asyncio.gather(*[crawl(url, session) for url in url_list])
await asyncio.gather(*[crawl_worker(session) for _ in range(WORKERS_COUNT)])


if __name__ == '__main__':
Expand All @@ -443,8 +464,8 @@ async def start(url_list: Set[str]):
CURRENT_URL_LIST = LINKS_TO_TRACK | LINKS_TO_TRACKABLE_RESOURCES | LINKS_TO_TRANSLATIONS

logger.info(f'Is equal: {OLD_URL_LIST == CURRENT_URL_LIST}')
logger.info(f'Deleted: {OLD_URL_LIST - CURRENT_URL_LIST}')
logger.info(f'Added: {CURRENT_URL_LIST - OLD_URL_LIST}')
logger.info(f'Deleted ({len(OLD_URL_LIST - CURRENT_URL_LIST)}): {OLD_URL_LIST - CURRENT_URL_LIST}')
logger.info(f'Added ({len(CURRENT_URL_LIST - OLD_URL_LIST)}): {CURRENT_URL_LIST - OLD_URL_LIST}')
except IOError:
pass

Expand Down
6 changes: 3 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
aiohttp==3.7.4.post0
aiodns==3.0.0
aiohttp==3.9.5
aiodns==3.2.0
aiofiles==0.6.0
git+https://github.com/MarshalX/pyrogram
TgCrypto==1.2.3
beautifulsoup4==4.11.1
cssutils==2.4.2
requests==2.31.0
# uvloop==0.16.0
# uvloop==0.19.0
2 changes: 0 additions & 2 deletions tracked_links.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7558,11 +7558,9 @@ telegram.org/js/tgsticker-worker.js
telegram.org/js/tgsticker.js
telegram.org/js/widget-frame.js
telegram.org/press
telegram.org/privacy
telegram.org/privacy/gmailbot
telegram.org/support
telegram.org/t.me/PremiumBot
telegram.org/tos
telegram.org/tos/bot-developers
telegram.org/tos/bots
telegram.org/tos/business
Expand Down

0 comments on commit 1f8a267

Please sign in to comment.