From ddf8db54c6abe62175994ad89b347a87c50c0a74 Mon Sep 17 00:00:00 2001 From: Shabbir Hasan <68828793+ShabbirHasan1@users.noreply.github.com> Date: Wed, 27 Dec 2023 11:29:21 +0530 Subject: [PATCH] Update update_user_agents.py --- update_user_agents.py | 77 ++++++++++++++++++++++++------------------- 1 file changed, 43 insertions(+), 34 deletions(-) diff --git a/update_user_agents.py b/update_user_agents.py index 1d9b476..abb990e 100644 --- a/update_user_agents.py +++ b/update_user_agents.py @@ -3,73 +3,82 @@ from github import Github from lxml import html -user_agents_file_name = 'user_agents.json' -user_agents_file_path = os.path.join( - os.path.dirname(__file__), user_agents_file_name) +user_agents_file_name = "user_agents.json" +user_agents_file_path = os.path.join(os.path.dirname(__file__), user_agents_file_name) _os_field_include_patterns = [ - re.compile(r'^windows nt \d+\.\d+$', flags=re.IGNORECASE), - re.compile(r'^macintosh$', flags=re.IGNORECASE), - re.compile(r'^linux (x86_64|i686)$', flags=re.IGNORECASE), + re.compile(r"^windows nt \d+\.\d+$", flags=re.IGNORECASE), + re.compile(r"^macintosh$", flags=re.IGNORECASE), + re.compile(r"^linux (x86_64|i686)$", flags=re.IGNORECASE), ] _os_field_exclude_patterns = [ - re.compile(r'\bwindows mobile\b', flags=re.IGNORECASE), - re.compile(r'\bxbox\b', flags=re.IGNORECASE), - re.compile(r'\biphone\b', flags=re.IGNORECASE), - re.compile(r'\bipad\b', flags=re.IGNORECASE), - re.compile(r'\bipod\b', flags=re.IGNORECASE), - re.compile(r'\bandroid\b', flags=re.IGNORECASE), + re.compile(r"\bwindows mobile\b", flags=re.IGNORECASE), + re.compile(r"\bxbox\b", flags=re.IGNORECASE), + re.compile(r"\biphone\b", flags=re.IGNORECASE), + re.compile(r"\bipad\b", flags=re.IGNORECASE), + re.compile(r"\bipod\b", flags=re.IGNORECASE), + re.compile(r"\bandroid\b", flags=re.IGNORECASE), ] _saved_user_agents = None + def get_saved_user_agents(): global _saved_user_agents if _saved_user_agents is None: - with open(user_agents_file_path, 'r') as f: + with open(user_agents_file_path, "r") as f: _saved_user_agents = json.load(f) return _saved_user_agents + def get_latest_user_agents(): - user_agents = [] - base_url = 'https://www.whatismybrowser.com/guides/the-latest-user-agent/' - for browser in ('chrome', 'firefox', 'safari', 'edge'): + user_agents, session = [], requests.session() + session.headers.update( + { + "authority": "www.whatismybrowser.com", + "referer": "https://www.whatismybrowser.com/", + "User-Agent": random.choice(get_saved_user_agents()), + } + ) + base_url = "https://www.whatismybrowser.com/guides/the-latest-user-agent/" + for browser in ("chrome", "firefox", "safari", "edge"): time.sleep(1) - response = requests.get( - ''.join((base_url, browser)), - headers={'User-Agent': random.choice(get_saved_user_agents())}, - ) - elems = html.fromstring(response.text).cssselect('td li span.code') + response = session.get("".join((base_url, browser))) + elems = html.fromstring(response.text).cssselect("td li span.code") browser_uas = [] for elem in elems: ua = elem.text_content().strip() - if not ua.startswith('Mozilla/5.0 ('): + if not ua.startswith("Mozilla/5.0 ("): continue browser_uas.append(ua) for ua in browser_uas: - os_type = ua[len('Mozilla/5.0 ('):ua.find(')')].lower() - os_fields = [p.strip() for p in os_type.split(';')] + os_type = ua[len("Mozilla/5.0 (") : ua.find(")")].lower() + os_fields = [p.strip() for p in os_type.split(";")] - if any(p.match(f) for p, f in product( - _os_field_exclude_patterns, os_fields)): + if any( + p.match(f) for p, f in product(_os_field_exclude_patterns, os_fields) + ): continue - if any(p.match(f) for p, f in product( - _os_field_include_patterns, os_fields)): + if any( + p.match(f) for p, f in product(_os_field_include_patterns, os_fields) + ): user_agents.append(ua) return user_agents + def json_dump(obj): - return json.dumps(obj, indent=4).strip() + '\n' + return json.dumps(obj, indent=4).strip() + "\n" + def update_files_on_github(new_user_agents_json): - gh = Github(os.environ['GITHUB_TOKEN']) - repo = gh.get_repo(os.environ['GITHUB_REPOSITORY']) -# for branch in ('main', 'gh-pages'): - branch = 'main' + gh = Github(os.environ["GITHUB_TOKEN"]) + repo = gh.get_repo(os.environ["GITHUB_REPOSITORY"]) + # for branch in ('main', 'gh-pages'): + branch = "main" f = repo.get_contents(user_agents_file_name, ref=branch) repo.update_file( f.path, - message=f'Update {user_agents_file_name} on {branch} branch', + message=f"Update {user_agents_file_name} on {branch} branch", content=new_user_agents_json, sha=f.sha, branch=branch,