Skip to content

Commit

Permalink
feat: allow setting custom user agents
Browse files Browse the repository at this point in the history
  • Loading branch information
vladiliescu committed Jan 29, 2025
1 parent 1b40c4a commit 7f064a2
Showing 1 changed file with 21 additions and 6 deletions.
27 changes: 21 additions & 6 deletions grabit.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,14 +78,15 @@ def can_handle(self, url: str) -> bool:
def grab(
self,
url: str,
user_agent: str,
use_readability_js: bool,
fallback_title: str,
render_flags: RenderFlags,
output_formats: list[OutputFormat],
) -> (str, dict[OutputFormat, str]):
outputs = {}

html_content = download_html_content(url)
html_content = download_html_content(url, user_agent)
if should_output_raw_html(output_formats):
outputs[OutputFormat.RAW_HTML] = html_content

Expand Down Expand Up @@ -141,6 +142,7 @@ def can_handle(self, url: str) -> bool:
def grab(
self,
url: str,
user_agent: str,
use_readability_js: bool,
fallback_title: str,
render_flags: RenderFlags,
Expand All @@ -156,7 +158,7 @@ def grab(
outputs = {}

json_url = self._convert_to_json_url(url)
json_content = json.loads(download_html_content(json_url))
json_content = json.loads(download_html_content(json_url, user_agent))

title = json_content[0]["data"]["children"][0]["data"].get("title", None)
title = self.post_process_title(title, fallback_title)
Expand Down Expand Up @@ -225,6 +227,12 @@ def parse_comments(comments_data, depth=0):

@click.command()
@click.argument("url")
@click.option(
"--user-agent",
default=f"Grabit/{VERSION}",
help="The user agent reported when retrieving web pages",
show_default=True,
)
@click.version_option(
version=VERSION,
prog_name="Grabit",
Expand Down Expand Up @@ -282,11 +290,12 @@ def parse_comments(comments_data, depth=0):
[fmt.value for fmt in OutputFormat],
case_sensitive=False,
),
help="Output format(s) to save the content in. Can be specified multiple times i.e. -f md -f html",
help="Which output format(s) to use when saving the content. Can be specified multiple times i.e. -f md -f html",
show_default=True,
)
def save(
url: str,
user_agent: str,
use_readability_js: bool,
yaml_frontmatter: bool,
include_title: bool,
Expand Down Expand Up @@ -314,7 +323,7 @@ def save(
overwrite=overwrite,
)

title, outputs = grabber.grab(url, use_readability_js, fallback_title, render_flags, output_formats)
title, outputs = grabber.grab(url, user_agent, use_readability_js, fallback_title, render_flags, output_formats)
output(title, outputs, url, output_flags)


Expand Down Expand Up @@ -440,9 +449,15 @@ def extract_readable_content_and_title(html_content, use_readability_js):
)


def download_html_content(url):
def download_html_content(url, user_agent: str) -> str:
try:
response = requests.get(url)
request_headers = {
"User-Agent": user_agent,
"Accept-Language": "en-US,en;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
}

response = requests.get(url, headers=request_headers)
response.raise_for_status()
html_content = response.text
except RequestException as e:
Expand Down

0 comments on commit 7f064a2

Please sign in to comment.