client.py

#!/usr/bin/env python3

import argparse
import json
import sys

import pgcrawl
from pgcrawl.client.args import DEFAULT_CRAWL_ARGS, ClientCrawlArgs
from pgcrawl.client.args import DEFAULT_QUERY_ARGS, ClientQueryArgs
from pgcrawl.client.commands import crawl_url, query_graph
from pgcrawl.logging import add_logger_argument, Logger
from pgcrawl.types import JSONDict


def crawl_cmd(args: argparse.Namespace, logger: Logger) -> bool:
    crawl_args = ClientCrawlArgs(args.client_code_path, args.binary_path,
                                 args.s3_bucket, args.seconds,
                                 args.timeout)
    return crawl_url(ARGS.url, ARGS.rank, crawl_args, logger)


def query_cmd(args: argparse.Namespace, logger: Logger) -> JSONDict | bool:
    query_args = ClientQueryArgs(args.s3_bucket, args.timeout)
    return query_graph(ARGS.url, ARGS.rank, query_args, logger)


PARSER = argparse.ArgumentParser(
    prog=f"{pgcrawl.NAME}: client",
    description="Script responsible for crawling or site, as dictated by a "
                "dispatch instance of this script, or querying a graph.",
    formatter_class=argparse.ArgumentDefaultsHelpFormatter)

SUBPARSERS = PARSER.add_subparsers(required=True)
CLIENT_CRAWL_PARSER = SUBPARSERS.add_parser(
    "crawl",
    help="Crawl a tranco ranked site using pagegraph-crawl.",
    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
CLIENT_CRAWL_PARSER.add_argument(
    "-r", "--rank",
    required=True,
    type=int,
    help="The Tranco rank of the site at the time the list was generated.")
CLIENT_CRAWL_PARSER.add_argument(
    "-u", "--url",
    required=True,
    help="The URL to record in PageGraph (must be a fully formed URL).")
CLIENT_CRAWL_PARSER.add_argument(
    "-s", "--seconds",
    type=int,
    default=DEFAULT_CRAWL_ARGS.pagegraph_secs,
    help="Number of seconds let the page execute before requesting the graph.")
CLIENT_CRAWL_PARSER.add_argument(
    "-t", "--timeout",
    type=int,
    default=DEFAULT_CRAWL_ARGS.timeout,
    help="The maxim number of seconds we'll wait, after requesting the graph, "
         "before killing the process and reporting an error.")
CLIENT_CRAWL_PARSER.add_argument(
    "--binary-path", "-b",
    default=DEFAULT_CRAWL_ARGS.binary_path,
    help="The binary to use when calling pagegraph-crawl.")
CLIENT_CRAWL_PARSER.add_argument(
    "--s3-bucket",
    default=DEFAULT_CRAWL_ARGS.s3_bucket,
    help="The S3 bucket to write the resulting graphs into.")
CLIENT_CRAWL_PARSER.add_argument(
    "--client-code-path",
    default=pgcrawl.DEFAULT_CLIENT_CODE_PATH,
    help="Path to pagegraph-tranco-crawl code on the client.")
add_logger_argument(CLIENT_CRAWL_PARSER)
CLIENT_CRAWL_PARSER.set_defaults(func=crawl_cmd)

QUERY_PARSER = SUBPARSERS.add_parser(
    "query",
    help="Query a graph generated by pagegraph-crawl.",
    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
QUERY_PARSER.add_argument(
    "-u", "--url",
    required=True,
    help="The URL that was recorded using the `crawl` command.")
QUERY_PARSER.add_argument(
    "-r", "--rank",
    required=True,
    type=int,
    help="The Tranco rank of the site at the time the list was generated.")
QUERY_PARSER.add_argument(
    "-t", "--timeout",
    type=int,
    default=DEFAULT_QUERY_ARGS.timeout,
    help="The maxim number of seconds we'll wait before assuming a query "
         "went wrong.")
QUERY_PARSER.add_argument(
    "--s3-bucket",
    default=DEFAULT_QUERY_ARGS.s3_bucket,
    help="The S3 bucket to write the resulting graphs into.")
add_logger_argument(QUERY_PARSER)
QUERY_PARSER.set_defaults(func=query_cmd)

try:
    ARGS = PARSER.parse_args()
    LOGGER = Logger(ARGS.log_level)
    RESULT = ARGS.func(ARGS, LOGGER)
except ValueError as e:
    print(f"Invalid argument: {e}", file=sys.stderr)
    sys.exit(1)

if not RESULT:
    sys.exit(1)
else:
    print(json.dumps(RESULT))
    sys.exit(0)