-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclient.py
executable file
·111 lines (99 loc) · 3.82 KB
/
client.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/usr/bin/env python3
import argparse
import json
import sys
import pgcrawl
from pgcrawl.client.args import DEFAULT_CRAWL_ARGS, ClientCrawlArgs
from pgcrawl.client.args import DEFAULT_QUERY_ARGS, ClientQueryArgs
from pgcrawl.client.commands import crawl_url, query_graph
from pgcrawl.logging import add_logger_argument, Logger
from pgcrawl.types import JSONDict
def crawl_cmd(args: argparse.Namespace, logger: Logger) -> bool:
crawl_args = ClientCrawlArgs(args.client_code_path, args.binary_path,
args.s3_bucket, args.seconds,
args.timeout)
return crawl_url(ARGS.url, ARGS.rank, crawl_args, logger)
def query_cmd(args: argparse.Namespace, logger: Logger) -> JSONDict | bool:
query_args = ClientQueryArgs(args.s3_bucket, args.timeout)
return query_graph(ARGS.url, ARGS.rank, query_args, logger)
PARSER = argparse.ArgumentParser(
prog=f"{pgcrawl.NAME}: client",
description="Script responsible for crawling or site, as dictated by a "
"dispatch instance of this script, or querying a graph.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
SUBPARSERS = PARSER.add_subparsers(required=True)
CLIENT_CRAWL_PARSER = SUBPARSERS.add_parser(
"crawl",
help="Crawl a tranco ranked site using pagegraph-crawl.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
CLIENT_CRAWL_PARSER.add_argument(
"-r", "--rank",
required=True,
type=int,
help="The Tranco rank of the site at the time the list was generated.")
CLIENT_CRAWL_PARSER.add_argument(
"-u", "--url",
required=True,
help="The URL to record in PageGraph (must be a fully formed URL).")
CLIENT_CRAWL_PARSER.add_argument(
"-s", "--seconds",
type=int,
default=DEFAULT_CRAWL_ARGS.pagegraph_secs,
help="Number of seconds let the page execute before requesting the graph.")
CLIENT_CRAWL_PARSER.add_argument(
"-t", "--timeout",
type=int,
default=DEFAULT_CRAWL_ARGS.timeout,
help="The maxim number of seconds we'll wait, after requesting the graph, "
"before killing the process and reporting an error.")
CLIENT_CRAWL_PARSER.add_argument(
"--binary-path", "-b",
default=DEFAULT_CRAWL_ARGS.binary_path,
help="The binary to use when calling pagegraph-crawl.")
CLIENT_CRAWL_PARSER.add_argument(
"--s3-bucket",
default=DEFAULT_CRAWL_ARGS.s3_bucket,
help="The S3 bucket to write the resulting graphs into.")
CLIENT_CRAWL_PARSER.add_argument(
"--client-code-path",
default=pgcrawl.DEFAULT_CLIENT_CODE_PATH,
help="Path to pagegraph-tranco-crawl code on the client.")
add_logger_argument(CLIENT_CRAWL_PARSER)
CLIENT_CRAWL_PARSER.set_defaults(func=crawl_cmd)
QUERY_PARSER = SUBPARSERS.add_parser(
"query",
help="Query a graph generated by pagegraph-crawl.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
QUERY_PARSER.add_argument(
"-u", "--url",
required=True,
help="The URL that was recorded using the `crawl` command.")
QUERY_PARSER.add_argument(
"-r", "--rank",
required=True,
type=int,
help="The Tranco rank of the site at the time the list was generated.")
QUERY_PARSER.add_argument(
"-t", "--timeout",
type=int,
default=DEFAULT_QUERY_ARGS.timeout,
help="The maxim number of seconds we'll wait before assuming a query "
"went wrong.")
QUERY_PARSER.add_argument(
"--s3-bucket",
default=DEFAULT_QUERY_ARGS.s3_bucket,
help="The S3 bucket to write the resulting graphs into.")
add_logger_argument(QUERY_PARSER)
QUERY_PARSER.set_defaults(func=query_cmd)
try:
ARGS = PARSER.parse_args()
LOGGER = Logger(ARGS.log_level)
RESULT = ARGS.func(ARGS, LOGGER)
except ValueError as e:
print(f"Invalid argument: {e}", file=sys.stderr)
sys.exit(1)
if not RESULT:
sys.exit(1)
else:
print(json.dumps(RESULT))
sys.exit(0)