-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
108 lines (95 loc) · 3.39 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import argparse
import glob
import os
# hard code the data directory and file paths
DATA_DIR = "./data"
POST_LIST = f"{DATA_DIR}/post_list.json"
POST_LIST_DIFF = f"{DATA_DIR}/post_list.diff.json"
POST_CONTENT = f"{DATA_DIR}/post_content.json"
POST_CONTENT_DIFF = f"{DATA_DIR}/post_content.diff.json"
def fetch_post_list(is_update: bool):
if is_update:
command = [
"python", "fetch_post_list.py",
"--update",
"--diff-input", POST_LIST,
"--diff-output", POST_LIST_DIFF,
"--output", POST_LIST,
"--base-url", args.base_url,
"--type-list", args.type_list,
"--concurrency", str(args.concurrency),
"--timeout", str(args.timeout)
]
else:
command = [
"python", "fetch_post_list.py",
"--output", POST_LIST,
"--base-url", args.base_url,
"--type-list", args.type_list,
"--concurrency", str(args.concurrency),
"--timeout", str(args.timeout)
]
os.system(" ".join(command))
def fetch_post_content(is_update: bool):
if is_update:
command = [
"python", "fetch_post_content.py",
"--update",
"--input", POST_LIST_DIFF,
"--output", POST_CONTENT,
"--diff-input", POST_CONTENT,
"--diff-output", POST_CONTENT_DIFF,
"--base-url", args.base_url,
"--concurrency", str(args.concurrency),
"--timeout", str(args.timeout)
]
else:
command = [
"python", "fetch_post_content.py",
"--input", POST_LIST,
"--output", POST_CONTENT,
"--base-url", args.base_url,
"--concurrency", str(args.concurrency),
"--timeout", str(args.timeout)
]
os.system(" ".join(command))
def fetch_post_file(is_update: bool):
if is_update:
command = [
"python", "fetch_post_file.py",
"--base-url", args.base_url,
"--input", POST_CONTENT_DIFF,
"--concurrency", str(args.concurrency)
]
else:
command = [
"python", "fetch_post_file.py",
"--base-url", args.base_url,
"--input", POST_CONTENT,
"--concurrency", str(args.concurrency)
]
os.system(" ".join(command))
def main():
if args.force_refetch:
input(f"This will delete all json in {DATA_DIR}, press Enter to continue")
json_files = glob.glob(os.path.join(DATA_DIR, '*.json'))
for file_path in json_files:
os.remove(file_path)
is_update = False
if os.path.exists(POST_LIST) and os.path.exists(POST_CONTENT):
is_update = True
print(f"Start {'update' if is_update else 'initial'} fetching")
fetch_post_list(is_update)
fetch_post_content(is_update)
if not args.skip_file:
fetch_post_file(is_update)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--base-url", type=str, default="http://i.whut.edu.cn")
parser.add_argument("--type-list", type=str, default="xxtg,xytg,bmxw,lgjz")
parser.add_argument("--concurrency", type=int, default=32)
parser.add_argument("--force-refetch", action="store_true")
parser.add_argument("--timeout", type=int, default=5)
parser.add_argument("--skip-file", action="store_true")
args = parser.parse_args()
main()