-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgraph.py
308 lines (257 loc) · 10.1 KB
/
graph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
import sys
import io
import duckdb
import pyarrow.csv as csv
import matplotlib.pyplot as plt
import surt
import duck_utils
import graph_utils
import utils
host_sql = '''
SELECT
{cols}
FROM host_index
WHERE surt_host_name = '{surt_host_name}'{and_tld}
ORDER BY crawl ASC
'''
subdomain_sql = '''
SELECT
{cols}
FROM host_index
WHERE surt_host_name LIKE '{surt_host_name}'{and_tld}
GROUP BY crawl
ORDER BY crawl ASC
'''
many_host_sql = '''
SELECT
{cols}
FROM host_index
WHERE contains(ARRAY [{surt_list}], surt_host_name){and_tld}
GROUP BY crawl
ORDER BY crawl ASC
'''
host_columns = {
# list order does matter for the graphs
'rank': ('crawl', 'fetch_200', 'fetch_200_lote', 'prank10', 'hcrank10'),
'fetch': ('crawl', 'fetch_200', 'fetch_gone', 'fetch_redirPerm', 'fetch_redirTemp', 'fetch_notModified', 'fetch_3xx', 'fetch_4xx', 'fetch_5xx', 'fetch_other'),
'pagesize': ('crawl', 'warc_record_length_av', 'warc_record_length_median'),
'nutch': ('crawl', 'nutch_numRecords', 'nutch_fetched', 'nutch_unfetched', 'nutch_gone', 'nutch_redirTemp', 'nutch_redirPerm', 'nutch_notModified'),
'nutch_pct': ('crawl', 'nutch_fetched_pct', 'nutch_unfetched_pct', 'nutch_gone_pct', 'nutch_redirTemp_pct', 'nutch_redirPerm_pct', 'nutch_notModified_pct'),
'nutch_all': ('crawl', 'nutch_numRecords', 'nutch_fetched', 'nutch_unfetched', 'nutch_gone', 'nutch_redirTemp', 'nutch_redirPerm', 'nutch_notModified',
'nutch_fetched_pct', 'nutch_unfetched_pct', 'nutch_gone_pct', 'nutch_redirTemp_pct', 'nutch_redirPerm_pct', 'nutch_notModified_pct'),
'robots': ('crawl', 'robots_200', 'robots_gone', 'robots_redirPerm', 'robots_redirTemp', 'robots_notModified', 'robots_3xx', 'robots_4xx', 'robots_5xx', 'robots_other'),
}
domain_columns = {
'sum': ('crawl', 'fetch_200', 'fetch_200_lote'),
}
many_host_columns = {
'sum': ('crawl', 'fetch_200'),
'sum_lote': ('crawl', 'fetch_200', 'fetch_200_lote'),
'sum_nutch': ('crawl', 'fetch_200', 'fetch_200_lote', 'nutch_fetched', 'nutch_unfetched', 'nutch_gone', 'nutch_redirTemp', 'nutch_redirPerm', 'nutch_notModified'),
}
def left_right(cols):
# different normalizations
cols = tuple(col for col in cols if col != 'crawl')
rank10 = tuple(col for col in cols if col in {'hcrank10', 'prank10'})
rank = tuple(col for col in cols if col in {'hcrank', 'prank'})
pct = tuple(col for col in cols if col.endswith('_pct'))
left = set(cols)
left = left.difference(rank10).difference(rank).difference(pct)
left = tuple(left)
# at most two
# put left on the left, if present
count = [bool(rank10), bool(rank), bool(pct), bool(left)].count(True)
if count > 2:
raise ValueError('too many scales: '+repr(cols))
# this order matters
if rank10:
rlabel = 'rank10'
elif rank:
rlabel = 'rank'
elif pct:
rlabel = 'percent'
else:
rlabel = None
if left:
if rank10 or rank or pct:
return left, rank10 or rank or pct, rlabel
else:
return left, tuple(), None
if rank:
if rank10 or pct:
return rank, rank10 or pct, rlabel
else:
return rank, tuple(), None
if pct:
if rank10:
return pct, rank10, 'rank10'
else:
return pct, tuple(), None
return rank10, tuple(), None
def surt_host_name_to_title(surt_host_name):
parts = list(reversed(surt_host_name.split(',')))
if parts[0] == '':
parts[0] = '*'
return '.'.join(parts)
def get_values(host_index, surt_host_name, col_names, verbose=0):
if not isinstance(surt_host_name, str):
# if not a string, it's a list of strings
surt_list = ','.join(f"'{s}'" for s in surt_host_name)
tlds = set([s.split(',', 1)[0] for s in surt_host_name])
if len(tlds) == 1:
tld = next(iter(tlds))
and_tld = f" AND url_host_tld = '{tld}'"
else:
and_tld = ''
cols = ', '.join(f'CAST(SUM({col}) AS INT64) AS sum_{col}' for col in col_names if col != 'crawl')
cols = 'crawl, '+cols
sql = many_host_sql.format(cols=cols, surt_list=surt_list, and_tld=and_tld)
if verbose:
print(sql)
return duckdb.sql(sql).arrow()
tld = surt_host_name.split(',', 1)[0]
and_tld = f" AND url_host_tld = '{tld}'"
if surt_host_name.endswith(','):
cols = ', '.join(f'CAST(SUM({col}) AS INT64) AS sum_{col}' for col in col_names if col != 'crawl')
cols = 'crawl, '+cols
sql = subdomain_sql.format(cols=cols, surt_host_name=surt_host_name+'%', and_tld=and_tld)
else:
cols = ', '.join(col_names)
sql = host_sql.format(cols=cols, surt_host_name=surt_host_name, and_tld=and_tld)
if verbose:
print(sql)
return duckdb.sql(sql).arrow()
def host_csv(table, fname):
with open(fname, 'wb') as fd:
csv.write_csv(table, fd)
def plot_values(table, col_names, title):
df = table.to_pandas()
cols = list(df.columns)
left, right, rlabel = left_right(cols)
lines = []
for name in col_names:
if name == 'crawl':
continue
side = 'l' if name in left else 'r'
# x, y, side, marker, label
lines.append(['crawl', name, side, None, name])
return do_plot(df, lines, title, rlabel)
def do_plot(df, lines, title, rlabel):
fig, ax1 = plt.subplots()
ax2 = None
our_lines = []
for i, line in enumerate(lines):
x, y, side, marker, label = line
yvalues = df[y].astype(float)
if yvalues.sum() == 0.0:
# declutter plots by suppressing all-zero lines and their legends
continue
xvalues = df[x].astype(str)
xvalues = [x.replace('CC-MAIN-', '') for x in xvalues]
ls = None
#color, ls = graph_utils.get_color_ls(i)
color, marker = graph_utils.get_color_marker(i)
if side == 'l':
our_line, = ax1.plot(xvalues, yvalues, marker=marker, label=label, color=color, ls=ls)
else:
if not ax2:
ax2 = ax1.twinx()
our_line, = ax2.plot(xvalues, yvalues, marker=marker, label=label, color=color, ls=ls)
our_lines.append(our_line)
plt.xlabel('crawl')
ax1.set_ylim(bottom=0)
if rlabel:
ax2.set_ylabel(rlabel)
ax2.set_ylim(bottom=0)
###ax2.set_ylim(top=10.0) # the legend tends to get clobbered if you do this
plt.title(title)
# more complicated because of the twinx
labels = [line.get_label() for line in our_lines]
ax1.legend(our_lines, labels, fontsize='x-small') # default is medium
for label in ax1.get_xticklabels():
label.set_rotation(90)
plt.setp(plt.gcf(), figwidth=5, figheight=5) # "inches"
plt.tight_layout() # avoid clipping of the x axis labels
buffer = io.BytesIO()
plt.savefig(buffer, format='png', dpi=200) # it's 1000 x 1000
# <img style="width:500px;" src="..."> for retina
plt.close()
return buffer
def get_plots(host_index, surt_host_name, title, config, verbose=0):
plots = {}
tables = {}
for key, cols in config.items():
table = get_values(host_index, surt_host_name, cols, verbose=verbose)
tables[key] = table
# this preserves the original order, which is a good thing
cols = table.column_names
buff = plot_values(table, cols, title)
plot = buff.getvalue()
plots[key] = plot
return tables, plots
def output_stuff(title, tables, plots,
do_csv=False, do_png=False, do_html=False, verbose=0, html_template='domain.html'):
for key in tables:
out = title + '_' + key
if do_csv:
host_csv(tables[key], out+'.csv')
if do_png:
with open (out+'.png', 'wb') as fd:
fd.write(plots[key])
if do_html:
plots[key] = graph_utils.png_to_embed(plots[key])
if do_html:
from jinja2 import Environment, FileSystemLoader, select_autoescape
env = Environment(
loader=FileSystemLoader('./templates'),
autoescape=select_autoescape(['html']),
)
template = env.get_template(html_template)
page = template.render(title=title, plots=plots)
with open(title + '.html', 'w') as f:
f.write(page)
def do_work(surt_host_name, host_index, title, verbose=0):
if not isinstance(surt_host_name, str):
config = many_host_columns
check_sums = True
elif surt_host_name.endswith(','):
config = domain_columns
check_sums = True
else:
config = host_columns
check_sums = False
if check_sums:
for k, cols in config.items():
if any([c.endswith('_pct') for c in cols]):
raise ValueError('cannot sum _pct')
if any([c in {'hcrank', 'hcrank10', 'crank', 'crank10'} for c in cols]):
raise ValueError('cannot sum ranks')
tables, plots = get_plots(host_index, surt_host_name, title, config, verbose=verbose)
output_stuff(title, tables, plots, do_csv=True, do_png=True, do_html=True, verbose=verbose)
def main():
verbose = 1
duck_utils.init_duckdb_httpfs(verbose=verbose)
grep = None
#grep = 'CC-MAIN-2022'
host_index = duck_utils.open_host_index(grep=grep, verbose=verbose)
if len(sys.argv) > 2 and sys.argv[1] == '-f':
assert len(sys.argv) == 3
surts = []
title = sys.argv[2]
with open(sys.argv[2], encoding='utf8') as fd:
for thing in fd:
surt_host_name = utils.thing_to_surt_host_name(thing.rstrip(), verbose=verbose)
if surt_host_name:
surts.append(surt_host_name)
if verbose:
print(f'making a plot for {len(surts)} hosts')
do_work(surts, host_index, title, verbose=verbose)
return
for thing in sys.argv[1:]:
surt_host_name = utils.thing_to_surt_host_name(thing)
if not surt_host_name:
continue
title = surt_host_name_to_title(surt_host_name)
do_work(surt_host_name, host_index, title, verbose=verbose)
if __name__ == '__main__':
main()