-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathapproxwc
executable file
·175 lines (146 loc) · 6.48 KB
/
approxwc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
#!/usr/bin/env python2.7
# vim: sts=4:sw=4
"""
Approximate line-count of an extremely large file. Quits after time limit,
or stops early if possible (via error tolerance at 95% confidence).
Default settings are to run fast; tweak if you need more accuracy.
Example:
% approxwc v6/0910.useragg.randorder.partial
1,310,000 lines (1.0% max error, 16311 samples); 17,058,627,584 bytes
"""
# Approach: estimate the bernoulli parameter of proportion of newlines in file.
# Alternative approach is to view the problem as an average bytes-per-line estimator.
# Note for skewed distribution of line lengths (e.g. edgelists), mean estimation is hard.
# Then have to extrapolate to NumLines estimation bounds.
from __future__ import division
import math,sys,os,random,time
import argparse
def block_positions():
global args
num_first_blocks = int(args.min_samples)
global filesize, bufsize
if filesize <= num_first_blocks*bufsize:
yield 0,filesize
return
interval_size = (filesize-bufsize)/(num_first_blocks-1)
for b in range(num_first_blocks):
#print b*interval_size, bufsize
yield b*interval_size, bufsize
# would be better to do a recursive deterministic sampler
# e.g. 0,1,.5,.25,.75,.125,.375,.625,.875
# i forget how to implement though
for x in xrange(int(args.max_samples) - num_first_blocks):
yield random.randrange(bufsize, filesize-2*bufsize), bufsize
def iter_blocks():
global fp
for start,size in block_positions():
fp.seek(start)
yield fp.read(size)
def print_estimate():
# print "Num lines: {numlines:.1f} [{numlines_lo:.1f}, {numlines_hi:.1f}], Bytes per Line: {bpl_mean:.1f} (se {bpl_se:.1f})".format(**globals())
#print "Num Lines: {numlines_sig:,.0f} ({err_potential_pct:.1f}% max error), Bytes per Line: {bpl_mean:,.1f} (sd {bpl_sd:.1f}, se {bpl_se:.1f})".format(numlines_sig=smart_round(numlines, args.tolerance), err_potential_pct=err_potential*100, **globals())
if args.wc:
print "{numlines:12.1f}\tNA\t{filesize:12}\t{filename}".format(**globals())
else:
print "{numlines_sig:12,.0f} lines (relerr {err_potential_pct:4.1f}%, {num_blocks} samples), {filesize:12,} bytes ({filename})".format(
numlines_sig=smart_round(numlines, err_potential), err_potential_pct=err_potential*100 if numlines>0 else 0, **globals())
#print "NUMLINES",numlines
def print_totals():
p = grand_newlines_seen / grand_bytes_seen
se = math.sqrt(p*(1-p) / grand_bytes_seen)
lo = 1 + grand_total_bytes*(p-1.96*se)
hi = 1 + grand_total_bytes*(p+1.96*se)
grand_total_lines = p * grand_total_bytes
err_potential = (hi-lo) / grand_total_lines
if args.wc:
d=locals()
d.update(globals())
print "{grand_total_lines:12.1f}\tNA\t{grand_total_bytes:12d}\ttotal".format(**d)
else:
print "{numlines_sig:12,.0f} lines (relerr {err_potential_pct:4.1f}%), {grand_total_bytes:12,} bytes (total)".format(
numlines_sig=grand_total_lines,
err_potential_pct=err_potential*100, **globals())
def intround(x):
return math.ceil(x) if x-math.floor(x) >= 0.5 else math.floor(x)
def smart_round(n, tol):
"""Tolerance is in relative error.
0.9% => 3 sigdig
1% => 2 sigdig
2% => 2 sigdig
"""
if tol<=0: return 2
if n <= 0: return 1
sigdig = math.ceil(-math.log10(tol)) + 1
sigdig = max(sigdig, 2)
extra = math.ceil(math.log10(n)) - sigdig
if extra <= 0:
return n
else:
return intround(n / 10**extra) * 10**extra
parser = argparse.ArgumentParser(description=__doc__.strip(), formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('filenames', nargs='+')
parser.add_argument('--time-limit', '-l', default=5, type=float, help="Limit wall-clock time per file, in seconds. LOW DEFAULT: 5")
parser.add_argument('--tolerance', '-t', default=0.1, type=float, help="Relative error tolerance (default is low)")
parser.add_argument('--blocksize', '-b', default=10e6, type=float, help="Block size in bytes (default: 10M)")
parser.add_argument('--max-samples', default=100e3, type=float, help="Maximum number of samples (default: very high)")
parser.add_argument('--min-samples', default=3, type=float, help="Minimum number of samples")
parser.add_argument('--wc', action='store_true', help="Output format similar to 'wc'")
args = parser.parse_args()
grand_total_bytes = 0
grand_bytes_seen = 0
grand_newlines_seen = 0
# err_potential_pct = 0
err_potential = 0
for filename in args.filenames:
filesize = os.stat(filename).st_size
bufsize = int(args.blocksize)
try:
fp = open(filename, 'r', bufsize)
except IOError as e:
print e
continue
start_time = time.time()
bytes_seen = 0
newlines_seen = 0
num_blocks = 0
for block in iter_blocks():
num_blocks += 1
newlines_seen += block.count('\n')
bytes_seen += len(block)
if bytes_seen==0: continue
grand_bytes_seen += bytes_seen
grand_newlines_seen += newlines_seen
p = newlines_seen/bytes_seen
se = math.sqrt( p*(1-p)/bytes_seen )
numlines = 1 + filesize*p
lo = 1 + filesize*(p-1.96*se)
hi = 1 + filesize*(p+1.96*se)
# Replacement adjustment isn't correct under with-replacement seek samples
#rest = filesize - bytes_seen
#numlines = 1 + newlines_seen + rest*p
#lo = 1+newlines_seen + rest*(p-1.96*se)
#lo = max(0, lo)
#hi = 1+newlines_seen + rest*(p+1.96*se)
#print num_blocks, numlines, lo,hi
if num_blocks < args.min_samples: continue
# Relative error is calculated as size of interval relative to estimate.
# Justification: Assume we're in the 95% case of the true NumLines being
# inside the interval. Conditional on that, the worst-case is if the
# estimate is on the very extreme of the interval, Then you want to bound
# how far away is the other other very extreme end -- i.e. the size of the
# interval.
err_potential = (hi-lo) / numlines
if err_potential < args.tolerance:
#print "Confidence satisfied ({} samples)".format(num_blocks)
break
if time.time() - start_time > args.time_limit:
break
else:
#print "Sample limit reached ({})".format(num_blocks)
pass
grand_total_bytes += filesize
if bytes_seen==0:
numlines = 0
print_estimate()
if len(args.filenames) > 1:
print_totals()