-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathmd5sample
executable file
·36 lines (32 loc) · 1.07 KB
/
md5sample
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#!/usr/bin/env python2.7
# Downsample records by md5 hash
from __future__ import division
import sys, hashlib, struct, os
from optparse import OptionParser
parser = OptionParser()
parser.add_option('-r', '--rate', default=1.0/1000, help="sample rate (from 0 to 1)")
parser.add_option('-f', '--field', default=1, help="Which field to use as split key (1-indexed). 0 = whole line")
parser.add_option('-s', '--salt', default='', help="Salt for hashing - essentially a random seed")
opts,args = parser.parse_args()
#print>>sys.stderr, opts
field = int(opts.field)
assert field >= 0
field -= 1
rate = float(opts.rate)
#print>>sys.stderr, "desired rate", rate
assert 0 < rate <= 1, "rate %s out of range" % rate
N = 2**32-1
n = int(rate * N)
assert n>0, "too finegrained a rate"
#print>>sys.stderr, "using sample rate of %s" % (n/N)
for line in sys.stdin:
line = line[:-1]
if field > -1:
key = line.split('\t')[field]
else:
key = line
if opts.salt:
key += '*=*-*' + opts.salt
randint = struct.unpack('I', hashlib.md5(key).digest()[:4])[0]
if randint <= n:
print line