Skip to content
This repository has been archived by the owner on Aug 2, 2021. It is now read-only.

file, testutil: Add reference file hasher #2099

Merged
merged 7 commits into from
Feb 24, 2020
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 66 additions & 0 deletions file/hasher/common_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
package hasher

import (
"github.com/ethersphere/swarm/testutil"
)

const (
sectionSize = 32
branches = 128
chunkSize = 4096
)

var (
dataLengths = []int{31, // 0
32, // 1
33, // 2
63, // 3
64, // 4
65, // 5
chunkSize, // 6
chunkSize + 31, // 7
chunkSize + 32, // 8
chunkSize + 63, // 9
chunkSize + 64, // 10
chunkSize * 2, // 11
chunkSize*2 + 32, // 12
chunkSize * 128, // 13
chunkSize*128 + 31, // 14
chunkSize*128 + 32, // 15
chunkSize*128 + 64, // 16
chunkSize * 129, // 17
chunkSize * 130, // 18
chunkSize * 128 * 128, // 19
chunkSize*128*128 + 32, // 20
}
expected = []string{
"ece86edb20669cc60d142789d464d57bdf5e33cb789d443f608cbd81cfa5697d", // 0
"0be77f0bb7abc9cd0abed640ee29849a3072ccfd1020019fe03658c38f087e02", // 1
"3463b46d4f9d5bfcbf9a23224d635e51896c1daef7d225b86679db17c5fd868e", // 2
"95510c2ff18276ed94be2160aed4e69c9116573b6f69faaeed1b426fea6a3db8", // 3
"490072cc55b8ad381335ff882ac51303cc069cbcb8d8d3f7aa152d9c617829fe", // 4
"541552bae05e9a63a6cb561f69edf36ffe073e441667dbf7a0e9a3864bb744ea", // 5
"c10090961e7682a10890c334d759a28426647141213abda93b096b892824d2ef", // 6
"91699c83ed93a1f87e326a29ccd8cc775323f9e7260035a5f014c975c5f3cd28", // 7
"73759673a52c1f1707cbb61337645f4fcbd209cdc53d7e2cedaaa9f44df61285", // 8
"db1313a727ffc184ae52a70012fbbf7235f551b9f2d2da04bf476abe42a3cb42", // 9
"ade7af36ac0c7297dc1c11fd7b46981b629c6077bce75300f85b02a6153f161b", // 10
"29a5fb121ce96194ba8b7b823a1f9c6af87e1791f824940a53b5a7efe3f790d9", // 11
"61416726988f77b874435bdd89a419edc3861111884fd60e8adf54e2f299efd6", // 12
"3047d841077898c26bbe6be652a2ec590a5d9bd7cd45d290ea42511b48753c09", // 13
"e5c76afa931e33ac94bce2e754b1bb6407d07f738f67856783d93934ca8fc576", // 14
"485a526fc74c8a344c43a4545a5987d17af9ab401c0ef1ef63aefcc5c2c086df", // 15
"624b2abb7aefc0978f891b2a56b665513480e5dc195b4a66cd8def074a6d2e94", // 16
"b8e1804e37a064d28d161ab5f256cc482b1423d5cd0a6b30fde7b0f51ece9199", // 17
"59de730bf6c67a941f3b2ffa2f920acfaa1713695ad5deea12b4a121e5f23fa1", // 18
"522194562123473dcfd7a457b18ee7dee8b7db70ed3cfa2b73f348a992fdfd3b", // 19
"ed0cc44c93b14fef2d91ab3a3674eeb6352a42ac2f0bbe524711824aae1e7bcc", // 20
}

start = 0
end = len(dataLengths)
)

func init() {
testutil.Init()
}
2 changes: 1 addition & 1 deletion file/hasher/hasher.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
// You should have received a copy of the GNU Lesser General Public License
// along with the Swarm library. If not, see <http://www.gnu.org/licenses/>.

package file
package hasher

import (
"context"
Expand Down
2 changes: 1 addition & 1 deletion file/hasher/hasher_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
// You should have received a copy of the GNU Lesser General Public License
// along with the Swarm library. If not, see <http://www.gnu.org/licenses/>.

package file
package hasher

import (
"bytes"
Expand Down
56 changes: 56 additions & 0 deletions file/hasher/param.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
package hasher

import (
"context"
"sync"

"github.com/ethersphere/swarm/file"
)

// defines the boundaries of the hashing job and also contains the hash factory function of the job
// setting Debug means omitting any automatic behavior (for now it means job processing won't auto-start)
type treeParams struct {
SectionSize int
acud marked this conversation as resolved.
Show resolved Hide resolved
Branches int
ChunkSize int
Spans []int
Debug bool
hashFunc file.SectionWriterFunc
writerPool sync.Pool
ctx context.Context
}

func newTreeParams(hashFunc file.SectionWriterFunc) *treeParams {

h := hashFunc(context.Background())
p := &treeParams{
SectionSize: h.SectionSize(),
Branches: h.Branches(),
ChunkSize: h.SectionSize() * h.Branches(),
hashFunc: hashFunc,
}
h.Reset()
p.writerPool.New = func() interface{} {
hf := p.hashFunc(p.ctx)
return hf
}
p.Spans = generateSpanSizes(p.Branches, 9)
return p
}

func (p *treeParams) SetContext(ctx context.Context) {
p.ctx = ctx
acud marked this conversation as resolved.
Show resolved Hide resolved
}

func (p *treeParams) GetContext() context.Context {
return p.ctx
}

func (p *treeParams) PutWriter(w file.SectionWriter) {
w.Reset()
p.writerPool.Put(w)
}

func (p *treeParams) GetWriter() file.SectionWriter {
return p.writerPool.Get().(file.SectionWriter)
}
115 changes: 115 additions & 0 deletions file/hasher/reference.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
package hasher

import (
"github.com/ethersphere/swarm/file"
)

// ReferenceHasher is the source-of-truth implementation of the swarm file hashing algorithm
type ReferenceHasher struct {
params *treeParams
cursors []int // section write position, indexed per level
length int // number of bytes written to the data level of the hasher
buffer []byte // keeps data and hashes, indexed by cursors
counts []int // number of sums performed, indexed per level
hasher file.SectionWriter // underlying hasher
}

// NewReferenceHasher constructs and returns a new ReferenceHasher
// This implementation is limited to a tree of 9 levels, where level 0 is the data level
// With 32 section size and 128 branches (i.e. unencrypted, non erasure-coded content) this means
// a capacity of 4096 bytes * (128^(9-1)) ~ 295.148 * (10^18) bytes
func NewReferenceHasher(params *treeParams) *ReferenceHasher {
// TODO: remove when bmt interface is amended
h := params.GetWriter()
return &ReferenceHasher{
params: params,
cursors: make([]int, 9),
counts: make([]int, 9),
buffer: make([]byte, params.ChunkSize*9),
nolash marked this conversation as resolved.
Show resolved Hide resolved
hasher: h,
}
}

// Hash computes and returns the root hash of arbitrary data
func (r *ReferenceHasher) Hash(data []byte) []byte {
l := r.params.ChunkSize
for i := 0; i < len(data); i += r.params.ChunkSize {
if len(data)-i < r.params.ChunkSize {
l = len(data) - i
}
r.update(0, data[i:i+l])
}
return r.digest()
}

// write to the data buffer on the specified level
// calls sum if chunk boundary is reached and recursively calls this function for the next level with the acquired bmt hash
// adjusts cursors accordingly
func (r *ReferenceHasher) update(lvl int, data []byte) {
if lvl == 0 {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it would be nice to write that level 0 is the data layer. especially when this reference hasher is another representation of a tree or trie, in which tree height is measured as the inverse (0 is the root)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

agreed.

r.length += len(data)
}
copy(r.buffer[r.cursors[lvl]:r.cursors[lvl]+len(data)], data)
r.cursors[lvl] += len(data)
if r.cursors[lvl]-r.cursors[lvl+1] == r.params.ChunkSize {
ref := r.sum(lvl)
r.update(lvl+1, ref)
r.cursors[lvl] = r.cursors[lvl+1]
acud marked this conversation as resolved.
Show resolved Hide resolved
}
}

// calculates and returns the bmt sum of the last written data on the level
func (r *ReferenceHasher) sum(lvl int) []byte {
r.counts[lvl]++
spanSize := r.params.Spans[lvl] * r.params.ChunkSize
span := (r.length-1)%spanSize + 1
acud marked this conversation as resolved.
Show resolved Hide resolved

sizeToSum := r.cursors[lvl] - r.cursors[lvl+1]

r.hasher.Reset()
r.hasher.SetSpan(span)
r.hasher.Write(r.buffer[r.cursors[lvl+1] : r.cursors[lvl+1]+sizeToSum])
ref := r.hasher.Sum(nil)
return ref
}

// called after all data has been written
// sums the final chunks of each level
// skips intermediate levels that end on span boundary
func (r *ReferenceHasher) digest() []byte {

// if we did not end on a chunk boundary, the last chunk hasn't been hashed
// we need to do this first
if r.length%r.params.ChunkSize != 0 {
ref := r.sum(0)
acud marked this conversation as resolved.
Show resolved Hide resolved
copy(r.buffer[r.cursors[1]:], ref)
r.cursors[1] += len(ref)
r.cursors[0] = r.cursors[1]
}

// calculate the total number of levels needed to represent the data (including the data level)
targetLevel := getLevelsFromLength(r.length, r.params.SectionSize, r.params.Branches)

// sum every intermediate level and write to the level above it
for i := 1; i < targetLevel; i++ {
acud marked this conversation as resolved.
Show resolved Hide resolved

// if the tree is balanced or if there is a single reference outside a balanced tree on this level
// don't hash it again but pass it on to the next level
if r.counts[i] > 0 {
// TODO: simplify if possible
if r.counts[i-1]-r.params.Spans[targetLevel-1-i] <= 1 {
acud marked this conversation as resolved.
Show resolved Hide resolved
r.cursors[i+1] = r.cursors[i]
r.cursors[i] = r.cursors[i-1]
continue
}
}

ref := r.sum(i)
copy(r.buffer[r.cursors[i+1]:], ref)
r.cursors[i+1] += len(ref)
r.cursors[i] = r.cursors[i+1]
}

// the first section of the buffer will hold the root hash
return r.buffer[:r.params.SectionSize]
}
140 changes: 140 additions & 0 deletions file/hasher/reference_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
package hasher

import (
"context"
"fmt"
"strconv"
"strings"
"testing"

"github.com/ethereum/go-ethereum/common/hexutil"
"github.com/ethersphere/swarm/bmt"
"github.com/ethersphere/swarm/file"
"github.com/ethersphere/swarm/log"
"github.com/ethersphere/swarm/testutil"
"golang.org/x/crypto/sha3"
)

// TestManualDanglingChunk is a test script explicitly hashing and writing every individual level in the dangling chunk edge case
// we use a balanced tree with data size of chunkSize*branches, and a single chunk of data
// this case is chosen because it produces the wrong result in the pyramid hasher at the time of writing (master commit hash 4928d989ebd0854d993c10c194e61a5a5455e4f9)
func TestManualDanglingChunk(t *testing.T) {
pool := bmt.NewTreePool(sha3.NewLegacyKeccak256, branches, bmt.PoolSize)
h := bmt.New(pool)

// to execute the job we need buffers with the following capacities:
// level 0: chunkSize*branches+chunkSize
// level 1: chunkSize
// level 2: sectionSize * 2
var levels [][]byte
levels = append(levels, nil)
levels = append(levels, make([]byte, chunkSize))
levels = append(levels, make([]byte, sectionSize*2))

// hash the balanced tree portion of the data level and write to level 1
_, levels[0] = testutil.SerialData(chunkSize*branches+chunkSize, 255, 0)
for i := 0; i < chunkSize*branches; i += chunkSize {
h.Reset()
h.SetSpan(chunkSize)
h.Write(levels[0][i : i+chunkSize])
copy(levels[1][i/branches:], h.Sum(nil))
}
refHex := hexutil.Encode(levels[1][:sectionSize])
correctRefHex := "0xc10090961e7682a10890c334d759a28426647141213abda93b096b892824d2ef"
if refHex != correctRefHex {
t.Fatalf("manual dangling single chunk; expected %s, got %s", correctRefHex, refHex)
}

// write the dangling chunk
// hash it and write the reference on the second section of level 2
h.Reset()
h.SetSpan(chunkSize)
h.Write(levels[0][chunkSize*branches:])
copy(levels[2][sectionSize:], h.Sum(nil))
refHex = hexutil.Encode(levels[2][sectionSize:])
correctRefHex = "0x81b31d9a7f6c377523e8769db021091df23edd9fd7bd6bcdf11a22f518db6006"
if refHex != correctRefHex {
t.Fatalf("manual dangling single chunk; expected %s, got %s", correctRefHex, refHex)
}

// hash the chunk on level 1 and write into the first section of level 2
h.Reset()
h.SetSpan(chunkSize * branches)
h.Write(levels[1])
copy(levels[2], h.Sum(nil))
refHex = hexutil.Encode(levels[2][:sectionSize])
correctRefHex = "0x3047d841077898c26bbe6be652a2ec590a5d9bd7cd45d290ea42511b48753c09"
if refHex != correctRefHex {
t.Fatalf("manual dangling balanced tree; expected %s, got %s", correctRefHex, refHex)
}

// hash the two sections on level 2 to obtain the root hash
h.Reset()
h.SetSpan(chunkSize*branches + chunkSize)
h.Write(levels[2])
ref := h.Sum(nil)
refHex = hexutil.Encode(ref)
correctRefHex = "0xb8e1804e37a064d28d161ab5f256cc482b1423d5cd0a6b30fde7b0f51ece9199"
if refHex != correctRefHex {
t.Fatalf("manual dangling root; expected %s, got %s", correctRefHex, refHex)
}
}

// TestReferenceFileHasherVector executes the file hasher algorithms on serial input data of periods of 0-254
// of lengths defined in common_test.go
//
// the "expected" array in common_test.go is generated by this implementation, and test failure due to
// result mismatch is nothing else than an indication that something has changed in the reference filehasher
// or the underlying hashing algorithm
func TestReferenceHasherVector(t *testing.T) {

hashFunc := func(_ context.Context) file.SectionWriter {
pool := bmt.NewTreePool(sha3.NewLegacyKeccak256, branches, bmt.PoolSize)
return bmt.New(pool)
}
params := newTreeParams(hashFunc)
var mismatch int
for i := start; i < end; i++ {
dataLength := dataLengths[i]
log.Info("start", "i", i, "len", dataLength)
rh := NewReferenceHasher(params)
_, data := testutil.SerialData(dataLength, 255, 0)
refHash := rh.Hash(data)
eq := true
if expected[i] != fmt.Sprintf("%x", refHash) {
mismatch++
eq = false
}
t.Logf("[%7d+%4d]\t%v\tref: %x\texpect: %s", dataLength/chunkSize, dataLength%chunkSize, eq, refHash, expected[i])
}
if mismatch > 0 {
t.Fatalf("mismatches: %d/%d", mismatch, end-start)
}
}

// BenchmarkReferenceHasher establishes a baseline for a fully synchronous file hashing operation
// it will be vastly inefficient
func BenchmarkReferenceHasher(b *testing.B) {
for i := start; i < end; i++ {
b.Run(fmt.Sprintf("%d", dataLengths[i]), benchmarkReferenceHasher)
}
}

func benchmarkReferenceHasher(b *testing.B) {
benchParams := strings.Split(b.Name(), "/")
dataLength, err := strconv.ParseInt(benchParams[1], 10, 64)
if err != nil {
b.Fatal(err)
}
hashFunc := func(_ context.Context) file.SectionWriter {
pool := bmt.NewTreePool(sha3.NewLegacyKeccak256, branches, bmt.PoolSize)
return bmt.New(pool)
}
params := newTreeParams(hashFunc)
b.ResetTimer()
for i := 0; i < b.N; i++ {
_, data := testutil.SerialData(int(dataLength), 255, 0)
fh := NewReferenceHasher(params)
fh.Hash(data)
}
}
Loading