From b0182d7834f4f410b7ba5e59ea594e1e877bda7f Mon Sep 17 00:00:00 2001
From: Alex Sharp <alexsharp@Alexs-MacBook-Pro.local>
Date: Sun, 19 Sep 2021 11:40:23 +0100
Subject: [PATCH] state mphf experiment

---
 cmd/hack/hack.go | 93 ++++++++++++++++++++++++++++++++++++++----------
 go.mod           |  4 +--
 go.sum           |  9 ++---
 3 files changed, 82 insertions(+), 24 deletions(-)

diff --git a/cmd/hack/hack.go b/cmd/hack/hack.go
index c55e31f531..07f34ed5e1 100644
--- a/cmd/hack/hack.go
+++ b/cmd/hack/hack.go
@@ -22,6 +22,7 @@ import (
 	"github.com/holiman/uint256"
 	"github.com/ledgerwatch/erigon-lib/kv"
 	"github.com/ledgerwatch/erigon-lib/kv/mdbx"
+	"github.com/ledgerwatch/erigon-lib/recsplit"
 	"github.com/ledgerwatch/erigon/consensus/ethash"
 	"github.com/ledgerwatch/erigon/consensus/misc"
 	"github.com/ledgerwatch/erigon/core"
@@ -1140,53 +1141,109 @@ func testGetProof(chaindata string, address common.Address, rewind int, regen bo
 func dumpState(chaindata string) error {
 	db := mdbx.MustOpen(chaindata)
 	defer db.Close()
-	f, err := os.Create("statedump")
+	f, err := os.Create("statedump.hex")
 	if err != nil {
 		return err
 	}
 	defer f.Close()
 	w := bufio.NewWriter(f)
 	defer w.Flush()
+	kf, err := os.Create("keys.dat")
+	if err != nil {
+		return err
+	}
+	defer kf.Close()
+	kw := bufio.NewWriter(kf)
+	defer kw.Flush()
 	stAccounts := 0
 	stStorage := 0
-	var varintBuf [10]byte // Buffer for varint number
+	valueSize := 0
+	var rs *recsplit.RecSplit
 	if err := db.View(context.Background(), func(tx kv.Tx) error {
 		c, err := tx.Cursor(kv.PlainState)
 		if err != nil {
 			return err
 		}
+		var count uint64
+		if count, err = c.Count(); err != nil {
+			return err
+		}
+		if rs, err = recsplit.NewRecSplit(recsplit.RecSplitArgs{
+			KeyCount:   int(count),
+			BucketSize: 2000,
+			Salt:       0,
+			LeafSize:   8,
+			TmpDir:     "",
+			StartSeed:  []uint32{1000000, 2000000, 3000000, 4000000, 5000000, 6000000, 7000000, 8000000, 9000000, 10000000, 11000000, 12000000, 13000000, 14000000},
+		}); err != nil {
+			return err
+		}
+		var prevKey []byte
 		k, v, e := c.First()
 		for ; k != nil && e == nil; k, v, e = c.Next() {
-			keyLen := binary.PutUvarint(varintBuf[:], uint64(len(k)))
-			if _, err = w.Write(varintBuf[:keyLen]); err != nil {
+			fmt.Fprintf(w, "%x\n", k)
+			valueSize++
+			valueSize += len(v)
+
+			rs.AddKey(k)
+			prefixLen := 0
+			for ; prefixLen < len(prevKey) && prefixLen < len(k) && prevKey[prefixLen] == k[prefixLen]; prefixLen++ {
+			}
+
+			if len(k) > 28 {
+				stStorage++
+			} else {
+				stAccounts++
+			}
+			if err = kw.WriteByte(byte(len(k))); err != nil {
 				return err
 			}
-			if _, err = w.Write([]byte(k)); err != nil {
+			if err = kw.WriteByte(byte(prefixLen)); err != nil {
 				return err
 			}
-			valLen := binary.PutUvarint(varintBuf[:], uint64(len(v)))
-			if _, err = w.Write(varintBuf[:valLen]); err != nil {
+			if _, err = kw.Write(k[prefixLen:]); err != nil {
 				return err
 			}
-			if len(v) > 0 {
-				if _, err = w.Write(v); err != nil {
-					return err
-				}
+			prevKey = common.CopyBytes(k)
+			if (stStorage+stAccounts)%100000 == 0 {
+				log.Info("State", "record", stStorage+stAccounts)
 			}
-			if len(k) > 28 {
-				stStorage++
-			} else {
-				stAccounts++
+		}
+		if e != nil {
+			return e
+		}
+		start := time.Now()
+		log.Info("Building recsplit...")
+		if err = rs.Build(); err != nil {
+			return err
+		}
+		s1, s2 := rs.Stats()
+		log.Info("Done", "time", time.Since(start), "s1", s1, "s2", s2)
+		start = time.Now()
+		log.Info("Testing bijection")
+		bitCount := (count + 63) / 64
+		bits := make([]uint64, bitCount)
+		k, v, e = c.First()
+		for ; k != nil && e == nil; k, v, e = c.Next() {
+			idx := rs.Lookup(k)
+			if idx >= int(count) {
+				return fmt.Errorf("idx %d >= count %d", idx, count)
 			}
-			if (stStorage+stAccounts)%100000 == 0 {
-				fmt.Printf("State records: %d\n", stStorage+stAccounts)
+			mask := uint64(1) << (idx & 63)
+			if bits[idx>>6]&mask != 0 {
+				return fmt.Errorf("no bijection count=%d", count)
 			}
+			bits[idx>>6] |= mask
 		}
+		if e != nil {
+			return e
+		}
+		log.Info("Done", "time", time.Since(start))
 		return e
 	}); err != nil {
 		return err
 	}
-	fmt.Printf("stAccounts = %d, stStorage = %d\n", stAccounts, stStorage)
+	fmt.Printf("stAccounts = %d, stStorage = %d, valueSize = %d\n", stAccounts, stStorage, valueSize)
 	return nil
 }
 
diff --git a/go.mod b/go.mod
index 060da76615..9db0400c1a 100644
--- a/go.mod
+++ b/go.mod
@@ -36,7 +36,7 @@ require (
 	github.com/json-iterator/go v1.1.11
 	github.com/julienschmidt/httprouter v1.3.0
 	github.com/kevinburke/go-bindata v3.21.0+incompatible
-	github.com/ledgerwatch/erigon-lib v0.0.0-20210918130108-95f4ac34fd13
+	github.com/ledgerwatch/erigon-lib v0.0.0-20210919103541-afaed4b09586
 	github.com/ledgerwatch/log/v3 v3.3.0
 	github.com/ledgerwatch/secp256k1 v0.0.0-20210626115225-cd5cd00ed72d
 	github.com/logrusorgru/aurora/v3 v3.0.0
@@ -57,7 +57,7 @@ require (
 	go.uber.org/atomic v1.9.0
 	golang.org/x/crypto v0.0.0-20210616213533-5ff15b29337e
 	golang.org/x/sync v0.0.0-20210220032951-036812b2e83c
-	golang.org/x/sys v0.0.0-20210910150752-751e447fb3d0
+	golang.org/x/sys v0.0.0-20210917161153-d61c044b1678
 	golang.org/x/time v0.0.0-20201208040808-7e3f01d25324
 	golang.org/x/tools v0.1.2
 	google.golang.org/grpc v1.39.1
diff --git a/go.sum b/go.sum
index f029bbc07b..77cee52f87 100644
--- a/go.sum
+++ b/go.sum
@@ -492,8 +492,8 @@ github.com/kylelemons/godebug v0.0.0-20170224010052-a616ab194758 h1:0D5M2HQSGD3P
 github.com/kylelemons/godebug v0.0.0-20170224010052-a616ab194758/go.mod h1:B69LEHPfb2qLo0BaaOLcbitczOKLWTsrBG9LczfCD4k=
 github.com/leanovate/gopter v0.2.9 h1:fQjYxZaynp97ozCzfOyOuAGOU4aU/z37zf/tOujFk7c=
 github.com/leanovate/gopter v0.2.9/go.mod h1:U2L/78B+KVFIx2VmW6onHJQzXtFb+p5y3y2Sh+Jxxv8=
-github.com/ledgerwatch/erigon-lib v0.0.0-20210918130108-95f4ac34fd13 h1:Pgs2rcuUTLmEIuFAiKSKN16cz36CWLfnGLqOntvi+Hs=
-github.com/ledgerwatch/erigon-lib v0.0.0-20210918130108-95f4ac34fd13/go.mod h1:kZsi9wFAOYIkegoiSj10RXOVc0EmDtUxltnNP1f1ZE0=
+github.com/ledgerwatch/erigon-lib v0.0.0-20210919103541-afaed4b09586 h1:57x4CVH9G3EvqQboxMO1U04NiX8y7rRqw4Nb5hS+8RA=
+github.com/ledgerwatch/erigon-lib v0.0.0-20210919103541-afaed4b09586/go.mod h1:darGhVf++67hq/fQQ92zT+1EjE+FDxHd/OU7OKK4uWI=
 github.com/ledgerwatch/log/v3 v3.3.0 h1:k8N/3NQLILr8CKCMyza261vLFKU7VA+nMNNb0wVyQSc=
 github.com/ledgerwatch/log/v3 v3.3.0/go.mod h1:J58eOHHrIYHxl7LKkRsb/0YibKwtLfauUryl5SLRGm0=
 github.com/ledgerwatch/secp256k1 v0.0.0-20210626115225-cd5cd00ed72d h1:/IKMrJdfRsoYNc36PXqP4xMH3vhW/8IQyBKGQbKZUno=
@@ -809,6 +809,7 @@ go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9i
 go.uber.org/zap v1.10.0/go.mod h1:vwi/ZaCAaUcBkycHslxD9B2zi4UTXhF60s6SWpuDF0Q=
 go.uber.org/zap v1.17.0/go.mod h1:MXVU+bhUf/A7Xi2HNOnopQOrmycQ5Ih87HtOu4q5SSo=
 go4.org v0.0.0-20180809161055-417644f6feb5/go.mod h1:MkTOUMDaeVYJUOUsaDXIhWPZYa1yOyC1qaOBpL57BhE=
+golang.org/dl v0.0.0-20210909185531-e2a88a019121/go.mod h1:IUMfjQLJQd4UTqG1Z90tenwKoCX93Gn3MAQJMOSBsDQ=
 golang.org/x/build v0.0.0-20190111050920-041ab4dc3f9d/go.mod h1:OWs+y06UdEOHN4y+MfF/py+xQ/tYqIWW03b70/CG9Rw=
 golang.org/x/crypto v0.0.0-20170930174604-9419663f5a44/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
 golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
@@ -1021,8 +1022,8 @@ golang.org/x/sys v0.0.0-20210403161142-5e06dd20ab57/go.mod h1:h1NjWce9XRLGQEsW7w
 golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.0.0-20210910150752-751e447fb3d0 h1:xrCZDmdtoloIiooiA9q0OQb9r8HejIHYoHGhGCe1pGg=
-golang.org/x/sys v0.0.0-20210910150752-751e447fb3d0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20210917161153-d61c044b1678 h1:J27LZFQBFoihqXoegpscI10HpjZ7B5WQLLKL2FZXQKw=
+golang.org/x/sys v0.0.0-20210917161153-d61c044b1678/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/term v0.0.0-20201117132131-f5c789dd3221/go.mod h1:Nr5EML6q2oocZ2LXRh80K7BxOlk5/8JxuGnuhpl+muw=
 golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
 golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
-- 
GitLab