From cdbd48d5ab35174c2b6226c1ddde379d9a752d9a Mon Sep 17 00:00:00 2001 From: Dmitry Kolesnikov Date: Thu, 29 Feb 2024 22:22:16 +0200 Subject: [PATCH 1/4] interim release for perf improvments --- .gitignore | 4 + cmd/go.mod | 8 +- cmd/go.sum | 12 ++- cmd/opt/draw.go | 38 +++++---- cmd/opt/query.go | 128 ++++++++++++++++++++++++++++++ cmd/opt/test.go | 35 +++++++-- cmd/try/try.go | 83 ++++++++++++-------- codec.go | 132 +++++++++++++++++++++++++++++++ go.mod | 13 ++- go.sum | 12 ++- hnsw.go | 37 ++++++--- hnsw_test.go | 41 ++++++++-- insert.go | 33 ++++++-- internal/fvecs/decoder.go | 68 ---------------- internal/pq/pq.go | 7 +- kv/kv.go | 161 ++++++++++++++++++++++++++++++++++++++ pipe.go | 29 +++++++ search.go | 34 ++++++-- types.go | 32 ++++++-- vector/type.go | 37 --------- vector/vector32.go | 51 ------------ 21 files changed, 733 insertions(+), 262 deletions(-) create mode 100644 cmd/opt/query.go create mode 100644 codec.go delete mode 100644 internal/fvecs/decoder.go create mode 100644 kv/kv.go create mode 100644 pipe.go delete mode 100644 vector/type.go delete mode 100644 vector/vector32.go diff --git a/.gitignore b/.gitignore index 3905de2..9677afa 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,9 @@ go.work go.work.sum cmd/sift* + *.html +*.fvecs +*.ivecs +*.bvecs diff --git a/cmd/go.mod b/cmd/go.mod index 9727f5d..c446cbc 100644 --- a/cmd/go.mod +++ b/cmd/go.mod @@ -1,19 +1,23 @@ module github.com/fogfish/hnsw/cmd -go 1.21.3 +go 1.22.0 require ( + github.com/bits-and-blooms/bitset v1.13.0 github.com/fogfish/hnsw v0.0.0-00010101000000-000000000000 github.com/go-echarts/go-echarts/v2 v2.3.3 + github.com/kshard/fvecs v0.0.1 + github.com/kshard/vector v0.0.2 github.com/spf13/cobra v1.8.0 ) require github.com/fogfish/golem/pure v0.10.1 // indirect require ( + github.com/chewxy/math32 v1.10.1 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/spf13/pflag v1.0.5 // indirect - github.com/willf/bitset v1.1.11 + golang.org/x/sys v0.17.0 // indirect ) replace github.com/fogfish/hnsw => ../ diff --git a/cmd/go.sum b/cmd/go.sum index e11f8c2..8643b8d 100644 --- a/cmd/go.sum +++ b/cmd/go.sum @@ -1,3 +1,7 @@ +github.com/bits-and-blooms/bitset v1.13.0 h1:bAQ9OPNFYbGHV6Nez0tmNI0RiEu7/hxlYJRUA0wFAVE= +github.com/bits-and-blooms/bitset v1.13.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= +github.com/chewxy/math32 v1.10.1 h1:LFpeY0SLJXeaiej/eIp2L40VYfscTvKh/FSEZ68uMkU= +github.com/chewxy/math32 v1.10.1/go.mod h1:dOB2rcuFrCn6UHrze36WSLVPKtzPMRAQvBvUwkSsLqs= github.com/cpuguy83/go-md2man/v2 v2.0.3/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -10,6 +14,10 @@ github.com/go-echarts/go-echarts/v2 v2.3.3 h1:uImZAk6qLkC6F9ju6mZ5SPBqTyK8xjZKwS github.com/go-echarts/go-echarts/v2 v2.3.3/go.mod h1:56YlvzhW/a+du15f3S2qUGNDfKnFOeJSThBIrVFHDtI= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= +github.com/kshard/fvecs v0.0.1 h1:4FIjuJaiWWv1Q2y20w/1l13WhNlErWXs4yYVLmotNGo= +github.com/kshard/fvecs v0.0.1/go.mod h1:cehO9AfnF3Tb2vOwhOWmoaNUfYqmm4WQrUMyrPGqN6Q= +github.com/kshard/vector v0.0.2 h1:eh6d2XpcSRRZYaJAK2F0l5Ccql1NYpZY4vDBNBRj7qs= +github.com/kshard/vector v0.0.2/go.mod h1:5sauOIat9reamLm+hPc6M7n2oWo3G6z5u1V0lQvpYYE= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= @@ -19,8 +27,8 @@ github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/stretchr/testify v1.6.0 h1:jlIyCplCJFULU/01vCkhKuTyc3OorI3bJFuw6obfgho= github.com/stretchr/testify v1.6.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -github.com/willf/bitset v1.1.11 h1:N7Z7E9UvjW+sGsEl7k/SJrvY2reP1A07MrGuCjIOjRE= -github.com/willf/bitset v1.1.11/go.mod h1:83CECat5yLh5zVOf4P1ErAgKA5UDvKtgyUABdr3+MjI= +golang.org/x/sys v0.17.0 h1:25cE3gD+tdBA7lp7QfhuV+rJiE9YXTcS3VG1SqssI/Y= +golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/cmd/opt/draw.go b/cmd/opt/draw.go index af67fab..fc3e444 100644 --- a/cmd/opt/draw.go +++ b/cmd/opt/draw.go @@ -14,13 +14,14 @@ import ( "os" "strconv" + "github.com/bits-and-blooms/bitset" "github.com/fogfish/hnsw" "github.com/fogfish/hnsw/cmd/try" + "github.com/fogfish/hnsw/kv" "github.com/go-echarts/go-echarts/v2/charts" "github.com/go-echarts/go-echarts/v2/components" "github.com/go-echarts/go-echarts/v2/opts" "github.com/spf13/cobra" - "github.com/willf/bitset" ) func init() { @@ -51,9 +52,9 @@ It is required to obtain the dataset(s) into local environment: } func draw(cmd *cobra.Command, args []string) error { - h := try.New() + h := try.New(128) - if err := try.Create(h, drawDataset); err != nil { + if err := try.Insert(h, 8, drawDataset); err != nil { return err } @@ -68,7 +69,7 @@ func draw(cmd *cobra.Command, args []string) error { return nil } -func drawLevel(h *hnsw.HNSW[try.Node], level int) error { +func drawLevel(h *hnsw.HNSW[kv.Vector], level int) error { nodes, links, kinds := cutLevel(h, level) if len(nodes) == 0 || len(links) == 0 { return nil @@ -79,14 +80,14 @@ func drawLevel(h *hnsw.HNSW[try.Node], level int) error { graph.AddSeries("graph", nodes, links). SetSeriesOptions( charts.WithGraphChartOpts(opts.GraphChart{ - Layout: "force", - Draggable: true, + Layout: "force", + // Draggable: true, Roam: true, FocusNodeAdjacency: true, Force: &opts.GraphForce{ - Repulsion: 200.0, //800.0, - Gravity: 0.05, //0.01, - EdgeLength: 60.0, + Repulsion: 800.0, + Gravity: 0.05, //0.01, + // EdgeLength: 60.0, }, Categories: kinds, @@ -99,7 +100,8 @@ func drawLevel(h *hnsw.HNSW[try.Node], level int) error { }, }), charts.WithLineStyleOpts(opts.LineStyle{ - Curveness: 0.3, + // Curveness: 0.3, + Color: "source", }), ) @@ -117,18 +119,19 @@ func drawLevel(h *hnsw.HNSW[try.Node], level int) error { return page.Render(io.MultiWriter(f)) } -func cutLevel(h *hnsw.HNSW[try.Node], level int) ([]opts.GraphNode, []opts.GraphLink, []*opts.GraphCategory) { +func cutLevel(h *hnsw.HNSW[kv.Vector], level int) ([]opts.GraphNode, []opts.GraphLink, []*opts.GraphCategory) { var visited bitset.BitSet + mrank := level nodes := []opts.GraphNode{} links := []opts.GraphLink{} kinds := []*opts.GraphCategory{} - h.FMap(level, func(rank int, vector try.Node, vertex []try.Node) error { - if visited.Test(uint(vector.ID)) { + h.FMap(level, func(rank int, vector kv.Vector, vertex []kv.Vector) error { + if visited.Test(uint(vector.Key)) { return nil } - visited.Set(uint(vector.ID)) + visited.Set(uint(vector.Key)) if rank > mrank { mrank = rank @@ -136,7 +139,7 @@ func cutLevel(h *hnsw.HNSW[try.Node], level int) ([]opts.GraphNode, []opts.Graph nodes = append(nodes, opts.GraphNode{ - Name: strconv.Itoa(vector.ID), + Name: strconv.Itoa(int(vector.Key)), Category: rank - level - 1, }, ) @@ -144,8 +147,9 @@ func cutLevel(h *hnsw.HNSW[try.Node], level int) ([]opts.GraphNode, []opts.Graph for _, v := range vertex { links = append(links, opts.GraphLink{ - Source: strconv.Itoa(vector.ID), - Target: strconv.Itoa(v.ID), + Source: strconv.Itoa(int(vector.Key)), + Target: strconv.Itoa(int(v.Key)), + // Value: 200.0 * vv.Euclidean.Distance(vector.Vector, v.Vector), }, ) } diff --git a/cmd/opt/query.go b/cmd/opt/query.go new file mode 100644 index 0000000..fd9b18d --- /dev/null +++ b/cmd/opt/query.go @@ -0,0 +1,128 @@ +// +// Copyright (C) 2024 Dmitry Kolesnikov +// +// This file may be modified and distributed under the terms +// of the MIT license. See the LICENSE file for details. +// https://github.com/fogfish/hnsw +// + +package opt + +import ( + "errors" + "io" + "os" + + "github.com/kshard/fvecs" + "github.com/spf13/cobra" +) + +func init() { + rootCmd.AddCommand(queryCmd) + queryCmd.Flags().StringVarP(&queryVectors, "dataset", "d", "", ".fvecs") + queryCmd.Flags().StringVarP(&queryText, "text", "t", "", ".bvecs") + queryCmd.Flags().IntVarP(&queryVecSize, "vector", "v", 128, "vector size") + queryCmd.Flags().StringVarP(&queryQuery, "query", "q", "", ".fvecs") + + // drawCmd.Flags().StringVarP(&drawOutput, "output", "o", ".", "directory to output rendered layers") +} + +var ( + queryVectors string + queryText string + queryVecSize int + queryQuery string + + // drawOutput string +) + +var queryCmd = &cobra.Command{ + Use: "query", + Short: "query test dataset", + Long: ` +`, + SilenceUsage: true, + RunE: query, +} + +func query(cmd *cobra.Command, args []string) error { + // h := try.New(queryVecSize) + // if err := try.Create(h, queryVectors); err != nil { + // return err + // } + + // text, err := readText() + // if err != nil { + // return err + // } + + // fv, err := os.Open(queryQuery) + // if err != nil { + // return err + // } + // defer fv.Close() + + // // + // t := time.Now() + // c := 1 + // fr := fvecs.NewDecoder[float32](fv) + // for { + // q, err := fr.Read() + // switch { + // case err == nil: + // os.Stdout.WriteString("\n---\n") + + // result := h.Search(try.Node{Vector: q}, 10, 100) + // for _, v := range result { + // d := vector.Cosine.Distance(q, v.Vector) + // os.Stdout.WriteString( + // fmt.Sprintf("%f\n%s\n", d, text[v.ID]), + // ) + // } + + // case errors.Is(err, io.EOF): + // os.Stderr.WriteString( + // fmt.Sprintf("==> query %9d vectors in %s (%d ns/op)\n", c, time.Since(t), int(time.Since(t).Nanoseconds())/c), + // ) + // return nil + // default: + // return err + // } + + // c++ + + // if c%1000 == 0 { + // os.Stderr.WriteString( + // fmt.Sprintf("==> query %9d vectors in %s (%d ns/op)\n", c, time.Since(t), int(time.Since(t).Nanoseconds())/c), + // ) + // } + // } + return nil +} + +func readText() (map[int]string, error) { + bv, err := os.Open(queryText) + if err != nil { + return nil, err + } + defer bv.Close() + + id := 1 + text := map[int]string{} + br := fvecs.NewDecoder[byte](bv) + + for { + t, err := br.Read() + + switch { + case err == nil: + text[id] = string(t) + case errors.Is(err, io.EOF): + return text, nil + default: + return nil, err + } + + id++ + } +} diff --git a/cmd/opt/test.go b/cmd/opt/test.go index 70d72f1..e9190a8 100644 --- a/cmd/opt/test.go +++ b/cmd/opt/test.go @@ -9,9 +9,8 @@ package opt import ( - "fmt" - "github.com/fogfish/hnsw/cmd/try" + "github.com/fogfish/hnsw/kv" "github.com/spf13/cobra" ) @@ -28,7 +27,7 @@ var testCmd = &cobra.Command{ Use: "test", Short: "test the algorithm against dataset", Long: ` -'hnsw graw' tests algorithms against datasets for approximate +'hnsw draw' tests algorithms against datasets for approximate nearest neighbor search available at http://corpus-texmex.irisa.fr. It is required to obtain the dataset(s) into local environment: @@ -41,13 +40,35 @@ It is required to obtain the dataset(s) into local environment: } func test(cmd *cobra.Command, args []string) error { - h := try.New() + h := try.New(128) + + // f := fmt.Sprintf("%s/%s_base.fvecs", testDataset, filepath.Base(testDataset)) + + // if err := try.Insert(h, 8, f); err != nil { + // return err + // } + + // if err := kv.Write(h, "test"); err != nil { + // panic(err) + // } - if err := try.Create(h, testDataset); err != nil { - return err + if err := kv.Read(h, "test"); err != nil { + panic(err) } - fmt.Println() + // h.Dump() + + // w, _ := os.Create("test.ivecs") + // e := fvecs.NewEncoder[uint32](w) + + // h.Encode(e) + + // h.FMap(3, func(level int, vector try.Node, vertex []try.Node) error { + // fmt.Printf("ID: %d => %d\n", vector.ID, len(vertex)) + + // return nil + // }) + // return nil return try.Test(h, testDataset) } diff --git a/cmd/try/try.go b/cmd/try/try.go index fbddfa1..cc82522 100644 --- a/cmd/try/try.go +++ b/cmd/try/try.go @@ -13,86 +13,96 @@ import ( "fmt" "io" "os" + "path/filepath" "time" "github.com/fogfish/hnsw" - "github.com/fogfish/hnsw/internal/fvecs" - "github.com/fogfish/hnsw/vector" + "github.com/fogfish/hnsw/kv" + "github.com/kshard/fvecs" + "github.com/kshard/vector" ) -type Node struct { - ID int - Vector vector.V32 +// New HNSW Index +func New(vs int) *hnsw.HNSW[kv.Vector] { + return hnsw.New[kv.Vector]( + kv.Surface(vector.Euclidean()), + kv.Zero(vs), + hnsw.WithEfConstruction(200), + hnsw.WithM(16), + ) } -func New() *hnsw.HNSW[Node] { - surface := vector.ContraMap[vector.V32, Node]{ - Surface: vector.Euclidean, - ContraMap: func(n Node) []float32 { return n.Vector }, - } - - zero := Node{ID: 0, Vector: make(vector.V32, 128)} - - return hnsw.New[Node](surface, zero, hnsw.WithEfConstruction(400), hnsw.WithM(8)) -} +// Insert dataset +func Insert(h *hnsw.HNSW[kv.Vector], threads int, dataset string) error { + fmt.Printf("==> reading %s\n", dataset) -func Create(h *hnsw.HNSW[Node], dataset string) error { - fmt.Printf("==> reading dataset %s\n", dataset) - - f, err := os.Open(fmt.Sprintf("%s/%s_base.fvecs", dataset, dataset)) + f, err := os.Open(dataset) if err != nil { return err } defer f.Close() t := time.Now() - c := 1 + c := uint32(1) + + progress := func() { + os.Stderr.WriteString( + fmt.Sprintf("==> read %9d vectors in %s (%d ns/op)\n", c, time.Since(t), time.Since(t).Nanoseconds()/int64(c)), + ) + } + d := fvecs.NewDecoder[float32](f) + w := h.Pipe(threads) + for { vec, err := d.Read() switch { case err == nil: - h.Insert(Node{ID: c, Vector: vec}) + w <- kv.Vector{Key: c, Vector: vec} case errors.Is(err, io.EOF): + progress() return nil default: return err } c++ - - if c%1000 == 0 { - fmt.Printf("==> read %9d vectors in %s (%d ns/op)\n", c, time.Since(t), int(time.Since(t).Nanoseconds())/c) + if c%10000 == 0 { + progress() } } } -func Query(h *hnsw.HNSW[Node], query []float32, truth []uint32) { - result := h.Search(Node{Vector: query}, 10, 100) +// Query index comparing with ground truth +func Query(h *hnsw.HNSW[kv.Vector], k int, query []float32, truth []uint32) (int, float64) { + result := h.Search(kv.Vector{Vector: query}, k, 100) errors := 0 + weight := 0.0 for i, vector := range result { - if truth[i] != uint32(vector.ID-1) { + if truth[i] != uint32(vector.Key-1) { errors++ + weight += float64(k) / float64(i+1) } } if errors > 0 { - fmt.Printf("FAIL: %2d of %2d (%.2f %%)\n", errors, len(result), 100.0*float32(errors)/float32(len(result))) + fmt.Printf("FAIL: %2d, %.2f (%.2f %%)\n", errors, weight, 100.0*float32(errors)/float32(len(result))) } + return errors, weight } -func Test(h *hnsw.HNSW[Node], dataset string) error { +func Test(h *hnsw.HNSW[kv.Vector], dataset string) error { fmt.Printf("==> testing dataset %s\n", dataset) - qf, err := os.Open(fmt.Sprintf("%s/%s_query.fvecs", dataset, dataset)) + qf, err := os.Open(fmt.Sprintf("%s/%s_query.fvecs", dataset, filepath.Base(dataset))) if err != nil { return err } defer qf.Close() - tf, err := os.Open(fmt.Sprintf("%s/%s_groundtruth.ivecs", dataset, dataset)) + tf, err := os.Open(fmt.Sprintf("%s/%s_groundtruth.ivecs", dataset, filepath.Base(dataset))) if err != nil { return err } @@ -105,6 +115,9 @@ func Test(h *hnsw.HNSW[Node], dataset string) error { t := time.Now() c := 0 + errors := 0 + weight := 0.0 + for { q, err := query.Read() if err != nil { @@ -116,10 +129,16 @@ func Test(h *hnsw.HNSW[Node], dataset string) error { break } - Query(h, q, t) + e, w := Query(h, 10, q, t) + errors += e + weight += w + c++ } fmt.Printf("\n%d queries in %v (%d ns/op)\n", c, time.Since(t), int(time.Since(t).Nanoseconds())/c) + + fmt.Printf("\n%d failed %v (%v)\n", c*10, errors, weight) + return nil } diff --git a/codec.go b/codec.go new file mode 100644 index 0000000..75ccf37 --- /dev/null +++ b/codec.go @@ -0,0 +1,132 @@ +// +// Copyright (C) 2024 Dmitry Kolesnikov +// +// This file may be modified and distributed under the terms +// of the MIT license. See the LICENSE file for details. +// https://github.com/fogfish/hnsw +// + +package hnsw + +import ( + "errors" + "io" +) + +func (h *HNSW[Vector]) Write( + nodes interface { + Write(n Vector) error + }, + edges interface { + Write(v []Pointer) error + }, +) error { + h.Lock() + defer h.Unlock() + + for l := h.level - 1; l >= 0; l-- { + hv := []Pointer{0, 0, 0, uint32(l)} + if err := edges.Write(hv); err != nil { + return err + } + + for addr, node := range h.heap { + if len(node.Connections) > l { + iv := make([]Pointer, len(node.Connections[l])+1) + iv[0] = Pointer(addr) + copy(iv[1:], node.Connections[l]) + // for i, edge := range node.Connections[l] { + // iv[i+1] = edge + // } + + if err := edges.Write(iv); err != nil { + return err + } + } + } + } + + for _, node := range h.heap { + if err := nodes.Write(node.Vector); err != nil { + return err + } + } + + return nil +} + +func (h *HNSW[Vector]) Read( + nodes interface { + Read() (Vector, error) + }, + edges interface { + Read() ([]Pointer, error) + }, +) error { + h.Lock() + defer h.Unlock() + + if err := h.readNodes(nodes); err != nil { + return err + } + + if err := h.readEdges(edges); err != nil { + return err + } + + return nil +} + +func (h *HNSW[Vector]) readNodes( + nodes interface { + Read() (Vector, error) + }, +) error { + h.heap = []Node[Vector]{} + + for { + nv, err := nodes.Read() + switch { + case err == nil: + node := Node[Vector]{Vector: nv} + h.heap = append(h.heap, node) + case errors.Is(err, io.EOF): + return nil + default: + return err + } + } +} + +func (h *HNSW[Vector]) readEdges( + edges interface { + Read() ([]Pointer, error) + }, +) error { + lvl := -1 + + // fmt.Printf("%v\n", h.heap) + + for { + iv, err := edges.Read() + switch { + case err == nil: + if len(iv) == 4 && iv[0] == 0 && iv[1] == 0 && iv[2] == 0 { + lvl = int(iv[3]) + } else { + addr := iv[0] + node := h.heap[addr] + if node.Connections == nil { + node.Connections = make([][]Pointer, lvl+1) + } + node.Connections[lvl] = iv[1:] + // fmt.Printf("%v | %v\n", addr, node.Connections[lvl]) + h.heap[addr] = node + } + case errors.Is(err, io.EOF): + return nil + default: + return err + } + } +} diff --git a/go.mod b/go.mod index dd66f7b..2ce99fe 100644 --- a/go.mod +++ b/go.mod @@ -1,9 +1,16 @@ module github.com/fogfish/hnsw -go 1.21.3 +go 1.22.0 require ( - github.com/fogfish/golem/pure v0.10.1 + github.com/bits-and-blooms/bitset v1.13.0 github.com/fogfish/it/v2 v2.0.1 - github.com/willf/bitset v1.1.11 + github.com/kshard/fvecs v0.0.1 + github.com/kshard/vector v0.0.2 +) + +require ( + github.com/chewxy/math32 v1.10.1 // indirect + github.com/fogfish/golem/pure v0.10.1 // indirect + golang.org/x/sys v0.17.0 // indirect ) diff --git a/go.sum b/go.sum index fc71b00..be78352 100644 --- a/go.sum +++ b/go.sum @@ -1,6 +1,14 @@ +github.com/bits-and-blooms/bitset v1.13.0 h1:bAQ9OPNFYbGHV6Nez0tmNI0RiEu7/hxlYJRUA0wFAVE= +github.com/bits-and-blooms/bitset v1.13.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= +github.com/chewxy/math32 v1.10.1 h1:LFpeY0SLJXeaiej/eIp2L40VYfscTvKh/FSEZ68uMkU= +github.com/chewxy/math32 v1.10.1/go.mod h1:dOB2rcuFrCn6UHrze36WSLVPKtzPMRAQvBvUwkSsLqs= github.com/fogfish/golem/pure v0.10.1 h1:0+cnvdaV9zF+0NN8SZMgR5bgFM6yNfBHU4rynYSDfmE= github.com/fogfish/golem/pure v0.10.1/go.mod h1:kLPfgu5uKP0CrwVap7jejisRwV7vo1q8Eyqnc/Z0qyw= github.com/fogfish/it/v2 v2.0.1 h1:vu3kV2xzYDPHoMHMABxXeu5CoMcTfRc4gkWkzOUkRJY= github.com/fogfish/it/v2 v2.0.1/go.mod h1:h5FdKaEQT4sUEykiVkB8VV4jX27XabFVeWhoDZaRZtE= -github.com/willf/bitset v1.1.11 h1:N7Z7E9UvjW+sGsEl7k/SJrvY2reP1A07MrGuCjIOjRE= -github.com/willf/bitset v1.1.11/go.mod h1:83CECat5yLh5zVOf4P1ErAgKA5UDvKtgyUABdr3+MjI= +github.com/kshard/fvecs v0.0.1 h1:4FIjuJaiWWv1Q2y20w/1l13WhNlErWXs4yYVLmotNGo= +github.com/kshard/fvecs v0.0.1/go.mod h1:cehO9AfnF3Tb2vOwhOWmoaNUfYqmm4WQrUMyrPGqN6Q= +github.com/kshard/vector v0.0.2 h1:eh6d2XpcSRRZYaJAK2F0l5Ccql1NYpZY4vDBNBRj7qs= +github.com/kshard/vector v0.0.2/go.mod h1:5sauOIat9reamLm+hPc6M7n2oWo3G6z5u1V0lQvpYYE= +golang.org/x/sys v0.17.0 h1:25cE3gD+tdBA7lp7QfhuV+rJiE9YXTcS3VG1SqssI/Y= +golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= diff --git a/hnsw.go b/hnsw.go index 0513f27..c42a158 100644 --- a/hnsw.go +++ b/hnsw.go @@ -9,11 +9,14 @@ package hnsw import ( + "fmt" "math" "math/rand" + "strings" + "sync" "time" - "github.com/fogfish/hnsw/vector" + "github.com/kshard/vector" ) // Config of the HNSW @@ -71,6 +74,7 @@ func WithRandomSource(random rand.Source) Option { // HNSW data type type HNSW[Vector any] struct { + sync.RWMutex config Config surface vector.Surface[Vector] @@ -123,18 +127,31 @@ func (h *HNSW[Vector]) Level() int { return h.level } // func (h *HNSW[Vector]) Head() Pointer { return h.head } // func (h *HNSW[Vector]) Node(addr Pointer) Node[Vector] { return h.heap[addr] } -// func (h *HNSW[Vector]) Dump() { -// sb := strings.Builder{} +func (h *HNSW[Vector]) Dump() { + sb := strings.Builder{} -// for lvl := h.level - 1; lvl >= 0; lvl-- { -// visited := map[Pointer]struct{}{} + for lvl := h.level - 1; lvl >= 0; lvl-- { + sb.WriteString(fmt.Sprintf("\n\n==> %v\n", lvl)) -// sb.WriteString(fmt.Sprintf("\n\n==> %v\n", lvl)) -// h.dump(&sb, lvl, visited, h.head) -// } + h.FMap(lvl, func(level int, vector Vector, vertex []Vector) error { -// fmt.Println(sb.String()) -// } + sb.WriteString(fmt.Sprintf("%v | ", vector)) + for _, e := range vertex { + sb.WriteString(fmt.Sprintf("%v ", e)) + } + sb.WriteString("\n") + + return nil + }) + + // visited := map[Pointer]struct{}{} + + // sb.WriteString(fmt.Sprintf("\n\n==> %v\n", lvl)) + // h.dump(&sb, lvl, visited, h.head) + } + + fmt.Println(sb.String()) +} // func (h *HNSW[Vector]) dump(sb *strings.Builder, level int, visited map[Pointer]struct{}, addr Pointer) { // if _, has := visited[addr]; has { diff --git a/hnsw_test.go b/hnsw_test.go index f66ffdc..061ebd8 100644 --- a/hnsw_test.go +++ b/hnsw_test.go @@ -9,16 +9,43 @@ package hnsw_test import ( + "math/rand" "testing" + + "github.com/fogfish/hnsw" + "github.com/kshard/vector" ) -func BenchmarkXxx(b *testing.B) { - // TODO: - // h := hnsw.New(nil, hnsw.Vector{0}) +const n = 128 + +func vZero() vector.F32 { + v := make(vector.F32, n) + for i := 0; i < n; i++ { + v[i] = 0 + } + return v +} + +func vRand() vector.F32 { + v := make(vector.F32, n) + for i := 0; i < n; i++ { + v[i] = rand.Float32() + } + return v +} + +func BenchmarkInsert(b *testing.B) { + h := hnsw.New[vector.F32]( + vector.Euclidean(), + // vector.Cosine, + vZero(), + hnsw.WithEfConstruction(400), + hnsw.WithM(16), + ) - // b.ReportAllocs() + b.ReportAllocs() - // for n := b.N; n > 0; n-- { - // h.Insert(hnsw.Vector{float32(rand.Int63n(100000))}) - // } + for n := b.N; n > 0; n-- { + h.Insert(vRand()) + } } diff --git a/insert.go b/insert.go index 2028c05..76d15be 100644 --- a/insert.go +++ b/insert.go @@ -33,17 +33,23 @@ func (h *HNSW[Vector]) Insert(v Vector) { Connections: make([][]Pointer, level+1), } + h.Lock() h.heap = append(h.heap, node) addr := Pointer(len(h.heap) - 1) + h.Unlock() // skip down through layers + h.RLock() head := h.head - for lvl := h.level - 1; lvl > level; lvl-- { + hLevel := h.level + h.RUnlock() + + for lvl := hLevel - 1; lvl > level; lvl-- { head = h.skip(lvl, head, v) } // - for lvl := min(level, h.level-1); lvl >= 0; lvl-- { + for lvl := min(level, hLevel-1); lvl >= 0; lvl-- { M := h.config.mLayerN if lvl == 0 { M = h.config.mLayer0 @@ -60,14 +66,23 @@ func (h *HNSW[Vector]) Insert(v Vector) { // } // Add Bi-Edges - node.Connections[lvl] = make([]Pointer, w.Len()) + //h.Lock() + edges := make([]Pointer, w.Len()) + // node.Connections[lvl] = make([]Pointer, w.Len()) for i := w.Len() - 1; i >= 0; i-- { candidate := w.Deq() - node.Connections[lvl][i] = candidate.Addr - - c := h.heap[candidate.Addr].Connections[lvl] - h.heap[candidate.Addr].Connections[lvl] = append(c, addr) + edges[i] = candidate.Addr + //node.Connections[lvl][i] = candidate.Addr + + n := h.heap[candidate.Addr] + c := n.Connections[lvl] + h.Lock() + n.Connections[lvl] = append(c, addr) + h.Unlock() } + h.Lock() + node.Connections[lvl] = edges + h.Unlock() // Shrink Connection for _, e := range node.Connections[lvl] { @@ -93,16 +108,20 @@ func (h *HNSW[Vector]) Insert(v Vector) { conns[i] = edges.Deq().Addr } + h.Lock() h.heap[e].Connections[lvl] = conns + h.Unlock() } } } + h.Lock() if len(node.Connections) > h.level { h.level = len(node.Connections) h.head = addr } + h.Unlock() } /* diff --git a/internal/fvecs/decoder.go b/internal/fvecs/decoder.go deleted file mode 100644 index 1d30634..0000000 --- a/internal/fvecs/decoder.go +++ /dev/null @@ -1,68 +0,0 @@ -// -// Copyright (C) 2024 Dmitry Kolesnikov -// -// This file may be modified and distributed under the terms -// of the MIT license. See the LICENSE file for details. -// https://github.com/fogfish/hnsw -// - -package fvecs - -import ( - "encoding/binary" - "io" - "math" -) - -type Decoder[T float32 | uint32] struct { - r io.Reader - reader func() (T, error) -} - -func NewDecoder[T float32 | uint32](r io.Reader) Decoder[T] { - d := Decoder[T]{r: r} - - switch any(*new(T)).(type) { - case float32: - d.reader = func() (T, error) { - v, err := d.float32() - return T(v), err - } - case uint32: - d.reader = func() (T, error) { - v, err := d.uint32() - return T(v), err - } - } - - return d -} - -func (d *Decoder[T]) uint32() (uint32, error) { - bs := make([]byte, 4) - _, err := d.r.Read(bs) - return binary.LittleEndian.Uint32(bs), err -} - -func (d *Decoder[T]) float32() (float32, error) { - bs := make([]byte, 4) - _, err := d.r.Read(bs) - return float32(math.Float32frombits(binary.LittleEndian.Uint32(bs))), err -} - -func (d *Decoder[T]) Read() ([]T, error) { - s, err := d.uint32() - if err != nil { - return nil, err - } - - v := make([]T, s) - for i := 0; i < int(s); i++ { - v[i], err = d.reader() - if err != nil { - return nil, err - } - } - - return v, nil -} diff --git a/internal/pq/pq.go b/internal/pq/pq.go index 95ec0d4..1cd1eeb 100644 --- a/internal/pq/pq.go +++ b/internal/pq/pq.go @@ -10,7 +10,6 @@ package pq import ( "container/heap" - "fmt" ) type Ord[T any] interface{ Compare(T, T) int } @@ -20,11 +19,11 @@ type Queue[T any] struct { } func New[T any](ord Ord[T], seq ...T) Queue[T] { - mm := [64]T{} + mm := [400]T{} pq := Queue[T]{ heap: &heaps[T]{ ord: ord, - mem: mm[0:0], // make([]T, 0), + mem: mm[0:0:400], // make([]T, 0), }, } @@ -59,8 +58,6 @@ func (q Queue[T]) Deq() T { return item } -func (q Queue[T]) D() { fmt.Println(q.heap.mem) } - // // // diff --git a/kv/kv.go b/kv/kv.go new file mode 100644 index 0000000..d76351e --- /dev/null +++ b/kv/kv.go @@ -0,0 +1,161 @@ +package kv + +import ( + "encoding/binary" + "os" + "strconv" + + "github.com/fogfish/hnsw" + "github.com/kshard/fvecs" + "github.com/kshard/vector" +) + +type Vector struct { + Key uint32 + Vector vector.F32 +} + +func (v Vector) String() string { return strconv.Itoa(int(v.Key)) } + +func toVector(n Vector) vector.F32 { return n.Vector } + +func Zero(vs int) Vector { + return Vector{Key: 0, Vector: make(vector.F32, vs)} +} + +func Surface(surface vector.Surface[vector.F32]) vector.Surface[Vector] { + return vector.ContraMap[vector.F32, Vector]{ + Surface: surface, + ContraMap: toVector, + } +} + +func Write(h *hnsw.HNSW[Vector], file string) error { + fw, err := os.Create(file + ".fvecs") + if err != nil { + return err + } + defer fw.Close() + + iw, err := os.Create(file + ".ivecs") + if err != nil { + return err + } + defer iw.Close() + + bw, err := os.Create(file + ".bvecs") + if err != nil { + return err + } + defer bw.Close() + + fe := fvecs.NewEncoder[float32](fw) + ie := fvecs.NewEncoder[uint32](iw) + be := fvecs.NewEncoder[byte](bw) + + nw := NewWriter(fe, be) + + return h.Write(nw, ie) +} + +func Read(h *hnsw.HNSW[Vector], file string) error { + fr, err := os.Open(file + ".fvecs") + if err != nil { + return err + } + defer fr.Close() + + ir, err := os.Open(file + ".ivecs") + if err != nil { + return err + } + defer ir.Close() + + br, err := os.Open(file + ".bvecs") + if err != nil { + return err + } + defer br.Close() + + fd := fvecs.NewDecoder[float32](fr) + id := fvecs.NewDecoder[uint32](ir) + bd := fvecs.NewDecoder[byte](br) + + nr := NewReader(fd, bd) + + return h.Read(nr, id) +} + +// +// +// + +type FWriter interface { + Write(vector.F32) error +} + +type BWriter interface { + Write([]byte) error +} + +type Writer struct { + fw FWriter + bw BWriter + b []byte +} + +func NewWriter(fw FWriter, bw BWriter) Writer { + return Writer{fw: fw, bw: bw, b: []byte{0, 0, 0, 0}} +} + +func (w Writer) Write(v Vector) error { + if err := w.fw.Write(v.Vector); err != nil { + return err + } + + binary.LittleEndian.PutUint32(w.b, v.Key) + if err := w.bw.Write(w.b); err != nil { + return err + } + + return nil +} + +// +// +// + +type FReader interface { + Read() (vector.F32, error) +} + +type BReader interface { + Read() ([]byte, error) +} + +type Reader struct { + fr FReader + br BReader +} + +func NewReader(fr FReader, br BReader) Reader { + return Reader{fr: fr, br: br} +} + +func (r Reader) Read() (v Vector, err error) { + v.Vector, err = r.fr.Read() + if err != nil { + return + } + + b, err := r.br.Read() + if err != nil { + return + } + + v.Key = binary.LittleEndian.Uint32(b) + + // fmt.Printf("%v\n", v) + + return +} diff --git a/pipe.go b/pipe.go new file mode 100644 index 0000000..0b769f1 --- /dev/null +++ b/pipe.go @@ -0,0 +1,29 @@ +// +// Copyright (C) 2024 Dmitry Kolesnikov +// +// This file may be modified and distributed under the terms +// of the MIT license. See the LICENSE file for details. +// https://github.com/fogfish/hnsw +// + +package hnsw + +import "sync" + +func (h *HNSW[Vector]) Pipe(workers int) chan<- Vector { + var wg sync.WaitGroup + + pipe := make(chan Vector, workers) + + for i := 1; i <= workers; i++ { + wg.Add(1) + go func() { + defer wg.Done() + for v := range pipe { + h.Insert(v) + } + }() + } + + return pipe +} diff --git a/search.go b/search.go index 939fc93..b85459d 100644 --- a/search.go +++ b/search.go @@ -9,8 +9,8 @@ package hnsw import ( + "github.com/bits-and-blooms/bitset" "github.com/fogfish/hnsw/internal/pq" - "github.com/willf/bitset" ) // skip the graph to "nearest" node @@ -27,9 +27,10 @@ func (h *HNSW[Vector]) skip(level int, addr Pointer, q Vector) Pointer { // skip to "nearest" connection at the node. // it return input address if no "movements" is possible func (h *HNSW[Vector]) skipToNearest(level int, addr Pointer, q Vector) Pointer { - dist := h.surface.Distance(h.heap[addr].Vector, q) + node := h.heap[addr] + dist := h.surface.Distance(node.Vector, q) - for _, a := range h.heap[addr].Connections[level] { + for _, a := range node.Connections[level] { d := h.surface.Distance(h.heap[a].Vector, q) if d < dist { dist = d @@ -42,7 +43,7 @@ func (h *HNSW[Vector]) skipToNearest(level int, addr Pointer, q Vector) Pointer // Search "nearest" vectors on the layer func (h *HNSW[Vector]) SearchLayer(level int, addr Pointer, q Vector, ef int) pq.Queue[Vertex] { - var visited bitset.BitSet + visited := bitset.New(uint(ef)) this := Vertex{ Distance: h.surface.Distance(q, h.heap[addr].Vector), @@ -60,7 +61,22 @@ func (h *HNSW[Vector]) SearchLayer(level int, addr Pointer, q Vector, ef int) pq break } - for _, e := range h.heap[c.Addr].Connections[level] { + // if len(h.heap[c.Addr].Connections[level]) == 0 { + // fmt.Printf("==> %+v\n", h.heap[c.Addr]) + // panic("fuck") + // } + + cnode := h.heap[c.Addr] + + // defer func() { + // if r := recover(); r != nil { + // fmt.Printf("Recovered. Error: lvl = %v, addr = %v, node = %v\n", level, c.Addr, cnode) + // } + // }() + + // if len(cnode.Connections) > level && len(cnode.Connections[level]) > 0 { + // if cnode.Connections != nil && len(cnode.Connections) > level { + for _, e := range cnode.Connections[level] { if !visited.Test(uint(e)) { visited.Set(uint(e)) @@ -78,6 +94,7 @@ func (h *HNSW[Vector]) SearchLayer(level int, addr Pointer, q Vector, ef int) pq candidates.Enq(item) } } + // } } } @@ -86,8 +103,13 @@ func (h *HNSW[Vector]) SearchLayer(level int, addr Pointer, q Vector, ef int) pq // Search K-nearest vectors from the graph func (h *HNSW[Vector]) Search(q Vector, K int, efSearch int) []Vector { + + h.RLock() head := h.head - for lvl := h.level - 1; lvl >= 0; lvl-- { + hLevel := h.level + h.RUnlock() + + for lvl := hLevel - 1; lvl >= 0; lvl-- { head = h.skip(lvl, head, q) } diff --git a/types.go b/types.go index 6facf6f..62e59a7 100644 --- a/types.go +++ b/types.go @@ -27,14 +27,24 @@ type Vertex struct { type ordForwardVertex string func (ordForwardVertex) Compare(a, b Vertex) int { - if a.Distance < b.Distance { - return -1 - } + d := a.Distance - b.Distance - if a.Distance > b.Distance { + if d > 1e-5 { return 1 } + if d < -1e-5 { + return -1 + } + + // if a.Distance < b.Distance { + // return -1 + // } + + // if a.Distance > b.Distance { + // return 1 + // } + return 0 } @@ -42,13 +52,23 @@ func (ordForwardVertex) Compare(a, b Vertex) int { type ordReverseVertex string func (ordReverseVertex) Compare(a, b Vertex) int { - if a.Distance > b.Distance { + d := a.Distance - b.Distance + + if d > 1e-5 { return -1 } - if a.Distance < b.Distance { + if d < -1e-5 { return 1 } + // if a.Distance > b.Distance { + // return -1 + // } + + // if a.Distance < b.Distance { + // return 1 + // } + return 0 } diff --git a/vector/type.go b/vector/type.go deleted file mode 100644 index e6defa2..0000000 --- a/vector/type.go +++ /dev/null @@ -1,37 +0,0 @@ -// -// Copyright (C) 2024 Dmitry Kolesnikov -// -// This file may be modified and distributed under the terms -// of the MIT license. See the LICENSE file for details. -// https://github.com/fogfish/hnsw -// - -package vector - -import "github.com/fogfish/golem/pure" - -// Generic trait for "distance" estimate between two vectors -type Surface[Vector any] interface { - Distance(Vector, Vector) float32 -} - -// From is a combinator that lifts V ⟼ V ⟼ float32 function to -// an instance of Distance type trait -type From[Vector any] func(Vector, Vector) float32 - -func (f From[Vector]) Distance(a, b Vector) float32 { return f(a, b) } - -// ContraMap is a combinator that build a new instance of type trait Distance[V] using -// existing instance of Distance[A] and f: b ⟼ a -type ContraMap[A, B any] struct { - Surface[A] - pure.ContraMap[A, B] -} - -// Equal implementation of contra variant functor -func (f ContraMap[A, B]) Distance(a, b B) float32 { - return f.Surface.Distance( - f.ContraMap(a), - f.ContraMap(b), - ) -} diff --git a/vector/vector32.go b/vector/vector32.go deleted file mode 100644 index ef05538..0000000 --- a/vector/vector32.go +++ /dev/null @@ -1,51 +0,0 @@ -// -// Copyright (C) 2024 Dmitry Kolesnikov -// -// This file may be modified and distributed under the terms -// of the MIT license. See the LICENSE file for details. -// https://github.com/fogfish/hnsw -// - -package vector - -import ( - "math" -) - -// Vector of Floats32 -type V32 = []float32 - -// Euclidean surface for vector of floats32 -type euclidean string - -func (euclidean) Distance(a V32, b V32) (d float32) { - for i := 0; i < len(a); i++ { - d += (a[i] - b[i]) * (a[i] - b[i]) - } - return -} - -const Euclidean = euclidean("") - -// Cosine surface for vector of floats32 -type cosine string - -func (cosine) Distance(a V32, b V32) (d float32) { - // https://en.wikipedia.org/wiki/Cosine_similarity - - ab := 0.0 - aa := 0.0 - bb := 0.0 - - for i := 0; i < len(a); i++ { - ab += float64(a[i] * b[i]) - aa += float64(a[i] * a[i]) - bb += float64(b[i] * b[i]) - } - - d = float32(ab / (math.Sqrt(aa) * math.Sqrt(bb))) - - return -} - -const Cosine = cosine("") From 9f4216f558e425016027aa6e88f0f983a6b0d2d1 Mon Sep 17 00:00:00 2001 From: Dmitry Kolesnikov Date: Fri, 1 Mar 2024 23:49:46 +0200 Subject: [PATCH 2/4] tune the data structure for performance --- cmd/opt/create.go | 55 ++++++++++++++ cmd/opt/draw.go | 23 +++--- cmd/opt/query.go | 131 ++++++++++++++++---------------- cmd/opt/test.go | 36 ++------- cmd/try/try.go | 27 ++++--- codec.go | 102 ++++++++++++++++++++++--- hnsw.go | 60 ++------------- insert.go | 110 +++++++++++++++++++-------- internal/pq/pq.go | 4 +- iterator.go | 22 ++++++ kv/kv.go | 161 --------------------------------------- pipe.go | 6 -- search.go | 24 ++---- types.go | 32 ++------ vector/vector.go | 190 ++++++++++++++++++++++++++++++++++++++++++++++ 15 files changed, 560 insertions(+), 423 deletions(-) create mode 100644 cmd/opt/create.go delete mode 100644 kv/kv.go create mode 100644 vector/vector.go diff --git a/cmd/opt/create.go b/cmd/opt/create.go new file mode 100644 index 0000000..976a9da --- /dev/null +++ b/cmd/opt/create.go @@ -0,0 +1,55 @@ +// +// Copyright (C) 2024 Dmitry Kolesnikov +// +// This file may be modified and distributed under the terms +// of the MIT license. See the LICENSE file for details. +// https://github.com/fogfish/hnsw +// + +package opt + +import ( + "fmt" + "runtime" + + "github.com/fogfish/hnsw/cmd/try" + "github.com/fogfish/hnsw/vector" + "github.com/spf13/cobra" +) + +func init() { + rootCmd.AddCommand(createCmd) + createCmd.Flags().StringVarP(&createDataset, "dataset", "d", "", "path to input *.fvecs files") + createCmd.Flags().StringVarP(&createOutput, "output", "o", "test", "output") + createCmd.Flags().IntVarP(&createVecSize, "vector", "v", 128, "vector size") +} + +var ( + createDataset string + createOutput string + createVecSize int +) + +var createCmd = &cobra.Command{ + Use: "create", + Short: "create the dataset", + Long: ` +Creates the dataset from *.fvecs, making it reusable for other tests +`, + SilenceUsage: true, + RunE: create, +} + +func create(cmd *cobra.Command, args []string) error { + h := try.New(createVecSize) + if err := try.Insert(h, runtime.NumCPU(), createDataset); err != nil { + return err + } + + fmt.Printf("==> writing %s\n", createOutput) + if err := vector.Write(h, createOutput); err != nil { + return err + } + + return nil +} diff --git a/cmd/opt/draw.go b/cmd/opt/draw.go index fc3e444..3853934 100644 --- a/cmd/opt/draw.go +++ b/cmd/opt/draw.go @@ -17,7 +17,7 @@ import ( "github.com/bits-and-blooms/bitset" "github.com/fogfish/hnsw" "github.com/fogfish/hnsw/cmd/try" - "github.com/fogfish/hnsw/kv" + kv "github.com/fogfish/hnsw/vector" "github.com/go-echarts/go-echarts/v2/charts" "github.com/go-echarts/go-echarts/v2/components" "github.com/go-echarts/go-echarts/v2/opts" @@ -69,7 +69,7 @@ func draw(cmd *cobra.Command, args []string) error { return nil } -func drawLevel(h *hnsw.HNSW[kv.Vector], level int) error { +func drawLevel(h *hnsw.HNSW[kv.VF32], level int) error { nodes, links, kinds := cutLevel(h, level) if len(nodes) == 0 || len(links) == 0 { return nil @@ -80,14 +80,13 @@ func drawLevel(h *hnsw.HNSW[kv.Vector], level int) error { graph.AddSeries("graph", nodes, links). SetSeriesOptions( charts.WithGraphChartOpts(opts.GraphChart{ - Layout: "force", - // Draggable: true, + Layout: "force", Roam: true, FocusNodeAdjacency: true, Force: &opts.GraphForce{ - Repulsion: 800.0, - Gravity: 0.05, //0.01, - // EdgeLength: 60.0, + Repulsion: 800.0, + Gravity: 0.05, + EdgeLength: 60.0, }, Categories: kinds, @@ -100,8 +99,8 @@ func drawLevel(h *hnsw.HNSW[kv.Vector], level int) error { }, }), charts.WithLineStyleOpts(opts.LineStyle{ - // Curveness: 0.3, - Color: "source", + Curveness: 0.3, + Color: "source", }), ) @@ -119,7 +118,7 @@ func drawLevel(h *hnsw.HNSW[kv.Vector], level int) error { return page.Render(io.MultiWriter(f)) } -func cutLevel(h *hnsw.HNSW[kv.Vector], level int) ([]opts.GraphNode, []opts.GraphLink, []*opts.GraphCategory) { +func cutLevel(h *hnsw.HNSW[kv.VF32], level int) ([]opts.GraphNode, []opts.GraphLink, []*opts.GraphCategory) { var visited bitset.BitSet mrank := level @@ -127,7 +126,7 @@ func cutLevel(h *hnsw.HNSW[kv.Vector], level int) ([]opts.GraphNode, []opts.Grap links := []opts.GraphLink{} kinds := []*opts.GraphCategory{} - h.FMap(level, func(rank int, vector kv.Vector, vertex []kv.Vector) error { + h.FMap(level, func(rank int, vector kv.VF32, vertex []kv.VF32) error { if visited.Test(uint(vector.Key)) { return nil } @@ -149,7 +148,7 @@ func cutLevel(h *hnsw.HNSW[kv.Vector], level int) ([]opts.GraphNode, []opts.Grap opts.GraphLink{ Source: strconv.Itoa(int(vector.Key)), Target: strconv.Itoa(int(v.Key)), - // Value: 200.0 * vv.Euclidean.Distance(vector.Vector, v.Vector), + Value: 200.0 * h.Distance(vector, v), }, ) } diff --git a/cmd/opt/query.go b/cmd/opt/query.go index fd9b18d..8ea9c8f 100644 --- a/cmd/opt/query.go +++ b/cmd/opt/query.go @@ -10,30 +10,31 @@ package opt import ( "errors" + "fmt" "io" "os" + "time" + "github.com/fogfish/hnsw/cmd/try" + "github.com/fogfish/hnsw/vector" + kv "github.com/fogfish/hnsw/vector" "github.com/kshard/fvecs" "github.com/spf13/cobra" ) func init() { rootCmd.AddCommand(queryCmd) - queryCmd.Flags().StringVarP(&queryVectors, "dataset", "d", "", ".fvecs") - queryCmd.Flags().StringVarP(&queryText, "text", "t", "", ".bvecs") + queryCmd.Flags().StringVarP(&queryDataset, "dataset", "d", "", "path to hnsw index") queryCmd.Flags().IntVarP(&queryVecSize, "vector", "v", 128, "vector size") queryCmd.Flags().StringVarP(&queryQuery, "query", "q", "", ".fvecs") - - // drawCmd.Flags().StringVarP(&drawOutput, "output", "o", ".", "directory to output rendered layers") + queryCmd.Flags().StringVarP(&queryText, "text", "t", "", ".bvecs") } var ( - queryVectors string - queryText string + queryDataset string queryVecSize int queryQuery string - - // drawOutput string + queryText string ) var queryCmd = &cobra.Command{ @@ -46,69 +47,71 @@ var queryCmd = &cobra.Command{ } func query(cmd *cobra.Command, args []string) error { - // h := try.New(queryVecSize) - // if err := try.Create(h, queryVectors); err != nil { - // return err - // } - - // text, err := readText() - // if err != nil { - // return err - // } - - // fv, err := os.Open(queryQuery) - // if err != nil { - // return err - // } - // defer fv.Close() - - // // - // t := time.Now() - // c := 1 - // fr := fvecs.NewDecoder[float32](fv) - // for { - // q, err := fr.Read() - // switch { - // case err == nil: - // os.Stdout.WriteString("\n---\n") - - // result := h.Search(try.Node{Vector: q}, 10, 100) - // for _, v := range result { - // d := vector.Cosine.Distance(q, v.Vector) - // os.Stdout.WriteString( - // fmt.Sprintf("%f\n%s\n", d, text[v.ID]), - // ) - // } - - // case errors.Is(err, io.EOF): - // os.Stderr.WriteString( - // fmt.Sprintf("==> query %9d vectors in %s (%d ns/op)\n", c, time.Since(t), int(time.Since(t).Nanoseconds())/c), - // ) - // return nil - // default: - // return err - // } - - // c++ - - // if c%1000 == 0 { - // os.Stderr.WriteString( - // fmt.Sprintf("==> query %9d vectors in %s (%d ns/op)\n", c, time.Since(t), int(time.Since(t).Nanoseconds())/c), - // ) - // } - // } - return nil + h := try.New(queryVecSize) + + if err := vector.Read(h, queryDataset); err != nil { + return err + } + + text, err := readText() + if err != nil { + return err + } + + fv, err := os.Open(queryQuery) + if err != nil { + return err + } + defer fv.Close() + + // + t := time.Now() + c := 1 + fr := fvecs.NewDecoder[float32](fv) + + for { + q, err := fr.Read() + switch { + case err == nil: + os.Stdout.WriteString("\n---\n") + + search := kv.VF32{Vector: q} + result := h.Search(search, 5, 100) + for _, v := range result { + d := h.Distance(search, v) + os.Stdout.WriteString( + fmt.Sprintf("%f\n%s\n", d, text[v.Key]), + ) + } + + case errors.Is(err, io.EOF): + os.Stderr.WriteString( + fmt.Sprintf("==> query %9d vectors in %s (%d ns/op)\n", c, time.Since(t), int(time.Since(t).Nanoseconds())/c), + ) + return nil + default: + return err + } + + c++ + + if c%1000 == 0 { + os.Stderr.WriteString( + fmt.Sprintf("==> query %9d vectors in %s (%d ns/op)\n", c, time.Since(t), int(time.Since(t).Nanoseconds())/c), + ) + } + } } -func readText() (map[int]string, error) { +func readText() (map[uint32]string, error) { bv, err := os.Open(queryText) if err != nil { return nil, err } defer bv.Close() - id := 1 - text := map[int]string{} + id := uint32(0) + text := map[uint32]string{} br := fvecs.NewDecoder[byte](bv) for { @@ -116,6 +119,7 @@ func readText() (map[int]string, error) { switch { case err == nil: + id++ text[id] = string(t) case errors.Is(err, io.EOF): return text, nil @@ -123,6 +127,5 @@ func readText() (map[int]string, error) { return nil, err } - id++ } } diff --git a/cmd/opt/test.go b/cmd/opt/test.go index e9190a8..9a906d5 100644 --- a/cmd/opt/test.go +++ b/cmd/opt/test.go @@ -10,17 +10,19 @@ package opt import ( "github.com/fogfish/hnsw/cmd/try" - "github.com/fogfish/hnsw/kv" + "github.com/fogfish/hnsw/vector" "github.com/spf13/cobra" ) func init() { rootCmd.AddCommand(testCmd) - testCmd.Flags().StringVarP(&testDataset, "dataset", "d", "siftsmall", "name of the dataset from http://corpus-texmex.irisa.fr") + testCmd.Flags().StringVarP(&testDataset, "dataset", "d", "", "name of the dataset from http://corpus-texmex.irisa.fr") + testCmd.Flags().StringVarP(&testSuite, "suite", "s", "siftsmall", "name of the dataset from http://corpus-texmex.irisa.fr") } var ( testDataset string + testSuite string ) var testCmd = &cobra.Command{ @@ -42,33 +44,9 @@ It is required to obtain the dataset(s) into local environment: func test(cmd *cobra.Command, args []string) error { h := try.New(128) - // f := fmt.Sprintf("%s/%s_base.fvecs", testDataset, filepath.Base(testDataset)) - - // if err := try.Insert(h, 8, f); err != nil { - // return err - // } - - // if err := kv.Write(h, "test"); err != nil { - // panic(err) - // } - - if err := kv.Read(h, "test"); err != nil { - panic(err) + if err := vector.Read(h, testDataset); err != nil { + return err } - // h.Dump() - - // w, _ := os.Create("test.ivecs") - // e := fvecs.NewEncoder[uint32](w) - - // h.Encode(e) - - // h.FMap(3, func(level int, vector try.Node, vertex []try.Node) error { - // fmt.Printf("ID: %d => %d\n", vector.ID, len(vertex)) - - // return nil - // }) - - // return nil - return try.Test(h, testDataset) + return try.Test(h, testSuite) } diff --git a/cmd/try/try.go b/cmd/try/try.go index cc82522..390b8b9 100644 --- a/cmd/try/try.go +++ b/cmd/try/try.go @@ -17,23 +17,23 @@ import ( "time" "github.com/fogfish/hnsw" - "github.com/fogfish/hnsw/kv" + kv "github.com/fogfish/hnsw/vector" "github.com/kshard/fvecs" "github.com/kshard/vector" ) -// New HNSW Index -func New(vs int) *hnsw.HNSW[kv.Vector] { - return hnsw.New[kv.Vector]( +// New HNSW Index for given vector's dimension +func New(vs int) *hnsw.HNSW[kv.VF32] { + return hnsw.New[kv.VF32]( kv.Surface(vector.Euclidean()), kv.Zero(vs), - hnsw.WithEfConstruction(200), + hnsw.WithEfConstruction(400), hnsw.WithM(16), ) } // Insert dataset -func Insert(h *hnsw.HNSW[kv.Vector], threads int, dataset string) error { +func Insert(h *hnsw.HNSW[kv.VF32], threads int, dataset string) error { fmt.Printf("==> reading %s\n", dataset) f, err := os.Open(dataset) @@ -43,7 +43,7 @@ func Insert(h *hnsw.HNSW[kv.Vector], threads int, dataset string) error { defer f.Close() t := time.Now() - c := uint32(1) + c := uint32(0) progress := func() { os.Stderr.WriteString( @@ -58,7 +58,9 @@ func Insert(h *hnsw.HNSW[kv.Vector], threads int, dataset string) error { vec, err := d.Read() switch { case err == nil: - w <- kv.Vector{Key: c, Vector: vec} + c++ + // fmt.Printf("%v\n", vec) + w <- kv.VF32{Key: c, Vector: vec} case errors.Is(err, io.EOF): progress() return nil @@ -66,7 +68,6 @@ func Insert(h *hnsw.HNSW[kv.Vector], threads int, dataset string) error { return err } - c++ if c%10000 == 0 { progress() } @@ -74,8 +75,10 @@ func Insert(h *hnsw.HNSW[kv.Vector], threads int, dataset string) error { } // Query index comparing with ground truth -func Query(h *hnsw.HNSW[kv.Vector], k int, query []float32, truth []uint32) (int, float64) { - result := h.Search(kv.Vector{Vector: query}, k, 100) +func Query(h *hnsw.HNSW[kv.VF32], k int, query []float32, truth []uint32) (int, float64) { + result := h.Search(kv.VF32{Vector: query}, k, 100) + + //fmt.Printf("%v\n", result) errors := 0 weight := 0.0 @@ -92,7 +95,7 @@ func Query(h *hnsw.HNSW[kv.Vector], k int, query []float32, truth []uint32) (int return errors, weight } -func Test(h *hnsw.HNSW[kv.Vector], dataset string) error { +func Test(h *hnsw.HNSW[kv.VF32], dataset string) error { fmt.Printf("==> testing dataset %s\n", dataset) qf, err := os.Open(fmt.Sprintf("%s/%s_query.fvecs", dataset, filepath.Base(dataset))) diff --git a/codec.go b/codec.go index 75ccf37..6e8bfdd 100644 --- a/codec.go +++ b/codec.go @@ -9,11 +9,22 @@ package hnsw import ( + "encoding/json" "errors" "io" ) +type header struct { + EfConstruction int `json:"efConstruction"` + MLayerN int `json:"mLayerN"` + MLayer0 int `json:"mLayer0"` + ML float64 `json:"mL"` + Head Pointer `json:"head"` + Level int `json:"level"` +} + func (h *HNSW[Vector]) Write( + w io.Writer, nodes interface { Write(n Vector) error }, @@ -21,9 +32,34 @@ func (h *HNSW[Vector]) Write( Write(v []Pointer) error }, ) error { - h.Lock() - defer h.Unlock() + h.rwCore.Lock() + defer h.rwCore.Unlock() + + for i := 0; i < heapRWSlots; i++ { + h.rwHeap[i].Lock() + defer h.rwHeap[i].Unlock() + } + + if err := h.writeEdges(edges); err != nil { + return err + } + + if err := h.writeNodes(nodes); err != nil { + return err + } + + if err := h.writeHeader(w); err != nil { + return err + } + return nil +} + +func (h *HNSW[Vector]) writeEdges( + edges interface { + Write(v []Pointer) error + }, +) error { for l := h.level - 1; l >= 0; l-- { hv := []Pointer{0, 0, 0, uint32(l)} if err := edges.Write(hv); err != nil { @@ -35,9 +71,6 @@ func (h *HNSW[Vector]) Write( iv := make([]Pointer, len(node.Connections[l])+1) iv[0] = Pointer(addr) copy(iv[1:], node.Connections[l]) - // for i, edge := range node.Connections[l] { - // iv[i+1] = edge - // } if err := edges.Write(iv); err != nil { return err @@ -46,6 +79,14 @@ func (h *HNSW[Vector]) Write( } } + return nil +} + +func (h *HNSW[Vector]) writeNodes( + nodes interface { + Write(n Vector) error + }, +) error { for _, node := range h.heap { if err := nodes.Write(node.Vector); err != nil { return err @@ -55,7 +96,25 @@ func (h *HNSW[Vector]) Write( return nil } +func (h *HNSW[Vector]) writeHeader(w io.Writer) error { + v := header{ + EfConstruction: h.config.efConstruction, + MLayerN: h.config.mLayerN, + MLayer0: h.config.mLayer0, + ML: h.config.mL, + Head: h.head, + Level: h.level, + } + + if err := json.NewEncoder(w).Encode(v); err != nil { + return err + } + + return nil +} + func (h *HNSW[Vector]) Read( + r io.Reader, nodes interface { Read() (Vector, error) }, @@ -63,8 +122,13 @@ func (h *HNSW[Vector]) Read( Read() ([]Pointer, error) }, ) error { - h.Lock() - defer h.Unlock() + h.rwCore.Lock() + defer h.rwCore.Unlock() + + for i := 0; i < heapRWSlots; i++ { + h.rwHeap[i].Lock() + defer h.rwHeap[i].Unlock() + } if err := h.readNodes(nodes); err != nil { return err @@ -74,6 +138,10 @@ func (h *HNSW[Vector]) Read( return err } + if err := h.readHeader(r); err != nil { + return err + } + return nil } @@ -105,8 +173,6 @@ func (h *HNSW[Vector]) readEdges( ) error { lvl := -1 - // fmt.Printf("%v\n", h.heap) - for { iv, err := edges.Read() switch { @@ -120,7 +186,6 @@ func (h *HNSW[Vector]) readEdges( node.Connections = make([][]Pointer, lvl+1) } node.Connections[lvl] = iv[1:] - // fmt.Printf("%v | %v\n", addr, node.Connections[lvl]) h.heap[addr] = node } case errors.Is(err, io.EOF): @@ -130,3 +195,20 @@ func (h *HNSW[Vector]) readEdges( } } } + +func (h *HNSW[Vector]) readHeader(r io.Reader) error { + var v header + + if err := json.NewDecoder(r).Decode(&v); err != nil { + return err + } + + h.config.efConstruction = v.EfConstruction + h.config.mLayerN = v.MLayerN + h.config.mLayer0 = v.MLayer0 + h.config.mL = v.ML + h.head = v.Head + h.level = v.Level + + return nil +} diff --git a/hnsw.go b/hnsw.go index c42a158..fabe7a3 100644 --- a/hnsw.go +++ b/hnsw.go @@ -9,10 +9,8 @@ package hnsw import ( - "fmt" "math" "math/rand" - "strings" "sync" "time" @@ -72,9 +70,14 @@ func WithRandomSource(random rand.Source) Option { } } +// Slots to coordinate concurrent I/O +const heapRWSlots = 1024 + // HNSW data type type HNSW[Vector any] struct { - sync.RWMutex + rwCore sync.RWMutex + rwHeap [heapRWSlots]sync.RWMutex + config Config surface vector.Surface[Vector] @@ -120,53 +123,6 @@ func New[Vector any]( func (h *HNSW[Vector]) Level() int { return h.level } -// -// -// - -// func (h *HNSW[Vector]) Head() Pointer { return h.head } -// func (h *HNSW[Vector]) Node(addr Pointer) Node[Vector] { return h.heap[addr] } - -func (h *HNSW[Vector]) Dump() { - sb := strings.Builder{} - - for lvl := h.level - 1; lvl >= 0; lvl-- { - sb.WriteString(fmt.Sprintf("\n\n==> %v\n", lvl)) - - h.FMap(lvl, func(level int, vector Vector, vertex []Vector) error { - - sb.WriteString(fmt.Sprintf("%v | ", vector)) - for _, e := range vertex { - sb.WriteString(fmt.Sprintf("%v ", e)) - } - sb.WriteString("\n") - - return nil - }) - - // visited := map[Pointer]struct{}{} - - // sb.WriteString(fmt.Sprintf("\n\n==> %v\n", lvl)) - // h.dump(&sb, lvl, visited, h.head) - } - - fmt.Println(sb.String()) +func (h *HNSW[Vector]) Distance(a, b Vector) float32 { + return h.surface.Distance(a, b) } - -// func (h *HNSW[Vector]) dump(sb *strings.Builder, level int, visited map[Pointer]struct{}, addr Pointer) { -// if _, has := visited[addr]; has { -// return -// } - -// visited[addr] = struct{}{} - -// sb.WriteString(fmt.Sprintf("%v | ", h.heap[addr].Vector)) -// for _, e := range h.heap[addr].Connections[level] { -// sb.WriteString(fmt.Sprintf("%v ", h.heap[e].Vector)) -// } -// sb.WriteString("\n") - -// for _, e := range h.heap[addr].Connections[level] { -// h.dump(sb, level, visited, e) -// } -// } diff --git a/insert.go b/insert.go index 76d15be..06cddcf 100644 --- a/insert.go +++ b/insert.go @@ -26,29 +26,35 @@ again: // Insert new vector func (h *HNSW[Vector]) Insert(v Vector) { + // + // allocate new node + // + level := int(math.Floor(-math.Log(h.rand() * h.config.mL))) + addr := Pointer(0) node := Node[Vector]{ Vector: v, Connections: make([][]Pointer, level+1), } - h.Lock() - h.heap = append(h.heap, node) - addr := Pointer(len(h.heap) - 1) - h.Unlock() - + // // skip down through layers - h.RLock() + // + + h.rwCore.RLock() head := h.head hLevel := h.level - h.RUnlock() + h.rwCore.RUnlock() for lvl := hLevel - 1; lvl > level; lvl-- { head = h.skip(lvl, head, v) } // + // start building neighborhood + // + for lvl := min(level, hLevel-1); lvl >= 0; lvl-- { M := h.config.mLayerN if lvl == 0 { @@ -61,37 +67,61 @@ func (h *HNSW[Vector]) Insert(v Vector) { for w.Len() > M { w.Deq() } - // if w.Len() > M { - // w = h.SelectNeighboursHeuristic(lvl, v, w, M) - // } - // Add Bi-Edges - //h.Lock() + // Add Edges from new node to existing one edges := make([]Pointer, w.Len()) - // node.Connections[lvl] = make([]Pointer, w.Len()) for i := w.Len() - 1; i >= 0; i-- { candidate := w.Deq() edges[i] = candidate.Addr - //node.Connections[lvl][i] = candidate.Addr - - n := h.heap[candidate.Addr] - c := n.Connections[lvl] - h.Lock() - n.Connections[lvl] = append(c, addr) - h.Unlock() } - h.Lock() node.Connections[lvl] = edges - h.Unlock() + } + + // if w.Len() > M { + // w = h.SelectNeighboursHeuristic(lvl, v, w, M) + // } + + // + // Append new node + // + + h.rwCore.Lock() + addr = Pointer(len(h.heap)) + h.rwHeap[addr%heapRWSlots].Lock() + h.heap = append(h.heap, node) + h.rwHeap[addr%heapRWSlots].Unlock() + h.rwCore.Unlock() + + for lvl, edges := range node.Connections { + for i := 0; i < len(edges); i++ { + h.addConnection(lvl, edges[i], addr) + } + } + + // + // Shrink Connections + // + + for lvl, edges := range node.Connections { + M := h.config.mLayerN + if lvl == 0 { + M = h.config.mLayer0 + } - // Shrink Connection - for _, e := range node.Connections[lvl] { + for _, e := range edges { + slot := e % heapRWSlots + h.rwHeap[slot].RLock() + enode := h.heap[e] + eedges := enode.Connections[lvl] + h.rwHeap[slot].RUnlock() - if len(h.heap[e].Connections[lvl]) > M { + if len(eedges) > M { edges := pq.New(ordReverseVertex("")) - for _, n := range h.heap[e].Connections[lvl] { - dist := h.surface.Distance(h.heap[e].Vector, h.heap[n].Vector) + for _, n := range eedges { + nnode := h.heap[n] + + dist := h.surface.Distance(enode.Vector, nnode.Vector) item := Vertex{Distance: dist, Addr: n} edges.Enq(item) } @@ -108,20 +138,36 @@ func (h *HNSW[Vector]) Insert(v Vector) { conns[i] = edges.Deq().Addr } - h.Lock() + h.rwHeap[slot].Lock() h.heap[e].Connections[lvl] = conns - h.Unlock() + h.rwHeap[slot].Unlock() } } - } - h.Lock() + // + // Update Heap + // + + h.rwCore.Lock() if len(node.Connections) > h.level { h.level = len(node.Connections) h.head = addr } - h.Unlock() + h.rwCore.Unlock() +} + +func (h *HNSW[Vector]) addConnection(level int, src, dst Pointer) { + slot := src % heapRWSlots + + h.rwHeap[slot].RLock() + n := h.heap[src] + c := n.Connections[level] + h.rwHeap[slot].RUnlock() + + h.rwHeap[slot].Lock() + n.Connections[level] = append(c, dst) + h.rwHeap[slot].Unlock() } /* diff --git a/internal/pq/pq.go b/internal/pq/pq.go index 1cd1eeb..370e3e1 100644 --- a/internal/pq/pq.go +++ b/internal/pq/pq.go @@ -19,11 +19,11 @@ type Queue[T any] struct { } func New[T any](ord Ord[T], seq ...T) Queue[T] { - mm := [400]T{} + mm := [64]T{} pq := Queue[T]{ heap: &heaps[T]{ ord: ord, - mem: mm[0:0:400], // make([]T, 0), + mem: mm[0:0:64], // make([]T, 0), }, } diff --git a/iterator.go b/iterator.go index 8113834..96eb4b3 100644 --- a/iterator.go +++ b/iterator.go @@ -8,6 +8,11 @@ package hnsw +import ( + "fmt" + "strings" +) + type FMap[Vector any] func(level int, vector Vector, vertex []Vector) error func (h *HNSW[Vector]) FMap(level int, fmap FMap[Vector]) error { @@ -28,3 +33,20 @@ func (h *HNSW[Vector]) FMap(level int, fmap FMap[Vector]) error { return nil } + +func (h *HNSW[Vector]) Dump(sb *strings.Builder) { + for lvl := h.level - 1; lvl >= 0; lvl-- { + sb.WriteString(fmt.Sprintf("\n\n==> %v\n", lvl)) + + h.FMap(lvl, func(level int, vector Vector, vertex []Vector) error { + + sb.WriteString(fmt.Sprintf("%v | ", vector)) + for _, e := range vertex { + sb.WriteString(fmt.Sprintf("%v ", e)) + } + sb.WriteString("\n") + + return nil + }) + } +} diff --git a/kv/kv.go b/kv/kv.go deleted file mode 100644 index d76351e..0000000 --- a/kv/kv.go +++ /dev/null @@ -1,161 +0,0 @@ -package kv - -import ( - "encoding/binary" - "os" - "strconv" - - "github.com/fogfish/hnsw" - "github.com/kshard/fvecs" - "github.com/kshard/vector" -) - -type Vector struct { - Key uint32 - Vector vector.F32 -} - -func (v Vector) String() string { return strconv.Itoa(int(v.Key)) } - -func toVector(n Vector) vector.F32 { return n.Vector } - -func Zero(vs int) Vector { - return Vector{Key: 0, Vector: make(vector.F32, vs)} -} - -func Surface(surface vector.Surface[vector.F32]) vector.Surface[Vector] { - return vector.ContraMap[vector.F32, Vector]{ - Surface: surface, - ContraMap: toVector, - } -} - -func Write(h *hnsw.HNSW[Vector], file string) error { - fw, err := os.Create(file + ".fvecs") - if err != nil { - return err - } - defer fw.Close() - - iw, err := os.Create(file + ".ivecs") - if err != nil { - return err - } - defer iw.Close() - - bw, err := os.Create(file + ".bvecs") - if err != nil { - return err - } - defer bw.Close() - - fe := fvecs.NewEncoder[float32](fw) - ie := fvecs.NewEncoder[uint32](iw) - be := fvecs.NewEncoder[byte](bw) - - nw := NewWriter(fe, be) - - return h.Write(nw, ie) -} - -func Read(h *hnsw.HNSW[Vector], file string) error { - fr, err := os.Open(file + ".fvecs") - if err != nil { - return err - } - defer fr.Close() - - ir, err := os.Open(file + ".ivecs") - if err != nil { - return err - } - defer ir.Close() - - br, err := os.Open(file + ".bvecs") - if err != nil { - return err - } - defer br.Close() - - fd := fvecs.NewDecoder[float32](fr) - id := fvecs.NewDecoder[uint32](ir) - bd := fvecs.NewDecoder[byte](br) - - nr := NewReader(fd, bd) - - return h.Read(nr, id) -} - -// -// -// - -type FWriter interface { - Write(vector.F32) error -} - -type BWriter interface { - Write([]byte) error -} - -type Writer struct { - fw FWriter - bw BWriter - b []byte -} - -func NewWriter(fw FWriter, bw BWriter) Writer { - return Writer{fw: fw, bw: bw, b: []byte{0, 0, 0, 0}} -} - -func (w Writer) Write(v Vector) error { - if err := w.fw.Write(v.Vector); err != nil { - return err - } - - binary.LittleEndian.PutUint32(w.b, v.Key) - if err := w.bw.Write(w.b); err != nil { - return err - } - - return nil -} - -// -// -// - -type FReader interface { - Read() (vector.F32, error) -} - -type BReader interface { - Read() ([]byte, error) -} - -type Reader struct { - fr FReader - br BReader -} - -func NewReader(fr FReader, br BReader) Reader { - return Reader{fr: fr, br: br} -} - -func (r Reader) Read() (v Vector, err error) { - v.Vector, err = r.fr.Read() - if err != nil { - return - } - - b, err := r.br.Read() - if err != nil { - return - } - - v.Key = binary.LittleEndian.Uint32(b) - - // fmt.Printf("%v\n", v) - - return -} diff --git a/pipe.go b/pipe.go index 0b769f1..3f4ff18 100644 --- a/pipe.go +++ b/pipe.go @@ -8,17 +8,11 @@ package hnsw -import "sync" - func (h *HNSW[Vector]) Pipe(workers int) chan<- Vector { - var wg sync.WaitGroup - pipe := make(chan Vector, workers) for i := 1; i <= workers; i++ { - wg.Add(1) go func() { - defer wg.Done() for v := range pipe { h.Insert(v) } diff --git a/search.go b/search.go index b85459d..a5ee7cb 100644 --- a/search.go +++ b/search.go @@ -61,22 +61,13 @@ func (h *HNSW[Vector]) SearchLayer(level int, addr Pointer, q Vector, ef int) pq break } - // if len(h.heap[c.Addr].Connections[level]) == 0 { - // fmt.Printf("==> %+v\n", h.heap[c.Addr]) - // panic("fuck") - // } - + slot := c.Addr % heapRWSlots + h.rwHeap[slot].RLock() cnode := h.heap[c.Addr] + cedge := cnode.Connections[level] + h.rwHeap[slot].RUnlock() - // defer func() { - // if r := recover(); r != nil { - // fmt.Printf("Recovered. Error: lvl = %v, addr = %v, node = %v\n", level, c.Addr, cnode) - // } - // }() - - // if len(cnode.Connections) > level && len(cnode.Connections[level]) > 0 { - // if cnode.Connections != nil && len(cnode.Connections) > level { - for _, e := range cnode.Connections[level] { + for _, e := range cedge { if !visited.Test(uint(e)) { visited.Set(uint(e)) @@ -94,7 +85,6 @@ func (h *HNSW[Vector]) SearchLayer(level int, addr Pointer, q Vector, ef int) pq candidates.Enq(item) } } - // } } } @@ -104,10 +94,10 @@ func (h *HNSW[Vector]) SearchLayer(level int, addr Pointer, q Vector, ef int) pq // Search K-nearest vectors from the graph func (h *HNSW[Vector]) Search(q Vector, K int, efSearch int) []Vector { - h.RLock() + h.rwCore.RLock() head := h.head hLevel := h.level - h.RUnlock() + h.rwCore.RUnlock() for lvl := hLevel - 1; lvl >= 0; lvl-- { head = h.skip(lvl, head, q) diff --git a/types.go b/types.go index 62e59a7..6facf6f 100644 --- a/types.go +++ b/types.go @@ -27,23 +27,13 @@ type Vertex struct { type ordForwardVertex string func (ordForwardVertex) Compare(a, b Vertex) int { - d := a.Distance - b.Distance - - if d > 1e-5 { - return 1 - } - - if d < -1e-5 { + if a.Distance < b.Distance { return -1 } - // if a.Distance < b.Distance { - // return -1 - // } - - // if a.Distance > b.Distance { - // return 1 - // } + if a.Distance > b.Distance { + return 1 + } return 0 } @@ -52,23 +42,13 @@ func (ordForwardVertex) Compare(a, b Vertex) int { type ordReverseVertex string func (ordReverseVertex) Compare(a, b Vertex) int { - d := a.Distance - b.Distance - - if d > 1e-5 { + if a.Distance > b.Distance { return -1 } - if d < -1e-5 { + if a.Distance < b.Distance { return 1 } - // if a.Distance > b.Distance { - // return -1 - // } - - // if a.Distance < b.Distance { - // return 1 - // } - return 0 } diff --git a/vector/vector.go b/vector/vector.go new file mode 100644 index 0000000..22f6db0 --- /dev/null +++ b/vector/vector.go @@ -0,0 +1,190 @@ +// +// Copyright (C) 2024 Dmitry Kolesnikov +// +// This file may be modified and distributed under the terms +// of the MIT license. See the LICENSE file for details. +// https://github.com/fogfish/hnsw +// + +package vector + +import ( + "encoding/binary" + "os" + "strconv" + + "github.com/fogfish/hnsw" + "github.com/kshard/fvecs" + "github.com/kshard/vector" +) + +// Vector of float32 annotated with uint32 key +type VF32 struct { + Key uint32 + Vector vector.F32 +} + +func (v VF32) String() string { return strconv.Itoa(int(v.Key)) } + +func toVector(n VF32) vector.F32 { return n.Vector } + +// Zero vector of float32 +func Zero(vs int) VF32 { + return VF32{Key: 0, Vector: make(vector.F32, vs)} +} + +// Surface defines distance measurement rules +func Surface(surface vector.Surface[vector.F32]) vector.Surface[VF32] { + return vector.ContraMap[vector.F32, VF32]{ + Surface: surface, + ContraMap: toVector, + } +} + +// Write HNSW index to file. The structure is written into three files. +// *.fvecs - sequence of vectors (vector.F32), ordered by insert time (as preserved by data structure) +// *.bvecs - sequence of vectors ids (uint32), ordered by insert time (as sequence of vectors) +// *.ivecs - sequence of edges ([]Pointer) for each level between vectors +func Write(h *hnsw.HNSW[VF32], file string) error { + hw, err := os.Create(file + ".json") + if err != nil { + return err + } + defer hw.Close() + + fw, err := os.Create(file + ".fvecs") + if err != nil { + return err + } + defer fw.Close() + + iw, err := os.Create(file + ".ivecs") + if err != nil { + return err + } + defer iw.Close() + + bw, err := os.Create(file + ".bvecs") + if err != nil { + return err + } + defer bw.Close() + + fe := fvecs.NewEncoder[float32](fw) + ie := fvecs.NewEncoder[uint32](iw) + be := fvecs.NewEncoder[byte](bw) + + if err := h.Write(hw, newWriter(fe, be), ie); err != nil { + return err + } + + return nil +} + +// Read HNSW index from file. +func Read(h *hnsw.HNSW[VF32], file string) error { + hr, err := os.Open(file + ".json") + if err != nil { + return err + } + defer hr.Close() + + fr, err := os.Open(file + ".fvecs") + if err != nil { + return err + } + defer fr.Close() + + ir, err := os.Open(file + ".ivecs") + if err != nil { + return err + } + defer ir.Close() + + br, err := os.Open(file + ".bvecs") + if err != nil { + return err + } + defer br.Close() + + fd := fvecs.NewDecoder[float32](fr) + id := fvecs.NewDecoder[uint32](ir) + bd := fvecs.NewDecoder[byte](br) + + if err := h.Read(hr, newReader(fd, bd), id); err != nil { + return err + } + + return nil +} + +// +// +// + +type Writer[T any] interface { + Write(T) error +} + +type writer struct { + floats Writer[vector.F32] + bytes Writer[[]byte] + b []byte +} + +func newWriter(floats Writer[vector.F32], bytes Writer[[]byte]) Writer[VF32] { + return writer{ + floats: floats, + bytes: bytes, + b: []byte{0, 0, 0, 0}, + } +} + +func (w writer) Write(v VF32) error { + if err := w.floats.Write(v.Vector); err != nil { + return err + } + + binary.LittleEndian.PutUint32(w.b, v.Key) + if err := w.bytes.Write(w.b); err != nil { + return err + } + + return nil +} + +// +// +// + +type Reader[T any] interface { + Read() (T, error) +} + +type reader struct { + floats Reader[vector.F32] + bytes Reader[[]byte] +} + +func newReader(floats Reader[vector.F32], bytes Reader[[]byte]) Reader[VF32] { + return reader{ + floats: floats, + bytes: bytes, + } +} + +func (r reader) Read() (v VF32, err error) { + v.Vector, err = r.floats.Read() + if err != nil { + return + } + + b, err := r.bytes.Read() + if err != nil { + return + } + + v.Key = binary.LittleEndian.Uint32(b) + + return +} From f19e770f3d7f8eb6e563086e4431f4d32e6fe76c Mon Sep 17 00:00:00 2001 From: Dmitry Kolesnikov Date: Sun, 3 Mar 2024 20:45:21 +0200 Subject: [PATCH 3/4] update ci/cd pipeline --- .github/workflows/build.yml | 48 ++++++++++++++++++++++++++++++++ .github/workflows/check-code.yml | 25 +++++++++++++++++ .github/workflows/check-test.yml | 34 ++++++++++++++++++++++ cmd/go.mod | 2 +- cmd/go.sum | 4 +-- go.mod | 4 +-- go.sum | 4 +-- 7 files changed, 114 insertions(+), 7 deletions(-) create mode 100644 .github/workflows/build.yml create mode 100644 .github/workflows/check-code.yml create mode 100644 .github/workflows/check-test.yml diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 0000000..df30005 --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,48 @@ +## +## Build the main branch +## +name: build +on: + push: + branches: + - main + - /refs/heads/main + +jobs: + + build: + runs-on: ubuntu-latest + steps: + + - uses: actions/setup-go@v5 + with: + go-version: "1.21" + + - uses: actions/checkout@v4.1.1 + + - name: go build + run: | + go build ./... + + - name: go test + run: | + go test -v -coverprofile=profile.cov $(go list ./... | grep -v /examples/) + + - uses: shogo82148/actions-goveralls@v1 + continue-on-error: true + with: + path-to-profile: profile.cov + + - uses: reecetech/version-increment@2023.10.2 + id: version + with: + scheme: semver + increment: patch + + - name: publish + run: | + git config user.name "GitHub Actions" + git config user.email "github-actions@users.noreply.github.com" + git tag ${{ steps.version.outputs.v-version }} + git push origin -u ${{ steps.version.outputs.v-version }} + diff --git a/.github/workflows/check-code.yml b/.github/workflows/check-code.yml new file mode 100644 index 0000000..ab9b7dc --- /dev/null +++ b/.github/workflows/check-code.yml @@ -0,0 +1,25 @@ +## +## Quality checks +## +name: check +on: + pull_request: + types: + - opened + - synchronize + +jobs: + + code: + runs-on: ubuntu-latest + steps: + + - uses: actions/setup-go@v5 + with: + go-version: "1.21" + + - uses: actions/checkout@v4.1.1 + + - uses: dominikh/staticcheck-action@v1.3.0 + with: + install-go: false diff --git a/.github/workflows/check-test.yml b/.github/workflows/check-test.yml new file mode 100644 index 0000000..5f8119a --- /dev/null +++ b/.github/workflows/check-test.yml @@ -0,0 +1,34 @@ +## +## Unit Tests +## +name: test +on: + pull_request: + types: + - opened + - synchronize + +jobs: + + unit: + runs-on: ubuntu-latest + steps: + + - uses: actions/setup-go@v5 + with: + go-version: "1.21" + + - uses: actions/checkout@v4.1.1 + + - name: go build + run: | + go build ./... + + - name: go test + run: | + go test -v -coverprofile=profile.cov $(go list ./... | grep -v /examples/) + + - uses: shogo82148/actions-goveralls@v1 + continue-on-error: true + with: + path-to-profile: profile.cov diff --git a/cmd/go.mod b/cmd/go.mod index c446cbc..7307891 100644 --- a/cmd/go.mod +++ b/cmd/go.mod @@ -7,7 +7,7 @@ require ( github.com/fogfish/hnsw v0.0.0-00010101000000-000000000000 github.com/go-echarts/go-echarts/v2 v2.3.3 github.com/kshard/fvecs v0.0.1 - github.com/kshard/vector v0.0.2 + github.com/kshard/vector v0.0.3 github.com/spf13/cobra v1.8.0 ) diff --git a/cmd/go.sum b/cmd/go.sum index 8643b8d..a41e54f 100644 --- a/cmd/go.sum +++ b/cmd/go.sum @@ -16,8 +16,8 @@ github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2 github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/kshard/fvecs v0.0.1 h1:4FIjuJaiWWv1Q2y20w/1l13WhNlErWXs4yYVLmotNGo= github.com/kshard/fvecs v0.0.1/go.mod h1:cehO9AfnF3Tb2vOwhOWmoaNUfYqmm4WQrUMyrPGqN6Q= -github.com/kshard/vector v0.0.2 h1:eh6d2XpcSRRZYaJAK2F0l5Ccql1NYpZY4vDBNBRj7qs= -github.com/kshard/vector v0.0.2/go.mod h1:5sauOIat9reamLm+hPc6M7n2oWo3G6z5u1V0lQvpYYE= +github.com/kshard/vector v0.0.3 h1:8/4t3uuv4N6gXOxgnN4RPfIel4Uq/BBSwnOjxmMWEXc= +github.com/kshard/vector v0.0.3/go.mod h1:l5c902GqrnE4/LvJAmSCbtGorXeMe9si5SHtwckr8jc= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= diff --git a/go.mod b/go.mod index 2ce99fe..77230e5 100644 --- a/go.mod +++ b/go.mod @@ -1,12 +1,12 @@ module github.com/fogfish/hnsw -go 1.22.0 +go 1.21.3 require ( github.com/bits-and-blooms/bitset v1.13.0 github.com/fogfish/it/v2 v2.0.1 github.com/kshard/fvecs v0.0.1 - github.com/kshard/vector v0.0.2 + github.com/kshard/vector v0.0.3 ) require ( diff --git a/go.sum b/go.sum index be78352..4a7a39d 100644 --- a/go.sum +++ b/go.sum @@ -8,7 +8,7 @@ github.com/fogfish/it/v2 v2.0.1 h1:vu3kV2xzYDPHoMHMABxXeu5CoMcTfRc4gkWkzOUkRJY= github.com/fogfish/it/v2 v2.0.1/go.mod h1:h5FdKaEQT4sUEykiVkB8VV4jX27XabFVeWhoDZaRZtE= github.com/kshard/fvecs v0.0.1 h1:4FIjuJaiWWv1Q2y20w/1l13WhNlErWXs4yYVLmotNGo= github.com/kshard/fvecs v0.0.1/go.mod h1:cehO9AfnF3Tb2vOwhOWmoaNUfYqmm4WQrUMyrPGqN6Q= -github.com/kshard/vector v0.0.2 h1:eh6d2XpcSRRZYaJAK2F0l5Ccql1NYpZY4vDBNBRj7qs= -github.com/kshard/vector v0.0.2/go.mod h1:5sauOIat9reamLm+hPc6M7n2oWo3G6z5u1V0lQvpYYE= +github.com/kshard/vector v0.0.3 h1:8/4t3uuv4N6gXOxgnN4RPfIel4Uq/BBSwnOjxmMWEXc= +github.com/kshard/vector v0.0.3/go.mod h1:l5c902GqrnE4/LvJAmSCbtGorXeMe9si5SHtwckr8jc= golang.org/x/sys v0.17.0 h1:25cE3gD+tdBA7lp7QfhuV+rJiE9YXTcS3VG1SqssI/Y= golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= From 67907a0425eaded022a6d77825d318cc0dbb915d Mon Sep 17 00:00:00 2001 From: Dmitry Kolesnikov Date: Sun, 3 Mar 2024 20:47:01 +0200 Subject: [PATCH 4/4] update go version --- cmd/go.mod | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/go.mod b/cmd/go.mod index 7307891..72b2e89 100644 --- a/cmd/go.mod +++ b/cmd/go.mod @@ -1,6 +1,6 @@ module github.com/fogfish/hnsw/cmd -go 1.22.0 +go 1.21.3 require ( github.com/bits-and-blooms/bitset v1.13.0