From 4133762868acbe5d7561d9e85d82c532d6a8baff Mon Sep 17 00:00:00 2001 From: fogfish Date: Tue, 20 Aug 2024 19:30:09 +0300 Subject: [PATCH] Initial client release (#1) --- .github/workflows/build.yml | 48 ++++ .github/workflows/check-code.yml | 25 ++ .github/workflows/check-test.yml | 34 +++ LICENSE | 2 +- README.md | 237 +++++++++++++++++- cmd/optimum/encoding/encoding.go | 66 +++++ cmd/optimum/go.mod | 43 ++++ cmd/optimum/go.sum | 81 ++++++ cmd/optimum/main.go | 17 ++ cmd/optimum/opt/common/commit.go | 58 +++++ cmd/optimum/opt/common/create.go | 72 ++++++ cmd/optimum/opt/common/list.go | 55 +++++ cmd/optimum/opt/common/remove.go | 34 +++ cmd/optimum/opt/common/spinner.go | 40 +++ cmd/optimum/opt/hnsw.go | 393 ++++++++++++++++++++++++++++++ cmd/optimum/opt/root.go | 107 ++++++++ doc/hnsw.md | 96 ++++++++ doc/optimum.svg | 43 ++++ go.mod | 16 ++ go.sum | 18 ++ restapi.go | 113 +++++++++ stream.go | 133 ++++++++++ types.go | 88 +++++++ 23 files changed, 1816 insertions(+), 3 deletions(-) create mode 100644 .github/workflows/build.yml create mode 100644 .github/workflows/check-code.yml create mode 100644 .github/workflows/check-test.yml create mode 100644 cmd/optimum/encoding/encoding.go create mode 100644 cmd/optimum/go.mod create mode 100644 cmd/optimum/go.sum create mode 100644 cmd/optimum/main.go create mode 100644 cmd/optimum/opt/common/commit.go create mode 100644 cmd/optimum/opt/common/create.go create mode 100644 cmd/optimum/opt/common/list.go create mode 100644 cmd/optimum/opt/common/remove.go create mode 100644 cmd/optimum/opt/common/spinner.go create mode 100644 cmd/optimum/opt/hnsw.go create mode 100644 cmd/optimum/opt/root.go create mode 100644 doc/hnsw.md create mode 100644 doc/optimum.svg create mode 100644 go.mod create mode 100644 go.sum create mode 100644 restapi.go create mode 100644 stream.go create mode 100644 types.go diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 0000000..f103097 --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,48 @@ +## +## Build the main branch +## +name: build +on: + push: + branches: + - main + - /refs/heads/main + +jobs: + + build: + runs-on: ubuntu-latest + steps: + + - uses: actions/setup-go@v5 + with: + go-version: "1.21" + + - uses: actions/checkout@v4 + + - name: go build + run: | + go build ./... + + - name: go test + run: | + go test -v -coverprofile=profile.cov $(go list ./... | grep -v /examples/) + + - uses: shogo82148/actions-goveralls@v1 + continue-on-error: true + with: + path-to-profile: profile.cov + + - uses: reecetech/version-increment@2023.10.2 + id: version + with: + scheme: semver + increment: patch + + - name: publish + run: | + git config user.name "GitHub Actions" + git config user.email "github-actions@users.noreply.github.com" + git tag ${{ steps.version.outputs.v-version }} + git push origin -u ${{ steps.version.outputs.v-version }} + diff --git a/.github/workflows/check-code.yml b/.github/workflows/check-code.yml new file mode 100644 index 0000000..213bb8a --- /dev/null +++ b/.github/workflows/check-code.yml @@ -0,0 +1,25 @@ +## +## Quality checks +## +name: check +on: + pull_request: + types: + - opened + - synchronize + +jobs: + + code: + runs-on: ubuntu-latest + steps: + + - uses: actions/setup-go@v5 + with: + go-version: "1.21" + + - uses: actions/checkout@v4 + + - uses: dominikh/staticcheck-action@v1.3.0 + with: + install-go: false diff --git a/.github/workflows/check-test.yml b/.github/workflows/check-test.yml new file mode 100644 index 0000000..7b05093 --- /dev/null +++ b/.github/workflows/check-test.yml @@ -0,0 +1,34 @@ +## +## Unit Tests +## +name: test +on: + pull_request: + types: + - opened + - synchronize + +jobs: + + unit: + runs-on: ubuntu-latest + steps: + + - uses: actions/setup-go@v5 + with: + go-version: "1.21" + + - uses: actions/checkout@v4 + + - name: go build + run: | + go build ./... + + - name: go test + run: | + go test -v -coverprofile=profile.cov $(go list ./... | grep -v /examples/) + + - uses: shogo82148/actions-goveralls@v1 + continue-on-error: true + with: + path-to-profile: profile.cov diff --git a/LICENSE b/LICENSE index 02d8ff4..6c312e9 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2024 kshard +Copyright (c) 2024 Dmitry Kolesnikov Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 352300f..2d59ed6 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,235 @@ -# optimum -Golang Client +

+ +

optimum

+

data structures management client

+ +

+ + + + + + + + + + + + + + + + + + + + + + + + +

+

+ +--- + +The library is both Golang api and command-line client for managing data +structures. + +## What is this about? + +> "A data structure is a data organization, and storage format that is usually chosen for efficient access to data" - Wikipedia says. + +Data structures are widely utilized across various domains in Computer Science and Software Engineering. Unlike key-value or relational datastores, data structures are an algebraic abstractions that implements a unique properties tailored to meet the specific needs of applications. This library eliminate the extra cost of converting application objects into database entities for each database operation. + +This library provides remote access to sophisticated data structures, giving the simplicity of developing application with fewer lines of code to store and access data. + +## Getting Started + +- [What is this about?](#what-is-this-about) +- [Getting Started](#getting-started) + - [Getting access](#getting-access) + - [Typical workflow](#typical-workflow) + - [List data structures](#list-data-structures) + - [Create data structure instance](#create-data-structure-instance) + - [Writing to data structure instance (batch mode)](#writing-to-data-structure-instance-batch-mode) + - [Reading from data structure instance](#reading-from-data-structure-instance) + - [Remove data structure instance](#remove-data-structure-instance) + - [Supported data structures](#supported-data-structures) +- [Using Golang API](#using-golang-api) + - [Quick Example](#quick-example) +- [How To Contribute](#how-to-contribute) + - [commit message](#commit-message) + - [bugs](#bugs) +- [License](#license) + + +Install the command-line utility from source code. It requires [Golang](https://go.dev) to be installed: + +```bash +go install github.com/kshard/optimum/cmd/optimum@latest +``` + +### Getting access + +The library usage requires access to api that provisions and operates data structures for you. Contact your provided for api details. + +It is recommended to config environment variables for client usage: + +```bash +export HOST=https://example.com +export ROLE=arn:aws:iam::000000000000:role/example-access-role +``` + + +### Typical workflow + +Using data structures typically involves a following workflow: +1. List existing data structures. +2. Create a new instance of data structure. +3. Write data. +4. Read data. +5. Remove the data structure instance. + +A data structure can be seen as a typed algebraic abstraction that encompasses a collection of data values, the relationships between those values, and the operations or functions that can be applied to manipulate the data. In practical application development, each data structure must be uniquely identifiable to allow efficient access and manipulation. To facilitate this, the application uses a unique reference name called a CURIE (Compact Uniform Resource Identifier). The CURIE combines both the data structure type and a unique identifier, ensuring that the correct data structure is referenced throughout the workflow, enabling smooth interactions within the system. + +See [tutorials](./examples/) for example usage. + +#### List data structures + +List all data structure instances. It fetches data structure instances of same type. For each provisioned instance it reports NAME, active VERSION, UPDATED AT timestamp, instance STATUS, PENDING version if any, and initialization PARAMS. + +```bash +optimum list -u $HOST + +NAME VERSION UPDATED AT | STATUS PENDING | PARAMS +example1 NjqOYyOkpMHfg3.6 2024-08-18 10:40:34 | ACTIVE | {} +example2 2024-08-18 10:38:13 | PENDING NjqOYyOkpMHfg3.6 | {} +``` + + +#### Create data structure instance + +Create new instance of data structure. See either documentation of supported +data structure or `optimum help` for details about configuration parameters. + +```bash +optimum create -u $HOST -n -j path/to/config.json +``` + + +#### Writing to data structure instance (batch mode) + +The batch writing consist of two phases - data upload followed by a commit. +See either documentation of supported data structure or `optimum help` for +details about upload file format. + +```bash +# Upload data into server. +optimum upload -u $HOST -n path/to/data.txt + +# Commit uploaded data, making it available online. +optimum commit -u $HOST -n +``` + + +#### Reading from data structure instance + +Use the REST API for any advanced reading use cases, as the client only supports +basic read operations. See either documentation of supported data structure or +`optimum help` for details about query formats. + +```bash +optimum query -u $HOST -n path/to/query.txt +``` + + +#### Remove data structure instance + +The command removes data structure instance. The operation is irreversible and +results in the permanent destruction of all data. + +```bash +optimum remove -u $HOST -n +``` + +### Supported data structures + +The library supports following data structures: +* `hnsw` [Hierarchical Navigable Small World](./doc/hnsw.md) + + +Continue with [examples and tutorials](./examples/). + +Note: the command line is only support basic operation for data structure manipulation. Use Golang API for any advanced scenario. + + +## Using Golang API + +The latest version of the module is available at `main` branch. All development, including new features and bug fixes, take place on the `main` branch using forking and pull requests as described in contribution guidelines. The stable version is available via Golang modules. + +Use `go get` to retrieve the library and add it as dependency to your application. + +```bash +go get -u github.com/kshard/optimum +``` + +### Quick Example + +The example below shows usage of client for Hierarchical Navigable Small World. + +```go +package main + +import ( + "github.com/kshard/optimum" + "github.com/fogfish/gurl/v2/http" + "github.com/fogfish/curie" +) + +const ( + host = "https://example.com" + cask = curie.IRI("hnsw:example") +) + +func main() { + // Create client, the library depends on + api := optimum.New(http.New(), host) + + // Query the data structure + neighbors, err := api.Query(context.Background(), cask, + optimum.Query{Query: []float32{0.1, 0.2, /* ... */ 0.128}}, + ) + + // Print results + fmt.Println("Nearest neighbors:", neighbors) +} +``` + + +## How To Contribute + +The library is [MIT](LICENSE) licensed and accepts contributions via GitHub pull requests: + +1. Fork it +2. Create your feature branch (`git checkout -b my-new-feature`) +3. Commit your changes (`git commit -am 'Added some feature'`) +4. Push to the branch (`git push origin my-new-feature`) +5. Create new Pull Request + +The build and testing process requires [Go](https://golang.org) version 1.21 or later. + + +### commit message + +The commit message helps us to write a good release note, speed-up review process. The message should address two question what changed and why. The project follows the template defined by chapter [Contributing to a Project](http://git-scm.com/book/ch5-2.html) of Git book. + +### bugs + +If you experience any issues with the library, please let us know via [GitHub issues](https://github.com/kshard/optimum/issue). We appreciate detailed and accurate reports that help us to identity and replicate the issue. + + +## License + +[![See LICENSE](https://img.shields.io/github/license/kshard/optimum.svg?style=for-the-badge)](LICENSE) + diff --git a/cmd/optimum/encoding/encoding.go b/cmd/optimum/encoding/encoding.go new file mode 100644 index 0000000..516659e --- /dev/null +++ b/cmd/optimum/encoding/encoding.go @@ -0,0 +1,66 @@ +// +// Copyright (C) 2024 Dmitry Kolesnikov +// +// This file may be modified and distributed under the terms +// of the MIT license. See the LICENSE file for details. +// https://github.com/kshard/optimum +// + +package encoding + +import ( + "bufio" + "encoding/hex" + "io" + "strconv" + "strings" +) + +type Scanner struct { + r *bufio.Scanner + err error + uniqueKey []byte + vector []float32 +} + +func New(r io.Reader) *Scanner { + return &Scanner{ + r: bufio.NewScanner(r), + } +} + +func (s *Scanner) Err() error { return s.r.Err() } +func (s *Scanner) UniqueKey() []byte { return s.uniqueKey } +func (s *Scanner) Vector() []float32 { return s.vector } + +func (s *Scanner) Scan() bool { + if !s.r.Scan() { + return false + } + + seq := strings.Split(s.r.Text(), " ") + + f32 := make([]float32, len(seq)-1) + for i := 1; i < len(seq); i++ { + v, err := strconv.ParseFloat(seq[i], 32) + if err != nil { + s.err = err + return false + } + f32[i-1] = float32(v) + } + s.vector = f32 + + if strings.HasPrefix(seq[0], "0x") { + x, err := hex.DecodeString(seq[0][2:]) + if err != nil { + s.err = err + return false + } + s.uniqueKey = x + } else { + s.uniqueKey = []byte(seq[0]) + } + + return true +} diff --git a/cmd/optimum/go.mod b/cmd/optimum/go.mod new file mode 100644 index 0000000..6397678 --- /dev/null +++ b/cmd/optimum/go.mod @@ -0,0 +1,43 @@ +module github.com/kshard/optimum/cmd/optimum + +go 1.22.2 + +replace github.com/kshard/optimum => ../../ + +require ( + github.com/aws/aws-sdk-go-v2 v1.30.3 + github.com/aws/aws-sdk-go-v2/config v1.27.27 + github.com/aws/aws-sdk-go-v2/credentials v1.17.27 + github.com/aws/aws-sdk-go-v2/service/sts v1.30.3 + github.com/fogfish/curie v1.8.2 + github.com/fogfish/gurl/v2 v2.9.0 + github.com/fogfish/gurl/x/awsapi v0.0.2 + github.com/kshard/optimum v0.0.0-00010101000000-000000000000 + github.com/schollz/progressbar/v3 v3.14.5 + github.com/spf13/cobra v1.8.1 +) + +require ( + github.com/ajg/form v1.5.2-0.20200323032839-9aeb3cf462e1 // indirect + github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.11 // indirect + github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.15 // indirect + github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.15 // indirect + github.com/aws/aws-sdk-go-v2/internal/ini v1.8.0 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.11.3 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.11.17 // indirect + github.com/aws/aws-sdk-go-v2/service/sso v1.22.4 // indirect + github.com/aws/aws-sdk-go-v2/service/ssooidc v1.26.4 // indirect + github.com/aws/smithy-go v1.20.3 // indirect + github.com/fogfish/schemaorg v1.22.0 // indirect + github.com/google/go-cmp v0.6.0 // indirect + github.com/inconshreveable/mousetrap v1.1.0 // indirect + github.com/kshard/embeddings v0.0.3 // indirect + github.com/kshard/wreck v0.0.1 // indirect + github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect + github.com/rivo/uniseg v0.4.7 // indirect + github.com/spf13/pflag v1.0.5 // indirect + github.com/stretchr/testify v1.9.0 // indirect + golang.org/x/net v0.26.0 // indirect + golang.org/x/sys v0.22.0 // indirect + golang.org/x/term v0.22.0 // indirect +) diff --git a/cmd/optimum/go.sum b/cmd/optimum/go.sum new file mode 100644 index 0000000..2512c75 --- /dev/null +++ b/cmd/optimum/go.sum @@ -0,0 +1,81 @@ +github.com/ajg/form v1.5.2-0.20200323032839-9aeb3cf462e1 h1:8Qzi+0Uch1VJvdrOhJ8U8FqoPLbUdETPgMqGJ6DSMSQ= +github.com/ajg/form v1.5.2-0.20200323032839-9aeb3cf462e1/go.mod h1:uL1WgH+h2mgNtvBq0339dVnzXdBETtL2LeUXaIv25UY= +github.com/aws/aws-sdk-go-v2 v1.30.3 h1:jUeBtG0Ih+ZIFH0F4UkmL9w3cSpaMv9tYYDbzILP8dY= +github.com/aws/aws-sdk-go-v2 v1.30.3/go.mod h1:nIQjQVp5sfpQcTc9mPSr1B0PaWK5ByX9MOoDadSN4lc= +github.com/aws/aws-sdk-go-v2/config v1.27.27 h1:HdqgGt1OAP0HkEDDShEl0oSYa9ZZBSOmKpdpsDMdO90= +github.com/aws/aws-sdk-go-v2/config v1.27.27/go.mod h1:MVYamCg76dFNINkZFu4n4RjDixhVr51HLj4ErWzrVwg= +github.com/aws/aws-sdk-go-v2/credentials v1.17.27 h1:2raNba6gr2IfA0eqqiP2XiQ0UVOpGPgDSi0I9iAP+UI= +github.com/aws/aws-sdk-go-v2/credentials v1.17.27/go.mod h1:gniiwbGahQByxan6YjQUMcW4Aov6bLC3m+evgcoN4r4= +github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.11 h1:KreluoV8FZDEtI6Co2xuNk/UqI9iwMrOx/87PBNIKqw= +github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.11/go.mod h1:SeSUYBLsMYFoRvHE0Tjvn7kbxaUhl75CJi1sbfhMxkU= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.15 h1:SoNJ4RlFEQEbtDcCEt+QG56MY4fm4W8rYirAmq+/DdU= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.15/go.mod h1:U9ke74k1n2bf+RIgoX1SXFed1HLs51OgUSs+Ph0KJP8= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.15 h1:C6WHdGnTDIYETAm5iErQUiVNsclNx9qbJVPIt03B6bI= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.15/go.mod h1:ZQLZqhcu+JhSrA9/NXRm8SkDvsycE+JkV3WGY41e+IM= +github.com/aws/aws-sdk-go-v2/internal/ini v1.8.0 h1:hT8rVHwugYE2lEfdFE0QWVo81lF7jMrYJVDWI+f+VxU= +github.com/aws/aws-sdk-go-v2/internal/ini v1.8.0/go.mod h1:8tu/lYfQfFe6IGnaOdrpVgEL2IrrDOf6/m9RQum4NkY= +github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.11.3 h1:dT3MqvGhSoaIhRseqw2I0yH81l7wiR2vjs57O51EAm8= +github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.11.3/go.mod h1:GlAeCkHwugxdHaueRr4nhPuY+WW+gR8UjlcqzPr1SPI= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.11.17 h1:HGErhhrxZlQ044RiM+WdoZxp0p+EGM62y3L6pwA4olE= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.11.17/go.mod h1:RkZEx4l0EHYDJpWppMJ3nD9wZJAa8/0lq9aVC+r2UII= +github.com/aws/aws-sdk-go-v2/service/sso v1.22.4 h1:BXx0ZIxvrJdSgSvKTZ+yRBeSqqgPM89VPlulEcl37tM= +github.com/aws/aws-sdk-go-v2/service/sso v1.22.4/go.mod h1:ooyCOXjvJEsUw7x+ZDHeISPMhtwI3ZCB7ggFMcFfWLU= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.26.4 h1:yiwVzJW2ZxZTurVbYWA7QOrAaCYQR72t0wrSBfoesUE= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.26.4/go.mod h1:0oxfLkpz3rQ/CHlx5hB7H69YUpFiI1tql6Q6Ne+1bCw= +github.com/aws/aws-sdk-go-v2/service/sts v1.30.3 h1:ZsDKRLXGWHk8WdtyYMoGNO7bTudrvuKpDKgMVRlepGE= +github.com/aws/aws-sdk-go-v2/service/sts v1.30.3/go.mod h1:zwySh8fpFyXp9yOr/KVzxOl8SRqgf/IDw5aUt9UKFcQ= +github.com/aws/smithy-go v1.20.3 h1:ryHwveWzPV5BIof6fyDvor6V3iUL7nTfiTKXHiW05nE= +github.com/aws/smithy-go v1.20.3/go.mod h1:krry+ya/rV9RDcV/Q16kpu6ypI4K2czasz0NC3qS14E= +github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/fogfish/curie v1.8.2 h1:+4CezyjZ5uszSXUZAV27gfKwv58w3lKTH0JbQwh3S9A= +github.com/fogfish/curie v1.8.2/go.mod h1:jPv7pg4hHd8Ug/USG29ZA2bAwlRfh/iinY90/30ATGg= +github.com/fogfish/gurl/v2 v2.9.0 h1:IZlOxZte9y+NFgKHkPYw5jS0VCNG9avu2Ywrm2c3S6k= +github.com/fogfish/gurl/v2 v2.9.0/go.mod h1:vBqw+SCrfOPNllWDCwPnuotrNeuuyTsYZ28iP13qF3Y= +github.com/fogfish/gurl/x/awsapi v0.0.2 h1:NOGFY4mPjL6DlB0xLug0LRk4xorMDls3o2pt68s8mx8= +github.com/fogfish/gurl/x/awsapi v0.0.2/go.mod h1:0iIzNQpeu/xX7ThmH4gviUD/mMY7rQao+4bWNngLcs0= +github.com/fogfish/it v0.9.1 h1:Pu+qgqBV2ilZDzZzPIbUIhMIkdpHgbGUsdEwVQvBxNQ= +github.com/fogfish/it v0.9.1/go.mod h1:NQJG4Ygvek85y7zGj0Gny8+6ygAnHjfBORhI7TdQhp4= +github.com/fogfish/it/v2 v2.0.1 h1:vu3kV2xzYDPHoMHMABxXeu5CoMcTfRc4gkWkzOUkRJY= +github.com/fogfish/it/v2 v2.0.1/go.mod h1:h5FdKaEQT4sUEykiVkB8VV4jX27XabFVeWhoDZaRZtE= +github.com/fogfish/schemaorg v1.22.0 h1:0laPbToW8lVxdx7hPgc8qukZfrewBJYNf4ffpZn/6HQ= +github.com/fogfish/schemaorg v1.22.0/go.mod h1:CDOmEVSdag/o66Y3qjFROm0mUjJxDvSzAOXQwd+ZFrs= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= +github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= +github.com/k0kubun/go-ansi v0.0.0-20180517002512-3bf9e2903213/go.mod h1:vNUNkEQ1e29fT/6vq2aBdFsgNPmy8qMdSay1npru+Sw= +github.com/kshard/embeddings v0.0.3 h1:gtFTT7RyzfVrMyKNo9ZgvVdu1mdhmI4UjEiQbCBdjVY= +github.com/kshard/embeddings v0.0.3/go.mod h1:S+rTzXPtBSXrtiXoVzOkLhcsPMHIpbucTNSksJ9F3uE= +github.com/kshard/wreck v0.0.1 h1:U/vucQnpA7IgIG01xiq/6/cy57lhn3T/vQXUI4lOXSU= +github.com/kshard/wreck v0.0.1/go.mod h1:rT4tAEOaZhozTekFxTUhclfu4mLnqFgdrgrMtXw+KAI= +github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= +github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db h1:62I3jR2EmQ4l5rM/4FEfDWcRD+abF5XlKShorW5LRoQ= +github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db/go.mod h1:l0dey0ia/Uv7NcFFVbCLtqEBQbrT4OCwCSKTEv6enCw= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ= +github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88= +github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= +github.com/schollz/progressbar/v3 v3.14.5 h1:97RrSxbBASxQuZN9yemnyGrFZ/swnG6IrEe2R0BseX8= +github.com/schollz/progressbar/v3 v3.14.5/go.mod h1:Nrzpuw3Nl0srLY0VlTvC4V6RL50pcEymjy6qyJAaLa0= +github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM= +github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y= +github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= +github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +golang.org/x/net v0.26.0 h1:soB7SVo0PWrY4vPW/+ay0jKDNScG2X9wFeYlXIvJsOQ= +golang.org/x/net v0.26.0/go.mod h1:5YKkiSynbBIh3p6iOc/vibscux0x38BZDkn8sCUPxHE= +golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.22.0 h1:RI27ohtqKCnwULzJLqkv897zojh5/DwS/ENaMzUOaWI= +golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.22.0 h1:BbsgPEJULsl2fV/AT3v15Mjva5yXKQDyKf+TbDz7QJk= +golang.org/x/term v0.22.0/go.mod h1:F3qCibpT5AMpCRfhfT53vVJwhLtIVHhB9XDjfFvnMI4= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/cmd/optimum/main.go b/cmd/optimum/main.go new file mode 100644 index 0000000..ec1fdbf --- /dev/null +++ b/cmd/optimum/main.go @@ -0,0 +1,17 @@ +// +// Copyright (C) 2024 Dmitry Kolesnikov +// +// This file may be modified and distributed under the terms +// of the MIT license. See the LICENSE file for details. +// https://github.com/kshard/optimum +// + +package main + +import ( + "github.com/kshard/optimum/cmd/optimum/opt" +) + +func main() { + opt.Execute() +} diff --git a/cmd/optimum/opt/common/commit.go b/cmd/optimum/opt/common/commit.go new file mode 100644 index 0000000..d46d7a0 --- /dev/null +++ b/cmd/optimum/opt/common/commit.go @@ -0,0 +1,58 @@ +// +// Copyright (C) 2024 Dmitry Kolesnikov +// +// This file may be modified and distributed under the terms +// of the MIT license. See the LICENSE file for details. +// https://github.com/kshard/optimum +// + +package common + +import ( + "context" + "fmt" + "time" + + "github.com/fogfish/curie" + "github.com/kshard/optimum" + "github.com/schollz/progressbar/v3" +) + +func AboutCommit(kind, extension string) string { + return fmt.Sprintf(` +Batch writing to "%s" data structure requires commit after successful dataset +upload before dataset is available to reads. +%s +`, kind, extension) +} + +// List all data structures of given type +func Commit(api *optimum.Client, id curie.IRI) (err error) { + receipt, err := api.Commit(context.Background(), id) + if err != nil { + return err + } + + bar := progressbar.NewOptions(-1, + progressbar.OptionSpinnerType(14), + progressbar.OptionSetDescription( + fmt.Sprintf("%s (vsn %s) | %s ...", curie.Reference(id), receipt.Version, "COMMITTING"), + ), + ) + + return spinner(bar, func() error { + for { + time.Sleep(IDLE_TIME) + + status, err := api.Status(context.Background(), receipt.Job) + if err != nil { + return err + } + + bar.Describe(fmt.Sprintf("%s (vsn %s) | %s ...", curie.Reference(id), receipt.Version, status.Status)) + if status.Status == "SUCCEEDED" || status.Status == "FAILED" { + return nil + } + } + }) +} diff --git a/cmd/optimum/opt/common/create.go b/cmd/optimum/opt/common/create.go new file mode 100644 index 0000000..38a0fe5 --- /dev/null +++ b/cmd/optimum/opt/common/create.go @@ -0,0 +1,72 @@ +// +// Copyright (C) 2024 Dmitry Kolesnikov +// +// This file may be modified and distributed under the terms +// of the MIT license. See the LICENSE file for details. +// https://github.com/kshard/optimum +// + +package common + +import ( + "context" + "encoding/json" + "fmt" + "os" + "time" + + "github.com/fogfish/curie" + "github.com/kshard/optimum" + "github.com/schollz/progressbar/v3" +) + +func AboutCreate(kind, extension string) string { + return fmt.Sprintf(` +Creates new instance of "%s" data structure. Omitting the configuration +parameters causes usage of default params. +%s +`, kind, extension) +} + +func Create(api *optimum.Client, id curie.IRI, fopts string) (err error) { + opts := map[string]any{} + + if fopts != "" { + b, err := os.ReadFile(fopts) + if err != nil { + return err + } + + if err := json.Unmarshal(b, &opts); err != nil { + return err + } + } + + receipt, err := api.Create(context.Background(), id, opts) + if err != nil { + return err + } + + bar := progressbar.NewOptions(-1, + progressbar.OptionSpinnerType(14), + progressbar.OptionSetDescription( + fmt.Sprintf("%s (vsn %s) | %s ... opts: %+v", curie.Reference(id), receipt.Version, "CREATING", opts), + ), + ) + + return spinner(bar, func() error { + for { + time.Sleep(IDLE_TIME) + + status, err := api.Status(context.Background(), receipt.Job) + if err != nil { + return err + } + + bar.Describe(fmt.Sprintf("%s (vsn %s) | %s ...", curie.Reference(id), receipt.Version, status.Status)) + if status.Status == "SUCCEEDED" || status.Status == "FAILED" { + return nil + } + } + }) +} diff --git a/cmd/optimum/opt/common/list.go b/cmd/optimum/opt/common/list.go new file mode 100644 index 0000000..53e6b33 --- /dev/null +++ b/cmd/optimum/opt/common/list.go @@ -0,0 +1,55 @@ +// +// Copyright (C) 2024 Dmitry Kolesnikov +// +// This file may be modified and distributed under the terms +// of the MIT license. See the LICENSE file for details. +// https://github.com/kshard/optimum +// + +package common + +import ( + "context" + "fmt" + "time" + + "github.com/fogfish/curie" + "github.com/kshard/optimum" +) + +func AboutList(kind, extension string) string { + return fmt.Sprintf(` +List all data structure instances. It fetches data structure instances of +type "%s". For each provisioned instance it reports NAME, active VERSION, +UPDATED AT timestamp, instance STATUS, PENDING version if any, and initialization +PARAMS. + + optimum %[1]s list -u $HOST + + NAME VERSION UPDATED AT | STATUS PENDING | PARAMS + example1 NjqOYyOkpMHfg3.6 2024-08-18 10:40:34 | ACTIVE | {} + example2 2024-08-18 10:38:13 | PENDING NjqOYyOkpMHfg3.6 | {} + +The STATUS reflect both status of the instance and ongoing update operation: +- "UNAVAILABLE" the instance is not ready for use. +- "PENDING" the instance is pending updates, the VERSION is available online. +- "ACTIVE" the instance is active, all past updates successfully completed. +- "FAILED" PENDING update is failed, the VERSION is available online. +%s +`, kind, extension) +} + +// List all data structures of given type +func List(api *optimum.Client, kind string) (err error) { + seq, err := api.Casks(context.Background(), kind) + if err != nil { + return err + } + + fmt.Printf("%-10s\t%-16s %-19s | %-11s %-16s | %s\n", "NAME", "VERSION", "UPDATED AT", "STATUS", "PENDING", "PARAMS") + for _, x := range seq.Items { + fmt.Printf("%-10s\t%-16s %-19s | %-11s %-16s | %s\n", curie.Reference(x.ID), x.Version, x.Updated.Format(time.DateTime), x.Status, x.Pending, x.Opts) + } + + return nil +} diff --git a/cmd/optimum/opt/common/remove.go b/cmd/optimum/opt/common/remove.go new file mode 100644 index 0000000..4a979a9 --- /dev/null +++ b/cmd/optimum/opt/common/remove.go @@ -0,0 +1,34 @@ +// +// Copyright (C) 2024 Dmitry Kolesnikov +// +// This file may be modified and distributed under the terms +// of the MIT license. See the LICENSE file for details. +// https://github.com/kshard/optimum +// + +package common + +import ( + "context" + "fmt" + + "github.com/fogfish/curie" + "github.com/kshard/optimum" +) + +func AboutRemove(kind, extension string) string { + return fmt.Sprintf(` +The command removes "%s" data structure instance. The operation is irreversible and +results in the permanent destruction of all data. +%s +`, kind, extension) +} + +func Remove(api *optimum.Client, id curie.IRI) (err error) { + err = api.Remove(context.Background(), id) + if err != nil { + return err + } + + return nil +} diff --git a/cmd/optimum/opt/common/spinner.go b/cmd/optimum/opt/common/spinner.go new file mode 100644 index 0000000..28345af --- /dev/null +++ b/cmd/optimum/opt/common/spinner.go @@ -0,0 +1,40 @@ +// +// Copyright (C) 2024 Dmitry Kolesnikov +// +// This file may be modified and distributed under the terms +// of the MIT license. See the LICENSE file for details. +// https://github.com/kshard/optimum +// + +package common + +import ( + "time" + + "github.com/schollz/progressbar/v3" +) + +const IDLE_TIME = 20 * time.Second + +func spinner(bar *progressbar.ProgressBar, f func() error) error { + ch := make(chan bool) + + go func() { + for { + select { + case <-ch: + return + default: + bar.Add(1) + time.Sleep(40 * time.Millisecond) + } + } + }() + + err := f() + + ch <- false + bar.Finish() + + return err +} diff --git a/cmd/optimum/opt/hnsw.go b/cmd/optimum/opt/hnsw.go new file mode 100644 index 0000000..096e81d --- /dev/null +++ b/cmd/optimum/opt/hnsw.go @@ -0,0 +1,393 @@ +// +// Copyright (C) 2024 Dmitry Kolesnikov +// +// This file may be modified and distributed under the terms +// of the MIT license. See the LICENSE file for details. +// https://github.com/kshard/optimum +// + +package opt + +import ( + "bufio" + "context" + "fmt" + "io" + "os" + "strings" + + "github.com/fogfish/curie" + "github.com/kshard/optimum" + "github.com/kshard/optimum/cmd/optimum/encoding" + "github.com/kshard/optimum/cmd/optimum/opt/common" + "github.com/schollz/progressbar/v3" + "github.com/spf13/cobra" +) + +const HNSW_TYPE = "hnsw" + +func init() { + rootCmd.AddCommand(hnswCmd) + + hnswCmd.AddCommand(hnswListCmd) + + hnswCmd.AddCommand(hnswCreateCmd) + hnswCreateCmd.Flags().StringVarP(&hnswOpts, "json", "j", "", "json config file") + + hnswCmd.AddCommand(hnswCommitCmd) + + hnswCmd.AddCommand(hnswUploadCmd) + hnswUploadCmd.Flags().IntVar(&hnswUploadBuf, "buf", 4, "upload buffer in MB (default 4MB)") + + hnswCmd.AddCommand(hnswQueryCmd) + hnswQueryCmd.Flags().StringVarP(&hnswQueryContent, "text", "t", "", "hash to text associated list, useful for debug purposes") + + hnswCmd.AddCommand(hnswRemoveCmd) +} + +var ( + hnswOpts string + hnswUploadBuf int + hnswQueryContent string +) + +var hnswCmd = &cobra.Command{ + Use: "hnsw", + Short: "Operates `hnsw` data structures.", + Long: ` +The HNSW (Hierarchical Navigable Small World) algorithm is widely applicable in +areas that require efficient nearest neighbor searches, particularly in +high-dimensional spaces. Below are some key areas where HNSW is applicable: + +* Text Search and Retrieval: HNSW can be used in search engines and document +retrieval systems to quickly find similar documents on high-dimensional +text embeddings. + +* Content-Based Recommendations: HNSW is useful for finding similar items in +recommendation systems, such as finding related products, movies, or music +tracks based on embeddings vectors. It helps quickly locate users or items +with similar behavior patterns. + +* Personalized Content: When a system needs to recommend personalized content +(e.g., news articles, blog posts), HNSW can quickly find the most relevant +content based on a user's preferences or behavior. + +* Image and Video Retrieval: In tasks like image search or video retrieval, +HNSW is used to find images or frames similar to a given query image, +based on feature vectors extracted from deep learning models. + +* Semantic Search: In NLP, HNSW is used to find semantically similar phrases, +sentences, or documents by comparing embeddings generated by text models. + +* Chatbots and Conversational AI: It can be used to match user queries to a set +of predefined responses or intents based on vector similarity. + +* Fraud Detection: HNSW can be used to detect anomalies in financial transactions +by identifying transactions that are distant from normal patterns. + +* Intrusion Detection: In cybersecurity, it helps to find unusual patterns in +network traffic that might indicate security breaches. +`, + SilenceUsage: true, + Run: hnsw, +} + +func hnsw(cmd *cobra.Command, args []string) { + cmd.Help() +} + +//------------------------------------------------------------------------------ + +var hnswListCmd = &cobra.Command{ + Use: "list", + Short: "List all instances of `hnsw` data structure.", + Long: common.AboutList("hnsw", ""), + Example: ` +optimum hnsw list -u $HOST +optimum hnsw list -u $HOST -r $ROLE +`, + SilenceUsage: true, + RunE: hnswList, +} + +func hnswList(cmd *cobra.Command, args []string) (err error) { + cli, err := stack() + if err != nil { + return err + } + + return common.List(optimum.New(cli, host), HNSW_TYPE) +} + +//------------------------------------------------------------------------------ + +var hnswCreateCmd = &cobra.Command{ + Use: "create", + Short: "Create new instance of `hnsw` data structure.", + Long: common.AboutCreate("hnsw", ` +The algorithm "hnsw" is an efficient and scalable method for approximate nearest +neighbor search in high-dimensional spaces. + +Config algorithm through primary parameters: + - "M" and "M0" controls the maximum number of connections per node, balancing + between memory usage and search efficiency. M0 defines the connection + density on the graph's base layer, while M regulates it on the intermediate + layers. + + - "efConstruction" determines the number of candidate nodes evaluated during + graph construction, influencing both the construction time and the accuracy + of the graph. + + - "surface" is vector distance function. + +Example configuration: + { + "m": 8, // number in range of [4, 1024] + "m0": 64, // number in range of [4, 1024] + "efConstruction": 200, // number in range of [200, 1000] + "surface": "cosine" // enum {"cosine", "euclidean"} + } + +`), + Example: ` +optimum hnsw create -u $HOST -n example -j path/to/config.json +optimum hnsw create -u $HOST -r $ROLE -n example -j path/to/config.json +`, + SilenceUsage: true, + RunE: hnswCreate, +} + +func hnswCreate(cmd *cobra.Command, args []string) (err error) { + cli, err := stack() + if err != nil { + return err + } + + return common.Create(optimum.New(cli, host), curie.New("%s:%s", HNSW_TYPE, name), hnswOpts) +} + +//------------------------------------------------------------------------------ + +var hnswCommitCmd = &cobra.Command{ + Use: "commit", + Short: "Commit earlier uploaded datasets into `hnsw` instance.", + Long: common.AboutCommit("hnsw", ""), + Example: ` +optimum hnsw commit -u $HOST -n example +optimum hnsw commit -u $HOST -r $ROLE -n example +`, + SilenceUsage: true, + RunE: hnswCommit, +} + +func hnswCommit(cmd *cobra.Command, args []string) (err error) { + cli, err := stack() + if err != nil { + return err + } + + return common.Commit(optimum.New(cli, host), curie.New("%s:%s", HNSW_TYPE, name)) +} + +//------------------------------------------------------------------------------ + +var hnswUploadCmd = &cobra.Command{ + Use: "upload", + Short: "Upload `hnsw` datasets.", + Long: ` +Upload "hnsw" dataset to server. It accepts only textual format to represent +embedding vectors. Each line of the file should start with unique key, followed +by the corresponding vector. The unique key length should not exceeding 32 bytes: + + example_key_a 0.24116 ... -0.26098 -0.0079604 + example_key_b 0.34601 ... -0.66865 -0.0486001 + +We recommend using sha1, uuid or https://github.com/fogfish/guid as unique key. +The format allows hexadecimal encoding for keys, if it starts with "0x" prefix. + + 0xd857f9dc157c28e8e07c569c5992dee4f3486b4c -0.097231 ... -0.001681 0.154977 + 0xaeb3e05ab60520cd947455f2130d6cf1f6103243 -0.008007 ... -0.098503 0.057056 + +`, + Example: ` +optimum hnsw upload -u $HOST -n example path/to/data.txt +optimum hnsw upload -u $HOST -r $ROLE -n example path/to/data.txt +`, + SilenceUsage: true, + Args: cobra.ExactArgs(1), + RunE: hnswUpload, +} + +func hnswUpload(cmd *cobra.Command, args []string) (err error) { + fd, err := os.Open(args[0]) + if err != nil { + return err + } + defer fd.Close() + + fi, err := fd.Stat() + if err != nil { + return err + } + + cli, err := stack() + if err != nil { + return err + } + + stream := optimum.NewStream(cli, host, curie.New("%s:%s", HNSW_TYPE, name), hnswUploadBuf*1024*1024) + + r := io.TeeReader(fd, + progressbar.DefaultBytes( + fi.Size(), + "==> uploading", + ), + ) + + scanner := encoding.New(r) + for scanner.Scan() { + err := stream.Write(context.Background(), + optimum.Vector{ + UniqueKey: scanner.UniqueKey(), + Vec: scanner.Vector(), + }, + ) + if err != nil { + return err + } + } + + if err := scanner.Err(); err != nil { + return err + } + + if err := stream.Sync(context.Background()); err != nil { + return err + } + + return nil +} + +//------------------------------------------------------------------------------ + +var hnswQueryCmd = &cobra.Command{ + Use: "query", + Short: "Query instance of `hnsw` data structure.", + Long: ` +Query "hnsw" data structure instance. It accepts textual format as input, where +each line is embedding vector to query. Each line of the file should start with +identity of query, followed by the corresponding vector: + + example_query_a 0.24116 ... -0.26098 -0.0079604 + example_query_b 0.34601 ... -0.66865 -0.0486001 + +The file format is identical to the upload and can be re-used as is. +`, + Example: ` +optimum hnsw query -u $HOST -n example path/to/query.txt +optimum hnsw query -u $HOST -r $ROLE -n example path/to/query.txt +optimum hnsw query -u $HOST -r $ROLE -n example -t path/to/text-map.txt path/to/query.txt +`, + SilenceUsage: true, + Args: cobra.ExactArgs(1), + RunE: hnswQuery, +} + +func hnswQuery(cmd *cobra.Command, args []string) (err error) { + hashmap := hnswTextHashMap() + + fd, err := os.Open(args[0]) + if err != nil { + return err + } + defer fd.Close() + + cli, err := stack() + if err != nil { + return err + } + + api := optimum.New(cli, host) + + scanner := encoding.New(fd) + for scanner.Scan() { + query := optimum.Query{Query: scanner.Vector()} + rs, err := api.Query(context.Background(), curie.New("%s:%s", HNSW_TYPE, name), query) + if err != nil { + return err + } + + id := fmt.Sprintf("0x%x", scanner.UniqueKey()) + fmt.Printf("Query %s (took %s) | %s (vsn %s, size %d)\n", id, rs.Took, rs.Version.Cask, rs.Version.Version, rs.Version.Size) + for _, hit := range rs.Hits { + hid := fmt.Sprintf("0x%x", hit.UniqueKey) + fmt.Printf(" %f : %32s \n", hit.Rank, hid) + } + + if hashmap != nil { + fmt.Printf("\n\nQuery (took %s) > %s\n", rs.Took, hnswTextValue(hashmap, id)) + for _, hit := range rs.Hits { + hid := fmt.Sprintf("0x%x", hit.UniqueKey) + fmt.Printf(" %f : %s\n", hit.Rank, hnswTextValue(hashmap, hid)) + } + } + } + + if err := scanner.Err(); err != nil { + return err + } + + return nil +} + +func hnswTextHashMap() map[string]string { + if hnswQueryContent == "" { + return nil + } + + fd, err := os.Open(hnswQueryContent) + if err != nil { + return nil + } + defer fd.Close() + + hashmap := map[string]string{} + scanner := bufio.NewScanner(fd) + for scanner.Scan() { + seq := strings.SplitN(scanner.Text(), " ", 2) + hashmap[seq[0]] = seq[1] + } + + return hashmap +} + +func hnswTextValue(hashmap map[string]string, key string) string { + if val, has := hashmap[key]; has { + return val + } + + return key +} + +//------------------------------------------------------------------------------ + +var hnswRemoveCmd = &cobra.Command{ + Use: "remove", + Short: "Remove instance of `hnsw` data structure.", + Long: common.AboutRemove("hnsw", ""), + Example: ` +optimum hnsw commit -u $HOST -n example +optimum hnsw commit -u $HOST -r $ROLE -n example +`, + SilenceUsage: true, + RunE: hnswRemove, +} + +func hnswRemove(cmd *cobra.Command, args []string) (err error) { + cli, err := stack() + if err != nil { + return err + } + + return common.Remove(optimum.New(cli, host), curie.New("%s:%s", HNSW_TYPE, name)) +} diff --git a/cmd/optimum/opt/root.go b/cmd/optimum/opt/root.go new file mode 100644 index 0000000..db7b1b2 --- /dev/null +++ b/cmd/optimum/opt/root.go @@ -0,0 +1,107 @@ +// +// Copyright (C) 2024 Dmitry Kolesnikov +// +// This file may be modified and distributed under the terms +// of the MIT license. See the LICENSE file for details. +// https://github.com/kshard/optimum +// + +package opt + +import ( + "context" + "fmt" + "os" + "strings" + + "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/config" + "github.com/aws/aws-sdk-go-v2/credentials/stscreds" + "github.com/aws/aws-sdk-go-v2/service/sts" + "github.com/fogfish/gurl/v2/http" + "github.com/fogfish/gurl/x/awsapi" + "github.com/spf13/cobra" +) + +// Execute is entry point for cobra cli application +func Execute() { + if err := rootCmd.Execute(); err != nil { + e := err.Error() + fmt.Println(strings.ToUpper(e[:1]) + e[1:]) + os.Exit(1) + } +} + +func init() { + rootCmd.PersistentFlags().StringVarP(&host, "url", "u", "", "url to remote data structure management server") + rootCmd.PersistentFlags().StringVarP(&name, "name", "n", "", "unique name of data structure, use only alpha-numeric symbols.") + rootCmd.PersistentFlags().StringVarP(&role, "role", "r", "", "access identity, ARN of AWS IAM Role") + rootCmd.PersistentFlags().BoolVar(&debug, "debug", false, "enable debug output") +} + +var ( + host string + name string + role string + debug bool +) + +var rootCmd = &cobra.Command{ + Use: "optimum", + Short: "client for managing cloud data structures", + Long: ` +The command line client for managing cloud data structures. The data structure +is a collection, referred to as casks. Each cask is implemented based on +a specific data structure algorithm (class) and is assigned a unique name +along with configuration properties. This utility helps cask management on your +behalf. + +The command line utility requires access to remote server that provisions and +operates data structures for you. Contact your provided for details. + +It is recommended to config environment variables for client usage: + + export HOST=https://example.com + export ROLE=arn:aws:iam::000000000000:role/example-access-role + + `, + Run: root, +} + +func root(cmd *cobra.Command, args []string) { + cmd.Help() +} + +//------------------------------------------------------------------------------ + +func stack() (http.Stack, error) { + opts := []http.Config{} + + if debug { + opts = append(opts, http.WithDebugPayload()) + } + + cfg, err := config.LoadDefaultConfig(context.Background()) + if err != nil { + return nil, err + } + + if role == "" { + opts = append(opts, awsapi.WithSignatureV4(cfg)) + } else { + assumed, err := config.LoadDefaultConfig(context.Background(), + config.WithCredentialsProvider( + aws.NewCredentialsCache( + stscreds.NewAssumeRoleProvider(sts.NewFromConfig(cfg), role), + ), + ), + ) + if err != nil { + return nil, err + } + + opts = append(opts, awsapi.WithSignatureV4(assumed)) + } + + return http.New(opts...), nil +} diff --git a/doc/hnsw.md b/doc/hnsw.md new file mode 100644 index 0000000..8304229 --- /dev/null +++ b/doc/hnsw.md @@ -0,0 +1,96 @@ +# Hierarchical Navigable Small World + +## Example use-cases + +The HNSW (Hierarchical Navigable Small World) algorithm is widely applicable in areas that require efficient nearest neighbor searches, particularly in high-dimensional spaces. Below are some key areas where HNSW is applicable: + +- **Information Retrieval** + - *Text Search and Retrieval*: HNSW can be used in search engines and document retrieval systems to quickly find similar documents on high-dimensional text embeddings. +- **Recommendation Systems** + - *Content-Based Recommendations*: HNSW is useful for finding similar items in recommendation systems, such as finding related products, movies, or music tracks based on item embeddings. + - *Collaborative Filtering*: It can also be applied in collaborative filtering to quickly locate users or items with similar behavior patterns. + - *Personalized Content*: When a system needs to recommend personalized content (e.g., news articles, blog posts), HNSW can quickly find the most relevant content based on a user’s preferences or behavior. +- **Computer Vision** + - *Image and Video Retrieval*: In tasks like image search or video retrieval, HNSW is used to find images or frames similar to a given query image, based on feature vectors extracted from deep learning models. + - *Object Detection and Recognition*: For recognizing objects in real-time, HNSW helps to match objects in a database of known features quickly. +- **Natural Language Processing (NLP)** + - *Semantic Search*: In NLP, HNSW is used to find semantically similar phrases, sentences, or documents by comparing embeddings generated by text models. + - *Chatbots and Conversational AI*: It can be used to match user queries to a set of predefined responses or intents based on vector similarity. +- **Anomaly Detection** + - *Fraud Detection*: HNSW can be used to detect anomalies in financial transactions by identifying transactions that are distant from normal patterns. + - *Intrusion Detection*: In cybersecurity, it helps to find unusual patterns in network traffic that might indicate security breaches. +- **Audio and Signal Processing** + - *Music Recommendation*: HNSW is applied to match audio tracks with similar sound characteristics in music recommendation services. + - *Speaker Verification and Identification*: In voice recognition systems, HNSW helps match voice embeddings to identify speakers. +- **Geospatial Applications** + - *Location-Based Services*: In geospatial data analysis, HNSW can be used to find similar geographical regions or locations based on feature vectors representing various attributes like terrain, climate, etc. +- **Robotics and Autonomous Systems** + - *Simultaneous Localization and Mapping (SLAM)*: HNSW aids in real-time mapping by finding similar points in a high-dimensional space, which is crucial for navigation and obstacle avoidance in robotics. +- **Gaming and Virtual Reality** + - *Player Matching*: HNSW can be used to match players with similar skill levels in multiplayer games. + - *Virtual Object Recognition*: In augmented and virtual reality systems, HNSW aids in real-time recognition and placement of virtual objects based on feature similarity. +- **Healthcare** + - *Medical Image Retrieval*: HNSW can be used to retrieve similar medical images (e.g., X-rays, MRIs) for diagnostic purposes. + - *Patient Similarity Search*: It helps in identifying patients with similar conditions by comparing patient profiles based on multiple factors. + +Overall, HNSW is well-suited for applications that require fast and accurate similarity searches in large and high-dimensional datasets: +- **Scalability**: HNSW is highly scalable, capable of handling large datasets efficiently. +- **High Accuracy**: It provides close approximations to exact nearest neighbors, making it suitable for real-time applications. +- **Efficiency**: Its hierarchical graph structure allows for fast queries, especially in high-dimensional spaces where traditional methods struggle. + +## Create data structure instance + +```bash +optimum hnsw create -u $HOST -n -j path/to/config.json +``` + +The algorithm "hnsw" is an efficient and scalable method for approximate nearest neighbor search in high-dimensional spaces. It has a few configuration parameters to steer its accuracy: + +- "M" and "M0" controls the maximum number of connections per node, balancing between memory usage and search efficiency. M0 defines the connection density on the graph's base layer, while M regulates it on the intermediate layers. + +- "efConstruction" determines the number of candidate nodes evaluated during + graph construction, influencing both the construction time and the accuracy + of the graph. + +- "surface" is vector distance function. Only cosine and euclidean distances are supported. + +Example configuration: + +```json +{ + "m": 8, // number in range of [4, 1024] + "m0": 64, // number in range of [4, 1024] + "efConstruction": 200, // number in range of [200, 1000] + "surface": "cosine" // enum {"cosine", "euclidean"} +} +``` + +## Writing to data structure instance (batch mode) + +The batch writing consist of two phases - data upload followed by a commit. + +```bash +# Upload data into server. +optimum hnsw upload -u $HOST -n path/to/data.txt + +# Commit uploaded data, making it available online. +optimum hnsw commit -u $HOST -n +``` + +The uploaded **file format** is embedding vectors is textual format. Each line of the file should start with unique key, followed by the corresponding vector. The unique key length should not exceeding 32 bytes. We recommend usage of sha1, uuid or https://github.com/fogfish/guid as unique key. + +``` +example_key_a 0.24116 ... -0.26098 -0.0079604 +example_key_b 0.34601 ... -0.66865 -0.0486001 +``` + +The format allows hexadecimal encoding for keys, if it starts with "0x" prefix. + +``` +0xd857f9dc157c28e8e07c569c5992dee4f3486b4c -0.097231 ... -0.001681 0.154977 +0xaeb3e05ab60520cd947455f2130d6cf1f6103243 -0.008007 ... -0.098503 0.057056 +``` + +## Other operations + +See Golang interface for details about data retrieval. \ No newline at end of file diff --git a/doc/optimum.svg b/doc/optimum.svg new file mode 100644 index 0000000..22d10f5 --- /dev/null +++ b/doc/optimum.svg @@ -0,0 +1,43 @@ + + + + diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..266024d --- /dev/null +++ b/go.mod @@ -0,0 +1,16 @@ +module github.com/kshard/optimum + +go 1.22.2 + +require ( + github.com/fogfish/curie v1.8.2 + github.com/fogfish/gurl/v2 v2.9.0 + github.com/fogfish/schemaorg v1.22.0 + github.com/kshard/wreck v0.0.1 +) + +require ( + github.com/ajg/form v1.5.2-0.20200323032839-9aeb3cf462e1 // indirect + github.com/google/go-cmp v0.6.0 // indirect + golang.org/x/net v0.26.0 // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..447be8b --- /dev/null +++ b/go.sum @@ -0,0 +1,18 @@ +github.com/ajg/form v1.5.2-0.20200323032839-9aeb3cf462e1 h1:8Qzi+0Uch1VJvdrOhJ8U8FqoPLbUdETPgMqGJ6DSMSQ= +github.com/ajg/form v1.5.2-0.20200323032839-9aeb3cf462e1/go.mod h1:uL1WgH+h2mgNtvBq0339dVnzXdBETtL2LeUXaIv25UY= +github.com/fogfish/curie v1.8.2 h1:+4CezyjZ5uszSXUZAV27gfKwv58w3lKTH0JbQwh3S9A= +github.com/fogfish/curie v1.8.2/go.mod h1:jPv7pg4hHd8Ug/USG29ZA2bAwlRfh/iinY90/30ATGg= +github.com/fogfish/gurl/v2 v2.9.0 h1:IZlOxZte9y+NFgKHkPYw5jS0VCNG9avu2Ywrm2c3S6k= +github.com/fogfish/gurl/v2 v2.9.0/go.mod h1:vBqw+SCrfOPNllWDCwPnuotrNeuuyTsYZ28iP13qF3Y= +github.com/fogfish/it v0.9.1 h1:Pu+qgqBV2ilZDzZzPIbUIhMIkdpHgbGUsdEwVQvBxNQ= +github.com/fogfish/it v0.9.1/go.mod h1:NQJG4Ygvek85y7zGj0Gny8+6ygAnHjfBORhI7TdQhp4= +github.com/fogfish/it/v2 v2.0.1 h1:vu3kV2xzYDPHoMHMABxXeu5CoMcTfRc4gkWkzOUkRJY= +github.com/fogfish/it/v2 v2.0.1/go.mod h1:h5FdKaEQT4sUEykiVkB8VV4jX27XabFVeWhoDZaRZtE= +github.com/fogfish/schemaorg v1.22.0 h1:0laPbToW8lVxdx7hPgc8qukZfrewBJYNf4ffpZn/6HQ= +github.com/fogfish/schemaorg v1.22.0/go.mod h1:CDOmEVSdag/o66Y3qjFROm0mUjJxDvSzAOXQwd+ZFrs= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/kshard/wreck v0.0.1 h1:U/vucQnpA7IgIG01xiq/6/cy57lhn3T/vQXUI4lOXSU= +github.com/kshard/wreck v0.0.1/go.mod h1:rT4tAEOaZhozTekFxTUhclfu4mLnqFgdrgrMtXw+KAI= +golang.org/x/net v0.26.0 h1:soB7SVo0PWrY4vPW/+ay0jKDNScG2X9wFeYlXIvJsOQ= +golang.org/x/net v0.26.0/go.mod h1:5YKkiSynbBIh3p6iOc/vibscux0x38BZDkn8sCUPxHE= diff --git a/restapi.go b/restapi.go new file mode 100644 index 0000000..f5eca1c --- /dev/null +++ b/restapi.go @@ -0,0 +1,113 @@ +// +// Copyright (C) 2024 Dmitry Kolesnikov +// +// This file may be modified and distributed under the terms +// of the MIT license. See the LICENSE file for details. +// https://github.com/kshard/optimum +// + +package optimum + +import ( + "context" + + "github.com/fogfish/curie" + "github.com/fogfish/gurl/v2/http" + ƒ "github.com/fogfish/gurl/v2/http/recv" + ø "github.com/fogfish/gurl/v2/http/send" + "github.com/fogfish/schemaorg" +) + +type Client struct { + http.Stack + + host ø.Authority +} + +func New(stack http.Stack, host string) *Client { + return &Client{ + Stack: stack, + host: ø.Authority(host), + } +} + +func (api *Client) Casks(ctx context.Context, schema string) (*Instances, error) { + return http.IO[Instances]( + api.WithContext(ctx), + http.GET( + ø.URI("%s/ds/%s", api.host, schema), + ø.Accept.JSON, + + ƒ.Status.OK, + ), + ) +} + +func (api *Client) Create(ctx context.Context, cask curie.IRI, opts map[string]any) (*Created, error) { + return http.IO[Created]( + api.WithContext(ctx), + http.POST( + ø.URI("%s/ds/%s", api.host, curie.Prefix(cask)), + ø.Accept.JSON, + ø.ContentType.JSON, + ø.Send(create{ + Name: curie.Reference(cask), + Opts: opts, + }), + + ƒ.Status.Accepted, + ), + ) +} + +func (api *Client) Commit(ctx context.Context, cask curie.IRI) (*Committed, error) { + return http.IO[Committed]( + api.WithContext(ctx), + http.POST( + ø.URI("%s/ds/%s/%s", api.host, curie.Prefix(cask), curie.Reference(cask)), + ø.Accept.JSON, + ø.ContentType.JSON, + ø.Send(commit{Cursor: "latest"}), + + ƒ.Status.Accepted, + ), + ) + +} + +func (api *Client) Status(ctx context.Context, job schemaorg.Url) (*JobStatus, error) { + return http.IO[JobStatus]( + api.WithContext(ctx), + http.GET( + ø.URI("%s%s", api.host, ø.Path(job)), + ø.Accept.JSON, + + ƒ.Status.OK, + ), + ) +} + +func (api *Client) Remove(ctx context.Context, cask curie.IRI) error { + return api.IO(ctx, + http.DELETE( + ø.URI("%s/ds/%s/%s", api.host, curie.Prefix(cask), curie.Reference(cask)), + ø.Accept.JSON, + + ƒ.Status.Accepted, + ), + ) +} + +func (api *Client) Query(ctx context.Context, cask curie.IRI, q Query) (*Result, error) { + return http.IO[Result]( + api.WithContext(ctx), + http.GET( + ø.URI("%s/ds/%s/%s", api.host, curie.Prefix(cask), curie.Reference(cask)), + ø.Accept.JSON, + ø.ContentType.JSON, + ø.Send(q), + + ƒ.Status.OK, + ), + ) +} diff --git a/stream.go b/stream.go new file mode 100644 index 0000000..cdcd809 --- /dev/null +++ b/stream.go @@ -0,0 +1,133 @@ +// +// Copyright (C) 2024 Dmitry Kolesnikov +// +// This file may be modified and distributed under the terms +// of the MIT license. See the LICENSE file for details. +// https://github.com/kshard/optimum +// + +package optimum + +import ( + "bytes" + "compress/gzip" + "context" + "crypto/sha1" + + "github.com/fogfish/curie" + "github.com/fogfish/gurl/v2/http" + ƒ "github.com/fogfish/gurl/v2/http/recv" + ø "github.com/fogfish/gurl/v2/http/send" + "github.com/kshard/wreck" +) + +// Stream client +type Stream struct { + http.Stack + + host ø.Authority + cask curie.IRI + + chunk int + buf *bytes.Buffer + zip *gzip.Writer + seq *wreck.Writer[float32] +} + +func NewStream(stack http.Stack, host string, cask curie.IRI, chunk int) *Stream { + stream := &Stream{ + Stack: stack, + host: ø.Authority(host), + cask: cask, + chunk: chunk, + } + + stream.buf = &bytes.Buffer{} + stream.zip = gzip.NewWriter(stream.buf) + stream.seq = wreck.NewWriter[float32](stream.zip) + + return stream +} + +// Write vector +func (stream *Stream) Write(ctx context.Context, v Vector) error { + if err := stream.seq.Write(v.UniqueKey, v.SortKey, v.Vec); err != nil { + return err + } + + if stream.buf.Len() >= stream.chunk { + return stream.Sync(ctx) + } + + return nil +} + +// Sync local cache +func (stream *Stream) Sync(ctx context.Context) (err error) { + if err := stream.zip.Close(); err != nil { + return err + } + + if stream.buf.Len() == 0 { + return nil + } + + return stream.Stack.IO(ctx, + http.PUT( + ø.URI("%s/ds/%s/%s", stream.host, curie.Prefix(stream.cask), curie.Reference(stream.cask)), + ø.Accept.JSON, + ø.ContentType.Set("application/octet-stream"), + ø.Send(stream.buf), + + ƒ.Status.Accepted, + func(ctx *http.Context) error { + stream.buf.Reset() + stream.zip.Reset(stream.buf) + return nil + }, + ), + ) +} + +// Textual stream client +type TextStream struct { + api Embeddings + stream *Stream +} + +type Embeddings interface { + Embedding(ctx context.Context, text string) ([]float32, error) +} + +func NewTextStream(api Embeddings, stream *Stream) *TextStream { + return &TextStream{ + api: api, + stream: stream, + } +} + +func (stream *TextStream) Write(ctx context.Context, text string) error { + vec, err := stream.api.Embedding(ctx, text) + if err != nil { + return err + } + + hash := sha1.New() + hash.Write([]byte(text)) + uniqueKey := hash.Sum(nil) + + v := Vector{ + UniqueKey: uniqueKey, + Vec: vec, + } + + if err := stream.stream.Write(ctx, v); err != nil { + return err + } + + return nil +} + +func (stream *TextStream) Sync(ctx context.Context) (err error) { + return stream.stream.Sync(ctx) +} diff --git a/types.go b/types.go new file mode 100644 index 0000000..efb2558 --- /dev/null +++ b/types.go @@ -0,0 +1,88 @@ +// +// Copyright (C) 2024 Dmitry Kolesnikov +// +// This file may be modified and distributed under the terms +// of the MIT license. See the LICENSE file for details. +// https://github.com/kshard/optimum +// + +package optimum + +import ( + "time" + + "github.com/fogfish/curie" + "github.com/fogfish/schemaorg" +) + +type Instances struct { + Items []Instance `json:"items,omitempty"` +} + +type Instance struct { + ID curie.IRI `json:"id"` + Opts string `json:"opts"` + Status string `json:"status"` + Updated time.Time `json:"updated"` + Version string `json:"version"` + Pending string `json:"pending"` +} + +type create struct { + Name string `json:"name"` + Opts map[string]any `json:"opts"` +} + +type Created struct { + Version string `json:"version,omitempty"` + Job schemaorg.Url `json:"job"` +} + +type commit struct { + Cursor string `json:"cursor"` +} + +type Committed struct { + Version string `json:"version,omitempty"` + Job schemaorg.Url `json:"job"` +} + +type JobStatus struct { + Status string `json:"status,omitempty"` + Reason string `json:"reason,omitempty"` + Created string `json:"created,omitempty"` + Started string `json:"started,omitempty"` + Stopped string `json:"stopped,omitempty"` +} + +// Vector format +type Vector struct { + UniqueKey []uint8 `json:"id,omitempty"` + SortKey []uint8 `json:"sk,omitempty"` + Vec []float32 `json:"v"` +} + +type Query struct { + K int `json:"k,omitempty"` + EfSearch int `json:"efSearch,omitempty"` + Distance float32 `json:"distance,omitempty"` + Query []float32 `json:"query"` +} + +type Result struct { + Took time.Duration `json:"took,omitempty"` + Version Version `json:"version,omitempty"` + Hits []Hit `json:"hits,omitempty"` +} + +type Hit struct { + UniqueKey []uint8 `json:"key,omitempty"` + SortKey []uint8 `json:"sort,omitempty"` + Rank float32 `json:"rank"` +} + +type Version struct { + Cask string `json:"cask"` + Version string `json:"version"` + Size int `json:"size"` +}