Skip to content

Commit

Permalink
Improve: Support multi-column vectors
Browse files Browse the repository at this point in the history
  • Loading branch information
ashvardanian committed Dec 2, 2023
1 parent 21db294 commit 66f1716
Show file tree
Hide file tree
Showing 2 changed files with 168 additions and 57 deletions.
156 changes: 103 additions & 53 deletions python/lib_sqlite.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
*/
#include <stringzilla.h>

#include <charconv>
#include <charconv> // `std::from_chars`
#include <cstdlib> // `std::strtod`

#include <sqlite3ext.h>
SQLITE_EXTENSION_INIT1
Expand All @@ -24,28 +25,37 @@ template <> struct parsed_scalar_kind_gt<scalar_kind_t::f64_k> {

template <scalar_kind_t scalar_kind_ak, metric_kind_t metric_kind_ak>
static void sqlite_dense(sqlite3_context* context, int argc, sqlite3_value** argv) {
if (argc != 2) {
sqlite3_result_error(context, "Wrong number of arguments", -1);

if (argc < 2) {
sqlite3_result_error(context, "Distance function expects at least two arguments", -1);
return;
}

int type1 = sqlite3_value_type(argv[0]);
int type2 = sqlite3_value_type(argv[1]);
if (type1 != type2) {
sqlite3_result_error(context, "Arguments types don't match", -1);
return;

// Our primary case is having two BLOBs containing dense vector representations.
if (argc == 2 && type1 == SQLITE_BLOB && type2 == SQLITE_BLOB) {
void const* vec1 = sqlite3_value_blob(argv[0]);
void const* vec2 = sqlite3_value_blob(argv[1]);
int bytes1 = sqlite3_value_bytes(argv[0]);
int bytes2 = sqlite3_value_bytes(argv[1]);
if (bytes1 != bytes2) {
sqlite3_result_error(context, "Vectors have different number of dimensions", -1);
return;
}

std::size_t dimensions = (size_t)(bytes1)*CHAR_BIT / bits_per_scalar(scalar_kind_ak);
metric_t metric = metric_t(dimensions, metric_kind_ak, scalar_kind_ak);
distance_punned_t distance =
metric(reinterpret_cast<byte_t const*>(vec1), reinterpret_cast<byte_t const*>(vec2));
sqlite3_result_double(context, distance);
}

if (type1 == SQLITE_NULL)
sqlite3_result_error(context, "Only BLOB types are supported, received a NULL", -1);
else if (type1 == SQLITE_INTEGER)
sqlite3_result_error(context, "Only BLOB types are supported, received an INTEGER", -1);
else if (type1 == SQLITE_FLOAT)
sqlite3_result_error(context, "Only BLOB types are supported, received a FLOAT", -1);
// Textual JSON objects
else if (type1 == SQLITE_TEXT) {
char const* vec1 = (char const*)sqlite3_value_text(argv[0]);
char const* vec2 = (char const*)sqlite3_value_text(argv[1]);
// Worst case is to have JSON arrays or comma-separated values
else if (argc == 2 && type1 == SQLITE_TEXT && type2 == SQLITE_TEXT) {
char* vec1 = (char*)sqlite3_value_text(argv[0]);
char* vec2 = (char*)sqlite3_value_text(argv[1]);
size_t bytes1 = (size_t)sqlite3_value_bytes(argv[0]);
size_t bytes2 = (size_t)sqlite3_value_bytes(argv[1]);
size_t commas1 = sz_count_char_swar(vec1, bytes1, ",");
Expand Down Expand Up @@ -77,19 +87,35 @@ static void sqlite_dense(sqlite3_context* context, int argc, sqlite3_value** arg
while (bytes2 && vec2[0] == ' ')
++vec2, --bytes2;

// Parse the floating-point numbers
// Parse the floating-point numbers
// Sadly, most modern compilers don't support the `std::from_chars` yet
#if __cpp_lib_to_chars
std::from_chars_result result1 = std::from_chars(vec1, vec1 + bytes1, parsed1[i]);
std::from_chars_result result2 = std::from_chars(vec2, vec2 + bytes2, parsed2[i]);
if (result1.ec != std::errc() || result2.ec != std::errc()) {
sqlite3_result_error(context, "Number can't be parsed", -1);
return;
}

// Skip the number, whitespaces, and commas
bytes1 -= result1.ptr - vec1;
bytes2 -= result2.ptr - vec2;
vec1 = result1.ptr;
vec2 = result2.ptr;
vec1 = (char*)result1.ptr;
vec2 = (char*)result2.ptr;
#else
char* parsed1_end = vec1 + bytes1;
parsed1[i] = std::strtod(vec1, &parsed1_end);
char* parsed2_end = vec2 + bytes2;
parsed2[i] = std::strtod(vec2, &parsed2_end);
if (vec1 == parsed1_end || vec2 == parsed2_end) {
sqlite3_result_error(context, "Number can't be parsed", -1);
return;
}
bytes1 -= parsed1_end - vec1;
bytes2 -= parsed2_end - vec2;
vec1 = parsed1_end;
vec2 = parsed2_end;
#endif

// Skip the whitespaces and commas
while (bytes1 && (vec1[0] == ' ' || vec1[0] == ','))
++vec1, --bytes1;
while (bytes2 && (vec2[0] == ' ' || vec2[0] == ','))
Expand All @@ -102,24 +128,43 @@ static void sqlite_dense(sqlite3_context* context, int argc, sqlite3_value** arg
metric(reinterpret_cast<byte_t const*>(parsed1), reinterpret_cast<byte_t const*>(parsed2));
sqlite3_result_double(context, distance);
}
// Binary objects
else if (type1 == SQLITE_BLOB) {
void const* vec1 = sqlite3_value_blob(argv[0]);
void const* vec2 = sqlite3_value_blob(argv[1]);
int bytes1 = sqlite3_value_bytes(argv[0]);
int bytes2 = sqlite3_value_bytes(argv[1]);
if (bytes1 != bytes2) {
sqlite3_result_error(context, "Vectors have different number of dimensions", -1);
return;

// Less efficient, yet still common case is to have many scalar columns
else if (argc % 2 == 0) {

// Allocate vectors on stack and parse floating-point values into them
using scalar_t = typename parsed_scalar_kind_gt<scalar_kind_ak>::type;
size_t dimensions = argc / 2;
scalar_t parsed1[dimensions], parsed2[dimensions];
for (size_t i = 0; i != dimensions; ++i) {
switch (sqlite3_value_type(argv[i])) {
case SQLITE_FLOAT: parsed1[i] = sqlite3_value_double(argv[i]); break;
case SQLITE_INTEGER: parsed1[i] = sqlite3_value_int(argv[i]); break;
case SQLITE_NULL: parsed1[i] = 0; break;
default:
sqlite3_result_error(context, "Scalar columns may only contain 32-bit integers, floats, or NULLs.", -1);
return;
}
switch (sqlite3_value_type(argv[dimensions + i])) {
case SQLITE_FLOAT: parsed2[i] = sqlite3_value_double(argv[dimensions + i]); break;
case SQLITE_INTEGER: parsed2[i] = sqlite3_value_int(argv[dimensions + i]); break;
case SQLITE_NULL: parsed2[i] = 0; break;
default:
sqlite3_result_error(context, "Scalar columns may only contain 32-bit integers, floats, or NULLs.", -1);
return;
}
}

std::size_t dimensions = (size_t)(bytes1)*CHAR_BIT / bits_per_scalar(scalar_kind_ak);
metric_t metric = metric_t(dimensions, metric_kind_ak, scalar_kind_ak);
// Compute the distance itself
metric_t metric = metric_t(dimensions, metric_kind_ak, parsed_scalar_kind_gt<scalar_kind_ak>::kind);
distance_punned_t distance =
metric(reinterpret_cast<byte_t const*>(vec1), reinterpret_cast<byte_t const*>(vec2));
metric(reinterpret_cast<byte_t const*>(parsed1), reinterpret_cast<byte_t const*>(parsed2));
sqlite3_result_double(context, distance);
} else
sqlite3_result_error(context, "Unknown argument types", -1);
}
// Unsupported arguments combination
else {
sqlite3_result_error(context, "Number of columns in two vectors must be divisible by two", -1);
}
}

extern "C" PYBIND11_MAYBE_UNUSED PYBIND11_EXPORT int sqlite3_compiled_init( //
Expand All @@ -129,46 +174,51 @@ extern "C" PYBIND11_MAYBE_UNUSED PYBIND11_EXPORT int sqlite3_compiled_init( //
SQLITE_EXTENSION_INIT2(api)

int flags = SQLITE_UTF8 | SQLITE_DETERMINISTIC | SQLITE_INNOCUOUS;
int num_params = -1; // Any number will be accepted

sqlite3_create_function(db, "distance_hamming_binary", 2, flags, NULL,
sqlite3_create_function(db, "distance_hamming_binary", num_params, flags, NULL,
sqlite_dense<scalar_kind_t::b1x8_k, metric_kind_t::hamming_k>, NULL, NULL);
sqlite3_create_function(db, "distance_jaccard_binary", 2, flags, NULL,
sqlite3_create_function(db, "distance_jaccard_binary", num_params, flags, NULL,
sqlite_dense<scalar_kind_t::b1x8_k, metric_kind_t::jaccard_k>, NULL, NULL);
sqlite3_create_function(db, "distance_haversine_meters", num_params, flags, NULL,
sqlite_dense<scalar_kind_t::f64_k, metric_kind_t::haversine_k>, NULL, NULL);
// sqlite3_create_function(db, "distance_levenshtein", num_params, flags, NULL,
// sqlite_dense<scalar_kind_t::u8_k, metric_kind_t::haversine_k>, NULL, NULL);

sqlite3_create_function(db, "distance_sqeuclidean_f64", 2, flags, NULL,
sqlite3_create_function(db, "distance_sqeuclidean_f64", num_params, flags, NULL,
sqlite_dense<scalar_kind_t::f64_k, metric_kind_t::l2sq_k>, NULL, NULL);
sqlite3_create_function(db, "distance_cosine_f64", 2, flags, NULL,
sqlite3_create_function(db, "distance_cosine_f64", num_params, flags, NULL,
sqlite_dense<scalar_kind_t::f64_k, metric_kind_t::cos_k>, NULL, NULL);
sqlite3_create_function(db, "distance_inner_f64", 2, flags, NULL,
sqlite3_create_function(db, "distance_inner_f64", num_params, flags, NULL,
sqlite_dense<scalar_kind_t::f64_k, metric_kind_t::ip_k>, NULL, NULL);
sqlite3_create_function(db, "distance_divergence_f64", 2, flags, NULL,
sqlite3_create_function(db, "distance_divergence_f64", num_params, flags, NULL,
sqlite_dense<scalar_kind_t::f64_k, metric_kind_t::divergence_k>, NULL, NULL);

sqlite3_create_function(db, "distance_sqeuclidean_f32", 2, flags, NULL,
sqlite3_create_function(db, "distance_sqeuclidean_f32", num_params, flags, NULL,
sqlite_dense<scalar_kind_t::f32_k, metric_kind_t::l2sq_k>, NULL, NULL);
sqlite3_create_function(db, "distance_cosine_f32", 2, flags, NULL,
sqlite3_create_function(db, "distance_cosine_f32", num_params, flags, NULL,
sqlite_dense<scalar_kind_t::f32_k, metric_kind_t::cos_k>, NULL, NULL);
sqlite3_create_function(db, "distance_inner_f32", 2, flags, NULL,
sqlite3_create_function(db, "distance_inner_f32", num_params, flags, NULL,
sqlite_dense<scalar_kind_t::f32_k, metric_kind_t::ip_k>, NULL, NULL);
sqlite3_create_function(db, "distance_divergence_f32", 2, flags, NULL,
sqlite3_create_function(db, "distance_divergence_f32", num_params, flags, NULL,
sqlite_dense<scalar_kind_t::f32_k, metric_kind_t::divergence_k>, NULL, NULL);

sqlite3_create_function(db, "distance_sqeuclidean_f16", 2, flags, NULL,
sqlite3_create_function(db, "distance_sqeuclidean_f16", num_params, flags, NULL,
sqlite_dense<scalar_kind_t::f16_k, metric_kind_t::l2sq_k>, NULL, NULL);
sqlite3_create_function(db, "distance_cosine_f16", 2, flags, NULL,
sqlite3_create_function(db, "distance_cosine_f16", num_params, flags, NULL,
sqlite_dense<scalar_kind_t::f16_k, metric_kind_t::cos_k>, NULL, NULL);
sqlite3_create_function(db, "distance_inner_f16", 2, flags, NULL,
sqlite3_create_function(db, "distance_inner_f16", num_params, flags, NULL,
sqlite_dense<scalar_kind_t::f16_k, metric_kind_t::ip_k>, NULL, NULL);
sqlite3_create_function(db, "distance_divergence_f16", 2, flags, NULL,
sqlite3_create_function(db, "distance_divergence_f16", num_params, flags, NULL,
sqlite_dense<scalar_kind_t::f16_k, metric_kind_t::divergence_k>, NULL, NULL);

sqlite3_create_function(db, "distance_sqeuclidean_i8", 2, flags, NULL,
sqlite3_create_function(db, "distance_sqeuclidean_i8", num_params, flags, NULL,
sqlite_dense<scalar_kind_t::i8_k, metric_kind_t::l2sq_k>, NULL, NULL);
sqlite3_create_function(db, "distance_cosine_i8", 2, flags, NULL,
sqlite3_create_function(db, "distance_cosine_i8", num_params, flags, NULL,
sqlite_dense<scalar_kind_t::i8_k, metric_kind_t::cos_k>, NULL, NULL);
sqlite3_create_function(db, "distance_inner_i8", 2, flags, NULL,
sqlite3_create_function(db, "distance_inner_i8", num_params, flags, NULL,
sqlite_dense<scalar_kind_t::i8_k, metric_kind_t::ip_k>, NULL, NULL);
sqlite3_create_function(db, "distance_divergence_i8", 2, flags, NULL,
sqlite3_create_function(db, "distance_divergence_i8", num_params, flags, NULL,
sqlite_dense<scalar_kind_t::i8_k, metric_kind_t::divergence_k>, NULL, NULL);

return SQLITE_OK;
Expand Down
69 changes: 65 additions & 4 deletions python/scripts/test_sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,13 @@
from usearch.index import Match, Matches, BatchMatches, Index, Indexes


batch_sizes = [1, 3, 20]
dimensions = [3, 97, 256]
batch_sizes = [1, 77, 100]


def test_sqlite_distances():
@pytest.mark.parametrize("num_vectors", batch_sizes)
@pytest.mark.parametrize("ndim", dimensions)
def test_sqlite_distances_in_high_dimensions(num_vectors: int, ndim: int):
conn = sqlite3.connect(":memory:")
conn.enable_load_extension(True)
conn.load_extension(usearch.sqlite)
Expand All @@ -36,8 +38,6 @@ def test_sqlite_distances():
)

# Generate and insert random vectors
num_vectors = 3 # Number of vectors to generate
dim = 4 # Dimension of each vector
vectors = []

for i in range(num_vectors):
Expand Down Expand Up @@ -78,3 +78,64 @@ def test_sqlite_distances():
for a, b, similarity_json, similarity_f32, similarity_f16 in cursor.fetchall():
assert math.isclose(similarity_json, similarity_f32, abs_tol=0.1)
assert math.isclose(similarity_json, similarity_f16, abs_tol=0.1)

@pytest.mark.parametrize("num_vectors", batch_sizes)
def test_sqlite_distances_in_low_dimensions(num_vectors: int):
conn = sqlite3.connect(":memory:")
conn.enable_load_extension(True)
conn.load_extension(usearch.sqlite)

cursor = conn.cursor()

# Create a table with additional columns for f32 and f16 BLOBs
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS vector_table (
id INTEGER PRIMARY KEY,
vector_d0 FLOAT,
vector_d1 FLOAT,
vector_d2 FLOAT,
vector_d3 FLOAT,
description TEXT
)
"""
)

# Generate and insert random vectors
vectors = []

for i in range(num_vectors):
# Generate a random 256-dimensional vector
vector = np.random.rand(4)
vectors.append(vector)

# Insert the vector into the database as scalars
cursor.execute(
"""
INSERT INTO vector_table (vector_d0, vector_d1, vector_d2, vector_d3) VALUES (?, ?, ?, ?)
""",
(*vector),
)

# Commit changes
conn.commit()

similarities = """
SELECT
a.id AS id1,
b.id AS id2,
distance_cosine_f32(a.vector_d0, a.vector_d1, b.vector_d0, b.vector_d1) AS cosine_similarity_f32,
distance_cosine_f16(a.vector_d0, a.vector_d1, b.vector_d0, b.vector_d1) AS cosine_similarity_f16,
distance_haversine_meters(a.vector_d0, a.vector_d1, b.vector_d0, b.vector_d1) AS haversine_meters
FROM
vector_table AS a,
vector_table AS b
WHERE
a.id < b.id;
"""
cursor.execute(similarities)

for a, b, similarity_f32, similarity_f16, haversine_meters in cursor.fetchall():
# assert math.isclose(similarity_json, similarity_f32, abs_tol=0.1)
# assert math.isclose(similarity_json, similarity_f16, abs_tol=0.1)
pass

0 comments on commit 66f1716

Please sign in to comment.