Skip to content

Commit

Permalink
Add support for numpy.ndarray for vector field (#904) (#937)
Browse files Browse the repository at this point in the history
See also: #890

Signed-off-by: XuanYang-cn <xuan.yang@zilliz.com>
  • Loading branch information
XuanYang-cn authored Apr 2, 2022
1 parent 1195bbd commit 7073536
Show file tree
Hide file tree
Showing 3 changed files with 131 additions and 84 deletions.
177 changes: 110 additions & 67 deletions examples/hello_milvus.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
"source": [
"# hello_milvus Demo\n",
"\n",
" hello_milvus.ipynb demonstrates the basic operations of PyMilvus, a Python SDK of Milvus.\n",
" Before running, make sure that you have a running Milvus instance.\n",
"`hello_milvus.ipynb` demonstrates the basic operations of PyMilvus, a Python SDK of Milvus.\n",
"Before running, make sure that you have a running Milvus instance.\n",
"\n",
"1. connect to Milvus\n",
"2. create collection\n",
Expand All @@ -24,7 +24,7 @@
"metadata": {},
"outputs": [],
"source": [
"import random\n",
"import numpy as np\n",
"import time\n",
"\n",
"from pymilvus import (\n",
Expand All @@ -35,7 +35,8 @@
")\n",
"\n",
"fmt = \"\\n=== {:30} ===\\n\"\n",
"search_latency_fmt = \"search latency = {:.4f}s\""
"search_latency_fmt = \"search latency = {:.4f}s\"\n",
"num_entities, dim = 3000, 8"
]
},
{
Expand All @@ -44,8 +45,9 @@
"source": [
"## 1. connect to Milvus\n",
"\n",
"Add a new connection alias `default` for Milvus server in `localhost:19530`. Actually the \"default\" alias is a buildin in PyMilvus. If the address of Milvus is the same as `localhost:19530`, you can omit all\n",
"parameters and call the method as: `connections.connect()`.\n",
"Add a new connection alias `default` for Milvus server in `localhost:19530`. \n",
"\n",
"Actually the `default` alias is a buildin in PyMilvus. If the address of Milvus is the same as `localhost:19530`, you can omit all parameters and call the method as: `connections.connect()`.\n",
"\n",
"Note: the `using` parameter of the following methods is default to \"default\"."
]
Expand Down Expand Up @@ -77,11 +79,11 @@
"## 2. create collection\n",
"We're going to create a collection with 3 fields.\n",
"\n",
"| | field name | field type | other attributes | field description |\n",
"|---| :--------: | :----------: | :----------------: | :----------------------------: |\n",
"|1| \"pk\" | Int64 | is_primary=True, auto_id=False | \"primary field\" |\n",
"|2| \"random\" | Double | | \"a double field\" |\n",
"|3|\"embeddings\"| FloatVector| dim=8 | \"float vector with dim 8\" |"
"| |field name |field type |other attributes | field description |\n",
"|---|:----------:|:---------:|:----------------------------:|:-----------------------:|\n",
"|1 | \"pk\" | Int64 |is_primary=True, auto_id=False| \"primary field\" |\n",
"|2 | \"random\" | Double | | \"a double field\" |\n",
"|3 |\"embeddings\"|FloatVector| dim=8 |\"float vector with dim 8\"|"
]
},
{
Expand All @@ -93,12 +95,12 @@
"fields = [\n",
" FieldSchema(name=\"pk\", dtype=DataType.INT64, is_primary=True, auto_id=False),\n",
" FieldSchema(name=\"random\", dtype=DataType.DOUBLE),\n",
" FieldSchema(name=\"embeddings\", dtype=DataType.FLOAT_VECTOR, dim=8)\n",
" FieldSchema(name=\"embeddings\", dtype=DataType.FLOAT_VECTOR, dim=dim)\n",
"]\n",
"\n",
"schema = CollectionSchema(fields, \"hello_milvus is the simplest demo to introduce the APIs\")\n",
"\n",
"hello_milvus = Collection(\"hello_milvus\", schema, consistency_level=\"Strong\")\n"
"hello_milvus = Collection(\"hello_milvus\", schema, consistency_level=\"Strong\")"
]
},
{
Expand Down Expand Up @@ -128,11 +130,12 @@
}
],
"source": [
"rng = np.random.default_rng(seed=19530)\n",
"entities = [\n",
" # provide the pk field because `auto_id` is set to False\n",
" [i for i in range(3000)],\n",
" [float(random.randrange(-20, -10)) for _ in range(3000)], # field random\n",
" [[random.random() for _ in range(8)] for _ in range(3000)], # field embeddings\n",
" [i for i in range(num_entities)],\n",
" rng.random(num_entities).tolist(), # field random, only supports list\n",
" rng.random((num_entities, dim)), # field embeddings, supports numpy.ndarray and list\n",
"]\n",
"\n",
"insert_result = hello_milvus.insert(entities)\n",
Expand Down Expand Up @@ -193,45 +196,38 @@
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"hello_milvus.load()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Search based on vector similarity**"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"=== Start searching based on vector similarity ===\n",
"\n",
"hit: (distance: 0.0, id: 2998), random field: -20.0\n",
"hit: (distance: 0.1339951753616333, id: 1871), random field: -13.0\n",
"hit: (distance: 0.16615478694438934, id: 1180), random field: -16.0\n",
"hit: (distance: 0.0, id: 2999), random field: -16.0\n",
"hit: (distance: 0.10607236623764038, id: 764), random field: -11.0\n",
"hit: (distance: 0.14412546157836914, id: 750), random field: -11.0\n",
"search latency = 0.3159s\n",
"\n",
"=== Start querying with `random > -14` ===\n",
"\n",
"query result:\n",
"-{'pk': 0, 'random': -13.0, 'embeddings': [0.07525, 0.534547, 0.778204, 0.646336, 0.800183, 0.998726, 0.545411, 0.631751]}\n",
"search latency = 0.2571s\n",
"\n",
"=== Start hybrid searching with `random > -12` ===\n",
"\n",
"hit: (distance: 0.3116421699523926, id: 801), random field: -11.0\n",
"hit: (distance: 0.34958416223526, id: 568), random field: -11.0\n",
"hit: (distance: 0.3618723750114441, id: 1105), random field: -11.0\n",
"hit: (distance: 0.10607236623764038, id: 764), random field: -11.0\n",
"hit: (distance: 0.14412546157836914, id: 750), random field: -11.0\n",
"hit: (distance: 0.29973354935646057, id: 2716), random field: -11.0\n",
"search latency = 0.1434s\n"
"hit: (distance: 0.0, id: 2998), random field: 0.9728033590489911\n",
"hit: (distance: 0.08883658051490784, id: 1262), random field: 0.2978858685751561\n",
"hit: (distance: 0.09590047597885132, id: 1265), random field: 0.3042039939240304\n",
"hit: (distance: 0.0, id: 2999), random field: 0.02316334456872482\n",
"hit: (distance: 0.05628091096878052, id: 1580), random field: 0.3855988746044062\n",
"hit: (distance: 0.08096685260534286, id: 2377), random field: 0.8745922204004368\n",
"search latency = 0.3126s\n"
]
}
],
"source": [
"hello_milvus.load()\n",
"\n",
"# search based on vector similarity\n",
"print(fmt.format(\"Start searching based on vector similarity\"))\n",
"vectors_to_search = entities[-1][-2:]\n",
"search_params = {\n",
" \"metric_type\": \"l2\",\n",
Expand All @@ -245,31 +241,79 @@
"for hits in result:\n",
" for hit in hits:\n",
" print(f\"hit: {hit}, random field: {hit.entity.get('random')}\")\n",
"print(search_latency_fmt.format(end_time - start_time))\n",
"\n",
"# -----------------------------------------------------------------------------\n",
"# query based on scalar filtering(boolean, int, etc.)\n",
"print(fmt.format(\"Start querying with `random > -14`\"))\n",
"print(search_latency_fmt.format(end_time - start_time))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Query based on scalar filtering(boolean, int, etc.)**\n",
"\n",
"Start quering with `random > 0.5`"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"query result:\n",
"-{'pk': 7, 'random': 0.6025374094941409, 'embeddings': [0.234543, 0.100673, 0.30042, 0.142694, 0.296201, 0.546544, 0.157546, 0.373746]}\n",
"search latency = 0.4626s\n"
]
}
],
"source": [
"start_time = time.time()\n",
"result = hello_milvus.query(expr=\"random > -14\", output_fields=[\"random\", \"embeddings\"])\n",
"result = hello_milvus.query(expr=\"random > 0.5\", output_fields=[\"random\", \"embeddings\"])\n",
"end_time = time.time()\n",
"\n",
"print(f\"query result:\\n-{result[0]}\")\n",
"print(search_latency_fmt.format(end_time - start_time))\n",
"\n",
"# -----------------------------------------------------------------------------\n",
"# hybrid search\n",
"print(fmt.format(\"Start hybrid searching with `random > -12`\"))\n",
"print(search_latency_fmt.format(end_time - start_time))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Hybrid search**\n",
"\n",
"Start hybrid searching with `random > 0.5`"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"hit: (distance: 0.0, id: 2998), random field: 0.9728033590489911\n",
"hit: (distance: 0.14606499671936035, id: 747), random field: 0.5648774800635661\n",
"hit: (distance: 0.1530652642250061, id: 2527), random field: 0.8928974315571507\n",
"hit: (distance: 0.08096685260534286, id: 2377), random field: 0.8745922204004368\n",
"hit: (distance: 0.20354536175727844, id: 2034), random field: 0.5526117606328499\n",
"hit: (distance: 0.21908017992973328, id: 958), random field: 0.6647383716417955\n",
"search latency = 0.3048s\n"
]
}
],
"source": [
"start_time = time.time()\n",
"result = hello_milvus.search(vectors_to_search, \"embeddings\", search_params, limit=3, expr=\"random > -12\", output_fields=[\"random\"])\n",
"result = hello_milvus.search(vectors_to_search, \"embeddings\", search_params, limit=3, expr=\"random > 0.5\", output_fields=[\"random\"])\n",
"end_time = time.time()\n",
"\n",
"for hits in result:\n",
" for hit in hits:\n",
" print(f\"hit: {hit}, random field: {hit.entity.get('random')}\")\n",
"print(search_latency_fmt.format(end_time - start_time))\n"
"print(search_latency_fmt.format(end_time - start_time))"
]
},
{
Expand All @@ -282,16 +326,16 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"query before delete by expr=`pk in [2, 3]` -> result: \n",
"-{'pk': 2, 'random': -14.0, 'embeddings': [0.976175, 0.088528, 0.806287, 0.004207, 0.30336, 0.298667, 0.279592, 0.421679]}\n",
"-{'pk': 3, 'random': -20.0, 'embeddings': [0.230225, 0.149853, 0.704977, 0.938874, 0.092708, 0.104514, 0.839864, 0.235236]}\n",
"-{'pk': 3, 'random': 0.468666676812172, 'embeddings': [0.602599, 0.836988, 0.148322, 0.704132, 0.42856, 0.797848, 0.985722, 0.404425]}\n",
"-{'pk': 2, 'random': 0.1321158395732429, 'embeddings': [0.108341, 0.722564, 0.648116, 0.045293, 0.330672, 0.009141, 0.455942, 0.407452]}\n",
"\n",
"query after delete by expr=`pk in [2, 3]` -> result: []\n",
"\n"
Expand Down Expand Up @@ -321,7 +365,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -334,7 +378,7 @@
"hash": "08729c1b09183f4f38e81bb10929f98dd3c2fb886eeaa68ef9ddc9d2071f5c86"
},
"kernelspec": {
"display_name": "Python 3.7.0 64-bit ('pymilvus': conda)",
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
Expand All @@ -348,9 +392,8 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.0"
},
"orig_nbformat": 4
"version": "3.6.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
Expand Down
21 changes: 11 additions & 10 deletions examples/hello_milvus.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,9 @@
# 5. search, query, and hybrid search on entities
# 6. delete entities by PK
# 7. drop collection

import random
import time

import numpy as np
from pymilvus import (
connections,
utility,
Expand All @@ -19,6 +18,7 @@

fmt = "\n=== {:30} ===\n"
search_latency_fmt = "search latency = {:.4f}s"
num_entities, dim = 3000, 8

#################################################################################
# 1. connect to Milvus
Expand Down Expand Up @@ -50,7 +50,7 @@
fields = [
FieldSchema(name="pk", dtype=DataType.INT64, is_primary=True, auto_id=False),
FieldSchema(name="random", dtype=DataType.DOUBLE),
FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=8)
FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=dim)
]

schema = CollectionSchema(fields, "hello_milvus is the simplest demo to introduce the APIs")
Expand All @@ -66,13 +66,14 @@
# The insert() method returns:
# - either automatically generated primary keys by Milvus if auto_id=True in the schema;
# - or the existing primary key field from the entities if auto_id=False in the schema.

print(fmt.format("Start inserting entities"))
num_entities = 3000
rng = np.random.default_rng(seed=19530)
entities = [
# provide the pk field because `auto_id` is set to False
[i for i in range(num_entities)],
[float(random.randrange(-20, -10)) for _ in range(num_entities)], # field random
[[random.random() for _ in range(8)] for _ in range(num_entities)], # field embeddings
rng.random(num_entities).tolist(), # field random, only supports list
rng.random((num_entities, dim)), # field embeddings, supports numpy.ndarray and list
]

insert_result = hello_milvus.insert(entities)
Expand Down Expand Up @@ -124,21 +125,21 @@

# -----------------------------------------------------------------------------
# query based on scalar filtering(boolean, int, etc.)
print(fmt.format("Start querying with `random > -14`"))
print(fmt.format("Start querying with `random > 0.5`"))

start_time = time.time()
result = hello_milvus.query(expr="random > -14", output_fields=["random", "embeddings"])
result = hello_milvus.query(expr="random > 0.5", output_fields=["random", "embeddings"])
end_time = time.time()

print(f"query result:\n-{result[0]}")
print(search_latency_fmt.format(end_time - start_time))

# -----------------------------------------------------------------------------
# hybrid search
print(fmt.format("Start hybrid searching with `random > -12`"))
print(fmt.format("Start hybrid searching with `random > 0.5`"))

start_time = time.time()
result = hello_milvus.search(vectors_to_search, "embeddings", search_params, limit=3, expr="random > -12", output_fields=["random"])
result = hello_milvus.search(vectors_to_search, "embeddings", search_params, limit=3, expr="random > 0.5", output_fields=["random"])
end_time = time.time()

for hits in result:
Expand Down
Loading

0 comments on commit 7073536

Please sign in to comment.