Add support for numpy.ndarray for vector field (#904) (#937)

See also: #890 Signed-off-by: XuanYang-cn <xuan.yang@zilliz.com>
milvus-io · Apr 2, 2022 · 7073536 · 7073536
1 parent 1195bbd
commit 7073536
Show file tree

Hide file tree

Showing 3 changed files with 131 additions and 84 deletions.
diff --git a/examples/hello_milvus.ipynb b/examples/hello_milvus.ipynb
@@ -6,8 +6,8 @@
    "source": [
     "# hello_milvus Demo\n",
     "\n",
-    "    hello_milvus.ipynb demonstrates the basic operations of PyMilvus, a Python SDK of Milvus.\n",
-    "    Before running, make sure that you have a running Milvus instance.\n",
+    "`hello_milvus.ipynb` demonstrates the basic operations of PyMilvus, a Python SDK of Milvus.\n",
+    "Before running, make sure that you have a running Milvus instance.\n",
     "\n",
     "1. connect to Milvus\n",
     "2. create collection\n",
@@ -24,7 +24,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import random\n",
+    "import numpy as np\n",
     "import time\n",
     "\n",
     "from pymilvus import (\n",
@@ -35,7 +35,8 @@
     ")\n",
     "\n",
     "fmt = \"\\n=== {:30} ===\\n\"\n",
-    "search_latency_fmt = \"search latency = {:.4f}s\""
+    "search_latency_fmt = \"search latency = {:.4f}s\"\n",
+    "num_entities, dim = 3000, 8"
    ]
   },
   {
@@ -44,8 +45,9 @@
    "source": [
     "## 1. connect to Milvus\n",
     "\n",
-    "Add a new connection alias `default` for Milvus server in `localhost:19530`. Actually the \"default\" alias is a buildin in PyMilvus. If the address of Milvus is the same as `localhost:19530`, you can omit all\n",
-    "parameters and call the method as: `connections.connect()`.\n",
+    "Add a new connection alias `default` for Milvus server in `localhost:19530`. \n",
+    "\n",
+    "Actually the `default` alias is a buildin in PyMilvus. If the address of Milvus is the same as `localhost:19530`, you can omit all parameters and call the method as: `connections.connect()`.\n",
     "\n",
     "Note: the `using` parameter of the following methods is default to \"default\"."
    ]
@@ -77,11 +79,11 @@
     "## 2. create collection\n",
     "We're going to create a collection with 3 fields.\n",
     "\n",
-    "| | field name | field type | other attributes |       field description      |\n",
-    "|---| :--------: | :----------: | :----------------: | :----------------------------: |\n",
-    "|1|    \"pk\"    |    Int64   |  is_primary=True, auto_id=False |      \"primary field\"         |\n",
-    "|2|  \"random\"  |    Double  |                  |      \"a double field\"        |\n",
-    "|3|\"embeddings\"| FloatVector|     dim=8        |  \"float vector with dim 8\"   |"
+    "|   |field name  |field type |other attributes              |  field description      |\n",
+    "|---|:----------:|:---------:|:----------------------------:|:-----------------------:|\n",
+    "|1  |    \"pk\"    |   Int64   |is_primary=True, auto_id=False|      \"primary field\"    |\n",
+    "|2  |  \"random\"  |   Double  |                              |      \"a double field\"   |\n",
+    "|3  |\"embeddings\"|FloatVector|     dim=8                    |\"float vector with dim 8\"|"
    ]
   },
   {
@@ -93,12 +95,12 @@
     "fields = [\n",
     "    FieldSchema(name=\"pk\", dtype=DataType.INT64, is_primary=True, auto_id=False),\n",
     "    FieldSchema(name=\"random\", dtype=DataType.DOUBLE),\n",
-    "    FieldSchema(name=\"embeddings\", dtype=DataType.FLOAT_VECTOR, dim=8)\n",
+    "    FieldSchema(name=\"embeddings\", dtype=DataType.FLOAT_VECTOR, dim=dim)\n",
     "]\n",
     "\n",
     "schema = CollectionSchema(fields, \"hello_milvus is the simplest demo to introduce the APIs\")\n",
     "\n",
-    "hello_milvus = Collection(\"hello_milvus\", schema, consistency_level=\"Strong\")\n"
+    "hello_milvus = Collection(\"hello_milvus\", schema, consistency_level=\"Strong\")"
    ]
   },
   {
@@ -128,11 +130,12 @@
     }
    ],
    "source": [
+    "rng = np.random.default_rng(seed=19530)\n",
     "entities = [\n",
     "    # provide the pk field because `auto_id` is set to False\n",
-    "    [i for i in range(3000)],\n",
-    "    [float(random.randrange(-20, -10)) for _ in range(3000)],  # field random\n",
-    "    [[random.random() for _ in range(8)] for _ in range(3000)],  # field embeddings\n",
+    "    [i for i in range(num_entities)],\n",
+    "    rng.random(num_entities).tolist(),  # field random, only supports list\n",
+    "    rng.random((num_entities, dim)),    # field embeddings, supports numpy.ndarray and list\n",
     "]\n",
     "\n",
     "insert_result = hello_milvus.insert(entities)\n",
@@ -193,45 +196,38 @@
    "cell_type": "code",
    "execution_count": 6,
    "metadata": {},
+   "outputs": [],
+   "source": [
+    "hello_milvus.load()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Search based on vector similarity**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "\n",
-      "=== Start searching based on vector similarity ===\n",
-      "\n",
-      "hit: (distance: 0.0, id: 2998), random field: -20.0\n",
-      "hit: (distance: 0.1339951753616333, id: 1871), random field: -13.0\n",
-      "hit: (distance: 0.16615478694438934, id: 1180), random field: -16.0\n",
-      "hit: (distance: 0.0, id: 2999), random field: -16.0\n",
-      "hit: (distance: 0.10607236623764038, id: 764), random field: -11.0\n",
-      "hit: (distance: 0.14412546157836914, id: 750), random field: -11.0\n",
-      "search latency = 0.3159s\n",
-      "\n",
-      "=== Start querying with `random > -14` ===\n",
-      "\n",
-      "query result:\n",
-      "-{'pk': 0, 'random': -13.0, 'embeddings': [0.07525, 0.534547, 0.778204, 0.646336, 0.800183, 0.998726, 0.545411, 0.631751]}\n",
-      "search latency = 0.2571s\n",
-      "\n",
-      "=== Start hybrid searching with `random > -12` ===\n",
-      "\n",
-      "hit: (distance: 0.3116421699523926, id: 801), random field: -11.0\n",
-      "hit: (distance: 0.34958416223526, id: 568), random field: -11.0\n",
-      "hit: (distance: 0.3618723750114441, id: 1105), random field: -11.0\n",
-      "hit: (distance: 0.10607236623764038, id: 764), random field: -11.0\n",
-      "hit: (distance: 0.14412546157836914, id: 750), random field: -11.0\n",
-      "hit: (distance: 0.29973354935646057, id: 2716), random field: -11.0\n",
-      "search latency = 0.1434s\n"
+      "hit: (distance: 0.0, id: 2998), random field: 0.9728033590489911\n",
+      "hit: (distance: 0.08883658051490784, id: 1262), random field: 0.2978858685751561\n",
+      "hit: (distance: 0.09590047597885132, id: 1265), random field: 0.3042039939240304\n",
+      "hit: (distance: 0.0, id: 2999), random field: 0.02316334456872482\n",
+      "hit: (distance: 0.05628091096878052, id: 1580), random field: 0.3855988746044062\n",
+      "hit: (distance: 0.08096685260534286, id: 2377), random field: 0.8745922204004368\n",
+      "search latency = 0.3126s\n"
      ]
     }
    ],
    "source": [
-    "hello_milvus.load()\n",
-    "\n",
-    "# search based on vector similarity\n",
-    "print(fmt.format(\"Start searching based on vector similarity\"))\n",
     "vectors_to_search = entities[-1][-2:]\n",
     "search_params = {\n",
     "    \"metric_type\": \"l2\",\n",
@@ -245,31 +241,79 @@
     "for hits in result:\n",
     "    for hit in hits:\n",
     "        print(f\"hit: {hit}, random field: {hit.entity.get('random')}\")\n",
-    "print(search_latency_fmt.format(end_time - start_time))\n",
-    "\n",
-    "# -----------------------------------------------------------------------------\n",
-    "# query based on scalar filtering(boolean, int, etc.)\n",
-    "print(fmt.format(\"Start querying with `random > -14`\"))\n",
+    "print(search_latency_fmt.format(end_time - start_time))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Query based on scalar filtering(boolean, int, etc.)**\n",
     "\n",
+    "Start quering with `random > 0.5`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "query result:\n",
+      "-{'pk': 7, 'random': 0.6025374094941409, 'embeddings': [0.234543, 0.100673, 0.30042, 0.142694, 0.296201, 0.546544, 0.157546, 0.373746]}\n",
+      "search latency = 0.4626s\n"
+     ]
+    }
+   ],
+   "source": [
     "start_time = time.time()\n",
-    "result = hello_milvus.query(expr=\"random > -14\", output_fields=[\"random\", \"embeddings\"])\n",
+    "result = hello_milvus.query(expr=\"random > 0.5\", output_fields=[\"random\", \"embeddings\"])\n",
     "end_time = time.time()\n",
     "\n",
     "print(f\"query result:\\n-{result[0]}\")\n",
-    "print(search_latency_fmt.format(end_time - start_time))\n",
-    "\n",
-    "# -----------------------------------------------------------------------------\n",
-    "# hybrid search\n",
-    "print(fmt.format(\"Start hybrid searching with `random > -12`\"))\n",
+    "print(search_latency_fmt.format(end_time - start_time))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Hybrid search**\n",
     "\n",
+    "Start hybrid searching with `random > 0.5`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "hit: (distance: 0.0, id: 2998), random field: 0.9728033590489911\n",
+      "hit: (distance: 0.14606499671936035, id: 747), random field: 0.5648774800635661\n",
+      "hit: (distance: 0.1530652642250061, id: 2527), random field: 0.8928974315571507\n",
+      "hit: (distance: 0.08096685260534286, id: 2377), random field: 0.8745922204004368\n",
+      "hit: (distance: 0.20354536175727844, id: 2034), random field: 0.5526117606328499\n",
+      "hit: (distance: 0.21908017992973328, id: 958), random field: 0.6647383716417955\n",
+      "search latency = 0.3048s\n"
+     ]
+    }
+   ],
+   "source": [
     "start_time = time.time()\n",
-    "result = hello_milvus.search(vectors_to_search, \"embeddings\", search_params, limit=3, expr=\"random > -12\", output_fields=[\"random\"])\n",
+    "result = hello_milvus.search(vectors_to_search, \"embeddings\", search_params, limit=3, expr=\"random > 0.5\", output_fields=[\"random\"])\n",
     "end_time = time.time()\n",
     "\n",
     "for hits in result:\n",
     "    for hit in hits:\n",
     "        print(f\"hit: {hit}, random field: {hit.entity.get('random')}\")\n",
-    "print(search_latency_fmt.format(end_time - start_time))\n"
+    "print(search_latency_fmt.format(end_time - start_time))"
    ]
   },
   {
@@ -282,16 +326,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "query before delete by expr=`pk in [2, 3]` -> result: \n",
-      "-{'pk': 2, 'random': -14.0, 'embeddings': [0.976175, 0.088528, 0.806287, 0.004207, 0.30336, 0.298667, 0.279592, 0.421679]}\n",
-      "-{'pk': 3, 'random': -20.0, 'embeddings': [0.230225, 0.149853, 0.704977, 0.938874, 0.092708, 0.104514, 0.839864, 0.235236]}\n",
+      "-{'pk': 3, 'random': 0.468666676812172, 'embeddings': [0.602599, 0.836988, 0.148322, 0.704132, 0.42856, 0.797848, 0.985722, 0.404425]}\n",
+      "-{'pk': 2, 'random': 0.1321158395732429, 'embeddings': [0.108341, 0.722564, 0.648116, 0.045293, 0.330672, 0.009141, 0.455942, 0.407452]}\n",
       "\n",
       "query after delete by expr=`pk in [2, 3]` -> result: []\n",
       "\n"
@@ -321,7 +365,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -334,7 +378,7 @@
    "hash": "08729c1b09183f4f38e81bb10929f98dd3c2fb886eeaa68ef9ddc9d2071f5c86"
   },
   "kernelspec": {
-   "display_name": "Python 3.7.0 64-bit ('pymilvus': conda)",
+   "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },
@@ -348,9 +392,8 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.0"
-  },
-  "orig_nbformat": 4
+   "version": "3.6.9"
+  }
  },
  "nbformat": 4,
  "nbformat_minor": 2

diff --git a/examples/hello_milvus.py b/examples/hello_milvus.py
@@ -6,10 +6,9 @@
 # 5. search, query, and hybrid search on entities
 # 6. delete entities by PK
 # 7. drop collection
-
-import random
 import time
 
+import numpy as np
 from pymilvus import (
     connections,
     utility,
@@ -19,6 +18,7 @@
 
 fmt = "\n=== {:30} ===\n"
 search_latency_fmt = "search latency = {:.4f}s"
+num_entities, dim = 3000, 8
 
 #################################################################################
 # 1. connect to Milvus
@@ -50,7 +50,7 @@
 fields = [
     FieldSchema(name="pk", dtype=DataType.INT64, is_primary=True, auto_id=False),
     FieldSchema(name="random", dtype=DataType.DOUBLE),
-    FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=8)
+    FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=dim)
 ]
 
 schema = CollectionSchema(fields, "hello_milvus is the simplest demo to introduce the APIs")
@@ -66,13 +66,14 @@
 # The insert() method returns:
 # - either automatically generated primary keys by Milvus if auto_id=True in the schema;
 # - or the existing primary key field from the entities if auto_id=False in the schema.
+
 print(fmt.format("Start inserting entities"))
-num_entities = 3000
+rng = np.random.default_rng(seed=19530)
 entities = [
     # provide the pk field because `auto_id` is set to False
     [i for i in range(num_entities)],
-    [float(random.randrange(-20, -10)) for _ in range(num_entities)],  # field random
-    [[random.random() for _ in range(8)] for _ in range(num_entities)],  # field embeddings
+    rng.random(num_entities).tolist(),  # field random, only supports list
+    rng.random((num_entities, dim)),    # field embeddings, supports numpy.ndarray and list
 ]
 
 insert_result = hello_milvus.insert(entities)
@@ -124,21 +125,21 @@
 
 # -----------------------------------------------------------------------------
 # query based on scalar filtering(boolean, int, etc.)
-print(fmt.format("Start querying with `random > -14`"))
+print(fmt.format("Start querying with `random > 0.5`"))
 
 start_time = time.time()
-result = hello_milvus.query(expr="random > -14", output_fields=["random", "embeddings"])
+result = hello_milvus.query(expr="random > 0.5", output_fields=["random", "embeddings"])
 end_time = time.time()
 
 print(f"query result:\n-{result[0]}")
 print(search_latency_fmt.format(end_time - start_time))
 
 # -----------------------------------------------------------------------------
 # hybrid search
-print(fmt.format("Start hybrid searching with `random > -12`"))
+print(fmt.format("Start hybrid searching with `random > 0.5`"))
 
 start_time = time.time()
-result = hello_milvus.search(vectors_to_search, "embeddings", search_params, limit=3, expr="random > -12", output_fields=["random"])
+result = hello_milvus.search(vectors_to_search, "embeddings", search_params, limit=3, expr="random > 0.5", output_fields=["random"])
 end_time = time.time()
 
 for hits in result: