zilliztech · alwayslove2013 · May 15, 2024 · May 3, 2024
diff --git a/pyproject.toml b/pyproject.toml
@@ -59,13 +59,14 @@ all = [
     "redis",
     "chromadb",
     "psycopg2",
+    "psycopg",
 ]
 
 qdrant = [ "qdrant-client" ]
 pinecone = [ "pinecone-client" ]
 weaviate = [ "weaviate-client" ]
 elastic = [ "elasticsearch" ]
-pgvector = [ "pgvector", "psycopg2" ]
+pgvector = [ "pgvector", "psycopg" ]
 pgvecto_rs = [ "psycopg2" ]
 redis = [ "redis" ]
 chromadb = [ "chromadb" ]

diff --git a/vectordb_bench/backend/clients/api.py b/vectordb_bench/backend/clients/api.py
@@ -20,6 +20,7 @@ class IndexType(str, Enum):
     Flat = "FLAT"
     AUTOINDEX = "AUTOINDEX"
     ES_HNSW = "hnsw"
+    ES_IVFFlat = "ivfflat"
     GPU_IVF_FLAT = "GPU_IVF_FLAT"
     GPU_IVF_PQ = "GPU_IVF_PQ"
     GPU_CAGRA = "GPU_CAGRA"

diff --git a/vectordb_bench/backend/clients/pgvector/config.py b/vectordb_bench/backend/clients/pgvector/config.py
@@ -1,29 +1,62 @@
+from abc import abstractmethod
+from typing import Any, Mapping, Optional, Sequence, TypedDict
 from pydantic import BaseModel, SecretStr
-from ..api import DBConfig, DBCaseConfig, IndexType, MetricType
+from typing_extensions import LiteralString
+from ..api import DBCaseConfig, DBConfig, IndexType, MetricType
 
 POSTGRE_URL_PLACEHOLDER = "postgresql://%s:%s@%s/%s"
 
+
+class PgVectorConfigDict(TypedDict):
+    """These keys will be directly used as kwargs in psycopg connection string,
+        so the names must match exactly psycopg API"""
+
+    user: str
+    password: str
+    host: str
+    port: int
+    dbname: str
+
+
 class PgVectorConfig(DBConfig):
-    user_name: SecretStr = "postgres"
+    user_name: SecretStr = SecretStr("postgres")
     password: SecretStr
     host: str = "localhost"
     port: int = 5432
     db_name: str
 
-    def to_dict(self) -> dict:
+    def to_dict(self) -> PgVectorConfigDict:
         user_str = self.user_name.get_secret_value()
         pwd_str = self.password.get_secret_value()
         return {
-            "host" : self.host,
-            "port" : self.port,
-            "dbname" : self.db_name,
-            "user" : user_str,
-            "password" : pwd_str
+            "host": self.host,
+            "port": self.port,
+            "dbname": self.db_name,
+            "user": user_str,
+            "password": pwd_str,
         }
 
+
+class PgVectorIndexParam(TypedDict):
+    metric: str
+    index_type: str
+    index_creation_with_options: Sequence[dict[str, Any]]
+    maintenance_work_mem: Optional[str]
+    max_parallel_workers: Optional[int]
+
+
+class PgVectorSearchParam(TypedDict):
+    metric_fun_op: LiteralString
+
+
+class PgVectorSessionCommands(TypedDict):
+    session_options: Sequence[dict[str, Any]]
+
+
 class PgVectorIndexConfig(BaseModel, DBCaseConfig):
     metric_type: MetricType | None = None
-    index: IndexType
+    create_index_before_load: bool = False
+    create_index_after_load: bool = True
 
     def parse_metric(self) -> str:
         if self.metric_type == MetricType.L2:
@@ -32,7 +65,7 @@ def parse_metric(self) -> str:
             return "vector_ip_ops"
         return "vector_cosine_ops"
 
-    def parse_metric_fun_op(self) -> str:
+    def parse_metric_fun_op(self) -> LiteralString:
         if self.metric_type == MetricType.L2:
             return "<->"
         elif self.metric_type == MetricType.IP:
@@ -46,48 +79,137 @@ def parse_metric_fun_str(self) -> str:
             return "max_inner_product"
         return "cosine_distance"
 
+    @abstractmethod
+    def index_param(self) -> PgVectorIndexParam:
+        ...
+
+    @abstractmethod
+    def search_param(self) -> PgVectorSearchParam:
+        ...
+
+    @abstractmethod
+    def session_param(self) -> PgVectorSessionCommands:
+        ...
+
+    @staticmethod
+    def _optionally_build_with_options(with_options: Mapping[str, Any]) -> Sequence[dict[str, Any]]:
+        """Walk through mappings, creating a List of {key1 = value} pairs. That will be used to build a where clause"""
+        options = []
+        for option_name, value in with_options.items():
+            if value is not None:
+                options.append(
+                    {
+                        "option_name": option_name,
+                        "val": str(value),
+                    }
+                )
+        return options
+
+    @staticmethod
+    def _optionally_build_set_options(
+        set_mapping: Mapping[str, Any]
+    ) -> Sequence[dict[str, Any]]:
+        """Walk through options, creating 'SET 'key1 = "value1";' commands"""
+        session_options = []
+        for setting_name, value in set_mapping.items():
+            if value:
+                session_options.append(
+                    {"parameter": {
+                            "setting_name": setting_name,
+                            "val": str(value),
+                        },
+                    }
+                )
+        return session_options
+
+
+class PgVectorIVFFlatConfig(PgVectorIndexConfig):
+    """
+    An IVFFlat index divides vectors into lists, and then searches a subset of those lists that are
+    closest to the query vector. It has faster build times and uses less memory than HNSW,
+    but has lower query performance (in terms of speed-recall tradeoff).
+
+    Three keys to achieving good recall are:
+
+    Create the index after the table has some data
+    Choose an appropriate number of lists - a good place to start is rows / 1000 for up to 1M rows and sqrt(rows) for
+    over 1M rows.
+    When querying, specify an appropriate number of probes (higher is better for recall, lower is better for speed) -
+    a good place to start is sqrt(lists)
+    """
+
+    lists: int | None
+    probes: int | None
+    index: IndexType = IndexType.ES_IVFFlat
+    maintenance_work_mem: Optional[str] = None
+    max_parallel_workers: Optional[int] = None
+
+    def index_param(self) -> PgVectorIndexParam:
+        index_parameters = {"lists": self.lists}
+        return {
+            "metric": self.parse_metric(),
+            "index_type": self.index.value,
+            "index_creation_with_options": self._optionally_build_with_options(
+                index_parameters
+            ),
+            "maintenance_work_mem": self.maintenance_work_mem,
+            "max_parallel_workers": self.max_parallel_workers,
+        }
 
-
-class HNSWConfig(PgVectorIndexConfig):
-    M: int
-    efConstruction: int
-    ef: int | None = None
-    index: IndexType = IndexType.HNSW
-
-    def index_param(self) -> dict:
+    def search_param(self) -> PgVectorSearchParam:
         return {
-            "m" : self.M,
-            "ef_construction" : self.efConstruction,
-            "metric" : self.parse_metric()
+            "metric_fun_op": self.parse_metric_fun_op(),
         }
 
-    def search_param(self) -> dict:
+    def session_param(self) -> PgVectorSessionCommands:
+        session_parameters = {"ivfflat.probes": self.probes}
         return {
-            "ef" : self.ef,
-            "metric_fun" : self.parse_metric_fun_str(),
-            "metric_fun_op" : self.parse_metric_fun_op(),
+            "session_options": self._optionally_build_set_options(session_parameters)
         }
 
 
-class IVFFlatConfig(PgVectorIndexConfig):
-    lists: int | None = 1000
-    probes: int | None = 10
-    index: IndexType = IndexType.IVFFlat
+class PgVectorHNSWConfig(PgVectorIndexConfig):
+    """
+    An HNSW index creates a multilayer graph. It has better query performance than IVFFlat (in terms of
+    speed-recall tradeoff), but has slower build times and uses more memory. Also, an index can be
+    created without any data in the table since there isn't a training step like IVFFlat.
+    """
+
+    m: int | None  # DETAIL:  Valid values are between "2" and "100".
+    ef_construction: (
+        int | None
+    )  # ef_construction must be greater than or equal to 2 * m
+    ef_search: int | None
+    index: IndexType = IndexType.ES_HNSW
+    maintenance_work_mem: Optional[str] = None
+    max_parallel_workers: Optional[int] = None
+
+    def index_param(self) -> PgVectorIndexParam:
+        index_parameters = {"m": self.m, "ef_construction": self.ef_construction}
+        return {
+            "metric": self.parse_metric(),
+            "index_type": self.index.value,
+            "index_creation_with_options": self._optionally_build_with_options(
+                index_parameters
+            ),
+            "maintenance_work_mem": self.maintenance_work_mem,
+            "max_parallel_workers": self.max_parallel_workers,
+        }
 
-    def index_param(self) -> dict:
+    def search_param(self) -> PgVectorSearchParam:
         return {
-            "lists" : self.lists,
-            "metric" : self.parse_metric()
+            "metric_fun_op": self.parse_metric_fun_op(),
         }
 
-    def search_param(self) -> dict:
+    def session_param(self) -> PgVectorSessionCommands:
+        session_parameters = {"hnsw.ef_search": self.ef_search}
         return {
-            "probes" : self.probes,
-            "metric_fun" : self.parse_metric_fun_str(),
-            "metric_fun_op" : self.parse_metric_fun_op(),
+            "session_options": self._optionally_build_set_options(session_parameters)
         }
 
+
 _pgvector_case_config = {
-    IndexType.HNSW: HNSWConfig,
-    IndexType.IVFFlat: IVFFlatConfig,
+        IndexType.HNSW: PgVectorHNSWConfig,
+        IndexType.ES_HNSW: PgVectorHNSWConfig,
+        IndexType.IVFFlat: PgVectorIVFFlatConfig,
 }