Merge #974

974: Allow passing a custom serializer for documents r=sanders41 a=sanders41 # Pull Request `@LaundroMat` FYI in case you want to test this to see if it solves your issue. ## Related issue Fixes #973 ## What does this PR do? - Allows passing a custom JSONEncoder to serialize documents with types that the default encoder can't handle. ## PR checklist Please check if your PR fulfills the following requirements: - [x] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)? - [x] Have you read the contributing guidelines? - [x] Have you made sure that the title is accurate and descriptive of the changes? Thank you so much for contributing to Meilisearch! Co-authored-by: Paul Sanders <paul@paulsanders.dev>
meilisearch · Jun 10, 2024 · 739a22e · 739a22e
2 parents b3d914e + 7f7d066
commit 739a22e
Show file tree

Hide file tree

Showing 4 changed files with 237 additions and 18 deletions.
diff --git a/README.md b/README.md
@@ -169,6 +169,34 @@ index.update_filterable_attributes([
 ])
 ```
 
+#### Custom Serializer for documents <!-- omit in toc -->
+
+If your documents contain fields that the Python JSON serializer does not know how to handle you
+can use your own custom serializer.
+
+```py
+from datetime import datetime
+from json import JSONEncoder
+from uuid import uuid4
+
+
+class CustomEncoder(JSONEncoder):
+    def default(self, o):
+        if isinstance(o, (UUID, datetime)):
+            return str(o)
+
+        # Let the base class default method raise the TypeError
+        return super().default(o)
+
+
+documents = [
+    {"id": uuid4(), "title": "test 1", "when": datetime.now()},
+    {"id": uuid4(), "title": "Test 2", "when": datetime.now()},
+]
+index = empty_index()
+index.add_documents(documents, serializer=CustomEncoder)
+```
+
 You only need to perform this operation once.
 
 Note that Meilisearch will rebuild your index whenever you update `filterableAttributes`. Depending on the size of your dataset, this might take time. You can track the process using the [task](https://www.meilisearch.com/docs/reference/api/tasks#get-tasks).
@@ -205,7 +233,6 @@ index.search(
 
 This package guarantees compatibility with [version v1.x of Meilisearch](https://github.com/meilisearch/meilisearch/releases/latest), but some features may not be present. Please check the [issues](https://github.com/meilisearch/meilisearch-python/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22+label%3Aenhancement) for more info.
 
-
 ## 💡 Learn more
 
 The following sections in our main documentation website may interest you:

diff --git a/meilisearch/_httprequests.py b/meilisearch/_httprequests.py
@@ -2,7 +2,7 @@
 
 import json
 from functools import lru_cache
-from typing import Any, Callable, List, Mapping, Optional, Sequence, Tuple, Union
+from typing import Any, Callable, List, Mapping, Optional, Sequence, Tuple, Type, Union
 
 import requests
 
@@ -39,6 +39,8 @@ def send_request(
             ]
         ] = None,
         content_type: Optional[str] = None,
+        *,
+        serializer: Optional[Type[json.JSONEncoder]] = None,
     ) -> Any:
         if content_type:
             self.headers["Content-Type"] = content_type
@@ -58,11 +60,10 @@ def send_request(
                     data=body,
                 )
             else:
+                data = json.dumps(body, cls=serializer) if body else "" if body == "" else "null"
+
                 request = http_method(
-                    request_path,
-                    timeout=self.config.timeout,
-                    headers=self.headers,
-                    data=json.dumps(body) if body else "" if body == "" else "null",
+                    request_path, timeout=self.config.timeout, headers=self.headers, data=data
                 )
             return self.__validate(request)
 
@@ -81,8 +82,10 @@ def post(
             Union[Mapping[str, Any], Sequence[Mapping[str, Any]], List[str], str]
         ] = None,
         content_type: Optional[str] = "application/json",
+        *,
+        serializer: Optional[Type[json.JSONEncoder]] = None,
     ) -> Any:
-        return self.send_request(requests.post, path, body, content_type)
+        return self.send_request(requests.post, path, body, content_type, serializer=serializer)
 
     def patch(
         self,
@@ -108,8 +111,10 @@ def put(
             ]
         ] = None,
         content_type: Optional[str] = "application/json",
+        *,
+        serializer: Optional[Type[json.JSONEncoder]] = None,
     ) -> Any:
-        return self.send_request(requests.put, path, body, content_type)
+        return self.send_request(requests.put, path, body, content_type, serializer=serializer)
 
     def delete(
         self,

diff --git a/meilisearch/index.py b/meilisearch/index.py
@@ -1,7 +1,19 @@
 from __future__ import annotations
 
 from datetime import datetime
-from typing import Any, Dict, Generator, List, Mapping, MutableMapping, Optional, Sequence, Union
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Dict,
+    Generator,
+    List,
+    Mapping,
+    MutableMapping,
+    Optional,
+    Sequence,
+    Type,
+    Union,
+)
 from urllib import parse
 from warnings import warn
 
@@ -26,6 +38,9 @@
 from meilisearch.models.task import Task, TaskInfo, TaskResults
 from meilisearch.task import TaskHandler
 
+if TYPE_CHECKING:
+    from json import JSONEncoder
+
 
 # pylint: disable=too-many-public-methods, too-many-lines
 class Index:
@@ -403,6 +418,8 @@ def add_documents(
         self,
         documents: Sequence[Mapping[str, Any]],
         primary_key: Optional[str] = None,
+        *,
+        serializer: Optional[Type[JSONEncoder]] = None,
     ) -> TaskInfo:
         """Add documents to the index.
 
@@ -412,6 +429,9 @@ def add_documents(
             List of documents. Each document should be a dictionary.
         primary_key (optional):
             The primary-key used in index. Ignored if already set up.
+        serializer (optional):
+            A custom JSONEncode to handle serializing fields that the build in json.dumps
+            cannot handle, for example UUID and datetime.
 
         Returns
         -------
@@ -425,14 +445,16 @@ def add_documents(
             An error containing details about why Meilisearch can't process your request. Meilisearch error codes are described here: https://www.meilisearch.com/docs/reference/errors/error_codes#meilisearch-errors
         """
         url = self._build_url(primary_key)
-        add_document_task = self.http.post(url, documents)
+        add_document_task = self.http.post(url, documents, serializer=serializer)
         return TaskInfo(**add_document_task)
 
     def add_documents_in_batches(
         self,
         documents: Sequence[Mapping[str, Any]],
         batch_size: int = 1000,
         primary_key: Optional[str] = None,
+        *,
+        serializer: Optional[Type[JSONEncoder]] = None,
     ) -> List[TaskInfo]:
         """Add documents to the index in batches.
 
@@ -444,6 +466,9 @@ def add_documents_in_batches(
             The number of documents that should be included in each batch. Default = 1000
         primary_key (optional):
             The primary-key used in index. Ignored if already set up.
+        serializer (optional):
+            A custom JSONEncode to handle serializing fields that the build in json.dumps
+            cannot handle, for example UUID and datetime.
 
         Returns
         -------
@@ -461,7 +486,7 @@ def add_documents_in_batches(
         tasks: List[TaskInfo] = []
 
         for document_batch in self._batch(documents, batch_size):
-            task = self.add_documents(document_batch, primary_key)
+            task = self.add_documents(document_batch, primary_key, serializer=serializer)
             tasks.append(task)
 
         return tasks
@@ -470,6 +495,8 @@ def add_documents_json(
         self,
         str_documents: str,
         primary_key: Optional[str] = None,
+        *,
+        serializer: Optional[Type[JSONEncoder]] = None,
     ) -> TaskInfo:
         """Add string documents from JSON file to the index.
 
@@ -479,6 +506,9 @@ def add_documents_json(
             String of document from a JSON file.
         primary_key (optional):
             The primary-key used in index. Ignored if already set up.
+        serializer (optional):
+            A custom JSONEncode to handle serializing fields that the build in json.dumps
+            cannot handle, for example UUID and datetime.
 
         Returns
         -------
@@ -491,7 +521,9 @@ def add_documents_json(
         MeilisearchApiError
             An error containing details about why Meilisearch can't process your request. Meilisearch error codes are described here: https://www.meilisearch.com/docs/reference/errors/error_codes#meilisearch-errors
         """
-        return self.add_documents_raw(str_documents, primary_key, "application/json")
+        return self.add_documents_raw(
+            str_documents, primary_key, "application/json", serializer=serializer
+        )
 
     def add_documents_csv(
         self,
@@ -556,6 +588,8 @@ def add_documents_raw(
         primary_key: Optional[str] = None,
         content_type: Optional[str] = None,
         csv_delimiter: Optional[str] = None,
+        *,
+        serializer: Optional[Type[JSONEncoder]] = None,
     ) -> TaskInfo:
         """Add string documents to the index.
 
@@ -570,6 +604,9 @@ def add_documents_raw(
         csv_delimiter:
             One ASCII character used to customize the delimiter for CSV.
             Note: The csv delimiter can only be used with the Content-Type text/csv.
+        serializer (optional):
+            A custom JSONEncode to handle serializing fields that the build in json.dumps
+            cannot handle, for example UUID and datetime.
 
         Returns
         -------
@@ -583,11 +620,15 @@ def add_documents_raw(
             An error containing details about why Meilisearch can't process your request. Meilisearch error codes are described here: https://www.meilisearch.com/docs/reference/errors/error_codes#meilisearch-errors
         """
         url = self._build_url(primary_key=primary_key, csv_delimiter=csv_delimiter)
-        response = self.http.post(url, str_documents, content_type)
+        response = self.http.post(url, str_documents, content_type, serializer=serializer)
         return TaskInfo(**response)
 
     def update_documents(
-        self, documents: Sequence[Mapping[str, Any]], primary_key: Optional[str] = None
+        self,
+        documents: Sequence[Mapping[str, Any]],
+        primary_key: Optional[str] = None,
+        *,
+        serializer: Optional[Type[JSONEncoder]] = None,
     ) -> TaskInfo:
         """Update documents in the index.
 
@@ -597,6 +638,9 @@ def update_documents(
             List of documents. Each document should be a dictionary.
         primary_key (optional):
             The primary-key used in index. Ignored if already set up
+        serializer (optional):
+            A custom JSONEncode to handle serializing fields that the build in json.dumps
+            cannot handle, for example UUID and datetime.
 
         Returns
         -------
@@ -610,7 +654,7 @@ def update_documents(
             An error containing details about why Meilisearch can't process your request. Meilisearch error codes are described here: https://www.meilisearch.com/docs/reference/errors/error_codes#meilisearch-errors
         """
         url = self._build_url(primary_key)
-        response = self.http.put(url, documents)
+        response = self.http.put(url, documents, serializer=serializer)
         return TaskInfo(**response)
 
     def update_documents_ndjson(
@@ -644,6 +688,8 @@ def update_documents_json(
         self,
         str_documents: str,
         primary_key: Optional[str] = None,
+        *,
+        serializer: Optional[Type[JSONEncoder]] = None,
     ) -> TaskInfo:
         """Update documents as a json string in the index.
 
@@ -653,6 +699,9 @@ def update_documents_json(
             String of document from a JSON file.
         primary_key (optional):
             The primary-key used in index. Ignored if already set up
+        serializer (optional):
+            A custom JSONEncode to handle serializing fields that the build in json.dumps
+            cannot handle, for example UUID and datetime.
 
         Returns
         -------
@@ -665,7 +714,9 @@ def update_documents_json(
         MeilisearchApiError
             An error containing details about why Meilisearch can't process your request. Meilisearch error codes are described here: https://www.meilisearch.com/docs/reference/errors/error_codes#meilisearch-errors
         """
-        return self.update_documents_raw(str_documents, primary_key, "application/json")
+        return self.update_documents_raw(
+            str_documents, primary_key, "application/json", serializer=serializer
+        )
 
     def update_documents_csv(
         self,
@@ -703,6 +754,8 @@ def update_documents_raw(
         primary_key: Optional[str] = None,
         content_type: Optional[str] = None,
         csv_delimiter: Optional[str] = None,
+        *,
+        serializer: Optional[Type[JSONEncoder]] = None,
     ) -> TaskInfo:
         """Update documents as a string in the index.
 
@@ -717,6 +770,9 @@ def update_documents_raw(
         csv_delimiter:
             One ASCII character used to customize the delimiter for CSV.
             Note: The csv delimiter can only be used with the Content-Type text/csv.
+        serializer (optional):
+            A custom JSONEncode to handle serializing fields that the build in json.dumps
+            cannot handle, for example UUID and datetime.
 
         Returns
         -------
@@ -730,14 +786,15 @@ def update_documents_raw(
             An error containing details about why Meilisearch can't process your request. Meilisearch error codes are described here: https://www.meilisearch.com/docs/reference/errors/error_codes#meilisearch-errors
         """
         url = self._build_url(primary_key=primary_key, csv_delimiter=csv_delimiter)
-        response = self.http.put(url, str_documents, content_type)
+        response = self.http.put(url, str_documents, content_type, serializer=serializer)
         return TaskInfo(**response)
 
     def update_documents_in_batches(
         self,
         documents: Sequence[Mapping[str, Any]],
         batch_size: int = 1000,
         primary_key: Optional[str] = None,
+        serializer: Optional[Type[JSONEncoder]] = None,
     ) -> List[TaskInfo]:
         """Update documents to the index in batches.
 
@@ -749,6 +806,9 @@ def update_documents_in_batches(
             The number of documents that should be included in each batch. Default = 1000
         primary_key (optional):
             The primary-key used in index. Ignored if already set up.
+        serializer (optional):
+            A custom JSONEncode to handle serializing fields that the build in json.dumps
+            cannot handle, for example UUID and datetime.
 
         Returns
         -------
@@ -766,7 +826,7 @@ def update_documents_in_batches(
         tasks = []
 
         for document_batch in self._batch(documents, batch_size):
-            update_task = self.update_documents(document_batch, primary_key)
+            update_task = self.update_documents(document_batch, primary_key, serializer=serializer)
             tasks.append(update_task)
 
         return tasks