Skip to content

Commit

Permalink
Merge #974
Browse files Browse the repository at this point in the history
974: Allow passing a custom serializer for documents r=sanders41 a=sanders41

# Pull Request

`@LaundroMat` FYI in case you want to test this to see if it solves your issue.

## Related issue
Fixes #973

## What does this PR do?
- Allows passing a custom JSONEncoder to serialize documents with types that the default encoder can't handle.

## PR checklist
Please check if your PR fulfills the following requirements:
- [x] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)?
- [x] Have you read the contributing guidelines?
- [x] Have you made sure that the title is accurate and descriptive of the changes?

Thank you so much for contributing to Meilisearch!


Co-authored-by: Paul Sanders <paul@paulsanders.dev>
  • Loading branch information
meili-bors[bot] and sanders41 authored Jun 10, 2024
2 parents b3d914e + 7f7d066 commit 739a22e
Show file tree
Hide file tree
Showing 4 changed files with 237 additions and 18 deletions.
29 changes: 28 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,34 @@ index.update_filterable_attributes([
])
```

#### Custom Serializer for documents <!-- omit in toc -->

If your documents contain fields that the Python JSON serializer does not know how to handle you
can use your own custom serializer.

```py
from datetime import datetime
from json import JSONEncoder
from uuid import uuid4


class CustomEncoder(JSONEncoder):
def default(self, o):
if isinstance(o, (UUID, datetime)):
return str(o)

# Let the base class default method raise the TypeError
return super().default(o)


documents = [
{"id": uuid4(), "title": "test 1", "when": datetime.now()},
{"id": uuid4(), "title": "Test 2", "when": datetime.now()},
]
index = empty_index()
index.add_documents(documents, serializer=CustomEncoder)
```

You only need to perform this operation once.

Note that Meilisearch will rebuild your index whenever you update `filterableAttributes`. Depending on the size of your dataset, this might take time. You can track the process using the [task](https://www.meilisearch.com/docs/reference/api/tasks#get-tasks).
Expand Down Expand Up @@ -205,7 +233,6 @@ index.search(

This package guarantees compatibility with [version v1.x of Meilisearch](https://github.com/meilisearch/meilisearch/releases/latest), but some features may not be present. Please check the [issues](https://github.com/meilisearch/meilisearch-python/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22+label%3Aenhancement) for more info.


## 💡 Learn more

The following sections in our main documentation website may interest you:
Expand Down
19 changes: 12 additions & 7 deletions meilisearch/_httprequests.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import json
from functools import lru_cache
from typing import Any, Callable, List, Mapping, Optional, Sequence, Tuple, Union
from typing import Any, Callable, List, Mapping, Optional, Sequence, Tuple, Type, Union

import requests

Expand Down Expand Up @@ -39,6 +39,8 @@ def send_request(
]
] = None,
content_type: Optional[str] = None,
*,
serializer: Optional[Type[json.JSONEncoder]] = None,
) -> Any:
if content_type:
self.headers["Content-Type"] = content_type
Expand All @@ -58,11 +60,10 @@ def send_request(
data=body,
)
else:
data = json.dumps(body, cls=serializer) if body else "" if body == "" else "null"

request = http_method(
request_path,
timeout=self.config.timeout,
headers=self.headers,
data=json.dumps(body) if body else "" if body == "" else "null",
request_path, timeout=self.config.timeout, headers=self.headers, data=data
)
return self.__validate(request)

Expand All @@ -81,8 +82,10 @@ def post(
Union[Mapping[str, Any], Sequence[Mapping[str, Any]], List[str], str]
] = None,
content_type: Optional[str] = "application/json",
*,
serializer: Optional[Type[json.JSONEncoder]] = None,
) -> Any:
return self.send_request(requests.post, path, body, content_type)
return self.send_request(requests.post, path, body, content_type, serializer=serializer)

def patch(
self,
Expand All @@ -108,8 +111,10 @@ def put(
]
] = None,
content_type: Optional[str] = "application/json",
*,
serializer: Optional[Type[json.JSONEncoder]] = None,
) -> Any:
return self.send_request(requests.put, path, body, content_type)
return self.send_request(requests.put, path, body, content_type, serializer=serializer)

def delete(
self,
Expand Down
80 changes: 70 additions & 10 deletions meilisearch/index.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,19 @@
from __future__ import annotations

from datetime import datetime
from typing import Any, Dict, Generator, List, Mapping, MutableMapping, Optional, Sequence, Union
from typing import (
TYPE_CHECKING,
Any,
Dict,
Generator,
List,
Mapping,
MutableMapping,
Optional,
Sequence,
Type,
Union,
)
from urllib import parse
from warnings import warn

Expand All @@ -26,6 +38,9 @@
from meilisearch.models.task import Task, TaskInfo, TaskResults
from meilisearch.task import TaskHandler

if TYPE_CHECKING:
from json import JSONEncoder


# pylint: disable=too-many-public-methods, too-many-lines
class Index:
Expand Down Expand Up @@ -403,6 +418,8 @@ def add_documents(
self,
documents: Sequence[Mapping[str, Any]],
primary_key: Optional[str] = None,
*,
serializer: Optional[Type[JSONEncoder]] = None,
) -> TaskInfo:
"""Add documents to the index.
Expand All @@ -412,6 +429,9 @@ def add_documents(
List of documents. Each document should be a dictionary.
primary_key (optional):
The primary-key used in index. Ignored if already set up.
serializer (optional):
A custom JSONEncode to handle serializing fields that the build in json.dumps
cannot handle, for example UUID and datetime.
Returns
-------
Expand All @@ -425,14 +445,16 @@ def add_documents(
An error containing details about why Meilisearch can't process your request. Meilisearch error codes are described here: https://www.meilisearch.com/docs/reference/errors/error_codes#meilisearch-errors
"""
url = self._build_url(primary_key)
add_document_task = self.http.post(url, documents)
add_document_task = self.http.post(url, documents, serializer=serializer)
return TaskInfo(**add_document_task)

def add_documents_in_batches(
self,
documents: Sequence[Mapping[str, Any]],
batch_size: int = 1000,
primary_key: Optional[str] = None,
*,
serializer: Optional[Type[JSONEncoder]] = None,
) -> List[TaskInfo]:
"""Add documents to the index in batches.
Expand All @@ -444,6 +466,9 @@ def add_documents_in_batches(
The number of documents that should be included in each batch. Default = 1000
primary_key (optional):
The primary-key used in index. Ignored if already set up.
serializer (optional):
A custom JSONEncode to handle serializing fields that the build in json.dumps
cannot handle, for example UUID and datetime.
Returns
-------
Expand All @@ -461,7 +486,7 @@ def add_documents_in_batches(
tasks: List[TaskInfo] = []

for document_batch in self._batch(documents, batch_size):
task = self.add_documents(document_batch, primary_key)
task = self.add_documents(document_batch, primary_key, serializer=serializer)
tasks.append(task)

return tasks
Expand All @@ -470,6 +495,8 @@ def add_documents_json(
self,
str_documents: str,
primary_key: Optional[str] = None,
*,
serializer: Optional[Type[JSONEncoder]] = None,
) -> TaskInfo:
"""Add string documents from JSON file to the index.
Expand All @@ -479,6 +506,9 @@ def add_documents_json(
String of document from a JSON file.
primary_key (optional):
The primary-key used in index. Ignored if already set up.
serializer (optional):
A custom JSONEncode to handle serializing fields that the build in json.dumps
cannot handle, for example UUID and datetime.
Returns
-------
Expand All @@ -491,7 +521,9 @@ def add_documents_json(
MeilisearchApiError
An error containing details about why Meilisearch can't process your request. Meilisearch error codes are described here: https://www.meilisearch.com/docs/reference/errors/error_codes#meilisearch-errors
"""
return self.add_documents_raw(str_documents, primary_key, "application/json")
return self.add_documents_raw(
str_documents, primary_key, "application/json", serializer=serializer
)

def add_documents_csv(
self,
Expand Down Expand Up @@ -556,6 +588,8 @@ def add_documents_raw(
primary_key: Optional[str] = None,
content_type: Optional[str] = None,
csv_delimiter: Optional[str] = None,
*,
serializer: Optional[Type[JSONEncoder]] = None,
) -> TaskInfo:
"""Add string documents to the index.
Expand All @@ -570,6 +604,9 @@ def add_documents_raw(
csv_delimiter:
One ASCII character used to customize the delimiter for CSV.
Note: The csv delimiter can only be used with the Content-Type text/csv.
serializer (optional):
A custom JSONEncode to handle serializing fields that the build in json.dumps
cannot handle, for example UUID and datetime.
Returns
-------
Expand All @@ -583,11 +620,15 @@ def add_documents_raw(
An error containing details about why Meilisearch can't process your request. Meilisearch error codes are described here: https://www.meilisearch.com/docs/reference/errors/error_codes#meilisearch-errors
"""
url = self._build_url(primary_key=primary_key, csv_delimiter=csv_delimiter)
response = self.http.post(url, str_documents, content_type)
response = self.http.post(url, str_documents, content_type, serializer=serializer)
return TaskInfo(**response)

def update_documents(
self, documents: Sequence[Mapping[str, Any]], primary_key: Optional[str] = None
self,
documents: Sequence[Mapping[str, Any]],
primary_key: Optional[str] = None,
*,
serializer: Optional[Type[JSONEncoder]] = None,
) -> TaskInfo:
"""Update documents in the index.
Expand All @@ -597,6 +638,9 @@ def update_documents(
List of documents. Each document should be a dictionary.
primary_key (optional):
The primary-key used in index. Ignored if already set up
serializer (optional):
A custom JSONEncode to handle serializing fields that the build in json.dumps
cannot handle, for example UUID and datetime.
Returns
-------
Expand All @@ -610,7 +654,7 @@ def update_documents(
An error containing details about why Meilisearch can't process your request. Meilisearch error codes are described here: https://www.meilisearch.com/docs/reference/errors/error_codes#meilisearch-errors
"""
url = self._build_url(primary_key)
response = self.http.put(url, documents)
response = self.http.put(url, documents, serializer=serializer)
return TaskInfo(**response)

def update_documents_ndjson(
Expand Down Expand Up @@ -644,6 +688,8 @@ def update_documents_json(
self,
str_documents: str,
primary_key: Optional[str] = None,
*,
serializer: Optional[Type[JSONEncoder]] = None,
) -> TaskInfo:
"""Update documents as a json string in the index.
Expand All @@ -653,6 +699,9 @@ def update_documents_json(
String of document from a JSON file.
primary_key (optional):
The primary-key used in index. Ignored if already set up
serializer (optional):
A custom JSONEncode to handle serializing fields that the build in json.dumps
cannot handle, for example UUID and datetime.
Returns
-------
Expand All @@ -665,7 +714,9 @@ def update_documents_json(
MeilisearchApiError
An error containing details about why Meilisearch can't process your request. Meilisearch error codes are described here: https://www.meilisearch.com/docs/reference/errors/error_codes#meilisearch-errors
"""
return self.update_documents_raw(str_documents, primary_key, "application/json")
return self.update_documents_raw(
str_documents, primary_key, "application/json", serializer=serializer
)

def update_documents_csv(
self,
Expand Down Expand Up @@ -703,6 +754,8 @@ def update_documents_raw(
primary_key: Optional[str] = None,
content_type: Optional[str] = None,
csv_delimiter: Optional[str] = None,
*,
serializer: Optional[Type[JSONEncoder]] = None,
) -> TaskInfo:
"""Update documents as a string in the index.
Expand All @@ -717,6 +770,9 @@ def update_documents_raw(
csv_delimiter:
One ASCII character used to customize the delimiter for CSV.
Note: The csv delimiter can only be used with the Content-Type text/csv.
serializer (optional):
A custom JSONEncode to handle serializing fields that the build in json.dumps
cannot handle, for example UUID and datetime.
Returns
-------
Expand All @@ -730,14 +786,15 @@ def update_documents_raw(
An error containing details about why Meilisearch can't process your request. Meilisearch error codes are described here: https://www.meilisearch.com/docs/reference/errors/error_codes#meilisearch-errors
"""
url = self._build_url(primary_key=primary_key, csv_delimiter=csv_delimiter)
response = self.http.put(url, str_documents, content_type)
response = self.http.put(url, str_documents, content_type, serializer=serializer)
return TaskInfo(**response)

def update_documents_in_batches(
self,
documents: Sequence[Mapping[str, Any]],
batch_size: int = 1000,
primary_key: Optional[str] = None,
serializer: Optional[Type[JSONEncoder]] = None,
) -> List[TaskInfo]:
"""Update documents to the index in batches.
Expand All @@ -749,6 +806,9 @@ def update_documents_in_batches(
The number of documents that should be included in each batch. Default = 1000
primary_key (optional):
The primary-key used in index. Ignored if already set up.
serializer (optional):
A custom JSONEncode to handle serializing fields that the build in json.dumps
cannot handle, for example UUID and datetime.
Returns
-------
Expand All @@ -766,7 +826,7 @@ def update_documents_in_batches(
tasks = []

for document_batch in self._batch(documents, batch_size):
update_task = self.update_documents(document_batch, primary_key)
update_task = self.update_documents(document_batch, primary_key, serializer=serializer)
tasks.append(update_task)

return tasks
Expand Down
Loading

0 comments on commit 739a22e

Please sign in to comment.