From 45bde9439fa498be978e1951279dfc9ccc32d524 Mon Sep 17 00:00:00 2001 From: Daniel Smilkov Date: Fri, 20 Oct 2023 10:50:38 -0400 Subject: [PATCH 1/4] save --- .env | 4 ++ .../data/dataset_compute_signal_chain_test.py | 3 +- lilac/data/dataset_compute_signal_test.py | 3 +- lilac/data/dataset_labels_test.py | 3 +- lilac/data/dataset_map_test.py | 3 +- lilac/data/dataset_select_rows_udf_test.py | 7 ++-- lilac/data/dataset_test.py | 3 +- lilac/load_dataset_test.py | 3 +- lilac/schema.py | 2 +- lilac/signals/cluster_dbscan.py | 5 +-- lilac/signals/cluster_hdbscan.py | 5 +-- lilac/sources/csv_source.py | 2 +- lilac/sources/duckdb_utils.py | 26 ++++++++---- lilac/sources/json_source.py | 5 +-- lilac/sources/parquet_source.py | 41 ++++++++++++++----- lilac/sources/sqlite_source.py | 5 +-- 16 files changed, 70 insertions(+), 50 deletions(-) diff --git a/.env b/.env index dbe252117..cfa3ad74c 100644 --- a/.env +++ b/.env @@ -19,6 +19,10 @@ DUCKDB_USE_VIEWS=0 # GCS_REGION= # GCS_ACCESS_KEY= # GCS_SECRET_KEY= +# S3_REGION= +# S3_ENDPOINT= +# S3_ACCESS_KEY= +# S3_SECRET_KEY= # Get key from https://platform.openai.com/account/api-keys # OPENAI_API_KEY= diff --git a/lilac/data/dataset_compute_signal_chain_test.py b/lilac/data/dataset_compute_signal_chain_test.py index 44c3568d9..b292b441a 100644 --- a/lilac/data/dataset_compute_signal_chain_test.py +++ b/lilac/data/dataset_compute_signal_chain_test.py @@ -8,8 +8,6 @@ from pytest_mock import MockerFixture from typing_extensions import override -from lilac.sources.source_registry import clear_source_registry, register_source - from ..embeddings.vector_store import VectorDBIndex from ..schema import ( EMBEDDING_KEY, @@ -31,6 +29,7 @@ clear_signal_registry, register_signal, ) +from ..sources.source_registry import clear_source_registry, register_source from .dataset import DatasetManifest from .dataset_test_utils import ( TEST_DATASET_NAME, diff --git a/lilac/data/dataset_compute_signal_test.py b/lilac/data/dataset_compute_signal_test.py index 0f54d63e3..1d69bac69 100644 --- a/lilac/data/dataset_compute_signal_test.py +++ b/lilac/data/dataset_compute_signal_test.py @@ -7,8 +7,6 @@ from pytest_mock import MockerFixture from typing_extensions import override -from lilac.sources.source_registry import clear_source_registry, register_source - from ..concepts.concept import ExampleIn from ..concepts.db_concept import ConceptUpdate, DiskConceptDB from ..schema import ( @@ -30,6 +28,7 @@ register_signal, ) from ..signals.concept_scorer import ConceptSignal +from ..sources.source_registry import clear_source_registry, register_source from . import dataset_utils as dataset_utils_module from .dataset import Column, DatasetManifest, GroupsSortBy, SortOrder from .dataset_test_utils import ( diff --git a/lilac/data/dataset_labels_test.py b/lilac/data/dataset_labels_test.py index 0392383ac..fbee59232 100644 --- a/lilac/data/dataset_labels_test.py +++ b/lilac/data/dataset_labels_test.py @@ -7,9 +7,8 @@ from freezegun import freeze_time from pytest_mock import MockerFixture -from lilac.sources.source_registry import clear_source_registry, register_source - from ..schema import PATH_WILDCARD, ROWID, Item, field, schema +from ..sources.source_registry import clear_source_registry, register_source from .dataset import DatasetManifest, SelectGroupsResult, SortOrder from .dataset_test_utils import TestDataMaker, TestSource diff --git a/lilac/data/dataset_map_test.py b/lilac/data/dataset_map_test.py index 36767162f..3134e4901 100644 --- a/lilac/data/dataset_map_test.py +++ b/lilac/data/dataset_map_test.py @@ -8,10 +8,9 @@ from freezegun import freeze_time from typing_extensions import override -from lilac.sources.source_registry import clear_source_registry, register_source - from ..schema import PATH_WILDCARD, VALUE_KEY, Field, Item, MapInfo, RichData, field, schema from ..signal import TextSignal, clear_signal_registry, register_signal +from ..sources.source_registry import clear_source_registry, register_source from .dataset import DatasetManifest from .dataset_test_utils import ( TEST_DATASET_NAME, diff --git a/lilac/data/dataset_select_rows_udf_test.py b/lilac/data/dataset_select_rows_udf_test.py index 988768354..b752e2922 100644 --- a/lilac/data/dataset_select_rows_udf_test.py +++ b/lilac/data/dataset_select_rows_udf_test.py @@ -7,10 +7,8 @@ from pytest import approx from typing_extensions import override -from lilac.concepts.concept import ExampleIn -from lilac.concepts.db_concept import ConceptUpdate, DiskConceptDB -from lilac.signals.concept_scorer import ConceptSignal - +from ..concepts.concept import ExampleIn +from ..concepts.db_concept import ConceptUpdate, DiskConceptDB from ..embeddings.vector_store import VectorDBIndex from ..schema import ( ROWID, @@ -32,6 +30,7 @@ clear_signal_registry, register_signal, ) +from ..signals.concept_scorer import ConceptSignal from .dataset import BinaryFilterTuple, Column, SortOrder from .dataset_test_utils import TestDataMaker, enriched_item diff --git a/lilac/data/dataset_test.py b/lilac/data/dataset_test.py index 670c4e9c7..5ef3f4c57 100644 --- a/lilac/data/dataset_test.py +++ b/lilac/data/dataset_test.py @@ -6,11 +6,10 @@ import pytest from typing_extensions import override -from lilac.sources.source_registry import clear_source_registry, register_source - from ..config import DatasetConfig, EmbeddingConfig, SignalConfig from ..schema import EMBEDDING_KEY, ROWID, Field, Item, RichData, field, lilac_embedding, schema from ..signal import TextEmbeddingSignal, TextSignal, clear_signal_registry, register_signal +from ..sources.source_registry import clear_source_registry, register_source from .dataset import Column, DatasetManifest, dataset_config_from_manifest from .dataset_test_utils import ( TEST_DATASET_NAME, diff --git a/lilac/load_dataset_test.py b/lilac/load_dataset_test.py index 47ad5e37d..a495cb229 100644 --- a/lilac/load_dataset_test.py +++ b/lilac/load_dataset_test.py @@ -9,8 +9,6 @@ from pytest_mock import MockerFixture from typing_extensions import override -from lilac.sources.source_registry import clear_source_registry, register_source - from .config import Config, DatasetConfig, DatasetSettings, DatasetUISettings from .data.dataset import SourceManifest from .data.dataset_duckdb import read_source_manifest @@ -19,6 +17,7 @@ from .project import read_project_config from .schema import PARQUET_FILENAME_PREFIX, ROWID, Item, schema from .source import Source, SourceSchema +from .sources.source_registry import clear_source_registry, register_source from .test_utils import fake_uuid, read_items from .utils import DATASETS_DIR_NAME diff --git a/lilac/schema.py b/lilac/schema.py index 3c07f44d8..834af0685 100644 --- a/lilac/schema.py +++ b/lilac/schema.py @@ -20,7 +20,7 @@ ) from typing_extensions import TypedDict -from lilac.utils import is_primitive, log +from .utils import is_primitive, log MANIFEST_FILENAME = 'manifest.json' PARQUET_FILENAME_PREFIX = 'data' diff --git a/lilac/signals/cluster_dbscan.py b/lilac/signals/cluster_dbscan.py index 10d355189..17ecfb0d5 100644 --- a/lilac/signals/cluster_dbscan.py +++ b/lilac/signals/cluster_dbscan.py @@ -6,12 +6,11 @@ from sklearn.cluster import DBSCAN from typing_extensions import override -from lilac.embeddings.vector_store import VectorDBIndex -from lilac.utils import DebugTimer - from ..embeddings.embedding import get_embed_fn +from ..embeddings.vector_store import VectorDBIndex from ..schema import Field, Item, PathKey, RichData, SignalInputType, SpanVector, field, lilac_span from ..signal import VectorSignal +from ..utils import DebugTimer CLUSTER_ID = 'cluster_id' MIN_SAMPLES = 5 diff --git a/lilac/signals/cluster_hdbscan.py b/lilac/signals/cluster_hdbscan.py index a073f0e9e..ebfe2593e 100644 --- a/lilac/signals/cluster_hdbscan.py +++ b/lilac/signals/cluster_hdbscan.py @@ -7,12 +7,11 @@ from sklearn.cluster import HDBSCAN from typing_extensions import override -from lilac.embeddings.vector_store import VectorDBIndex -from lilac.utils import DebugTimer - from ..embeddings.embedding import get_embed_fn +from ..embeddings.vector_store import VectorDBIndex from ..schema import Field, Item, PathKey, RichData, SignalInputType, SpanVector, field, lilac_span from ..signal import VectorSignal +from ..utils import DebugTimer CLUSTER_ID = 'cluster_id' MIN_CLUSTER_SIZE = 5 diff --git a/lilac/sources/csv_source.py b/lilac/sources/csv_source.py index ffc121fc1..11e98b6ee 100644 --- a/lilac/sources/csv_source.py +++ b/lilac/sources/csv_source.py @@ -42,6 +42,7 @@ def setup(self) -> None: filepaths = download_http_files(self.filepaths) self._con = duckdb.connect(database=':memory:') + duckdb_setup(self._con) # DuckDB expects s3 protocol: https://duckdb.org/docs/guides/import/s3_import.html. s3_filepaths = [path.replace('gs://', 's3://') for path in filepaths] @@ -49,7 +50,6 @@ def setup(self) -> None: # NOTE: We use duckdb here to increase parallelism for multiple files. # NOTE: We turn off the parallel reader because of https://github.com/lilacai/lilac/issues/373. self._con.execute(f""" - {duckdb_setup(self._con)} CREATE SEQUENCE serial START 1; CREATE VIEW t as (SELECT nextval('serial') as "{LINE_NUMBER_COLUMN}", * FROM read_csv_auto( {s3_filepaths}, diff --git a/lilac/sources/duckdb_utils.py b/lilac/sources/duckdb_utils.py index 2a869e078..9717df8ed 100644 --- a/lilac/sources/duckdb_utils.py +++ b/lilac/sources/duckdb_utils.py @@ -6,17 +6,25 @@ from ..env import env, get_project_dir -def duckdb_setup(con: duckdb.DuckDBPyConnection) -> str: +def duckdb_setup(con: duckdb.DuckDBPyConnection) -> None: """Setup DuckDB. This includes setting up the extensions directory and GCS access.""" con.execute(f""" SET extension_directory='{os.path.join(get_project_dir(), '.duckdb')}'; """) - if env('GCS_REGION'): - return f""" - SET s3_region='{env('GCS_REGION')}'; - SET s3_access_key_id='{env('GCS_ACCESS_KEY')}'; - SET s3_secret_access_key='{env('GCS_SECRET_KEY')}'; - SET s3_endpoint='storage.googleapis.com'; - """ - return '' + region = env('GCS_REGION') or env('S3_REGION') + if region: + con.execute(f"SET s3_region='{region}") + + access_key = env('GCS_ACCESS_KEY') or env('S3_ACCESS_KEY') + if access_key: + con.execute(f"SET s3_access_key_id='{access_key}") + + secret_key = env('GCS_SECRET_KEY') or env('S3_SECRET_KEY') + if secret_key: + con.execute(f"SET s3_secret_access_key='{secret_key}'") + + gcs_endpoint = 'storage.googleapis.com' + endpoint = env('S3_ENDPOINT') or (gcs_endpoint if env('GCS_REGION') else None) + if endpoint: + con.execute(f"SET s3_endpoint='{endpoint}'") diff --git a/lilac/sources/json_source.py b/lilac/sources/json_source.py index a046cf3c7..7a5d51184 100644 --- a/lilac/sources/json_source.py +++ b/lilac/sources/json_source.py @@ -36,15 +36,14 @@ class JSONSource(Source): def setup(self) -> None: # Download JSON files to local cache if they are via HTTP to speed up duckdb. filepaths = download_http_files(self.filepaths) - self._con = duckdb.connect(database=':memory:') + duckdb_setup(self._con) # DuckDB expects s3 protocol: https://duckdb.org/docs/guides/import/s3_import.html. s3_filepaths = [path.replace('gs://', 's3://') for path in filepaths] # NOTE: We use duckdb here to increase parallelism for multiple files. self._con.execute(f""" - {duckdb_setup(self._con)} CREATE VIEW t as (SELECT * FROM read_json_auto( {s3_filepaths}, IGNORE_ERRORS=true @@ -62,7 +61,7 @@ def setup(self) -> None: @override def source_schema(self) -> SourceSchema: """Return the source schema.""" - assert self._source_schema is not None + assert self._source_schema is not None, 'setup() must be called first.' return self._source_schema @override diff --git a/lilac/sources/parquet_source.py b/lilac/sources/parquet_source.py index 7e91dde2d..c0c63b4bb 100644 --- a/lilac/sources/parquet_source.py +++ b/lilac/sources/parquet_source.py @@ -1,13 +1,15 @@ """Parquet source.""" -from typing import ClassVar, Iterable, Optional +from typing import ClassVar, Iterable, Optional, cast +import duckdb import pyarrow as pa -import pyarrow.parquet as pq from pydantic import Field from typing_extensions import override from ..schema import Item, arrow_schema_to_schema from ..source import Source, SourceSchema +from ..sources.duckdb_utils import duckdb_setup +from ..utils import download_http_files class ParquetSource(Source): @@ -24,15 +26,28 @@ class ParquetSource(Source): 'A list of paths to parquet files which live locally or remotely on GCS, S3, or Hadoop.') _source_schema: Optional[SourceSchema] = None - _table: Optional[pa.Table] = None + _reader: Optional[pa.RecordBatchReader] = None + _con: Optional[duckdb.DuckDBPyConnection] = None @override def setup(self) -> None: - assert self.filepaths, 'filepaths must be specified.' - self._table = pa.concat_tables([pq.read_table(f) for f in self.filepaths]) - self._source_schema = SourceSchema( - fields=arrow_schema_to_schema(pq.read_schema(self.filepaths[0])).fields, - num_items=self._table.num_rows) + filepaths = download_http_files(self.filepaths) + self._con = duckdb.connect(database=':memory:') + duckdb_setup(self._con) + + # DuckDB expects s3 protocol: https://duckdb.org/docs/guides/import/s3_import.html. + s3_filepaths = [path.replace('gs://', 's3://') for path in filepaths] + + # NOTE: We use duckdb here to increase parallelism for multiple files. + self._con.execute(f""" + CREATE VIEW t as (SELECT * FROM read_parquet({s3_filepaths})); + """) + res = self._con.execute('SELECT COUNT(*) FROM t').fetchone() + num_items = cast(tuple[int], res)[0] + self._reader = self._con.execute('SELECT * from t').fetch_record_batch(rows_per_batch=10_000) + # Create the source schema in prepare to share it between process and source_schema. + schema = arrow_schema_to_schema(self._reader.schema) + self._source_schema = SourceSchema(fields=schema.fields, num_items=num_items) @override def source_schema(self) -> SourceSchema: @@ -43,6 +58,10 @@ def source_schema(self) -> SourceSchema: @override def process(self) -> Iterable[Item]: """Process the source.""" - assert self._table is not None, 'setup() must be called first.' - for row in self._table.to_pylist(): - yield row + assert self._reader and self._con, 'setup() must be called first.' + + for batch in self._reader: + yield from batch.to_pylist() + + self._reader.close() + self._con.close() diff --git a/lilac/sources/sqlite_source.py b/lilac/sources/sqlite_source.py index 1670cce83..9503c1591 100644 --- a/lilac/sources/sqlite_source.py +++ b/lilac/sources/sqlite_source.py @@ -10,10 +10,9 @@ from pydantic import Field from typing_extensions import override -from lilac.utils import file_exists - from ..schema import Item, arrow_schema_to_schema from ..source import Source, SourceSchema +from ..utils import file_exists from .duckdb_utils import duckdb_setup router = APIRouter() @@ -48,12 +47,12 @@ class SQLiteSource(Source): @override def setup(self) -> None: self._con = duckdb.connect(database=':memory:') + duckdb_setup(self._con) # DuckDB expects s3 protocol: https://duckdb.org/docs/guides/import/s3_import.html. db_file = self.db_file.replace('gs://', 's3://') self._con.execute(f""" - {duckdb_setup(self._con)} CREATE VIEW t as (SELECT * FROM sqlite_scan('{db_file}', '{self.table}')); """) From 44eef5dfef2a2257047b7531e8a6b43028fefee4 Mon Sep 17 00:00:00 2001 From: Daniel Smilkov Date: Fri, 20 Oct 2023 10:53:38 -0400 Subject: [PATCH 2/4] save --- pytest.ini | 1 + 1 file changed, 1 insertion(+) diff --git a/pytest.ini b/pytest.ini index 3a5d5aeb2..7de155e6c 100644 --- a/pytest.ini +++ b/pytest.ini @@ -11,6 +11,7 @@ filterwarnings = ignore::DeprecationWarning:tornado.*: ignore::DeprecationWarning:pkg_resources.*: ignore::DeprecationWarning:google.rpc.*: + ignore::DeprecationWarning:scipy.*: markers = largedownload: Marks a test as having a large download. Wont run on github. (deselect with '-m "not largedownload"') asyncio_mode = auto From 3d570ea4595b106de6742f3dc94a7aab9d84a778 Mon Sep 17 00:00:00 2001 From: Daniel Smilkov Date: Fri, 20 Oct 2023 11:15:36 -0400 Subject: [PATCH 3/4] save --- lilac/sources/parquet_source.py | 23 ++++++++++++++++++-- lilac/sources/parquet_source_test.py | 32 ++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 2 deletions(-) diff --git a/lilac/sources/parquet_source.py b/lilac/sources/parquet_source.py index c0c63b4bb..fe7867784 100644 --- a/lilac/sources/parquet_source.py +++ b/lilac/sources/parquet_source.py @@ -3,7 +3,7 @@ import duckdb import pyarrow as pa -from pydantic import Field +from pydantic import Field, field_validator from typing_extensions import override from ..schema import Item, arrow_schema_to_schema @@ -24,11 +24,29 @@ class ParquetSource(Source): filepaths: list[str] = Field( description= 'A list of paths to parquet files which live locally or remotely on GCS, S3, or Hadoop.') + sample_size: Optional[int] = Field( + title='Sample size', description='Number of rows to sample from the dataset', default=None) _source_schema: Optional[SourceSchema] = None _reader: Optional[pa.RecordBatchReader] = None _con: Optional[duckdb.DuckDBPyConnection] = None + @field_validator('filepaths') + @classmethod + def validate_filepaths(cls, filepaths: list[str]) -> list[str]: + """Validate filepaths.""" + if not filepaths: + raise ValueError('filepaths must be non-empty.') + return filepaths + + @field_validator('sample_size') + @classmethod + def validate_sample_size(cls, sample_size: int) -> int: + """Validate sample size.""" + if sample_size < 1: + raise ValueError('sample_size must be greater than 0.') + return sample_size + @override def setup(self) -> None: filepaths = download_http_files(self.filepaths) @@ -39,8 +57,9 @@ def setup(self) -> None: s3_filepaths = [path.replace('gs://', 's3://') for path in filepaths] # NOTE: We use duckdb here to increase parallelism for multiple files. + sample_suffix = f'USING SAMPLE {self.sample_size}' if self.sample_size else '' self._con.execute(f""" - CREATE VIEW t as (SELECT * FROM read_parquet({s3_filepaths})); + CREATE VIEW t as (SELECT * FROM read_parquet({s3_filepaths}) {sample_suffix}); """) res = self._con.execute('SELECT COUNT(*) FROM t').fetchone() num_items = cast(tuple[int], res)[0] diff --git a/lilac/sources/parquet_source_test.py b/lilac/sources/parquet_source_test.py index 62835f7b9..ef5cdebd8 100644 --- a/lilac/sources/parquet_source_test.py +++ b/lilac/sources/parquet_source_test.py @@ -5,6 +5,8 @@ import pyarrow as pa import pyarrow.parquet as pq +import pytest +from pydantic import ValidationError from ..schema import schema from ..source import SourceSchema @@ -37,3 +39,33 @@ def test_simple_rows(tmp_path: pathlib.Path) -> None: items = list(source.process()) assert items == [{'name': 'a', 'age': 1}, {'name': 'b', 'age': 2}, {'name': 'c', 'age': 3}] + + +def test_sampling(tmp_path: pathlib.Path) -> None: + table = pa.Table.from_pylist([{ + 'name': 'a', + 'age': 1 + }, { + 'name': 'b', + 'age': 2 + }, { + 'name': 'c', + 'age': 3 + }]) + + out_file = os.path.join(tmp_path, 'test.parquet') + pq.write_table(table, out_file) + + for sample_size in range(1, 4): + source = ParquetSource(filepaths=[out_file], sample_size=sample_size) + source.setup() + items = list(source.process()) + assert len(items) == sample_size + + +def test_validation() -> None: + with pytest.raises(ValidationError, match='filepaths must be non-empty'): + ParquetSource(filepaths=[]) + + with pytest.raises(ValidationError, match='sample_size must be greater than 0'): + ParquetSource(filepaths=['gs://lilac/test.parquet'], sample_size=0) From ebcbe931c35e14dd50db7af2b634f67dbf4328a3 Mon Sep 17 00:00:00 2001 From: Daniel Smilkov Date: Fri, 20 Oct 2023 13:27:40 -0400 Subject: [PATCH 4/4] save --- .env | 7 ------- lilac/env.py | 10 ++++++++++ lilac/sources/parquet_source.py | 6 +++++- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/.env b/.env index cfa3ad74c..e252c665d 100644 --- a/.env +++ b/.env @@ -16,13 +16,6 @@ DUCKDB_USE_VIEWS=0 # Get key from https://dashboard.cohere.ai/api-keys # COHERE_API_KEY= -# GCS_REGION= -# GCS_ACCESS_KEY= -# GCS_SECRET_KEY= -# S3_REGION= -# S3_ENDPOINT= -# S3_ACCESS_KEY= -# S3_SECRET_KEY= # Get key from https://platform.openai.com/account/api-keys # OPENAI_API_KEY= diff --git a/lilac/env.py b/lilac/env.py index 67f031908..f6b16627e 100644 --- a/lilac/env.py +++ b/lilac/env.py @@ -88,6 +88,16 @@ class LilacEnvironment(BaseModel): LILAC_LOAD_ON_START_SERVER: str = PydanticField( description='When true, will load from lilac.yml upon startup.') + GCS_REGION: str = PydanticField(description='The GCS region for GCS operations.') + GCS_ACCESS_KEY: str = PydanticField(description='The GCS access key for GCS operations.') + GCS_SECRET_KEY: str = PydanticField(description='The GCS secret key for GCS operations.') + + S3_REGION: str = PydanticField(description='The S3 region for S3 operations.') + S3_ACCESS_KEY: str = PydanticField(description='The S3 access key for S3 operations.') + S3_SECRET_KEY: str = PydanticField(description='The S3 secret key for S3 operations.') + S3_ENDPOINT: str = PydanticField( + description='The S3 endpoint URL for S3-like operations, including GCS and Azure.') + def _init_env() -> None: in_test = os.environ.get('LILAC_TEST', None) diff --git a/lilac/sources/parquet_source.py b/lilac/sources/parquet_source.py index fe7867784..d523e18a1 100644 --- a/lilac/sources/parquet_source.py +++ b/lilac/sources/parquet_source.py @@ -11,6 +11,9 @@ from ..sources.duckdb_utils import duckdb_setup from ..utils import download_http_files +# Number of rows to read per batch. +ROWS_PER_BATCH_READ = 10_000 + class ParquetSource(Source): """Parquet data loader @@ -63,7 +66,8 @@ def setup(self) -> None: """) res = self._con.execute('SELECT COUNT(*) FROM t').fetchone() num_items = cast(tuple[int], res)[0] - self._reader = self._con.execute('SELECT * from t').fetch_record_batch(rows_per_batch=10_000) + self._reader = self._con.execute('SELECT * from t').fetch_record_batch( + rows_per_batch=ROWS_PER_BATCH_READ) # Create the source schema in prepare to share it between process and source_schema. schema = arrow_schema_to_schema(self._reader.schema) self._source_schema = SourceSchema(fields=schema.fields, num_items=num_items)