diff --git a/docs/datasets/dataset_load.md b/docs/datasets/dataset_load.md index 9769d1e44..8c5b4344e 100644 --- a/docs/datasets/dataset_load.md +++ b/docs/datasets/dataset_load.md @@ -63,6 +63,117 @@ You will be redirected to the dataset view once your data is loaded. ## From Python +### Creating a dataset + +You can create a dataset from Python using [](#lilac.create_dataset). Lilac supports variety of data +sources, including CSV, JSON, HuggingFace datasets, Parquet, Pandas and more. See [](#lilac.sources) +for details on available sources. All the file based readers support reading from local files, S3 +(`s3://...`), GCS (`gs://...`) and HTTP(S) URLs. + +Before we load any dataset, we should set the project directory which will be used to store all the +datasets we import. If not set, it defaults to the current working directory. + +```python +import lilac as ll +ll.set_project_dir('~/my_project') +``` + +#### Huggingface + +You can load any HuggingFace dataset by passing the dataset name and config name. We use the HF +dataset loader, which will fetch and cache the dataset in your HF cache dir. Then Lilac will convert +that to our internal format and store it in the Lilac project dir. To read private datasets, either +login via the [`huggingface-cli`](https://huggingface.co/docs/huggingface_hub/quick-start#login) or +provide a `token` to the `HuggingFaceSource`. + +```python +config = ll.DatasetConfig( + namespace='local', + name='glue', + source=ll.HuggingFaceSource(dataset_name='glue', config_name='ax')) +# NOTE: You can pass a `project_dir` to `create_dataset` as the second argument. +dataset = ll.create_dataset(config) +``` + +#### CSV + +The CSV reader can read from local files, S3, GCS and HTTP. If your dataset is sharded, you can use +a glob pattern to load multiple files. + +```python +url = 'https://storage.googleapis.com/lilac-data/datasets/the_movies_dataset/the_movies_dataset.csv' +config = ll.DatasetConfig( + namespace='local', name='the_movies_dataset', source=ll.CSVSource(filepaths=[url])) +dataset = ll.create_dataset(config) +``` + +#### Parquet + +The parquet reader can read from local files, S3, GCS and HTTP. If your dataset is sharded, you can +use a glob pattern to load multiple files. + +**Sampling** + +The `ParquetSource` takes a few optional arguments related to sampling: + +- `sample_size`, the number of rows to sample. +- `approximate_shuffle`, defaulting to `False`. When `False`, we take an entire pass over the + dataset with reservoir sampling. When `True`, we read a fraction of rows from the start of each + shard, to avoid shard skew, without doing a full pass over the entire dataset. This is useful when + your dataset is very large and consists of a large number of shards. +- `seed`, the random seed to use for sampling. + +```python +source = ll.ParquetSource( + filepaths=['s3://lilac-public-data/test-*.parquet'], + sample_size=100, + approximate_shuffle=True) +config = ll.DatasetConfig(namespace='local', name='parquet-test', source=source) +dataset = ll.create_dataset(config) +``` + +#### JSON + +The JSON reader can read from local files, S3, GCS and HTTP. If your dataset is sharded, you can use +a glob pattern to load multiple files. The reader supports both JSON and JSONL files. + +If the format is JSON, we expect the dataset to be an array of objects: + +```json +[ + {"id": 1, "text": "hello world"}, + {"id": 2, "text": "goodbye world"} +] +``` + +If the format is JSONL, we expect each line to be a JSON object: + +```json +{"id": 1, "text": "hello world"} +{"id": 2, "text": "goodbye world"} +``` + +```python +config = ll.DatasetConfig( + namespace='local', + name='news_headlines', + source=ll.JSONSource(filepaths=[ + 'https://storage.googleapis.com/lilac-data/datasets/langsmith-finetuning-rag/rag.jsonl' + ])) +dataset = ll.create_dataset(config) +``` + +#### Pandas + +```python +df = pd.DataFrame({'test': ['a', 'b', 'c']}) +config = ll.DatasetConfig(namespace='local', name='the_movies_dataset2', source=ll.PandasSource(df)) +dataset = ll.create_dataset(config) +``` + +For details on all the source loaders, see [](#lilac.sources). For details on the dataset config, +see [](#lilac.DatasetConfig). + ### Loading from lilac.yml When you start a webserver, Lilac will automatically create a project for you in the given project @@ -99,24 +210,3 @@ Or from the CLI: ```sh lilac load --project_dir=~/my_lilac ``` - -### Loading an individual dataset - -This example loads the `glue` dataset with the `ax` config from HuggingFace: - -```python -# Set the global project directory to where project files will be stored. -ll.set_project_dir('~/my_project') - -config = ll.DatasetConfig( - namespace='local', - name='glue', - source=ll.HuggingFaceSource(dataset_name='glue', config_name='ax')) - -# NOTE: If you don't want to set a global project directory, you can pass the `project_dir` to `create_dataset` as the second argument. -dataset = ll.create_dataset(config) -``` - -For details on all the source loaders, see [](#lilac.sources). - -For details on the dataset config, see [](#lilac.DatasetConfig). diff --git a/lilac.yml b/lilac.yml deleted file mode 100644 index f48f8a553..000000000 --- a/lilac.yml +++ /dev/null @@ -1,4 +0,0 @@ -# Lilac project config. -# See https://lilacml.com/api_reference/index.html#lilac.Config for details. - -{} diff --git a/lilac/load_dataset.py b/lilac/load_dataset.py index 842b031e9..3d9624303 100644 --- a/lilac/load_dataset.py +++ b/lilac/load_dataset.py @@ -26,13 +26,15 @@ def create_dataset(config: DatasetConfig, - project_dir: Optional[Union[str, pathlib.Path]] = None) -> Dataset: + project_dir: Optional[Union[str, pathlib.Path]] = None, + overwrite: bool = False) -> Dataset: """Load a dataset from a given source configuration. Args: config: The dataset configuration to load. project_dir: The path to the project directory for where to create the dataset. If not defined, uses the project directory from `LILAC_PROJECT_DIR` or [deprecated] `LILAC_DATA_PATH`. + overwrite: Whether to overwrite the dataset if it already exists. """ project_dir = project_dir or get_project_dir() if not project_dir: @@ -40,7 +42,7 @@ def create_dataset(config: DatasetConfig, 'globally with `set_project_dir(path)`') # Update the config before processing the source. - add_project_dataset_config(config, project_dir) + add_project_dataset_config(config, project_dir, overwrite) process_source(project_dir, config) return get_dataset(config.namespace, config.name, project_dir) diff --git a/lilac/project.py b/lilac/project.py index c72027f57..a43f126f9 100644 --- a/lilac/project.py +++ b/lilac/project.py @@ -33,13 +33,15 @@ def init(project_dir: Optional[Union[str, pathlib.Path]] = None) -> None: def add_project_dataset_config(dataset_config: DatasetConfig, - project_dir: Optional[Union[str, pathlib.Path]] = None) -> None: + project_dir: Optional[Union[str, pathlib.Path]] = None, + overwrite: bool = False) -> None: """Add a dataset to the project config. Args: dataset_config: The dataset configuration to load. project_dir: The path to the project directory for where to create the dataset. If not defined, uses the project directory from `LILAC_PROJECT_DIR` or [deprecated] `LILAC_DATA_PATH`. + overwrite: Whether to overwrite the dataset if it already exists. """ project_dir = project_dir or get_project_dir() with PROJECT_CONFIG_LOCK: @@ -47,10 +49,13 @@ def add_project_dataset_config(dataset_config: DatasetConfig, existing_dataset_config = get_dataset_config(config, dataset_config.namespace, dataset_config.name) if existing_dataset_config is not None: - raise ValueError( - f'{dataset_config} has already been added. You can delete it with: \n\n' - f'dataset = get_dataset("{dataset_config.namespace}", "{dataset_config.name}")\n' - 'dataset.delete()') + if overwrite: + config.datasets.remove(existing_dataset_config) + else: + raise ValueError( + f'{dataset_config} has already been added. You can delete it with: \n\n' + f'dataset = get_dataset("{dataset_config.namespace}", "{dataset_config.name}")\n' + 'dataset.delete()') config.datasets.append(dataset_config) write_project_config(project_dir, config) diff --git a/lilac/router_dataset.py b/lilac/router_dataset.py index f7c691303..be50719ef 100644 --- a/lilac/router_dataset.py +++ b/lilac/router_dataset.py @@ -2,7 +2,6 @@ import os from copy import copy from typing import Annotated, Any, Literal, Optional, Sequence, Union, cast -from urllib.parse import unquote from fastapi import APIRouter, HTTPException, Response from fastapi.params import Depends diff --git a/lilac/sources/csv_source.py b/lilac/sources/csv_source.py index 11e98b6ee..e10112f59 100644 --- a/lilac/sources/csv_source.py +++ b/lilac/sources/csv_source.py @@ -9,7 +9,7 @@ from ..schema import Item, arrow_schema_to_schema from ..source import Source, SourceSchema from ..utils import download_http_files -from .duckdb_utils import duckdb_setup +from .duckdb_utils import convert_path_to_duckdb, duckdb_setup LINE_NUMBER_COLUMN = '__line_number__' @@ -45,14 +45,14 @@ def setup(self) -> None: duckdb_setup(self._con) # DuckDB expects s3 protocol: https://duckdb.org/docs/guides/import/s3_import.html. - s3_filepaths = [path.replace('gs://', 's3://') for path in filepaths] + duckdb_paths = [convert_path_to_duckdb(path) for path in filepaths] # NOTE: We use duckdb here to increase parallelism for multiple files. # NOTE: We turn off the parallel reader because of https://github.com/lilacai/lilac/issues/373. self._con.execute(f""" CREATE SEQUENCE serial START 1; CREATE VIEW t as (SELECT nextval('serial') as "{LINE_NUMBER_COLUMN}", * FROM read_csv_auto( - {s3_filepaths}, + {duckdb_paths}, SAMPLE_SIZE=500000, HEADER={self.header}, {f'NAMES={self.names},' if self.names else ''} diff --git a/lilac/sources/duckdb_utils.py b/lilac/sources/duckdb_utils.py index 9717df8ed..5c7138827 100644 --- a/lilac/sources/duckdb_utils.py +++ b/lilac/sources/duckdb_utils.py @@ -1,30 +1,46 @@ """Utils for duckdb.""" -import os + +import urllib.parse import duckdb -from ..env import env, get_project_dir +from ..env import env def duckdb_setup(con: duckdb.DuckDBPyConnection) -> None: - """Setup DuckDB. This includes setting up the extensions directory and GCS access.""" - con.execute(f""" - SET extension_directory='{os.path.join(get_project_dir(), '.duckdb')}'; + """Setup DuckDB. This includes setting up performance optimizations.""" + con.execute(""" + SET enable_http_metadata_cache=true; + SET enable_object_cache=true; """) - region = env('GCS_REGION') or env('S3_REGION') - if region: - con.execute(f"SET s3_region='{region}") - - access_key = env('GCS_ACCESS_KEY') or env('S3_ACCESS_KEY') - if access_key: - con.execute(f"SET s3_access_key_id='{access_key}") - - secret_key = env('GCS_SECRET_KEY') or env('S3_SECRET_KEY') - if secret_key: - con.execute(f"SET s3_secret_access_key='{secret_key}'") - gcs_endpoint = 'storage.googleapis.com' - endpoint = env('S3_ENDPOINT') or (gcs_endpoint if env('GCS_REGION') else None) - if endpoint: - con.execute(f"SET s3_endpoint='{endpoint}'") +def convert_path_to_duckdb(filepath: str) -> str: + """Convert a filepath to a duckdb filepath.""" + scheme = urllib.parse.urlparse(filepath).scheme + options: dict[str, str] = {} + if scheme == '': + return filepath + elif scheme == 'gs': + options['s3_endpoint'] = 'storage.googleapis.com' + if env('GCS_REGION'): + options['s3_region'] = env('GCS_REGION') + if env('GCS_ACCESS_KEY'): + options['s3_access_key_id'] = env('GCS_ACCESS_KEY') + if env('GCS_SECRET_KEY'): + options['s3_secret_access_key'] = env('GCS_SECRET_KEY') + filepath = filepath.replace('gs://', 's3://') + elif scheme == 's3': + if env('S3_ENDPOINT'): + options['s3_endpoint'] = env('S3_ENDPOINT') + if env('S3_REGION'): + options['s3_region'] = env('S3_REGION') + if env('S3_ACCESS_KEY'): + options['s3_access_key_id'] = env('S3_ACCESS_KEY') + if env('S3_SECRET_KEY'): + options['s3_secret_access_key'] = env('S3_SECRET_KEY') + else: + raise ValueError(f'Unsupported scheme: {scheme}') + if options: + return f'{filepath}?{urllib.parse.urlencode(options, safe="+/")}' + return filepath diff --git a/lilac/sources/huggingface_source.py b/lilac/sources/huggingface_source.py index af2a77977..1be3d7a31 100644 --- a/lilac/sources/huggingface_source.py +++ b/lilac/sources/huggingface_source.py @@ -136,6 +136,11 @@ class HuggingFaceSource(Source): title='Sample size', description='Number of rows to sample from the dataset, for each split.', default=None) + token: Optional[str] = PydanticField( + title='Huggingface token', + description='Huggingface token for private datasets.', + default=None, + exclude=True) revision: Optional[str] = PydanticField(title='Dataset revision', default=None) load_from_disk: Optional[bool] = PydanticField( description='Load from local disk instead of the hub.', default=False) @@ -153,7 +158,8 @@ def setup(self) -> None: self.dataset_name, self.config_name, num_proc=multiprocessing.cpu_count(), - ignore_verifications=True) + verification_mode='no_checks', + token=self.token) self._dataset_dict = hf_dataset_dict self._schema_info = hf_schema_to_schema(self._dataset_dict, self.split, self.sample_size) diff --git a/lilac/sources/json_source.py b/lilac/sources/json_source.py index 7a5d51184..6a4f6c9ac 100644 --- a/lilac/sources/json_source.py +++ b/lilac/sources/json_source.py @@ -9,7 +9,7 @@ from ..schema import Item, arrow_schema_to_schema from ..source import Source, SourceSchema from ..utils import download_http_files -from .duckdb_utils import duckdb_setup +from .duckdb_utils import convert_path_to_duckdb, duckdb_setup class JSONSource(Source): @@ -40,14 +40,11 @@ def setup(self) -> None: duckdb_setup(self._con) # DuckDB expects s3 protocol: https://duckdb.org/docs/guides/import/s3_import.html. - s3_filepaths = [path.replace('gs://', 's3://') for path in filepaths] + duckdb_paths = [convert_path_to_duckdb(path) for path in filepaths] # NOTE: We use duckdb here to increase parallelism for multiple files. self._con.execute(f""" - CREATE VIEW t as (SELECT * FROM read_json_auto( - {s3_filepaths}, - IGNORE_ERRORS=true - )); + CREATE VIEW t as (SELECT * FROM read_json_auto({duckdb_paths}, IGNORE_ERRORS=true)); """) res = self._con.execute('SELECT COUNT(*) FROM t').fetchone() diff --git a/lilac/sources/parquet_source.py b/lilac/sources/parquet_source.py index d523e18a1..98fc7693b 100644 --- a/lilac/sources/parquet_source.py +++ b/lilac/sources/parquet_source.py @@ -1,18 +1,19 @@ """Parquet source.""" +import random from typing import ClassVar, Iterable, Optional, cast import duckdb import pyarrow as pa -from pydantic import Field, field_validator +from pydantic import Field, ValidationInfo, field_validator from typing_extensions import override -from ..schema import Item, arrow_schema_to_schema +from ..schema import Item, Schema, arrow_schema_to_schema from ..source import Source, SourceSchema -from ..sources.duckdb_utils import duckdb_setup +from ..sources.duckdb_utils import convert_path_to_duckdb, duckdb_setup from ..utils import download_http_files # Number of rows to read per batch. -ROWS_PER_BATCH_READ = 10_000 +ROWS_PER_BATCH_READ = 50_000 class ParquetSource(Source): @@ -27,11 +28,16 @@ class ParquetSource(Source): filepaths: list[str] = Field( description= 'A list of paths to parquet files which live locally or remotely on GCS, S3, or Hadoop.') + seed: Optional[int] = Field(description='Random seed for sampling', default=None) sample_size: Optional[int] = Field( title='Sample size', description='Number of rows to sample from the dataset', default=None) + approximate_shuffle: bool = Field( + default=False, + description='If true, the reader will read a fraction of rows from each shard, ' + 'avoiding a pass over the entire dataset.') _source_schema: Optional[SourceSchema] = None - _reader: Optional[pa.RecordBatchReader] = None + _readers: list[pa.RecordBatchReader] = [] _con: Optional[duckdb.DuckDBPyConnection] = None @field_validator('filepaths') @@ -50,6 +56,43 @@ def validate_sample_size(cls, sample_size: int) -> int: raise ValueError('sample_size must be greater than 0.') return sample_size + @field_validator('approximate_shuffle') + @classmethod + def validate_approximate_shuffle(cls, approximate_shuffle: bool, info: ValidationInfo) -> bool: + """Validate shuffle before sampling.""" + if approximate_shuffle and not info.data['sample_size']: + raise ValueError('`approximate_shuffle` requires `sample_size` to be set.') + return approximate_shuffle + + def _setup_sampling(self, duckdb_paths: list[str]) -> Schema: + assert self._con, 'setup() must be called first.' + if self.approximate_shuffle: + assert self.sample_size, 'approximate_shuffle requires sample_size to be set.' + # Find each individual file. + glob_rows: list[tuple[str]] = self._con.execute( + f'SELECT * FROM GLOB({duckdb_paths})').fetchall() + duckdb_files: list[str] = list(set([row[0] for row in glob_rows])) + batch_size = max(1, min(self.sample_size // len(duckdb_files), ROWS_PER_BATCH_READ)) + for duckdb_file in duckdb_files: + # Since we are not fetching the entire results immediately, we need a seperate cursor + # for each file to avoid each cursor overwriting the previous one. + con = self._con.cursor() + duckdb_setup(con) + res = con.execute(f"""SELECT * FROM read_parquet('{duckdb_file}')""") + self._readers.append(res.fetch_record_batch(rows_per_batch=batch_size)) + else: + sample_suffix = '' + if self.sample_size: + sample_suffix = f'USING SAMPLE {self.sample_size}' + if self.seed is not None: + sample_suffix += f' (reservoir, {self.seed})' + res = self._con.execute(f"""SELECT * FROM read_parquet({duckdb_paths}) {sample_suffix}""") + batch_size = ROWS_PER_BATCH_READ + if self.sample_size: + batch_size = min(self.sample_size, ROWS_PER_BATCH_READ) + self._readers.append(res.fetch_record_batch(rows_per_batch=batch_size)) + return arrow_schema_to_schema(self._readers[0].schema) + @override def setup(self) -> None: filepaths = download_http_files(self.filepaths) @@ -57,19 +100,13 @@ def setup(self) -> None: duckdb_setup(self._con) # DuckDB expects s3 protocol: https://duckdb.org/docs/guides/import/s3_import.html. - s3_filepaths = [path.replace('gs://', 's3://') for path in filepaths] - - # NOTE: We use duckdb here to increase parallelism for multiple files. - sample_suffix = f'USING SAMPLE {self.sample_size}' if self.sample_size else '' - self._con.execute(f""" - CREATE VIEW t as (SELECT * FROM read_parquet({s3_filepaths}) {sample_suffix}); - """) - res = self._con.execute('SELECT COUNT(*) FROM t').fetchone() + duckdb_paths = [convert_path_to_duckdb(path) for path in filepaths] + res = self._con.execute(f'SELECT COUNT(*) FROM read_parquet({duckdb_paths})').fetchone() num_items = cast(tuple[int], res)[0] - self._reader = self._con.execute('SELECT * from t').fetch_record_batch( - rows_per_batch=ROWS_PER_BATCH_READ) - # Create the source schema in prepare to share it between process and source_schema. - schema = arrow_schema_to_schema(self._reader.schema) + if self.sample_size: + self.sample_size = min(self.sample_size, num_items) + num_items = self.sample_size + schema = self._setup_sampling(duckdb_paths) self._source_schema = SourceSchema(fields=schema.fields, num_items=num_items) @override @@ -81,10 +118,33 @@ def source_schema(self) -> SourceSchema: @override def process(self) -> Iterable[Item]: """Process the source.""" - assert self._reader and self._con, 'setup() must be called first.' - - for batch in self._reader: - yield from batch.to_pylist() + assert self._con, 'setup() must be called first.' + + items_yielded = 0 + done = False + + if self.seed is not None: + random.seed(self.seed) + + while not done: + index = random.randint(0, len(self._readers) - 1) + reader = self._readers[index] + batch = None + try: + batch = reader.read_next_batch() + except StopIteration: + reader.close() + del self._readers[index] + if not self._readers: + done = True + break + continue + items = batch.to_pylist() + for item in items: + yield item + items_yielded += 1 + if self.sample_size and items_yielded == self.sample_size: + done = True + break - self._reader.close() self._con.close() diff --git a/lilac/sources/parquet_source_test.py b/lilac/sources/parquet_source_test.py index ef5cdebd8..29b5520bb 100644 --- a/lilac/sources/parquet_source_test.py +++ b/lilac/sources/parquet_source_test.py @@ -10,6 +10,7 @@ from ..schema import schema from ..source import SourceSchema +from ..utils import chunks from .parquet_source import ParquetSource @@ -41,26 +42,138 @@ def test_simple_rows(tmp_path: pathlib.Path) -> None: assert items == [{'name': 'a', 'age': 1}, {'name': 'b', 'age': 2}, {'name': 'c', 'age': 3}] -def test_sampling(tmp_path: pathlib.Path) -> None: - table = pa.Table.from_pylist([{ - 'name': 'a', - 'age': 1 - }, { - 'name': 'b', - 'age': 2 - }, { - 'name': 'c', - 'age': 3 - }]) +def test_single_shard_with_sampling(tmp_path: pathlib.Path) -> None: + source_items = [{'name': 'a', 'age': 1}, {'name': 'b', 'age': 2}, {'name': 'c', 'age': 3}] + table = pa.Table.from_pylist(source_items) out_file = os.path.join(tmp_path, 'test.parquet') pq.write_table(table, out_file) - for sample_size in range(1, 4): + # Test sampling with different sample sizes, including sample size > num_items. + for sample_size in range(1, 5): source = ParquetSource(filepaths=[out_file], sample_size=sample_size) source.setup() items = list(source.process()) - assert len(items) == sample_size + assert len(items) == min(sample_size, len(source_items)) + + +def test_single_shard_approximate_shuffle(tmp_path: pathlib.Path) -> None: + source_items = [{'name': 'a', 'age': 1}, {'name': 'b', 'age': 2}, {'name': 'c', 'age': 3}] + table = pa.Table.from_pylist(source_items) + + out_file = os.path.join(tmp_path, 'test.parquet') + pq.write_table(table, out_file) + + # Test sampling with different sample sizes, including sample size > num_items. + for sample_size in range(1, 5): + source = ParquetSource(filepaths=[out_file], sample_size=sample_size, approximate_shuffle=True) + source.setup() + items = list(source.process()) + assert len(items) == min(sample_size, len(source_items)) + + +def test_multi_shard(tmp_path: pathlib.Path) -> None: + source_items = [{'name': 'a', 'age': 1}, {'name': 'b', 'age': 2}, {'name': 'c', 'age': 3}] + for i, item in enumerate(source_items): + table = pa.Table.from_pylist([item]) + out_file = tmp_path / f'test-{i}.parquet' + pq.write_table(table, out_file) + + source = ParquetSource(filepaths=[str(tmp_path / 'test-*.parquet')]) + source.setup() + items = list(source.process()) + assert items == source_items + + +def test_multi_shard_sample(tmp_path: pathlib.Path) -> None: + source_items = [{'name': 'a', 'age': 1}, {'name': 'b', 'age': 2}, {'name': 'c', 'age': 3}] + for i, item in enumerate(source_items): + table = pa.Table.from_pylist([item]) + out_file = tmp_path / f'test-{i}.parquet' + pq.write_table(table, out_file) + + # Test sampling with different sample sizes, including sample size > num_items. + for sample_size in range(1, 5): + source = ParquetSource(filepaths=[str(tmp_path / 'test-*.parquet')], sample_size=sample_size) + source.setup() + items = list(source.process()) + assert len(items) == min(sample_size, len(source_items)) + + +def test_multi_shard_approx_shuffle(tmp_path: pathlib.Path) -> None: + source_items = [{'name': 'a', 'age': 1}, {'name': 'b', 'age': 2}, {'name': 'c', 'age': 3}] + for i, item in enumerate(source_items): + table = pa.Table.from_pylist([item]) + out_file = tmp_path / f'test-{i}.parquet' + pq.write_table(table, out_file) + + # Test sampling with different sample sizes, including sample size > num_items. + for sample_size in range(1, 5): + source = ParquetSource( + filepaths=[str(tmp_path / 'test-*.parquet')], + approximate_shuffle=True, + sample_size=sample_size) + source.setup() + items = list(source.process()) + assert len(items) == min(sample_size, len(source_items)) + + +def test_uniform_shards_approximate_shuffle(tmp_path: pathlib.Path) -> None: + source_items = [{'index': i} for i in range(100)] + for i, chunk in enumerate(chunks(source_items, 10)): + table = pa.Table.from_pylist(chunk) + out_file = tmp_path / f'test-{i}.parquet' + pq.write_table(table, out_file) + + source = ParquetSource( + filepaths=[str(tmp_path / 'test-*.parquet')], approximate_shuffle=True, sample_size=20) + source.setup() + items = list(source.process()) + assert len(items) == 20 + + +def test_nonuniform_shards_approximate_shuffle(tmp_path: pathlib.Path) -> None: + source_items = [{'index': i} for i in range(100)] + shard_sizes = [49, 1, 40, 10] + for i, shard_size in enumerate(shard_sizes): + chunk = source_items[:shard_size] + source_items = source_items[shard_size:] + table = pa.Table.from_pylist(chunk) + out_file = tmp_path / f'test-{i}.parquet' + pq.write_table(table, out_file) + + source = ParquetSource( + filepaths=[str(tmp_path / 'test-*.parquet')], approximate_shuffle=True, sample_size=20) + source.setup() + items = list(source.process()) + assert len(items) == 20 + + +def test_sampling_with_seed(tmp_path: pathlib.Path) -> None: + source_items = [{'index': i} for i in range(100)] + for i, chunk in enumerate(chunks(source_items, 10)): + table = pa.Table.from_pylist(chunk) + out_file = tmp_path / f'test-{i}.parquet' + pq.write_table(table, out_file) + + source = ParquetSource(filepaths=[str(tmp_path / 'test-*.parquet')], sample_size=20, seed=42) + source.setup() + items = list(source.process()) + assert len(items) == 20 + + +def test_approx_shuffle_with_seed(tmp_path: pathlib.Path) -> None: + source_items = [{'index': i} for i in range(100)] + for i, chunk in enumerate(chunks(source_items, 10)): + table = pa.Table.from_pylist(chunk) + out_file = tmp_path / f'test-{i}.parquet' + pq.write_table(table, out_file) + + source = ParquetSource( + filepaths=[str(tmp_path / 'test-*.parquet')], approximate_shuffle=True, sample_size=20, seed=42) + source.setup() + items = list(source.process()) + assert len(items) == 20 def test_validation() -> None: diff --git a/lilac/sources/sqlite_source.py b/lilac/sources/sqlite_source.py index 9503c1591..6983c14b0 100644 --- a/lilac/sources/sqlite_source.py +++ b/lilac/sources/sqlite_source.py @@ -13,7 +13,7 @@ from ..schema import Item, arrow_schema_to_schema from ..source import Source, SourceSchema from ..utils import file_exists -from .duckdb_utils import duckdb_setup +from .duckdb_utils import convert_path_to_duckdb, duckdb_setup router = APIRouter() @@ -50,10 +50,9 @@ def setup(self) -> None: duckdb_setup(self._con) # DuckDB expects s3 protocol: https://duckdb.org/docs/guides/import/s3_import.html. - db_file = self.db_file.replace('gs://', 's3://') - + duckdb_path = convert_path_to_duckdb(self.db_file) self._con.execute(f""" - CREATE VIEW t as (SELECT * FROM sqlite_scan('{db_file}', '{self.table}')); + CREATE VIEW t as (SELECT * FROM sqlite_scan('{duckdb_path}', '{self.table}')); """) res = self._con.execute('SELECT COUNT(*) FROM t').fetchone() diff --git a/poetry.lock b/poetry.lock index 36902baad..2d299b69c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -708,7 +708,6 @@ files = [ {file = "contourpy-1.1.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:18a64814ae7bce73925131381603fff0116e2df25230dfc80d6d690aa6e20b37"}, {file = "contourpy-1.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:90c81f22b4f572f8a2110b0b741bb64e5a6427e0a198b2cdc1fbaf85f352a3aa"}, {file = "contourpy-1.1.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:53cc3a40635abedbec7f1bde60f8c189c49e84ac180c665f2cd7c162cc454baa"}, - {file = "contourpy-1.1.0-cp310-cp310-win32.whl", hash = "sha256:9b2dd2ca3ac561aceef4c7c13ba654aaa404cf885b187427760d7f7d4c57cff8"}, {file = "contourpy-1.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:1f795597073b09d631782e7245016a4323cf1cf0b4e06eef7ea6627e06a37ff2"}, {file = "contourpy-1.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0b7b04ed0961647691cfe5d82115dd072af7ce8846d31a5fac6c142dcce8b882"}, {file = "contourpy-1.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:27bc79200c742f9746d7dd51a734ee326a292d77e7d94c8af6e08d1e6c15d545"}, @@ -717,7 +716,6 @@ files = [ {file = "contourpy-1.1.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e5cec36c5090e75a9ac9dbd0ff4a8cf7cecd60f1b6dc23a374c7d980a1cd710e"}, {file = "contourpy-1.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1f0cbd657e9bde94cd0e33aa7df94fb73c1ab7799378d3b3f902eb8eb2e04a3a"}, {file = "contourpy-1.1.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:181cbace49874f4358e2929aaf7ba84006acb76694102e88dd15af861996c16e"}, - {file = "contourpy-1.1.0-cp311-cp311-win32.whl", hash = "sha256:edb989d31065b1acef3828a3688f88b2abb799a7db891c9e282df5ec7e46221b"}, {file = "contourpy-1.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:fb3b7d9e6243bfa1efb93ccfe64ec610d85cfe5aec2c25f97fbbd2e58b531256"}, {file = "contourpy-1.1.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:bcb41692aa09aeb19c7c213411854402f29f6613845ad2453d30bf421fe68fed"}, {file = "contourpy-1.1.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5d123a5bc63cd34c27ff9c7ac1cd978909e9c71da12e05be0231c608048bb2ae"}, @@ -726,7 +724,6 @@ files = [ {file = "contourpy-1.1.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:317267d915490d1e84577924bd61ba71bf8681a30e0d6c545f577363157e5e94"}, {file = "contourpy-1.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d551f3a442655f3dcc1285723f9acd646ca5858834efeab4598d706206b09c9f"}, {file = "contourpy-1.1.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:e7a117ce7df5a938fe035cad481b0189049e8d92433b4b33aa7fc609344aafa1"}, - {file = "contourpy-1.1.0-cp38-cp38-win32.whl", hash = "sha256:108dfb5b3e731046a96c60bdc46a1a0ebee0760418951abecbe0fc07b5b93b27"}, {file = "contourpy-1.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:d4f26b25b4f86087e7d75e63212756c38546e70f2a92d2be44f80114826e1cd4"}, {file = "contourpy-1.1.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bc00bb4225d57bff7ebb634646c0ee2a1298402ec10a5fe7af79df9a51c1bfd9"}, {file = "contourpy-1.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:189ceb1525eb0655ab8487a9a9c41f42a73ba52d6789754788d1883fb06b2d8a"}, @@ -735,7 +732,6 @@ files = [ {file = "contourpy-1.1.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:143dde50520a9f90e4a2703f367cf8ec96a73042b72e68fcd184e1279962eb6f"}, {file = "contourpy-1.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e94bef2580e25b5fdb183bf98a2faa2adc5b638736b2c0a4da98691da641316a"}, {file = "contourpy-1.1.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ed614aea8462735e7d70141374bd7650afd1c3f3cb0c2dbbcbe44e14331bf002"}, - {file = "contourpy-1.1.0-cp39-cp39-win32.whl", hash = "sha256:71551f9520f008b2950bef5f16b0e3587506ef4f23c734b71ffb7b89f8721999"}, {file = "contourpy-1.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:438ba416d02f82b692e371858143970ed2eb6337d9cdbbede0d8ad9f3d7dd17d"}, {file = "contourpy-1.1.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:a698c6a7a432789e587168573a864a7ea374c6be8d4f31f9d87c001d5a843493"}, {file = "contourpy-1.1.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:397b0ac8a12880412da3551a8cb5a187d3298a72802b45a3bd1805e204ad8439"}, @@ -3181,16 +3177,6 @@ files = [ {file = "MarkupSafe-2.1.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:5bbe06f8eeafd38e5d0a4894ffec89378b6c6a625ff57e3028921f8ff59318ac"}, {file = "MarkupSafe-2.1.3-cp311-cp311-win32.whl", hash = "sha256:dd15ff04ffd7e05ffcb7fe79f1b98041b8ea30ae9234aed2a9168b5797c3effb"}, {file = "MarkupSafe-2.1.3-cp311-cp311-win_amd64.whl", hash = "sha256:134da1eca9ec0ae528110ccc9e48041e0828d79f24121a1a146161103c76e686"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:f698de3fd0c4e6972b92290a45bd9b1536bffe8c6759c62471efaa8acb4c37bc"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:aa57bd9cf8ae831a362185ee444e15a93ecb2e344c8e52e4d721ea3ab6ef1823"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffcc3f7c66b5f5b7931a5aa68fc9cecc51e685ef90282f4a82f0f5e9b704ad11"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47d4f1c5f80fc62fdd7777d0d40a2e9dda0a05883ab11374334f6c4de38adffd"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1f67c7038d560d92149c060157d623c542173016c4babc0c1913cca0564b9939"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:9aad3c1755095ce347e26488214ef77e0485a3c34a50c5a5e2471dff60b9dd9c"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:14ff806850827afd6b07a5f32bd917fb7f45b046ba40c57abdb636674a8b559c"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8f9293864fe09b8149f0cc42ce56e3f0e54de883a9de90cd427f191c346eb2e1"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-win32.whl", hash = "sha256:715d3562f79d540f251b99ebd6d8baa547118974341db04f5ad06d5ea3eb8007"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-win_amd64.whl", hash = "sha256:1b8dd8c3fd14349433c79fa8abeb573a55fc0fdd769133baac1f5e07abf54aeb"}, {file = "MarkupSafe-2.1.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8e254ae696c88d98da6555f5ace2279cf7cd5b3f52be2b5cf97feafe883b58d2"}, {file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb0932dc158471523c9637e807d9bfb93e06a95cbf010f1a38b98623b929ef2b"}, {file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9402b03f1a1b4dc4c19845e5c749e3ab82d5078d16a2a4c2cd2df62d57bb0707"}, @@ -5017,7 +5003,6 @@ files = [ {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"}, - {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"}, {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"}, {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"}, {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"}, @@ -5025,15 +5010,8 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"}, - {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"}, {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"}, {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, - {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, - {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, - {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, - {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, - {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, - {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"}, {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"}, @@ -5050,7 +5028,6 @@ files = [ {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"}, - {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"}, {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"}, {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"}, {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"}, @@ -5058,7 +5035,6 @@ files = [ {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"}, - {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"}, {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"}, {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"}, {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"}, @@ -5486,9 +5462,6 @@ files = [ {file = "safetensors-0.3.3-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:17f41344d9a075f2f21b289a49a62e98baff54b5754240ba896063bce31626bf"}, {file = "safetensors-0.3.3-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:f1045f798e1a16a6ced98d6a42ec72936d367a2eec81dc5fade6ed54638cd7d2"}, {file = "safetensors-0.3.3-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:eaf0e4bc91da13f21ac846a39429eb3f3b7ed06295a32321fa3eb1a59b5c70f3"}, - {file = "safetensors-0.3.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:25149180d4dc8ca48bac2ac3852a9424b466e36336a39659b35b21b2116f96fc"}, - {file = "safetensors-0.3.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c9e943bf78c39de8865398a71818315e7d5d1af93c7b30d4da3fc852e62ad9bc"}, - {file = "safetensors-0.3.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cccfcac04a010354e87c7a2fe16a1ff004fc4f6e7ef8efc966ed30122ce00bc7"}, {file = "safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a07121f427e646a50d18c1be0fa1a2cbf6398624c31149cd7e6b35486d72189e"}, {file = "safetensors-0.3.3-cp310-cp310-win32.whl", hash = "sha256:a85e29cbfddfea86453cc0f4889b4bcc6b9c155be9a60e27be479a34e199e7ef"}, {file = "safetensors-0.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:e13adad4a3e591378f71068d14e92343e626cf698ff805f61cdb946e684a218e"}, @@ -5497,9 +5470,6 @@ files = [ {file = "safetensors-0.3.3-cp311-cp311-macosx_12_0_universal2.whl", hash = "sha256:f84a74cbe9859b28e3d6d7715ac1dd3097bebf8d772694098f6d42435245860c"}, {file = "safetensors-0.3.3-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:10d637423d98ab2e6a4ad96abf4534eb26fcaf8ca3115623e64c00759374e90d"}, {file = "safetensors-0.3.3-cp311-cp311-macosx_13_0_universal2.whl", hash = "sha256:3b46f5de8b44084aff2e480874c550c399c730c84b2e8ad1bddb062c94aa14e9"}, - {file = "safetensors-0.3.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e76da691a82dfaf752854fa6d17c8eba0c8466370c5ad8cf1bfdf832d3c7ee17"}, - {file = "safetensors-0.3.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4e342fd54e66aa9512dd13e410f791e47aa4feeb5f4c9a20882c72f3d272f29"}, - {file = "safetensors-0.3.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:178fd30b5dc73bce14a39187d948cedd0e5698e2f055b7ea16b5a96c9b17438e"}, {file = "safetensors-0.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e8fdf7407dba44587ed5e79d5de3533d242648e1f2041760b21474bd5ea5c8c"}, {file = "safetensors-0.3.3-cp311-cp311-win32.whl", hash = "sha256:7d3b744cee8d7a46ffa68db1a2ff1a1a432488e3f7a5a97856fe69e22139d50c"}, {file = "safetensors-0.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:f579877d30feec9b6ba409d05fa174633a4fc095675a4a82971d831a8bb60b97"}, @@ -5507,9 +5477,6 @@ files = [ {file = "safetensors-0.3.3-cp37-cp37m-macosx_11_0_x86_64.whl", hash = "sha256:41adb1d39e8aad04b16879e3e0cbcb849315999fad73bc992091a01e379cb058"}, {file = "safetensors-0.3.3-cp37-cp37m-macosx_12_0_x86_64.whl", hash = "sha256:0f2b404250b3b877b11d34afcc30d80e7035714a1116a3df56acaca6b6c00096"}, {file = "safetensors-0.3.3-cp37-cp37m-macosx_13_0_x86_64.whl", hash = "sha256:b43956ef20e9f4f2e648818a9e7b3499edd6b753a0f5526d4f6a6826fbee8446"}, - {file = "safetensors-0.3.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d61a99b34169981f088ccfbb2c91170843efc869a0a0532f422db7211bf4f474"}, - {file = "safetensors-0.3.3-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c0008aab36cd20e9a051a68563c6f80d40f238c2611811d7faa5a18bf3fd3984"}, - {file = "safetensors-0.3.3-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:93d54166072b143084fdcd214a080a088050c1bb1651016b55942701b31334e4"}, {file = "safetensors-0.3.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1c32ee08f61cea56a5d62bbf94af95df6040c8ab574afffaeb7b44ae5da1e9e3"}, {file = "safetensors-0.3.3-cp37-cp37m-win32.whl", hash = "sha256:351600f367badd59f7bfe86d317bb768dd8c59c1561c6fac43cafbd9c1af7827"}, {file = "safetensors-0.3.3-cp37-cp37m-win_amd64.whl", hash = "sha256:034717e297849dae1af0a7027a14b8647bd2e272c24106dced64d83e10d468d1"}, @@ -5519,9 +5486,6 @@ files = [ {file = "safetensors-0.3.3-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:69ccee8d05f55cdf76f7e6c87d2bdfb648c16778ef8acfd2ecc495e273e9233e"}, {file = "safetensors-0.3.3-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:c08a9a4b7a4ca389232fa8d097aebc20bbd4f61e477abc7065b5c18b8202dede"}, {file = "safetensors-0.3.3-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:a002868d2e3f49bbe81bee2655a411c24fa1f8e68b703dec6629cb989d6ae42e"}, - {file = "safetensors-0.3.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3bd2704cb41faa44d3ec23e8b97330346da0395aec87f8eaf9c9e2c086cdbf13"}, - {file = "safetensors-0.3.3-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4b2951bf3f0ad63df5e6a95263652bd6c194a6eb36fd4f2d29421cd63424c883"}, - {file = "safetensors-0.3.3-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:07114cec116253ca2e7230fdea30acf76828f21614afd596d7b5438a2f719bd8"}, {file = "safetensors-0.3.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6ab43aeeb9eadbb6b460df3568a662e6f1911ecc39387f8752afcb6a7d96c087"}, {file = "safetensors-0.3.3-cp38-cp38-win32.whl", hash = "sha256:f2f59fce31dd3429daca7269a6b06f65e6547a0c248f5116976c3f1e9b73f251"}, {file = "safetensors-0.3.3-cp38-cp38-win_amd64.whl", hash = "sha256:c31ca0d8610f57799925bf08616856b39518ab772c65093ef1516762e796fde4"}, @@ -5531,9 +5495,6 @@ files = [ {file = "safetensors-0.3.3-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:12b83f1986cd16ea0454c636c37b11e819d60dd952c26978310a0835133480b7"}, {file = "safetensors-0.3.3-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:f439175c827c2f1bbd54df42789c5204a10983a30bc4242bc7deaf854a24f3f0"}, {file = "safetensors-0.3.3-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:0085be33b8cbcb13079b3a8e131656e05b0bc5e6970530d4c24150f7afd76d70"}, - {file = "safetensors-0.3.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e3ec70c87b1e910769034206ad5efc051069b105aac1687f6edcd02526767f4"}, - {file = "safetensors-0.3.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f490132383e5e490e710608f4acffcb98ed37f91b885c7217d3f9f10aaff9048"}, - {file = "safetensors-0.3.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:79d1b6c7ed5596baf79c80fbce5198c3cdcc521ae6a157699f427aba1a90082d"}, {file = "safetensors-0.3.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad3cc8006e7a86ee7c88bd2813ec59cd7cc75b03e6fa4af89b9c7b235b438d68"}, {file = "safetensors-0.3.3-cp39-cp39-win32.whl", hash = "sha256:ab29f54c6b8c301ca05fa014728996bd83aac6e21528f893aaf8945c71f42b6d"}, {file = "safetensors-0.3.3-cp39-cp39-win_amd64.whl", hash = "sha256:0fa82004eae1a71e2aa29843ef99de9350e459a0fc2f65fc6ee0da9690933d2d"}, @@ -5570,11 +5531,6 @@ files = [ {file = "scikit_learn-1.3.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f66eddfda9d45dd6cadcd706b65669ce1df84b8549875691b1f403730bdef217"}, {file = "scikit_learn-1.3.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c6448c37741145b241eeac617028ba6ec2119e1339b1385c9720dae31367f2be"}, {file = "scikit_learn-1.3.1-cp311-cp311-win_amd64.whl", hash = "sha256:c413c2c850241998168bbb3bd1bb59ff03b1195a53864f0b80ab092071af6028"}, - {file = "scikit_learn-1.3.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:ef540e09873e31569bc8b02c8a9f745ee04d8e1263255a15c9969f6f5caa627f"}, - {file = "scikit_learn-1.3.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:9147a3a4df4d401e618713880be023e36109c85d8569b3bf5377e6cd3fecdeac"}, - {file = "scikit_learn-1.3.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d2cd3634695ad192bf71645702b3df498bd1e246fc2d529effdb45a06ab028b4"}, - {file = "scikit_learn-1.3.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0c275a06c5190c5ce00af0acbb61c06374087949f643ef32d355ece12c4db043"}, - {file = "scikit_learn-1.3.1-cp312-cp312-win_amd64.whl", hash = "sha256:0e1aa8f206d0de814b81b41d60c1ce31f7f2c7354597af38fae46d9c47c45122"}, {file = "scikit_learn-1.3.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:52b77cc08bd555969ec5150788ed50276f5ef83abb72e6f469c5b91a0009bbca"}, {file = "scikit_learn-1.3.1-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:a683394bc3f80b7c312c27f9b14ebea7766b1f0a34faf1a2e9158d80e860ec26"}, {file = "scikit_learn-1.3.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a15d964d9eb181c79c190d3dbc2fff7338786bf017e9039571418a1d53dab236"}, @@ -7523,4 +7479,4 @@ text-stats = ["spacy", "textacy"] [metadata] lock-version = "2.0" python-versions = ">=3.9,<4.0" -content-hash = "f6939bbf4378cd9e40b94975ed45c425fcbf21365e010fa31b2d4ed56cb3ed27" +content-hash = "09538a7af9d0fc67c0cbe31bbb5f64880a53f9d78d0724cea7d1d1d305e177be" diff --git a/pyproject.toml b/pyproject.toml index 4fc0e8c0e..53332327a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ distributed = "^2023.3.2.1" duckdb = "^0.9.0" fastapi = "^0.103.1" fsspec = "^2023.9.2" -gcsfs = "^2023.4.0" +gcsfs = "^2023.9.2" google-cloud-storage = "^2.5.0" gunicorn = "^21.2.0" hnswlib = "^0.7.0" # Fast KNN vector store.