diff --git a/docs/datasets/dataset_load.md b/docs/datasets/dataset_load.md
index 9769d1e44..8c5b4344e 100644
--- a/docs/datasets/dataset_load.md
+++ b/docs/datasets/dataset_load.md
@@ -63,6 +63,117 @@ You will be redirected to the dataset view once your data is loaded.
 
 ## From Python
 
+### Creating a dataset
+
+You can create a dataset from Python using [](#lilac.create_dataset). Lilac supports variety of data
+sources, including CSV, JSON, HuggingFace datasets, Parquet, Pandas and more. See [](#lilac.sources)
+for details on available sources. All the file based readers support reading from local files, S3
+(`s3://...`), GCS (`gs://...`) and HTTP(S) URLs.
+
+Before we load any dataset, we should set the project directory which will be used to store all the
+datasets we import. If not set, it defaults to the current working directory.
+
+```python
+import lilac as ll
+ll.set_project_dir('~/my_project')
+```
+
+#### Huggingface
+
+You can load any HuggingFace dataset by passing the dataset name and config name. We use the HF
+dataset loader, which will fetch and cache the dataset in your HF cache dir. Then Lilac will convert
+that to our internal format and store it in the Lilac project dir. To read private datasets, either
+login via the [`huggingface-cli`](https://huggingface.co/docs/huggingface_hub/quick-start#login) or
+provide a `token` to the `HuggingFaceSource`.
+
+```python
+config = ll.DatasetConfig(
+  namespace='local',
+  name='glue',
+  source=ll.HuggingFaceSource(dataset_name='glue', config_name='ax'))
+# NOTE: You can pass a `project_dir` to `create_dataset` as the second argument.
+dataset = ll.create_dataset(config)
+```
+
+#### CSV
+
+The CSV reader can read from local files, S3, GCS and HTTP. If your dataset is sharded, you can use
+a glob pattern to load multiple files.
+
+```python
+url = 'https://storage.googleapis.com/lilac-data/datasets/the_movies_dataset/the_movies_dataset.csv'
+config = ll.DatasetConfig(
+  namespace='local', name='the_movies_dataset', source=ll.CSVSource(filepaths=[url]))
+dataset = ll.create_dataset(config)
+```
+
+#### Parquet
+
+The parquet reader can read from local files, S3, GCS and HTTP. If your dataset is sharded, you can
+use a glob pattern to load multiple files.
+
+**Sampling**
+
+The `ParquetSource` takes a few optional arguments related to sampling:
+
+- `sample_size`, the number of rows to sample.
+- `approximate_shuffle`, defaulting to `False`. When `False`, we take an entire pass over the
+  dataset with reservoir sampling. When `True`, we read a fraction of rows from the start of each
+  shard, to avoid shard skew, without doing a full pass over the entire dataset. This is useful when
+  your dataset is very large and consists of a large number of shards.
+- `seed`, the random seed to use for sampling.
+
+```python
+source = ll.ParquetSource(
+  filepaths=['s3://lilac-public-data/test-*.parquet'],
+  sample_size=100,
+  approximate_shuffle=True)
+config = ll.DatasetConfig(namespace='local', name='parquet-test', source=source)
+dataset = ll.create_dataset(config)
+```
+
+#### JSON
+
+The JSON reader can read from local files, S3, GCS and HTTP. If your dataset is sharded, you can use
+a glob pattern to load multiple files. The reader supports both JSON and JSONL files.
+
+If the format is JSON, we expect the dataset to be an array of objects:
+
+```json
+[
+  {"id": 1, "text": "hello world"},
+  {"id": 2, "text": "goodbye world"}
+]
+```
+
+If the format is JSONL, we expect each line to be a JSON object:
+
+```json
+{"id": 1, "text": "hello world"}
+{"id": 2, "text": "goodbye world"}
+```
+
+```python
+config = ll.DatasetConfig(
+  namespace='local',
+  name='news_headlines',
+  source=ll.JSONSource(filepaths=[
+    'https://storage.googleapis.com/lilac-data/datasets/langsmith-finetuning-rag/rag.jsonl'
+  ]))
+dataset = ll.create_dataset(config)
+```
+
+#### Pandas
+
+```python
+df = pd.DataFrame({'test': ['a', 'b', 'c']})
+config = ll.DatasetConfig(namespace='local', name='the_movies_dataset2', source=ll.PandasSource(df))
+dataset = ll.create_dataset(config)
+```
+
+For details on all the source loaders, see [](#lilac.sources). For details on the dataset config,
+see [](#lilac.DatasetConfig).
+
 ### Loading from lilac.yml
 
 When you start a webserver, Lilac will automatically create a project for you in the given project
@@ -99,24 +210,3 @@ Or from the CLI:
 ```sh
 lilac load --project_dir=~/my_lilac
 ```
-
-### Loading an individual dataset
-
-This example loads the `glue` dataset with the `ax` config from HuggingFace:
-
-```python
-# Set the global project directory to where project files will be stored.
-ll.set_project_dir('~/my_project')
-
-config = ll.DatasetConfig(
-  namespace='local',
-  name='glue',
-  source=ll.HuggingFaceSource(dataset_name='glue', config_name='ax'))
-
-# NOTE: If you don't want to set a global project directory, you can pass the `project_dir` to `create_dataset` as the second argument.
-dataset = ll.create_dataset(config)
-```
-
-For details on all the source loaders, see [](#lilac.sources).
-
-For details on the dataset config, see [](#lilac.DatasetConfig).
diff --git a/lilac.yml b/lilac.yml
deleted file mode 100644
index f48f8a553..000000000
--- a/lilac.yml
+++ /dev/null
@@ -1,4 +0,0 @@
-# Lilac project config.
-# See https://lilacml.com/api_reference/index.html#lilac.Config for details.
-
-{}
diff --git a/lilac/load_dataset.py b/lilac/load_dataset.py
index 842b031e9..3d9624303 100644
--- a/lilac/load_dataset.py
+++ b/lilac/load_dataset.py
@@ -26,13 +26,15 @@
 
 
 def create_dataset(config: DatasetConfig,
-                   project_dir: Optional[Union[str, pathlib.Path]] = None) -> Dataset:
+                   project_dir: Optional[Union[str, pathlib.Path]] = None,
+                   overwrite: bool = False) -> Dataset:
   """Load a dataset from a given source configuration.
 
   Args:
     config: The dataset configuration to load.
     project_dir: The path to the project directory for where to create the dataset. If not defined,
       uses the project directory from `LILAC_PROJECT_DIR` or [deprecated] `LILAC_DATA_PATH`.
+    overwrite: Whether to overwrite the dataset if it already exists.
   """
   project_dir = project_dir or get_project_dir()
   if not project_dir:
@@ -40,7 +42,7 @@ def create_dataset(config: DatasetConfig,
                      'globally with `set_project_dir(path)`')
 
   # Update the config before processing the source.
-  add_project_dataset_config(config, project_dir)
+  add_project_dataset_config(config, project_dir, overwrite)
 
   process_source(project_dir, config)
   return get_dataset(config.namespace, config.name, project_dir)
diff --git a/lilac/project.py b/lilac/project.py
index c72027f57..a43f126f9 100644
--- a/lilac/project.py
+++ b/lilac/project.py
@@ -33,13 +33,15 @@ def init(project_dir: Optional[Union[str, pathlib.Path]] = None) -> None:
 
 
 def add_project_dataset_config(dataset_config: DatasetConfig,
-                               project_dir: Optional[Union[str, pathlib.Path]] = None) -> None:
+                               project_dir: Optional[Union[str, pathlib.Path]] = None,
+                               overwrite: bool = False) -> None:
   """Add a dataset to the project config.
 
   Args:
     dataset_config: The dataset configuration to load.
     project_dir: The path to the project directory for where to create the dataset. If not defined,
       uses the project directory from `LILAC_PROJECT_DIR` or [deprecated] `LILAC_DATA_PATH`.
+    overwrite: Whether to overwrite the dataset if it already exists.
   """
   project_dir = project_dir or get_project_dir()
   with PROJECT_CONFIG_LOCK:
@@ -47,10 +49,13 @@ def add_project_dataset_config(dataset_config: DatasetConfig,
     existing_dataset_config = get_dataset_config(config, dataset_config.namespace,
                                                  dataset_config.name)
     if existing_dataset_config is not None:
-      raise ValueError(
-        f'{dataset_config} has already been added. You can delete it with: \n\n'
-        f'dataset = get_dataset("{dataset_config.namespace}", "{dataset_config.name}")\n'
-        'dataset.delete()')
+      if overwrite:
+        config.datasets.remove(existing_dataset_config)
+      else:
+        raise ValueError(
+          f'{dataset_config} has already been added. You can delete it with: \n\n'
+          f'dataset = get_dataset("{dataset_config.namespace}", "{dataset_config.name}")\n'
+          'dataset.delete()')
 
     config.datasets.append(dataset_config)
     write_project_config(project_dir, config)
diff --git a/lilac/router_dataset.py b/lilac/router_dataset.py
index f7c691303..be50719ef 100644
--- a/lilac/router_dataset.py
+++ b/lilac/router_dataset.py
@@ -2,7 +2,6 @@
 import os
 from copy import copy
 from typing import Annotated, Any, Literal, Optional, Sequence, Union, cast
-from urllib.parse import unquote
 
 from fastapi import APIRouter, HTTPException, Response
 from fastapi.params import Depends
diff --git a/lilac/sources/csv_source.py b/lilac/sources/csv_source.py
index 11e98b6ee..e10112f59 100644
--- a/lilac/sources/csv_source.py
+++ b/lilac/sources/csv_source.py
@@ -9,7 +9,7 @@
 from ..schema import Item, arrow_schema_to_schema
 from ..source import Source, SourceSchema
 from ..utils import download_http_files
-from .duckdb_utils import duckdb_setup
+from .duckdb_utils import convert_path_to_duckdb, duckdb_setup
 
 LINE_NUMBER_COLUMN = '__line_number__'
 
@@ -45,14 +45,14 @@ def setup(self) -> None:
     duckdb_setup(self._con)
 
     # DuckDB expects s3 protocol: https://duckdb.org/docs/guides/import/s3_import.html.
-    s3_filepaths = [path.replace('gs://', 's3://') for path in filepaths]
+    duckdb_paths = [convert_path_to_duckdb(path) for path in filepaths]
 
     # NOTE: We use duckdb here to increase parallelism for multiple files.
     # NOTE: We turn off the parallel reader because of https://github.com/lilacai/lilac/issues/373.
     self._con.execute(f"""
       CREATE SEQUENCE serial START 1;
       CREATE VIEW t as (SELECT nextval('serial') as "{LINE_NUMBER_COLUMN}", * FROM read_csv_auto(
-        {s3_filepaths},
+        {duckdb_paths},
         SAMPLE_SIZE=500000,
         HEADER={self.header},
         {f'NAMES={self.names},' if self.names else ''}
diff --git a/lilac/sources/duckdb_utils.py b/lilac/sources/duckdb_utils.py
index 9717df8ed..5c7138827 100644
--- a/lilac/sources/duckdb_utils.py
+++ b/lilac/sources/duckdb_utils.py
@@ -1,30 +1,46 @@
 """Utils for duckdb."""
-import os
+
+import urllib.parse
 
 import duckdb
 
-from ..env import env, get_project_dir
+from ..env import env
 
 
 def duckdb_setup(con: duckdb.DuckDBPyConnection) -> None:
-  """Setup DuckDB. This includes setting up the extensions directory and GCS access."""
-  con.execute(f"""
-    SET extension_directory='{os.path.join(get_project_dir(), '.duckdb')}';
+  """Setup DuckDB. This includes setting up performance optimizations."""
+  con.execute("""
+    SET enable_http_metadata_cache=true;
+    SET enable_object_cache=true;
   """)
 
-  region = env('GCS_REGION') or env('S3_REGION')
-  if region:
-    con.execute(f"SET s3_region='{region}")
-
-  access_key = env('GCS_ACCESS_KEY') or env('S3_ACCESS_KEY')
-  if access_key:
-    con.execute(f"SET s3_access_key_id='{access_key}")
-
-  secret_key = env('GCS_SECRET_KEY') or env('S3_SECRET_KEY')
-  if secret_key:
-    con.execute(f"SET s3_secret_access_key='{secret_key}'")
 
-  gcs_endpoint = 'storage.googleapis.com'
-  endpoint = env('S3_ENDPOINT') or (gcs_endpoint if env('GCS_REGION') else None)
-  if endpoint:
-    con.execute(f"SET s3_endpoint='{endpoint}'")
+def convert_path_to_duckdb(filepath: str) -> str:
+  """Convert a filepath to a duckdb filepath."""
+  scheme = urllib.parse.urlparse(filepath).scheme
+  options: dict[str, str] = {}
+  if scheme == '':
+    return filepath
+  elif scheme == 'gs':
+    options['s3_endpoint'] = 'storage.googleapis.com'
+    if env('GCS_REGION'):
+      options['s3_region'] = env('GCS_REGION')
+    if env('GCS_ACCESS_KEY'):
+      options['s3_access_key_id'] = env('GCS_ACCESS_KEY')
+    if env('GCS_SECRET_KEY'):
+      options['s3_secret_access_key'] = env('GCS_SECRET_KEY')
+    filepath = filepath.replace('gs://', 's3://')
+  elif scheme == 's3':
+    if env('S3_ENDPOINT'):
+      options['s3_endpoint'] = env('S3_ENDPOINT')
+    if env('S3_REGION'):
+      options['s3_region'] = env('S3_REGION')
+    if env('S3_ACCESS_KEY'):
+      options['s3_access_key_id'] = env('S3_ACCESS_KEY')
+    if env('S3_SECRET_KEY'):
+      options['s3_secret_access_key'] = env('S3_SECRET_KEY')
+  else:
+    raise ValueError(f'Unsupported scheme: {scheme}')
+  if options:
+    return f'{filepath}?{urllib.parse.urlencode(options, safe="+/")}'
+  return filepath
diff --git a/lilac/sources/huggingface_source.py b/lilac/sources/huggingface_source.py
index af2a77977..1be3d7a31 100644
--- a/lilac/sources/huggingface_source.py
+++ b/lilac/sources/huggingface_source.py
@@ -136,6 +136,11 @@ class HuggingFaceSource(Source):
     title='Sample size',
     description='Number of rows to sample from the dataset, for each split.',
     default=None)
+  token: Optional[str] = PydanticField(
+    title='Huggingface token',
+    description='Huggingface token for private datasets.',
+    default=None,
+    exclude=True)
   revision: Optional[str] = PydanticField(title='Dataset revision', default=None)
   load_from_disk: Optional[bool] = PydanticField(
     description='Load from local disk instead of the hub.', default=False)
@@ -153,7 +158,8 @@ def setup(self) -> None:
         self.dataset_name,
         self.config_name,
         num_proc=multiprocessing.cpu_count(),
-        ignore_verifications=True)
+        verification_mode='no_checks',
+        token=self.token)
     self._dataset_dict = hf_dataset_dict
     self._schema_info = hf_schema_to_schema(self._dataset_dict, self.split, self.sample_size)
 
diff --git a/lilac/sources/json_source.py b/lilac/sources/json_source.py
index 7a5d51184..6a4f6c9ac 100644
--- a/lilac/sources/json_source.py
+++ b/lilac/sources/json_source.py
@@ -9,7 +9,7 @@
 from ..schema import Item, arrow_schema_to_schema
 from ..source import Source, SourceSchema
 from ..utils import download_http_files
-from .duckdb_utils import duckdb_setup
+from .duckdb_utils import convert_path_to_duckdb, duckdb_setup
 
 
 class JSONSource(Source):
@@ -40,14 +40,11 @@ def setup(self) -> None:
     duckdb_setup(self._con)
 
     # DuckDB expects s3 protocol: https://duckdb.org/docs/guides/import/s3_import.html.
-    s3_filepaths = [path.replace('gs://', 's3://') for path in filepaths]
+    duckdb_paths = [convert_path_to_duckdb(path) for path in filepaths]
 
     # NOTE: We use duckdb here to increase parallelism for multiple files.
     self._con.execute(f"""
-      CREATE VIEW t as (SELECT * FROM read_json_auto(
-        {s3_filepaths},
-        IGNORE_ERRORS=true
-      ));
+      CREATE VIEW t as (SELECT * FROM read_json_auto({duckdb_paths}, IGNORE_ERRORS=true));
     """)
 
     res = self._con.execute('SELECT COUNT(*) FROM t').fetchone()
diff --git a/lilac/sources/parquet_source.py b/lilac/sources/parquet_source.py
index d523e18a1..98fc7693b 100644
--- a/lilac/sources/parquet_source.py
+++ b/lilac/sources/parquet_source.py
@@ -1,18 +1,19 @@
 """Parquet source."""
+import random
 from typing import ClassVar, Iterable, Optional, cast
 
 import duckdb
 import pyarrow as pa
-from pydantic import Field, field_validator
+from pydantic import Field, ValidationInfo, field_validator
 from typing_extensions import override
 
-from ..schema import Item, arrow_schema_to_schema
+from ..schema import Item, Schema, arrow_schema_to_schema
 from ..source import Source, SourceSchema
-from ..sources.duckdb_utils import duckdb_setup
+from ..sources.duckdb_utils import convert_path_to_duckdb, duckdb_setup
 from ..utils import download_http_files
 
 # Number of rows to read per batch.
-ROWS_PER_BATCH_READ = 10_000
+ROWS_PER_BATCH_READ = 50_000
 
 
 class ParquetSource(Source):
@@ -27,11 +28,16 @@ class ParquetSource(Source):
   filepaths: list[str] = Field(
     description=
     'A list of paths to parquet files which live locally or remotely on GCS, S3, or Hadoop.')
+  seed: Optional[int] = Field(description='Random seed for sampling', default=None)
   sample_size: Optional[int] = Field(
     title='Sample size', description='Number of rows to sample from the dataset', default=None)
+  approximate_shuffle: bool = Field(
+    default=False,
+    description='If true, the reader will read a fraction of rows from each shard, '
+    'avoiding a pass over the entire dataset.')
 
   _source_schema: Optional[SourceSchema] = None
-  _reader: Optional[pa.RecordBatchReader] = None
+  _readers: list[pa.RecordBatchReader] = []
   _con: Optional[duckdb.DuckDBPyConnection] = None
 
   @field_validator('filepaths')
@@ -50,6 +56,43 @@ def validate_sample_size(cls, sample_size: int) -> int:
       raise ValueError('sample_size must be greater than 0.')
     return sample_size
 
+  @field_validator('approximate_shuffle')
+  @classmethod
+  def validate_approximate_shuffle(cls, approximate_shuffle: bool, info: ValidationInfo) -> bool:
+    """Validate shuffle before sampling."""
+    if approximate_shuffle and not info.data['sample_size']:
+      raise ValueError('`approximate_shuffle` requires `sample_size` to be set.')
+    return approximate_shuffle
+
+  def _setup_sampling(self, duckdb_paths: list[str]) -> Schema:
+    assert self._con, 'setup() must be called first.'
+    if self.approximate_shuffle:
+      assert self.sample_size, 'approximate_shuffle requires sample_size to be set.'
+      # Find each individual file.
+      glob_rows: list[tuple[str]] = self._con.execute(
+        f'SELECT * FROM GLOB({duckdb_paths})').fetchall()
+      duckdb_files: list[str] = list(set([row[0] for row in glob_rows]))
+      batch_size = max(1, min(self.sample_size // len(duckdb_files), ROWS_PER_BATCH_READ))
+      for duckdb_file in duckdb_files:
+        # Since we are not fetching the entire results immediately, we need a seperate cursor
+        # for each file to avoid each cursor overwriting the previous one.
+        con = self._con.cursor()
+        duckdb_setup(con)
+        res = con.execute(f"""SELECT * FROM read_parquet('{duckdb_file}')""")
+        self._readers.append(res.fetch_record_batch(rows_per_batch=batch_size))
+    else:
+      sample_suffix = ''
+      if self.sample_size:
+        sample_suffix = f'USING SAMPLE {self.sample_size}'
+        if self.seed is not None:
+          sample_suffix += f' (reservoir, {self.seed})'
+      res = self._con.execute(f"""SELECT * FROM read_parquet({duckdb_paths}) {sample_suffix}""")
+      batch_size = ROWS_PER_BATCH_READ
+      if self.sample_size:
+        batch_size = min(self.sample_size, ROWS_PER_BATCH_READ)
+      self._readers.append(res.fetch_record_batch(rows_per_batch=batch_size))
+    return arrow_schema_to_schema(self._readers[0].schema)
+
   @override
   def setup(self) -> None:
     filepaths = download_http_files(self.filepaths)
@@ -57,19 +100,13 @@ def setup(self) -> None:
     duckdb_setup(self._con)
 
     # DuckDB expects s3 protocol: https://duckdb.org/docs/guides/import/s3_import.html.
-    s3_filepaths = [path.replace('gs://', 's3://') for path in filepaths]
-
-    # NOTE: We use duckdb here to increase parallelism for multiple files.
-    sample_suffix = f'USING SAMPLE {self.sample_size}' if self.sample_size else ''
-    self._con.execute(f"""
-      CREATE VIEW t as (SELECT * FROM read_parquet({s3_filepaths}) {sample_suffix});
-    """)
-    res = self._con.execute('SELECT COUNT(*) FROM t').fetchone()
+    duckdb_paths = [convert_path_to_duckdb(path) for path in filepaths]
+    res = self._con.execute(f'SELECT COUNT(*) FROM read_parquet({duckdb_paths})').fetchone()
     num_items = cast(tuple[int], res)[0]
-    self._reader = self._con.execute('SELECT * from t').fetch_record_batch(
-      rows_per_batch=ROWS_PER_BATCH_READ)
-    # Create the source schema in prepare to share it between process and source_schema.
-    schema = arrow_schema_to_schema(self._reader.schema)
+    if self.sample_size:
+      self.sample_size = min(self.sample_size, num_items)
+      num_items = self.sample_size
+    schema = self._setup_sampling(duckdb_paths)
     self._source_schema = SourceSchema(fields=schema.fields, num_items=num_items)
 
   @override
@@ -81,10 +118,33 @@ def source_schema(self) -> SourceSchema:
   @override
   def process(self) -> Iterable[Item]:
     """Process the source."""
-    assert self._reader and self._con, 'setup() must be called first.'
-
-    for batch in self._reader:
-      yield from batch.to_pylist()
+    assert self._con, 'setup() must be called first.'
+
+    items_yielded = 0
+    done = False
+
+    if self.seed is not None:
+      random.seed(self.seed)
+
+    while not done:
+      index = random.randint(0, len(self._readers) - 1)
+      reader = self._readers[index]
+      batch = None
+      try:
+        batch = reader.read_next_batch()
+      except StopIteration:
+        reader.close()
+        del self._readers[index]
+        if not self._readers:
+          done = True
+          break
+        continue
+      items = batch.to_pylist()
+      for item in items:
+        yield item
+        items_yielded += 1
+        if self.sample_size and items_yielded == self.sample_size:
+          done = True
+          break
 
-    self._reader.close()
     self._con.close()
diff --git a/lilac/sources/parquet_source_test.py b/lilac/sources/parquet_source_test.py
index ef5cdebd8..29b5520bb 100644
--- a/lilac/sources/parquet_source_test.py
+++ b/lilac/sources/parquet_source_test.py
@@ -10,6 +10,7 @@
 
 from ..schema import schema
 from ..source import SourceSchema
+from ..utils import chunks
 from .parquet_source import ParquetSource
 
 
@@ -41,26 +42,138 @@ def test_simple_rows(tmp_path: pathlib.Path) -> None:
   assert items == [{'name': 'a', 'age': 1}, {'name': 'b', 'age': 2}, {'name': 'c', 'age': 3}]
 
 
-def test_sampling(tmp_path: pathlib.Path) -> None:
-  table = pa.Table.from_pylist([{
-    'name': 'a',
-    'age': 1
-  }, {
-    'name': 'b',
-    'age': 2
-  }, {
-    'name': 'c',
-    'age': 3
-  }])
+def test_single_shard_with_sampling(tmp_path: pathlib.Path) -> None:
+  source_items = [{'name': 'a', 'age': 1}, {'name': 'b', 'age': 2}, {'name': 'c', 'age': 3}]
+  table = pa.Table.from_pylist(source_items)
 
   out_file = os.path.join(tmp_path, 'test.parquet')
   pq.write_table(table, out_file)
 
-  for sample_size in range(1, 4):
+  # Test sampling with different sample sizes, including sample size > num_items.
+  for sample_size in range(1, 5):
     source = ParquetSource(filepaths=[out_file], sample_size=sample_size)
     source.setup()
     items = list(source.process())
-    assert len(items) == sample_size
+    assert len(items) == min(sample_size, len(source_items))
+
+
+def test_single_shard_approximate_shuffle(tmp_path: pathlib.Path) -> None:
+  source_items = [{'name': 'a', 'age': 1}, {'name': 'b', 'age': 2}, {'name': 'c', 'age': 3}]
+  table = pa.Table.from_pylist(source_items)
+
+  out_file = os.path.join(tmp_path, 'test.parquet')
+  pq.write_table(table, out_file)
+
+  # Test sampling with different sample sizes, including sample size > num_items.
+  for sample_size in range(1, 5):
+    source = ParquetSource(filepaths=[out_file], sample_size=sample_size, approximate_shuffle=True)
+    source.setup()
+    items = list(source.process())
+    assert len(items) == min(sample_size, len(source_items))
+
+
+def test_multi_shard(tmp_path: pathlib.Path) -> None:
+  source_items = [{'name': 'a', 'age': 1}, {'name': 'b', 'age': 2}, {'name': 'c', 'age': 3}]
+  for i, item in enumerate(source_items):
+    table = pa.Table.from_pylist([item])
+    out_file = tmp_path / f'test-{i}.parquet'
+    pq.write_table(table, out_file)
+
+  source = ParquetSource(filepaths=[str(tmp_path / 'test-*.parquet')])
+  source.setup()
+  items = list(source.process())
+  assert items == source_items
+
+
+def test_multi_shard_sample(tmp_path: pathlib.Path) -> None:
+  source_items = [{'name': 'a', 'age': 1}, {'name': 'b', 'age': 2}, {'name': 'c', 'age': 3}]
+  for i, item in enumerate(source_items):
+    table = pa.Table.from_pylist([item])
+    out_file = tmp_path / f'test-{i}.parquet'
+    pq.write_table(table, out_file)
+
+  # Test sampling with different sample sizes, including sample size > num_items.
+  for sample_size in range(1, 5):
+    source = ParquetSource(filepaths=[str(tmp_path / 'test-*.parquet')], sample_size=sample_size)
+    source.setup()
+    items = list(source.process())
+    assert len(items) == min(sample_size, len(source_items))
+
+
+def test_multi_shard_approx_shuffle(tmp_path: pathlib.Path) -> None:
+  source_items = [{'name': 'a', 'age': 1}, {'name': 'b', 'age': 2}, {'name': 'c', 'age': 3}]
+  for i, item in enumerate(source_items):
+    table = pa.Table.from_pylist([item])
+    out_file = tmp_path / f'test-{i}.parquet'
+    pq.write_table(table, out_file)
+
+  # Test sampling with different sample sizes, including sample size > num_items.
+  for sample_size in range(1, 5):
+    source = ParquetSource(
+      filepaths=[str(tmp_path / 'test-*.parquet')],
+      approximate_shuffle=True,
+      sample_size=sample_size)
+    source.setup()
+    items = list(source.process())
+    assert len(items) == min(sample_size, len(source_items))
+
+
+def test_uniform_shards_approximate_shuffle(tmp_path: pathlib.Path) -> None:
+  source_items = [{'index': i} for i in range(100)]
+  for i, chunk in enumerate(chunks(source_items, 10)):
+    table = pa.Table.from_pylist(chunk)
+    out_file = tmp_path / f'test-{i}.parquet'
+    pq.write_table(table, out_file)
+
+  source = ParquetSource(
+    filepaths=[str(tmp_path / 'test-*.parquet')], approximate_shuffle=True, sample_size=20)
+  source.setup()
+  items = list(source.process())
+  assert len(items) == 20
+
+
+def test_nonuniform_shards_approximate_shuffle(tmp_path: pathlib.Path) -> None:
+  source_items = [{'index': i} for i in range(100)]
+  shard_sizes = [49, 1, 40, 10]
+  for i, shard_size in enumerate(shard_sizes):
+    chunk = source_items[:shard_size]
+    source_items = source_items[shard_size:]
+    table = pa.Table.from_pylist(chunk)
+    out_file = tmp_path / f'test-{i}.parquet'
+    pq.write_table(table, out_file)
+
+  source = ParquetSource(
+    filepaths=[str(tmp_path / 'test-*.parquet')], approximate_shuffle=True, sample_size=20)
+  source.setup()
+  items = list(source.process())
+  assert len(items) == 20
+
+
+def test_sampling_with_seed(tmp_path: pathlib.Path) -> None:
+  source_items = [{'index': i} for i in range(100)]
+  for i, chunk in enumerate(chunks(source_items, 10)):
+    table = pa.Table.from_pylist(chunk)
+    out_file = tmp_path / f'test-{i}.parquet'
+    pq.write_table(table, out_file)
+
+  source = ParquetSource(filepaths=[str(tmp_path / 'test-*.parquet')], sample_size=20, seed=42)
+  source.setup()
+  items = list(source.process())
+  assert len(items) == 20
+
+
+def test_approx_shuffle_with_seed(tmp_path: pathlib.Path) -> None:
+  source_items = [{'index': i} for i in range(100)]
+  for i, chunk in enumerate(chunks(source_items, 10)):
+    table = pa.Table.from_pylist(chunk)
+    out_file = tmp_path / f'test-{i}.parquet'
+    pq.write_table(table, out_file)
+
+  source = ParquetSource(
+    filepaths=[str(tmp_path / 'test-*.parquet')], approximate_shuffle=True, sample_size=20, seed=42)
+  source.setup()
+  items = list(source.process())
+  assert len(items) == 20
 
 
 def test_validation() -> None:
diff --git a/lilac/sources/sqlite_source.py b/lilac/sources/sqlite_source.py
index 9503c1591..6983c14b0 100644
--- a/lilac/sources/sqlite_source.py
+++ b/lilac/sources/sqlite_source.py
@@ -13,7 +13,7 @@
 from ..schema import Item, arrow_schema_to_schema
 from ..source import Source, SourceSchema
 from ..utils import file_exists
-from .duckdb_utils import duckdb_setup
+from .duckdb_utils import convert_path_to_duckdb, duckdb_setup
 
 router = APIRouter()
 
@@ -50,10 +50,9 @@ def setup(self) -> None:
     duckdb_setup(self._con)
 
     # DuckDB expects s3 protocol: https://duckdb.org/docs/guides/import/s3_import.html.
-    db_file = self.db_file.replace('gs://', 's3://')
-
+    duckdb_path = convert_path_to_duckdb(self.db_file)
     self._con.execute(f"""
-      CREATE VIEW t as (SELECT * FROM sqlite_scan('{db_file}', '{self.table}'));
+      CREATE VIEW t as (SELECT * FROM sqlite_scan('{duckdb_path}', '{self.table}'));
     """)
 
     res = self._con.execute('SELECT COUNT(*) FROM t').fetchone()
diff --git a/poetry.lock b/poetry.lock
index 36902baad..2d299b69c 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -708,7 +708,6 @@ files = [
     {file = "contourpy-1.1.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:18a64814ae7bce73925131381603fff0116e2df25230dfc80d6d690aa6e20b37"},
     {file = "contourpy-1.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:90c81f22b4f572f8a2110b0b741bb64e5a6427e0a198b2cdc1fbaf85f352a3aa"},
     {file = "contourpy-1.1.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:53cc3a40635abedbec7f1bde60f8c189c49e84ac180c665f2cd7c162cc454baa"},
-    {file = "contourpy-1.1.0-cp310-cp310-win32.whl", hash = "sha256:9b2dd2ca3ac561aceef4c7c13ba654aaa404cf885b187427760d7f7d4c57cff8"},
     {file = "contourpy-1.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:1f795597073b09d631782e7245016a4323cf1cf0b4e06eef7ea6627e06a37ff2"},
     {file = "contourpy-1.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0b7b04ed0961647691cfe5d82115dd072af7ce8846d31a5fac6c142dcce8b882"},
     {file = "contourpy-1.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:27bc79200c742f9746d7dd51a734ee326a292d77e7d94c8af6e08d1e6c15d545"},
@@ -717,7 +716,6 @@ files = [
     {file = "contourpy-1.1.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e5cec36c5090e75a9ac9dbd0ff4a8cf7cecd60f1b6dc23a374c7d980a1cd710e"},
     {file = "contourpy-1.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1f0cbd657e9bde94cd0e33aa7df94fb73c1ab7799378d3b3f902eb8eb2e04a3a"},
     {file = "contourpy-1.1.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:181cbace49874f4358e2929aaf7ba84006acb76694102e88dd15af861996c16e"},
-    {file = "contourpy-1.1.0-cp311-cp311-win32.whl", hash = "sha256:edb989d31065b1acef3828a3688f88b2abb799a7db891c9e282df5ec7e46221b"},
     {file = "contourpy-1.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:fb3b7d9e6243bfa1efb93ccfe64ec610d85cfe5aec2c25f97fbbd2e58b531256"},
     {file = "contourpy-1.1.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:bcb41692aa09aeb19c7c213411854402f29f6613845ad2453d30bf421fe68fed"},
     {file = "contourpy-1.1.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5d123a5bc63cd34c27ff9c7ac1cd978909e9c71da12e05be0231c608048bb2ae"},
@@ -726,7 +724,6 @@ files = [
     {file = "contourpy-1.1.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:317267d915490d1e84577924bd61ba71bf8681a30e0d6c545f577363157e5e94"},
     {file = "contourpy-1.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d551f3a442655f3dcc1285723f9acd646ca5858834efeab4598d706206b09c9f"},
     {file = "contourpy-1.1.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:e7a117ce7df5a938fe035cad481b0189049e8d92433b4b33aa7fc609344aafa1"},
-    {file = "contourpy-1.1.0-cp38-cp38-win32.whl", hash = "sha256:108dfb5b3e731046a96c60bdc46a1a0ebee0760418951abecbe0fc07b5b93b27"},
     {file = "contourpy-1.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:d4f26b25b4f86087e7d75e63212756c38546e70f2a92d2be44f80114826e1cd4"},
     {file = "contourpy-1.1.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bc00bb4225d57bff7ebb634646c0ee2a1298402ec10a5fe7af79df9a51c1bfd9"},
     {file = "contourpy-1.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:189ceb1525eb0655ab8487a9a9c41f42a73ba52d6789754788d1883fb06b2d8a"},
@@ -735,7 +732,6 @@ files = [
     {file = "contourpy-1.1.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:143dde50520a9f90e4a2703f367cf8ec96a73042b72e68fcd184e1279962eb6f"},
     {file = "contourpy-1.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e94bef2580e25b5fdb183bf98a2faa2adc5b638736b2c0a4da98691da641316a"},
     {file = "contourpy-1.1.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ed614aea8462735e7d70141374bd7650afd1c3f3cb0c2dbbcbe44e14331bf002"},
-    {file = "contourpy-1.1.0-cp39-cp39-win32.whl", hash = "sha256:71551f9520f008b2950bef5f16b0e3587506ef4f23c734b71ffb7b89f8721999"},
     {file = "contourpy-1.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:438ba416d02f82b692e371858143970ed2eb6337d9cdbbede0d8ad9f3d7dd17d"},
     {file = "contourpy-1.1.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:a698c6a7a432789e587168573a864a7ea374c6be8d4f31f9d87c001d5a843493"},
     {file = "contourpy-1.1.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:397b0ac8a12880412da3551a8cb5a187d3298a72802b45a3bd1805e204ad8439"},
@@ -3181,16 +3177,6 @@ files = [
     {file = "MarkupSafe-2.1.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:5bbe06f8eeafd38e5d0a4894ffec89378b6c6a625ff57e3028921f8ff59318ac"},
     {file = "MarkupSafe-2.1.3-cp311-cp311-win32.whl", hash = "sha256:dd15ff04ffd7e05ffcb7fe79f1b98041b8ea30ae9234aed2a9168b5797c3effb"},
     {file = "MarkupSafe-2.1.3-cp311-cp311-win_amd64.whl", hash = "sha256:134da1eca9ec0ae528110ccc9e48041e0828d79f24121a1a146161103c76e686"},
-    {file = "MarkupSafe-2.1.3-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:f698de3fd0c4e6972b92290a45bd9b1536bffe8c6759c62471efaa8acb4c37bc"},
-    {file = "MarkupSafe-2.1.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:aa57bd9cf8ae831a362185ee444e15a93ecb2e344c8e52e4d721ea3ab6ef1823"},
-    {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffcc3f7c66b5f5b7931a5aa68fc9cecc51e685ef90282f4a82f0f5e9b704ad11"},
-    {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47d4f1c5f80fc62fdd7777d0d40a2e9dda0a05883ab11374334f6c4de38adffd"},
-    {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1f67c7038d560d92149c060157d623c542173016c4babc0c1913cca0564b9939"},
-    {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:9aad3c1755095ce347e26488214ef77e0485a3c34a50c5a5e2471dff60b9dd9c"},
-    {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:14ff806850827afd6b07a5f32bd917fb7f45b046ba40c57abdb636674a8b559c"},
-    {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8f9293864fe09b8149f0cc42ce56e3f0e54de883a9de90cd427f191c346eb2e1"},
-    {file = "MarkupSafe-2.1.3-cp312-cp312-win32.whl", hash = "sha256:715d3562f79d540f251b99ebd6d8baa547118974341db04f5ad06d5ea3eb8007"},
-    {file = "MarkupSafe-2.1.3-cp312-cp312-win_amd64.whl", hash = "sha256:1b8dd8c3fd14349433c79fa8abeb573a55fc0fdd769133baac1f5e07abf54aeb"},
     {file = "MarkupSafe-2.1.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8e254ae696c88d98da6555f5ace2279cf7cd5b3f52be2b5cf97feafe883b58d2"},
     {file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb0932dc158471523c9637e807d9bfb93e06a95cbf010f1a38b98623b929ef2b"},
     {file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9402b03f1a1b4dc4c19845e5c749e3ab82d5078d16a2a4c2cd2df62d57bb0707"},
@@ -5017,7 +5003,6 @@ files = [
     {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"},
     {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"},
     {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"},
-    {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"},
     {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"},
     {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"},
     {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"},
@@ -5025,15 +5010,8 @@ files = [
     {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"},
     {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"},
     {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"},
-    {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"},
     {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"},
     {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
-    {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
-    {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
-    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
-    {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
-    {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
-    {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"},
     {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"},
     {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"},
     {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"},
@@ -5050,7 +5028,6 @@ files = [
     {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"},
     {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"},
     {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"},
-    {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"},
     {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"},
     {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"},
     {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"},
@@ -5058,7 +5035,6 @@ files = [
     {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"},
     {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"},
     {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"},
-    {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"},
     {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"},
     {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"},
     {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"},
@@ -5486,9 +5462,6 @@ files = [
     {file = "safetensors-0.3.3-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:17f41344d9a075f2f21b289a49a62e98baff54b5754240ba896063bce31626bf"},
     {file = "safetensors-0.3.3-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:f1045f798e1a16a6ced98d6a42ec72936d367a2eec81dc5fade6ed54638cd7d2"},
     {file = "safetensors-0.3.3-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:eaf0e4bc91da13f21ac846a39429eb3f3b7ed06295a32321fa3eb1a59b5c70f3"},
-    {file = "safetensors-0.3.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:25149180d4dc8ca48bac2ac3852a9424b466e36336a39659b35b21b2116f96fc"},
-    {file = "safetensors-0.3.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c9e943bf78c39de8865398a71818315e7d5d1af93c7b30d4da3fc852e62ad9bc"},
-    {file = "safetensors-0.3.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cccfcac04a010354e87c7a2fe16a1ff004fc4f6e7ef8efc966ed30122ce00bc7"},
     {file = "safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a07121f427e646a50d18c1be0fa1a2cbf6398624c31149cd7e6b35486d72189e"},
     {file = "safetensors-0.3.3-cp310-cp310-win32.whl", hash = "sha256:a85e29cbfddfea86453cc0f4889b4bcc6b9c155be9a60e27be479a34e199e7ef"},
     {file = "safetensors-0.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:e13adad4a3e591378f71068d14e92343e626cf698ff805f61cdb946e684a218e"},
@@ -5497,9 +5470,6 @@ files = [
     {file = "safetensors-0.3.3-cp311-cp311-macosx_12_0_universal2.whl", hash = "sha256:f84a74cbe9859b28e3d6d7715ac1dd3097bebf8d772694098f6d42435245860c"},
     {file = "safetensors-0.3.3-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:10d637423d98ab2e6a4ad96abf4534eb26fcaf8ca3115623e64c00759374e90d"},
     {file = "safetensors-0.3.3-cp311-cp311-macosx_13_0_universal2.whl", hash = "sha256:3b46f5de8b44084aff2e480874c550c399c730c84b2e8ad1bddb062c94aa14e9"},
-    {file = "safetensors-0.3.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e76da691a82dfaf752854fa6d17c8eba0c8466370c5ad8cf1bfdf832d3c7ee17"},
-    {file = "safetensors-0.3.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4e342fd54e66aa9512dd13e410f791e47aa4feeb5f4c9a20882c72f3d272f29"},
-    {file = "safetensors-0.3.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:178fd30b5dc73bce14a39187d948cedd0e5698e2f055b7ea16b5a96c9b17438e"},
     {file = "safetensors-0.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e8fdf7407dba44587ed5e79d5de3533d242648e1f2041760b21474bd5ea5c8c"},
     {file = "safetensors-0.3.3-cp311-cp311-win32.whl", hash = "sha256:7d3b744cee8d7a46ffa68db1a2ff1a1a432488e3f7a5a97856fe69e22139d50c"},
     {file = "safetensors-0.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:f579877d30feec9b6ba409d05fa174633a4fc095675a4a82971d831a8bb60b97"},
@@ -5507,9 +5477,6 @@ files = [
     {file = "safetensors-0.3.3-cp37-cp37m-macosx_11_0_x86_64.whl", hash = "sha256:41adb1d39e8aad04b16879e3e0cbcb849315999fad73bc992091a01e379cb058"},
     {file = "safetensors-0.3.3-cp37-cp37m-macosx_12_0_x86_64.whl", hash = "sha256:0f2b404250b3b877b11d34afcc30d80e7035714a1116a3df56acaca6b6c00096"},
     {file = "safetensors-0.3.3-cp37-cp37m-macosx_13_0_x86_64.whl", hash = "sha256:b43956ef20e9f4f2e648818a9e7b3499edd6b753a0f5526d4f6a6826fbee8446"},
-    {file = "safetensors-0.3.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d61a99b34169981f088ccfbb2c91170843efc869a0a0532f422db7211bf4f474"},
-    {file = "safetensors-0.3.3-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c0008aab36cd20e9a051a68563c6f80d40f238c2611811d7faa5a18bf3fd3984"},
-    {file = "safetensors-0.3.3-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:93d54166072b143084fdcd214a080a088050c1bb1651016b55942701b31334e4"},
     {file = "safetensors-0.3.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1c32ee08f61cea56a5d62bbf94af95df6040c8ab574afffaeb7b44ae5da1e9e3"},
     {file = "safetensors-0.3.3-cp37-cp37m-win32.whl", hash = "sha256:351600f367badd59f7bfe86d317bb768dd8c59c1561c6fac43cafbd9c1af7827"},
     {file = "safetensors-0.3.3-cp37-cp37m-win_amd64.whl", hash = "sha256:034717e297849dae1af0a7027a14b8647bd2e272c24106dced64d83e10d468d1"},
@@ -5519,9 +5486,6 @@ files = [
     {file = "safetensors-0.3.3-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:69ccee8d05f55cdf76f7e6c87d2bdfb648c16778ef8acfd2ecc495e273e9233e"},
     {file = "safetensors-0.3.3-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:c08a9a4b7a4ca389232fa8d097aebc20bbd4f61e477abc7065b5c18b8202dede"},
     {file = "safetensors-0.3.3-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:a002868d2e3f49bbe81bee2655a411c24fa1f8e68b703dec6629cb989d6ae42e"},
-    {file = "safetensors-0.3.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3bd2704cb41faa44d3ec23e8b97330346da0395aec87f8eaf9c9e2c086cdbf13"},
-    {file = "safetensors-0.3.3-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4b2951bf3f0ad63df5e6a95263652bd6c194a6eb36fd4f2d29421cd63424c883"},
-    {file = "safetensors-0.3.3-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:07114cec116253ca2e7230fdea30acf76828f21614afd596d7b5438a2f719bd8"},
     {file = "safetensors-0.3.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6ab43aeeb9eadbb6b460df3568a662e6f1911ecc39387f8752afcb6a7d96c087"},
     {file = "safetensors-0.3.3-cp38-cp38-win32.whl", hash = "sha256:f2f59fce31dd3429daca7269a6b06f65e6547a0c248f5116976c3f1e9b73f251"},
     {file = "safetensors-0.3.3-cp38-cp38-win_amd64.whl", hash = "sha256:c31ca0d8610f57799925bf08616856b39518ab772c65093ef1516762e796fde4"},
@@ -5531,9 +5495,6 @@ files = [
     {file = "safetensors-0.3.3-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:12b83f1986cd16ea0454c636c37b11e819d60dd952c26978310a0835133480b7"},
     {file = "safetensors-0.3.3-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:f439175c827c2f1bbd54df42789c5204a10983a30bc4242bc7deaf854a24f3f0"},
     {file = "safetensors-0.3.3-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:0085be33b8cbcb13079b3a8e131656e05b0bc5e6970530d4c24150f7afd76d70"},
-    {file = "safetensors-0.3.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e3ec70c87b1e910769034206ad5efc051069b105aac1687f6edcd02526767f4"},
-    {file = "safetensors-0.3.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f490132383e5e490e710608f4acffcb98ed37f91b885c7217d3f9f10aaff9048"},
-    {file = "safetensors-0.3.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:79d1b6c7ed5596baf79c80fbce5198c3cdcc521ae6a157699f427aba1a90082d"},
     {file = "safetensors-0.3.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad3cc8006e7a86ee7c88bd2813ec59cd7cc75b03e6fa4af89b9c7b235b438d68"},
     {file = "safetensors-0.3.3-cp39-cp39-win32.whl", hash = "sha256:ab29f54c6b8c301ca05fa014728996bd83aac6e21528f893aaf8945c71f42b6d"},
     {file = "safetensors-0.3.3-cp39-cp39-win_amd64.whl", hash = "sha256:0fa82004eae1a71e2aa29843ef99de9350e459a0fc2f65fc6ee0da9690933d2d"},
@@ -5570,11 +5531,6 @@ files = [
     {file = "scikit_learn-1.3.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f66eddfda9d45dd6cadcd706b65669ce1df84b8549875691b1f403730bdef217"},
     {file = "scikit_learn-1.3.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c6448c37741145b241eeac617028ba6ec2119e1339b1385c9720dae31367f2be"},
     {file = "scikit_learn-1.3.1-cp311-cp311-win_amd64.whl", hash = "sha256:c413c2c850241998168bbb3bd1bb59ff03b1195a53864f0b80ab092071af6028"},
-    {file = "scikit_learn-1.3.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:ef540e09873e31569bc8b02c8a9f745ee04d8e1263255a15c9969f6f5caa627f"},
-    {file = "scikit_learn-1.3.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:9147a3a4df4d401e618713880be023e36109c85d8569b3bf5377e6cd3fecdeac"},
-    {file = "scikit_learn-1.3.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d2cd3634695ad192bf71645702b3df498bd1e246fc2d529effdb45a06ab028b4"},
-    {file = "scikit_learn-1.3.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0c275a06c5190c5ce00af0acbb61c06374087949f643ef32d355ece12c4db043"},
-    {file = "scikit_learn-1.3.1-cp312-cp312-win_amd64.whl", hash = "sha256:0e1aa8f206d0de814b81b41d60c1ce31f7f2c7354597af38fae46d9c47c45122"},
     {file = "scikit_learn-1.3.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:52b77cc08bd555969ec5150788ed50276f5ef83abb72e6f469c5b91a0009bbca"},
     {file = "scikit_learn-1.3.1-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:a683394bc3f80b7c312c27f9b14ebea7766b1f0a34faf1a2e9158d80e860ec26"},
     {file = "scikit_learn-1.3.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a15d964d9eb181c79c190d3dbc2fff7338786bf017e9039571418a1d53dab236"},
@@ -7523,4 +7479,4 @@ text-stats = ["spacy", "textacy"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9,<4.0"
-content-hash = "f6939bbf4378cd9e40b94975ed45c425fcbf21365e010fa31b2d4ed56cb3ed27"
+content-hash = "09538a7af9d0fc67c0cbe31bbb5f64880a53f9d78d0724cea7d1d1d305e177be"
diff --git a/pyproject.toml b/pyproject.toml
index 4fc0e8c0e..53332327a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,7 +20,7 @@ distributed = "^2023.3.2.1"
 duckdb = "^0.9.0"
 fastapi = "^0.103.1"
 fsspec = "^2023.9.2"
-gcsfs = "^2023.4.0"
+gcsfs = "^2023.9.2"
 google-cloud-storage = "^2.5.0"
 gunicorn = "^21.2.0"
 hnswlib = "^0.7.0"                                       # Fast KNN vector store.