diff --git a/lilac/data/clustering.py b/lilac/data/clustering.py index 6ee05569..dff1f866 100644 --- a/lilac/data/clustering.py +++ b/lilac/data/clustering.py @@ -109,6 +109,13 @@ def cluster_impl( else: raise ValueError('input must be provided.') + if use_garden and skip_noisy_assignment: + raise ValueError( + '`use_garden` and `skip_noisy_assignment` cannot both be True. ' + 'The garden implementation is heavily optimizied and will always ' + 'assign noisy points to the nearest cluster.' + ) + # Extract the text from the input path into a temporary column. TEXT_COLUMN = 'text' temp_text_path = (*cluster_output_path, TEXT_COLUMN) diff --git a/lilac/router_dataset_signals.py b/lilac/router_dataset_signals.py index 4c4f4d8d..da7e27c8 100644 --- a/lilac/router_dataset_signals.py +++ b/lilac/router_dataset_signals.py @@ -91,6 +91,11 @@ class ClusterOptions(BaseModel): use_garden: bool = PydanticField( default=False, description='Accelerate computation by running remotely on Lilac Garden.' ) + skip_noisy_assignment: bool = PydanticField( + default=False, + description='Skip assignment of noisy points to the nearest cluster to speed up clustering.', + ) + overwrite: bool = False @@ -145,6 +150,7 @@ def run() -> None: use_garden=options.use_garden, overwrite=options.overwrite, task_id=task_id, + skip_noisy_assignment=options.skip_noisy_assignment, ) launch_task(task_id, run) diff --git a/web/blueprint/src/lib/components/ComputeClusterModal.svelte b/web/blueprint/src/lib/components/ComputeClusterModal.svelte index 79d553d3..69fc0dcf 100644 --- a/web/blueprint/src/lib/components/ComputeClusterModal.svelte +++ b/web/blueprint/src/lib/components/ComputeClusterModal.svelte @@ -7,6 +7,7 @@ input: Path; output_path?: Path; use_garden?: boolean; + skip_noisy_assignment?: boolean; overwrite?: boolean; }; @@ -101,7 +102,8 @@ use_garden: options.use_garden, output_path: outputColumn, input_selector: selectedFormatSelector, - overwrite: options.overwrite + overwrite: options.overwrite, + skip_noisy_assignment: options.skip_noisy_assignment } ]); close(); @@ -173,6 +175,20 @@ {/if} + +
+
Skip noisy assignment
+
+ Skip assignment of noisy points to the nearest cluster to speed up clustering. +
+ +
+
Overwrite
diff --git a/web/lib/fastapi_client/models/ClusterOptions.ts b/web/lib/fastapi_client/models/ClusterOptions.ts index a5b200fe..c7471b9f 100644 --- a/web/lib/fastapi_client/models/ClusterOptions.ts +++ b/web/lib/fastapi_client/models/ClusterOptions.ts @@ -14,6 +14,10 @@ export type ClusterOptions = { * Accelerate computation by running remotely on Lilac Garden. */ use_garden?: boolean; + /** + * Skip assignment of noisy points to the nearest cluster to speed up clustering. + */ + skip_noisy_assignment?: boolean; overwrite?: boolean; };