Improve the schema view to be less overwhelming. (#726)

Use the signal titles & concept names instead of the paths for signal roots. Definitely alot more we can do, but step in the right direction. After: <img width="435" alt="image" src="https://github.com/lilacai/lilac/assets/1100749/507abdd7-6dc6-4400-9e8f-afc86dcaef04"> Hover chips show the embeddings & signal metadata: <img width="430" alt="image" src="https://github.com/lilacai/lilac/assets/1100749/b811c990-0fa5-4f99-89c1-634c62f8aa08"> Before: <img width="435" alt="image" src="https://github.com/lilacai/lilac/assets/1100749/965d0f0f-7680-4d94-9f63-a14dc3f0362b">
databricks · Sep 29, 2023 · 5fb2101 · 5fb2101
1 parent 067417a
commit 5fb2101
Show file tree

Hide file tree

Showing 11 changed files with 94 additions and 50 deletions.
diff --git a/docs/concepts/concept_metrics.md b/docs/concepts/concept_metrics.md
@@ -22,7 +22,7 @@ The human readable scores are based on this score:
 - `Great`: 0.95 to 1.0
 - `Very good`: 0.9 to 0.95
 - `Good`: 0.8 to 0.9
-- `OK`: 0.5 to 0.9
+- `OK`: 0.5 to 0.8
 - `Not good`: 0.0 to 0.5
 
 ## From the UI

diff --git a/lilac/data/dataset_compute_signal_test.py b/lilac/data/dataset_compute_signal_test.py
@@ -616,7 +616,7 @@ def test_concept_signal_with_select_groups(make_test_data: TestDataMaker) -> Non
     embedding='test_embedding',
     path='text')
 
-  concept_key = 'test_namespace/test_concept/test_embedding/v1'
+  concept_key = 'test_namespace/test_concept/test_embedding'
   result = dataset.select_groups(f'text.{concept_key}.*.score')
   assert result.counts == [('Not in concept', 2), ('In concept', 1)]
 

diff --git a/lilac/data/dataset_duckdb.py b/lilac/data/dataset_duckdb.py
@@ -416,8 +416,6 @@ def _compute_signal_items(self,
                             data_schema: Schema,
                             num_items: int,
                             task_step_id: Optional[TaskStepId] = None) -> Iterable[Item]:
-    signal.setup()
-
     source_path = normalize_path(path)
 
     source_values = self._select_iterable_values(source_path, data_schema)
@@ -479,6 +477,7 @@ def compute_signal(self,
       # Make a dummy task step so we report progress via tqdm.
       task_step_id = ('', 0)
 
+    signal.setup()
     output_items = self._compute_signal_items(signal, path, manifest.data_schema,
                                               manifest.num_items, task_step_id)
 
@@ -498,6 +497,7 @@ def compute_signal(self,
 
     signal_schema = create_signal_schema(signal, source_path, manifest.data_schema)
 
+    print('signal', signal.model_dump())
     # Add progress.
     if task_step_id is not None:
       output_items = progress(
@@ -546,6 +546,7 @@ def compute_embedding(self,
 
     signal = get_signal_by_type(embedding, TextEmbeddingSignal)()
 
+    signal.setup()
     output_items = self._compute_signal_items(signal, path, manifest.data_schema,
                                               manifest.num_items, task_step_id)
 

diff --git a/lilac/data/dataset_select_rows_schema_test.py b/lilac/data/dataset_select_rows_schema_test.py
@@ -439,7 +439,7 @@ def test_search_concept_schema(make_test_data: TestDataMaker) -> None:
                   )
                 })
             ]),
-          'test_namespace/test_concept/labels': field(
+          'test_namespace/test_concept/labels/preview': field(
             fields=[field('string_span', fields={
               'label': 'boolean',
               'draft': 'string'

diff --git a/lilac/data/dataset_select_rows_search_test.py b/lilac/data/dataset_select_rows_search_test.py
@@ -236,15 +236,15 @@ def test_concept_search(make_test_data: TestDataMaker, mocker: MockerFixture) ->
       'text': enriched_item(
         'hello world2.', {
           expected_signal_udf.key(): [lilac_span(0, 13, {'score': approx(0.75, abs=0.25)})],
-          'test_namespace/test_concept/labels': [lilac_span(0, 13, {'label': True})]
+          'test_namespace/test_concept/labels/preview': [lilac_span(0, 13, {'label': True})]
         })
     },
     {
       ROWID: '1',
       'text': enriched_item(
         'hello world.', {
           expected_signal_udf.key(): [lilac_span(0, 12, {'score': approx(0.25, abs=0.25)})],
-          'test_namespace/test_concept/labels': [lilac_span(0, 12, {'label': False})]
+          'test_namespace/test_concept/labels/preview': [lilac_span(0, 12, {'label': False})]
         })
     },
   ]
@@ -291,17 +291,17 @@ def test_concept_search_without_rowid(make_test_data: TestDataMaker) -> None:
     # Results are sorted by score desc.
     {
       'text': 'hello world2.',
-      'text.test_namespace/test_concept/test_embedding': [
+      'text.test_namespace/test_concept/test_embedding/preview': [
         lilac_span(0, 13, {'score': approx(0.75, abs=0.25)})
       ],
-      'text.test_namespace/test_concept/labels': [lilac_span(0, 13, {'label': True})]
+      'text.test_namespace/test_concept/labels/preview': [lilac_span(0, 13, {'label': True})]
     },
     {
       'text': 'hello world.',
-      'text.test_namespace/test_concept/test_embedding': [
+      'text.test_namespace/test_concept/test_embedding/preview': [
         lilac_span(0, 12, {'score': approx(0.25, abs=0.25)})
       ],
-      'text.test_namespace/test_concept/labels': [lilac_span(0, 12, {'label': False})]
+      'text.test_namespace/test_concept/labels/preview': [lilac_span(0, 12, {'label': False})]
     },
   ]
 
@@ -321,17 +321,17 @@ def test_concept_search_without_rowid(make_test_data: TestDataMaker) -> None:
     {
       'text': enriched_item(
         'hello world2.', {
-          'test_namespace/test_concept/test_embedding':
+          'test_namespace/test_concept/test_embedding/preview':
             [lilac_span(0, 13, {'score': approx(0.75, abs=0.25)})],
-          'test_namespace/test_concept/labels': [lilac_span(0, 13, {'label': True})]
+          'test_namespace/test_concept/labels/preview': [lilac_span(0, 13, {'label': True})]
         })
     },
     {
       'text': enriched_item(
         'hello world.', {
-          'test_namespace/test_concept/test_embedding':
+          'test_namespace/test_concept/test_embedding/preview':
             [lilac_span(0, 12, {'score': approx(0.25, abs=0.25)})],
-          'test_namespace/test_concept/labels': [lilac_span(0, 12, {'label': False})]
+          'test_namespace/test_concept/labels/preview': [lilac_span(0, 12, {'label': False})]
         })
     },
   ]
@@ -372,17 +372,17 @@ def test_concept_search_sort_by_rowid(make_test_data: TestDataMaker) -> None:
     # Results are sorted by rowid.
     {
       'text': 'hello world.',
-      'text.test_namespace/test_concept/test_embedding': [
+      'text.test_namespace/test_concept/test_embedding/preview': [
         lilac_span(0, 12, {'score': approx(0.25, abs=0.25)})
       ],
-      'text.test_namespace/test_concept/labels': [lilac_span(0, 12, {'label': False})]
+      'text.test_namespace/test_concept/labels/preview': [lilac_span(0, 12, {'label': False})]
     },
     {
       'text': 'hello world2.',
-      'text.test_namespace/test_concept/test_embedding': [
+      'text.test_namespace/test_concept/test_embedding/preview': [
         lilac_span(0, 13, {'score': approx(0.75, abs=0.25)})
       ],
-      'text.test_namespace/test_concept/labels': [lilac_span(0, 13, {'label': True})]
+      'text.test_namespace/test_concept/labels/preview': [lilac_span(0, 13, {'label': True})]
     }
   ]
 

diff --git a/lilac/signals/concept_labels.py b/lilac/signals/concept_labels.py
@@ -17,6 +17,8 @@ class ConceptLabelsSignal(TextSignal):
 
   namespace: str
   concept_name: str
+  # This will get filled out during setup.
+  version: Optional[int] = None
 
   # The draft version of the concept to use. If not provided, the latest version is used.
   draft: str = DRAFT_MAIN
@@ -28,6 +30,12 @@ class ConceptLabelsSignal(TextSignal):
   def fields(self) -> Field:
     return field(fields=[field('string_span', fields={'label': 'boolean', 'draft': 'string'})])
 
+  @override
+  def setup(self) -> None:
+    concept = self._concept_db.get(self.namespace, self.concept_name, self._user)
+    if concept:
+      self.version = concept.version
+
   @override
   def compute(self, data: Iterable[RichData]) -> Iterable[Optional[Item]]:
     concept = self._concept_db.get(self.namespace, self.concept_name, self._user)
@@ -74,11 +82,5 @@ def set_user(self, user: Optional[UserInfo]) -> None:
 
   @override
   def key(self, is_computed_signal: Optional[bool] = False) -> str:
-    version = ''
-    if is_computed_signal:
-      concept = self._concept_db.get(self.namespace, self.concept_name)
-      if not concept:
-        raise ValueError(f'Concept "{self.namespace}/{self.concept_name}" does not exist.')
-      version = f'/v{concept.version}'
-
-    return f'{self.namespace}/{self.concept_name}/labels{version}'
+    suffix = '/preview' if not is_computed_signal else ''
+    return f'{self.namespace}/{self.concept_name}/labels{suffix}'
diff --git a/lilac/signals/concept_labels_test.py b/lilac/signals/concept_labels_test.py
@@ -114,7 +114,7 @@ def test_concept_labels_key(concept_db_cls: Type[ConceptDB]) -> None:
   concept_db.create(namespace=namespace, name=concept_name, type=SignalInputType.TEXT)
 
   signal = ConceptLabelsSignal(namespace='test', concept_name='test_concept')
-  assert signal.key() == 'test/test_concept/labels'
+  assert signal.key() == 'test/test_concept/labels/preview'
 
 
 @pytest.mark.parametrize('concept_db_cls', ALL_CONCEPT_DBS)
@@ -125,4 +125,4 @@ def test_concept_labels_compute_signal_key(concept_db_cls: Type[ConceptDB]) -> N
   concept_db.create(namespace=namespace, name=concept_name, type=SignalInputType.TEXT)
 
   signal = ConceptLabelsSignal(namespace='test', concept_name='test_concept')
-  assert signal.key(is_computed_signal=True) == 'test/test_concept/labels/v0'
+  assert signal.key(is_computed_signal=True) == 'test/test_concept/labels'
diff --git a/lilac/signals/concept_scorer.py b/lilac/signals/concept_scorer.py
@@ -23,6 +23,8 @@ class ConceptSignal(VectorSignal):
 
   namespace: str
   concept_name: str
+  # This will get filled out during setup.
+  version: Optional[int] = None
 
   # The draft version of the concept to use. If not provided, the latest version is used.
   draft: str = DRAFT_MAIN
@@ -67,6 +69,11 @@ def _compute_span_vector_batch(self, span_vectors: Iterable[SpanVector],
     scores = concept_model.score_embeddings(self.draft, np.array(vectors)).tolist()
     return [lilac_span(start, end, {'score': score}) for score, (start, end) in zip(scores, spans)]
 
+  @override
+  def setup(self) -> None:
+    concept_model = self._get_concept_model()
+    self.version = concept_model.version
+
   @override
   def compute(self, examples: Iterable[RichData]) -> Iterable[Optional[Item]]:
     """Get the scores for the provided examples."""
@@ -94,7 +101,7 @@ def vector_compute_topk(
 
   @override
   def key(self, is_computed_signal: Optional[bool] = False) -> str:
+    suffix = '/preview' if not is_computed_signal else ''
     # NOTE: The embedding is a value so already exists in the path structure. This means we do not
     # need to provide the name as part of the key, which still guarantees uniqueness.
-    version = f'/v{self._get_concept_model().version}' if is_computed_signal else ''
-    return f'{self.namespace}/{self.concept_name}/{self.embedding}{version}'
+    return f'{self.namespace}/{self.concept_name}/{self.embedding}{suffix}'
diff --git a/lilac/signals/concept_scorer_test.py b/lilac/signals/concept_scorer_test.py
@@ -256,10 +256,10 @@ def test_concept_model_draft(concept_db_cls: Type[ConceptDB], model_db_cls: Type
   assert draft_scores != scores
 
 
-def test_concept_score_key() -> None:
+def test_concept_score_preview_key() -> None:
   signal = ConceptSignal(
     namespace='test', concept_name='test_concept', embedding=TestEmbedding.name)
-  assert signal.key() == 'test/test_concept/test_embedding'
+  assert signal.key() == 'test/test_concept/test_embedding/preview'
 
 
 @pytest.mark.parametrize('concept_db_cls', ALL_CONCEPT_DBS)
@@ -271,4 +271,4 @@ def test_concept_score_compute_signal_key(concept_db_cls: Type[ConceptDB]) -> No
 
   signal = ConceptSignal(
     namespace='test', concept_name='test_concept', embedding=TestEmbedding.name)
-  assert signal.key(is_computed_signal=True) == 'test/test_concept/test_embedding/v0'
+  assert signal.key(is_computed_signal=True) == 'test/test_concept/test_embedding'
diff --git a/web/blueprint/src/lib/components/schemaView/SchemaField.svelte b/web/blueprint/src/lib/components/schemaView/SchemaField.svelte
@@ -4,6 +4,7 @@
   import {DTYPE_TO_ICON, getSearches, isPreviewSignal} from '$lib/view_utils';
 
   import {computeSignalMutation} from '$lib/queries/datasetQueries';
+  import {querySignals} from '$lib/queries/signalQueries';
   import {
     PATH_WILDCARD,
     VALUE_KEY,
@@ -14,6 +15,7 @@
     isSortableField,
     pathIsEqual,
     serializePath,
+    type ConceptSignal,
     type LilacField,
     type LilacSchema,
     type TextEmbeddingSignal
@@ -49,8 +51,37 @@
 
   $: expandedStats = $datasetViewStore.expandedStats[serializePath(path)] || false;
 
+  const signals = querySignals();
+
   $: isRepeatedField = path.at(-1) === PATH_WILDCARD ? true : false;
-  $: fieldName = isRepeatedField ? path.at(-2) : path.at(-1);
+  $: fieldTitle = isRepeatedField ? path.at(-2) : path.at(-1);
+  let fieldHoverDetails: string | null = null;
+  // If the field is a signal root, use the signal name to define the title.
+  $: {
+    if (field.signal && $signals.data != null) {
+      if (field.signal.signal_name === 'concept_score') {
+        const conceptSignal = field.signal as ConceptSignal;
+        fieldTitle = `${conceptSignal.concept_name}`;
+        fieldHoverDetails =
+          `Concept '${conceptSignal.namespace}/${conceptSignal.concept_name}' arguments:\n\n` +
+          `embedding: '${conceptSignal.embedding}'` +
+          (conceptSignal.version != null ? `\nversion: ${conceptSignal.version}` : '');
+      } else {
+        const signalInfo = $signals.data.find(s => s.name === field.signal?.signal_name);
+        if (signalInfo != null) {
+          fieldTitle = signalInfo.json_schema.title;
+
+          const argumentDetails = Object.entries(field.signal || {})
+            .filter(([arg, _]) => arg != 'signal_name')
+            .map(([k, v]) => `${k}: ${v}`);
+
+          if (argumentDetails.length > 0) {
+            fieldHoverDetails = `Signal '${signalInfo.name}' arguments: \n\n${argumentDetails}`;
+          }
+        }
+      }
+    }
+  }
 
   $: children = childDisplayFields(field);
 
@@ -139,22 +170,24 @@
         <span class="font-mono">{'{}'}</span>
       {/if}
     </div>
-    <button
-      class="ml-2 grow cursor-pointer truncate whitespace-nowrap text-left text-gray-900"
-      class:cursor-default={!isExpandable}
-      disabled={!isExpandable}
-      on:click={() => {
-        if (isExpandable) {
-          if (expandedStats) {
-            datasetViewStore.removeExpandedColumn(path);
-          } else {
-            datasetViewStore.addExpandedColumn(path);
+    <div class="grow" use:hoverTooltip={{text: fieldHoverDetails || ''}}>
+      <button
+        class="ml-2 w-full cursor-pointer truncate whitespace-nowrap text-left text-gray-900"
+        class:cursor-default={!isExpandable}
+        disabled={!isExpandable}
+        on:click={() => {
+          if (isExpandable) {
+            if (expandedStats) {
+              datasetViewStore.removeExpandedColumn(path);
+            } else {
+              datasetViewStore.addExpandedColumn(path);
+            }
           }
-        }
-      }}
-    >
-      {fieldName}
-    </button>
+        }}
+      >
+        {fieldTitle}
+      </button>
+    </div>
     {#if isSortedBy}
       <RemovableTag
         interactive

diff --git a/web/lib/fastapi_client/models/ConceptSignal.ts b/web/lib/fastapi_client/models/ConceptSignal.ts
@@ -14,6 +14,7 @@ export type ConceptSignal = {
     embedding: 'cohere' | 'sbert' | 'openai' | 'palm' | 'gte-small' | 'gte-base';
     namespace: string;
     concept_name: string;
+    version?: (number | null);
     draft?: string;
 };