From afc5b626ee2fe0903342b8029da79116760fed71 Mon Sep 17 00:00:00 2001
From: Nikhil Thorat <nikhil@lilacml.com>
Date: Wed, 24 Jan 2024 15:56:22 -0500
Subject: [PATCH] Add more demo datasets. (#1120)

We are now officially in pretty good shape:
https://huggingface.co/spaces/lilacai/lilac

I fixed some subtle design stuff along the way (make the button look
like a button, fix some alignment, fix some centering)

NOTE: I had to change the openai json formatter. The official format has
"messages", but sometimes users use "conversation" so I made them two
formats. We should probably generalize this a bit but I didnt feel like
doing it now.
---
 lilac/formats/default_formats.py              |   3 +-
 lilac/formats/openai_json.py                  |  65 ++++++++++-
 lilac/formats/openai_json_test.py             |  29 ++++-
 lilac_hf_space.yml                            | 102 ++++++++++++++----
 .../datasetView/DatasetPivotResult.svelte     |   6 +-
 .../datasetView/DatasetPivotViewer.svelte     |  12 ++-
 6 files changed, 181 insertions(+), 36 deletions(-)

diff --git a/lilac/formats/default_formats.py b/lilac/formats/default_formats.py
index bee035071..f3b68185a 100644
--- a/lilac/formats/default_formats.py
+++ b/lilac/formats/default_formats.py
@@ -1,7 +1,7 @@
 """Registers all available dataset formats."""
 
 from ..dataset_format import register_dataset_format
-from .openai_json import OpenAIJSON
+from .openai_json import OpenAIConversationJSON, OpenAIJSON
 from .openchat import OpenChat
 from .sharegpt import ShareGPT
 
@@ -11,3 +11,4 @@ def register_default_formats() -> None:
   register_dataset_format(ShareGPT)
   register_dataset_format(OpenChat)
   register_dataset_format(OpenAIJSON)
+  register_dataset_format(OpenAIConversationJSON)
diff --git a/lilac/formats/openai_json.py b/lilac/formats/openai_json.py
index 4bea06f34..3b861daec 100644
--- a/lilac/formats/openai_json.py
+++ b/lilac/formats/openai_json.py
@@ -9,7 +9,7 @@
 def _openai_selector(item: Item, conv_role: str) -> str:
   """Selector for OpenAI JSON Formatted conversations."""
   # TODO(nsthorat): Make this return an array, and not pre-join with newlines.
-  values = [conv['content'] for conv in item['conversation'] if conv['role'] == conv_role]
+  values = [conv['content'] for conv in item['messages'] if conv['role'] == conv_role]
   # Get the __value__ key version of text if it's enriched.
   values = [value if isinstance(value, str) else value.get(VALUE_KEY) for value in values]
   return '\n'.join(values)
@@ -30,14 +30,12 @@ class OpenAIJSON(DatasetFormat):
   """OpenAI JSON format.
 
   Taken from: https://platform.openai.com/docs/api-reference/chat
-
-  Note that here "messages" is "conversation" for support with common datasets.
   """
 
   name: ClassVar[str] = 'openai_json'
   data_schema: Schema = schema(
     {
-      'conversation': [
+      'messages': [
         {
           'role': 'string',
           'content': 'string',
@@ -47,7 +45,7 @@ class OpenAIJSON(DatasetFormat):
   )
 
   title_slots: list[tuple[PathTuple, PathTuple]] = [
-    (('conversation', PATH_WILDCARD, 'content'), ('conversation', PATH_WILDCARD, 'role'))
+    (('messages', PATH_WILDCARD, 'content'), ('messages', PATH_WILDCARD, 'role'))
   ]
 
   user: ClassVar[DatasetFormatInputSelector] = _USER_SELECTOR
@@ -56,3 +54,60 @@ class OpenAIJSON(DatasetFormat):
   input_selectors: ClassVar[dict[str, DatasetFormatInputSelector]] = {
     selector.name: selector for selector in [_USER_SELECTOR, _ASSISTANT_SELECTOR]
   }
+
+
+# TODO(nsthorat): Generalize this code so that 'conversations' or 'messages' can be the root field.
+# This is the only difference that forces us to make a new format.
+
+
+def _openai_conversation_selector(item: Item, conv_role: str) -> str:
+  """Selector for OpenAI JSON Formatted conversations."""
+  # TODO(nsthorat): Make this return an array, and not pre-join with newlines.
+  values = [conv['content'] for conv in item['conversation'] if conv['role'] == conv_role]
+  # Get the __value__ key version of text if it's enriched.
+  values = [value if isinstance(value, str) else value.get(VALUE_KEY) for value in values]
+  return '\n'.join(values)
+
+
+_USER_CONVERSATION_SELECTOR = DatasetFormatInputSelector(
+  name='user',
+  selector=lambda item: _openai_conversation_selector(item, 'user'),
+)
+
+_ASSISTANT_CONVERSATION_SELECTOR = DatasetFormatInputSelector(
+  name='assistant',
+  selector=lambda item: _openai_conversation_selector(item, 'assistant'),
+)
+
+
+class OpenAIConversationJSON(DatasetFormat):
+  """OpenAI JSON format.
+
+  Taken from: https://platform.openai.com/docs/api-reference/chat
+
+  Note that here "messages" is "conversation" for support with common datasets.
+  """
+
+  name: ClassVar[str] = 'openai_conversation_json'
+  data_schema: Schema = schema(
+    {
+      'conversation': [
+        {
+          'role': 'string',
+          'content': 'string',
+        }
+      ],
+    },
+  )
+
+  title_slots: list[tuple[PathTuple, PathTuple]] = [
+    (('conversation', PATH_WILDCARD, 'content'), ('conversation', PATH_WILDCARD, 'role'))
+  ]
+
+  user: ClassVar[DatasetFormatInputSelector] = _USER_CONVERSATION_SELECTOR
+  assistant: ClassVar[DatasetFormatInputSelector] = _ASSISTANT_CONVERSATION_SELECTOR
+
+  input_selectors: ClassVar[dict[str, DatasetFormatInputSelector]] = {
+    selector.name: selector
+    for selector in [_USER_CONVERSATION_SELECTOR, _ASSISTANT_CONVERSATION_SELECTOR]
+  }
diff --git a/lilac/formats/openai_json_test.py b/lilac/formats/openai_json_test.py
index abd39b163..4c6b263b5 100644
--- a/lilac/formats/openai_json_test.py
+++ b/lilac/formats/openai_json_test.py
@@ -2,10 +2,33 @@
 
 
 from ..data.dataset_test_utils import TestDataMaker
-from .openai_json import OpenAIJSON
+from .openai_json import OpenAIConversationJSON, OpenAIJSON
 
 
-def test_infer_open_chat(make_test_data: TestDataMaker) -> None:
+def test_infer_openai_json(make_test_data: TestDataMaker) -> None:
+  dataset = make_test_data(
+    [
+      {
+        'messages': [
+          {'role': 'user', 'content': 'Hello'},
+          {'role': 'assistant', 'content': 'Hi'},
+          {'role': 'user', 'content': 'How are you today?'},
+          {'role': 'assistant', 'content': "I'm fine."},
+        ],
+      },
+      {
+        'messages': [
+          {'role': 'user', 'content': 'Who are you?'},
+          {'role': 'assistant', 'content': "I'm OpenChat."},
+        ],
+      },
+    ]
+  )
+
+  assert dataset.manifest().dataset_format == OpenAIJSON()
+
+
+def test_infer_openai_conversation_json(make_test_data: TestDataMaker) -> None:
   dataset = make_test_data(
     [
       {
@@ -25,4 +48,4 @@ def test_infer_open_chat(make_test_data: TestDataMaker) -> None:
     ]
   )
 
-  assert dataset.manifest().dataset_format == OpenAIJSON()
+  assert dataset.manifest().dataset_format == OpenAIConversationJSON()
diff --git a/lilac_hf_space.yml b/lilac_hf_space.yml
index f2b0a8a03..a6db68735 100644
--- a/lilac_hf_space.yml
+++ b/lilac_hf_space.yml
@@ -105,7 +105,7 @@ datasets:
           - '*'
           - content
 
-  - name: 'OpenOrca'
+  - name: OpenOrca
     namespace: lilac
     settings:
       tags: [datasets]
@@ -123,21 +123,42 @@ datasets:
           - question
 
   - namespace: lilac
-    name: mosaic-instruct-v3
+    name: SlimOrca
     source:
-      dataset_name: mosaicml/instruct-v3
+      dataset_name: Open-Orca/SlimOrca
+      source_name: huggingface
+    settings:
+      tags: [datasets]
+      ui:
+        media_paths:
+          - - conversations
+            - '*'
+            - value
+
+  - namespace: lilac
+    name: UltraChat-200k
+    source:
+      dataset_name: HuggingFaceH4/ultrachat_200k
+      source_name: huggingface
+    settings:
+      tags: [datasets]
+      ui:
+        media_paths:
+          - - messages
+            - '*'
+            - content
+
+  - namespace: lilac
+    name: roblox_luau_corpus
+    source:
+      dataset_name: Roblox/luau_corpus
       source_name: huggingface
     settings:
       tags: [datasets]
       ui:
         media_paths:
           - prompt
-          - response
-        markdown_paths: []
-    embeddings:
-      - embedding: gte-small
-        path:
-          - prompt
+          - completion
 
   ## Eval datasets
   - namespace: lilac
@@ -303,11 +324,10 @@ datasets:
           - option2
           - answer
 
-  ## Old datasets to be cleaned up.
   - name: databricks-dolly-15k-curated-en
     namespace: lilac
     settings:
-      tags: [machine-learning]
+      tags: [datasets]
       ui:
         media_paths:
           - original-instruction
@@ -336,21 +356,35 @@ datasets:
         path:
           - original-response
 
+  - namespace: lilac
+    name: mosaic-instruct-v3
+    source:
+      dataset_name: mosaicml/instruct-v3
+      source_name: huggingface
+    settings:
+      tags: [datasets]
+      ui:
+        media_paths:
+          - prompt
+          - response
+    embeddings:
+      - embedding: gte-small
+        path:
+          - prompt
+
   - namespace: lilac
     name: dolphin
-    tags: [datasets]
     source:
       dataset_name: cognitivecomputations/dolphin
       config_name: flan1m-alpaca-uncensored
       source_name: huggingface
     settings:
+      tags: [datasets]
       ui:
         media_paths:
           - instruction
           - input
           - output
-          - - input__cluster
-            - text
 
 signals:
   - signal_name: text_statistics
@@ -389,7 +423,7 @@ clusters:
   - dataset_namespace: lilac
     dataset_name: lmsys-chat-1m
     input_selector:
-      format: openai_json
+      format: openai_conversation_json
       selector: user
     output_path:
       - conversation__clusters
@@ -399,6 +433,14 @@ clusters:
     input_path:
       - question
 
+  - dataset_namespace: lilac
+    dataset_name: SlimOrca
+    input_selector:
+      format: sharegpt
+      selector: human
+    output_path:
+      - conversation__clusters
+
   - dataset_namespace: lilac
     dataset_name: databricks-dolly-15k-curated-en
     input_path:
@@ -409,6 +451,30 @@ clusters:
     input_path:
       - prompt
 
+  - dataset_namespace: lilac
+    dataset_name: dolphin
+    input_path:
+      - input
+
+  - dataset_namespace: lilac
+    dataset_name: UltraChat-200k
+    input_selector:
+      format: openai_json
+      selector: user
+    output_path:
+      - messages__clusters
+
+  # NOTE: We cluster both prompt and completion because the roblox prompts are just small sections
+  # of code which are not extremely descriptive.
+  - dataset_namespace: lilac
+    dataset_name: roblox_luau_corpus
+    input_path:
+      - prompt
+  - dataset_namespace: lilac
+    dataset_name: roblox_luau_corpus
+    input_path:
+      - completion
+
   ## Eval datasets
   - dataset_namespace: lilac
     dataset_name: MMLU
@@ -464,12 +530,6 @@ clusters:
     input_path:
       - sentence
 
-  ## Other datasets
-  - dataset_namespace: lilac
-    dataset_name: dolphin
-    input_path:
-      - input
-
 concept_model_cache_embeddings:
   - gte-small
   - gte-base
diff --git a/web/blueprint/src/lib/components/datasetView/DatasetPivotResult.svelte b/web/blueprint/src/lib/components/datasetView/DatasetPivotResult.svelte
index 99bb13508..5f93154ad 100644
--- a/web/blueprint/src/lib/components/datasetView/DatasetPivotResult.svelte
+++ b/web/blueprint/src/lib/components/datasetView/DatasetPivotResult.svelte
@@ -102,7 +102,7 @@
           <div class="flex flex-row gap-x-2 font-light leading-none text-neutral-600">
             <div class="leading-2 text-lg">
               <div class="flex flex-col py-2">
-                <div class="leading-2 flex flex-row gap-x-1 text-xl text-neutral-800">
+                <div class="leading-2 flex flex-row items-center gap-x-1 text-xl text-neutral-800">
                   {groupPercentage}%
                   <div
                     use:hoverTooltip={{
@@ -119,7 +119,9 @@
               </div>
             </div>
           </div>
-          <a class="flex flex-row" href={groupLink}> <button>Explore</button></a>
+          <a class="flex flex-row" href={groupLink}>
+            <button class="border border-neutral-300">Explore</button></a
+          >
         </div>
       </div>
     </Carousel>
diff --git a/web/blueprint/src/lib/components/datasetView/DatasetPivotViewer.svelte b/web/blueprint/src/lib/components/datasetView/DatasetPivotViewer.svelte
index 1f31a0978..04c24b8f2 100644
--- a/web/blueprint/src/lib/components/datasetView/DatasetPivotViewer.svelte
+++ b/web/blueprint/src/lib/components/datasetView/DatasetPivotViewer.svelte
@@ -219,7 +219,7 @@
       <div class="mx-20 mt-8 w-full text-lg text-gray-600">Select a field to explore.</div>
     {:else if outerLeafPath == null}
       <div class="mx-20 mt-8 w-full text-lg text-gray-600">Select a field to group by.</div>
-    {:else if groups == null}
+    {:else if groups == null || numRowsInQuery == null}
       <SkeletonText />
     {:else}
       <div class="flex w-full flex-col gap-y-10">
@@ -235,8 +235,10 @@
           })}
 
           <div class="flex w-full flex-row">
-            <div class="flex w-48 flex-col items-center justify-center gap-y-4 py-1">
-              <div class="mx-2 whitespace-break-spaces text-center text-2xl">
+            <div class="mb-4 flex w-48 flex-col items-center justify-between gap-y-4 p-6">
+              <div
+                class="mx-2 h-16 whitespace-break-spaces py-0.5 text-center text-2xl leading-7 tracking-tight"
+              >
                 {#each group.textHighlights as highlight}
                   {#if highlight.isBold}<span class="font-bold">{highlight.text}</span>
                   {:else}<span>{highlight.text}</span>{/if}
@@ -250,7 +252,9 @@
                   {group.count.toLocaleString()} rows
                 </span>
               </div>
-              <a class="flex flex-row" href={groupLink}> <button>Explore</button></a>
+              <a class="flex flex-row" href={groupLink}>
+                <button class="border border-neutral-300">Explore</button></a
+              >
             </div>
 
             {#if outerLeafPath && innerLeafPath && numRowsInQuery}