Skip to content

Commit

Permalink
Add more demo datasets. (#1120)
Browse files Browse the repository at this point in the history
We are now officially in pretty good shape:
https://huggingface.co/spaces/lilacai/lilac

I fixed some subtle design stuff along the way (make the button look
like a button, fix some alignment, fix some centering)

NOTE: I had to change the openai json formatter. The official format has
"messages", but sometimes users use "conversation" so I made them two
formats. We should probably generalize this a bit but I didnt feel like
doing it now.
  • Loading branch information
nsthorat authored Jan 24, 2024
1 parent 0ce0a4c commit afc5b62
Show file tree
Hide file tree
Showing 6 changed files with 181 additions and 36 deletions.
3 changes: 2 additions & 1 deletion lilac/formats/default_formats.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Registers all available dataset formats."""

from ..dataset_format import register_dataset_format
from .openai_json import OpenAIJSON
from .openai_json import OpenAIConversationJSON, OpenAIJSON
from .openchat import OpenChat
from .sharegpt import ShareGPT

Expand All @@ -11,3 +11,4 @@ def register_default_formats() -> None:
register_dataset_format(ShareGPT)
register_dataset_format(OpenChat)
register_dataset_format(OpenAIJSON)
register_dataset_format(OpenAIConversationJSON)
65 changes: 60 additions & 5 deletions lilac/formats/openai_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
def _openai_selector(item: Item, conv_role: str) -> str:
"""Selector for OpenAI JSON Formatted conversations."""
# TODO(nsthorat): Make this return an array, and not pre-join with newlines.
values = [conv['content'] for conv in item['conversation'] if conv['role'] == conv_role]
values = [conv['content'] for conv in item['messages'] if conv['role'] == conv_role]
# Get the __value__ key version of text if it's enriched.
values = [value if isinstance(value, str) else value.get(VALUE_KEY) for value in values]
return '\n'.join(values)
Expand All @@ -30,14 +30,12 @@ class OpenAIJSON(DatasetFormat):
"""OpenAI JSON format.
Taken from: https://platform.openai.com/docs/api-reference/chat
Note that here "messages" is "conversation" for support with common datasets.
"""

name: ClassVar[str] = 'openai_json'
data_schema: Schema = schema(
{
'conversation': [
'messages': [
{
'role': 'string',
'content': 'string',
Expand All @@ -47,7 +45,7 @@ class OpenAIJSON(DatasetFormat):
)

title_slots: list[tuple[PathTuple, PathTuple]] = [
(('conversation', PATH_WILDCARD, 'content'), ('conversation', PATH_WILDCARD, 'role'))
(('messages', PATH_WILDCARD, 'content'), ('messages', PATH_WILDCARD, 'role'))
]

user: ClassVar[DatasetFormatInputSelector] = _USER_SELECTOR
Expand All @@ -56,3 +54,60 @@ class OpenAIJSON(DatasetFormat):
input_selectors: ClassVar[dict[str, DatasetFormatInputSelector]] = {
selector.name: selector for selector in [_USER_SELECTOR, _ASSISTANT_SELECTOR]
}


# TODO(nsthorat): Generalize this code so that 'conversations' or 'messages' can be the root field.
# This is the only difference that forces us to make a new format.


def _openai_conversation_selector(item: Item, conv_role: str) -> str:
"""Selector for OpenAI JSON Formatted conversations."""
# TODO(nsthorat): Make this return an array, and not pre-join with newlines.
values = [conv['content'] for conv in item['conversation'] if conv['role'] == conv_role]
# Get the __value__ key version of text if it's enriched.
values = [value if isinstance(value, str) else value.get(VALUE_KEY) for value in values]
return '\n'.join(values)


_USER_CONVERSATION_SELECTOR = DatasetFormatInputSelector(
name='user',
selector=lambda item: _openai_conversation_selector(item, 'user'),
)

_ASSISTANT_CONVERSATION_SELECTOR = DatasetFormatInputSelector(
name='assistant',
selector=lambda item: _openai_conversation_selector(item, 'assistant'),
)


class OpenAIConversationJSON(DatasetFormat):
"""OpenAI JSON format.
Taken from: https://platform.openai.com/docs/api-reference/chat
Note that here "messages" is "conversation" for support with common datasets.
"""

name: ClassVar[str] = 'openai_conversation_json'
data_schema: Schema = schema(
{
'conversation': [
{
'role': 'string',
'content': 'string',
}
],
},
)

title_slots: list[tuple[PathTuple, PathTuple]] = [
(('conversation', PATH_WILDCARD, 'content'), ('conversation', PATH_WILDCARD, 'role'))
]

user: ClassVar[DatasetFormatInputSelector] = _USER_CONVERSATION_SELECTOR
assistant: ClassVar[DatasetFormatInputSelector] = _ASSISTANT_CONVERSATION_SELECTOR

input_selectors: ClassVar[dict[str, DatasetFormatInputSelector]] = {
selector.name: selector
for selector in [_USER_CONVERSATION_SELECTOR, _ASSISTANT_CONVERSATION_SELECTOR]
}
29 changes: 26 additions & 3 deletions lilac/formats/openai_json_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,33 @@


from ..data.dataset_test_utils import TestDataMaker
from .openai_json import OpenAIJSON
from .openai_json import OpenAIConversationJSON, OpenAIJSON


def test_infer_open_chat(make_test_data: TestDataMaker) -> None:
def test_infer_openai_json(make_test_data: TestDataMaker) -> None:
dataset = make_test_data(
[
{
'messages': [
{'role': 'user', 'content': 'Hello'},
{'role': 'assistant', 'content': 'Hi'},
{'role': 'user', 'content': 'How are you today?'},
{'role': 'assistant', 'content': "I'm fine."},
],
},
{
'messages': [
{'role': 'user', 'content': 'Who are you?'},
{'role': 'assistant', 'content': "I'm OpenChat."},
],
},
]
)

assert dataset.manifest().dataset_format == OpenAIJSON()


def test_infer_openai_conversation_json(make_test_data: TestDataMaker) -> None:
dataset = make_test_data(
[
{
Expand All @@ -25,4 +48,4 @@ def test_infer_open_chat(make_test_data: TestDataMaker) -> None:
]
)

assert dataset.manifest().dataset_format == OpenAIJSON()
assert dataset.manifest().dataset_format == OpenAIConversationJSON()
102 changes: 81 additions & 21 deletions lilac_hf_space.yml
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ datasets:
- '*'
- content

- name: 'OpenOrca'
- name: OpenOrca
namespace: lilac
settings:
tags: [datasets]
Expand All @@ -123,21 +123,42 @@ datasets:
- question

- namespace: lilac
name: mosaic-instruct-v3
name: SlimOrca
source:
dataset_name: mosaicml/instruct-v3
dataset_name: Open-Orca/SlimOrca
source_name: huggingface
settings:
tags: [datasets]
ui:
media_paths:
- - conversations
- '*'
- value

- namespace: lilac
name: UltraChat-200k
source:
dataset_name: HuggingFaceH4/ultrachat_200k
source_name: huggingface
settings:
tags: [datasets]
ui:
media_paths:
- - messages
- '*'
- content

- namespace: lilac
name: roblox_luau_corpus
source:
dataset_name: Roblox/luau_corpus
source_name: huggingface
settings:
tags: [datasets]
ui:
media_paths:
- prompt
- response
markdown_paths: []
embeddings:
- embedding: gte-small
path:
- prompt
- completion

## Eval datasets
- namespace: lilac
Expand Down Expand Up @@ -303,11 +324,10 @@ datasets:
- option2
- answer

## Old datasets to be cleaned up.
- name: databricks-dolly-15k-curated-en
namespace: lilac
settings:
tags: [machine-learning]
tags: [datasets]
ui:
media_paths:
- original-instruction
Expand Down Expand Up @@ -336,21 +356,35 @@ datasets:
path:
- original-response

- namespace: lilac
name: mosaic-instruct-v3
source:
dataset_name: mosaicml/instruct-v3
source_name: huggingface
settings:
tags: [datasets]
ui:
media_paths:
- prompt
- response
embeddings:
- embedding: gte-small
path:
- prompt

- namespace: lilac
name: dolphin
tags: [datasets]
source:
dataset_name: cognitivecomputations/dolphin
config_name: flan1m-alpaca-uncensored
source_name: huggingface
settings:
tags: [datasets]
ui:
media_paths:
- instruction
- input
- output
- - input__cluster
- text

signals:
- signal_name: text_statistics
Expand Down Expand Up @@ -389,7 +423,7 @@ clusters:
- dataset_namespace: lilac
dataset_name: lmsys-chat-1m
input_selector:
format: openai_json
format: openai_conversation_json
selector: user
output_path:
- conversation__clusters
Expand All @@ -399,6 +433,14 @@ clusters:
input_path:
- question

- dataset_namespace: lilac
dataset_name: SlimOrca
input_selector:
format: sharegpt
selector: human
output_path:
- conversation__clusters

- dataset_namespace: lilac
dataset_name: databricks-dolly-15k-curated-en
input_path:
Expand All @@ -409,6 +451,30 @@ clusters:
input_path:
- prompt

- dataset_namespace: lilac
dataset_name: dolphin
input_path:
- input

- dataset_namespace: lilac
dataset_name: UltraChat-200k
input_selector:
format: openai_json
selector: user
output_path:
- messages__clusters

# NOTE: We cluster both prompt and completion because the roblox prompts are just small sections
# of code which are not extremely descriptive.
- dataset_namespace: lilac
dataset_name: roblox_luau_corpus
input_path:
- prompt
- dataset_namespace: lilac
dataset_name: roblox_luau_corpus
input_path:
- completion

## Eval datasets
- dataset_namespace: lilac
dataset_name: MMLU
Expand Down Expand Up @@ -464,12 +530,6 @@ clusters:
input_path:
- sentence

## Other datasets
- dataset_namespace: lilac
dataset_name: dolphin
input_path:
- input

concept_model_cache_embeddings:
- gte-small
- gte-base
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@
<div class="flex flex-row gap-x-2 font-light leading-none text-neutral-600">
<div class="leading-2 text-lg">
<div class="flex flex-col py-2">
<div class="leading-2 flex flex-row gap-x-1 text-xl text-neutral-800">
<div class="leading-2 flex flex-row items-center gap-x-1 text-xl text-neutral-800">
{groupPercentage}%
<div
use:hoverTooltip={{
Expand All @@ -119,7 +119,9 @@
</div>
</div>
</div>
<a class="flex flex-row" href={groupLink}> <button>Explore</button></a>
<a class="flex flex-row" href={groupLink}>
<button class="border border-neutral-300">Explore</button></a
>
</div>
</div>
</Carousel>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@
<div class="mx-20 mt-8 w-full text-lg text-gray-600">Select a field to explore.</div>
{:else if outerLeafPath == null}
<div class="mx-20 mt-8 w-full text-lg text-gray-600">Select a field to group by.</div>
{:else if groups == null}
{:else if groups == null || numRowsInQuery == null}
<SkeletonText />
{:else}
<div class="flex w-full flex-col gap-y-10">
Expand All @@ -235,8 +235,10 @@
})}

<div class="flex w-full flex-row">
<div class="flex w-48 flex-col items-center justify-center gap-y-4 py-1">
<div class="mx-2 whitespace-break-spaces text-center text-2xl">
<div class="mb-4 flex w-48 flex-col items-center justify-between gap-y-4 p-6">
<div
class="mx-2 h-16 whitespace-break-spaces py-0.5 text-center text-2xl leading-7 tracking-tight"
>
{#each group.textHighlights as highlight}
{#if highlight.isBold}<span class="font-bold">{highlight.text}</span>
{:else}<span>{highlight.text}</span>{/if}
Expand All @@ -250,7 +252,9 @@
{group.count.toLocaleString()} rows
</span>
</div>
<a class="flex flex-row" href={groupLink}> <button>Explore</button></a>
<a class="flex flex-row" href={groupLink}>
<button class="border border-neutral-300">Explore</button></a
>
</div>

{#if outerLeafPath && innerLeafPath && numRowsInQuery}
Expand Down

0 comments on commit afc5b62

Please sign in to comment.