diff --git a/lilac/formats/default_formats.py b/lilac/formats/default_formats.py index bee035071..f3b68185a 100644 --- a/lilac/formats/default_formats.py +++ b/lilac/formats/default_formats.py @@ -1,7 +1,7 @@ """Registers all available dataset formats.""" from ..dataset_format import register_dataset_format -from .openai_json import OpenAIJSON +from .openai_json import OpenAIConversationJSON, OpenAIJSON from .openchat import OpenChat from .sharegpt import ShareGPT @@ -11,3 +11,4 @@ def register_default_formats() -> None: register_dataset_format(ShareGPT) register_dataset_format(OpenChat) register_dataset_format(OpenAIJSON) + register_dataset_format(OpenAIConversationJSON) diff --git a/lilac/formats/openai_json.py b/lilac/formats/openai_json.py index 4bea06f34..3b861daec 100644 --- a/lilac/formats/openai_json.py +++ b/lilac/formats/openai_json.py @@ -9,7 +9,7 @@ def _openai_selector(item: Item, conv_role: str) -> str: """Selector for OpenAI JSON Formatted conversations.""" # TODO(nsthorat): Make this return an array, and not pre-join with newlines. - values = [conv['content'] for conv in item['conversation'] if conv['role'] == conv_role] + values = [conv['content'] for conv in item['messages'] if conv['role'] == conv_role] # Get the __value__ key version of text if it's enriched. values = [value if isinstance(value, str) else value.get(VALUE_KEY) for value in values] return '\n'.join(values) @@ -30,14 +30,12 @@ class OpenAIJSON(DatasetFormat): """OpenAI JSON format. Taken from: https://platform.openai.com/docs/api-reference/chat - - Note that here "messages" is "conversation" for support with common datasets. """ name: ClassVar[str] = 'openai_json' data_schema: Schema = schema( { - 'conversation': [ + 'messages': [ { 'role': 'string', 'content': 'string', @@ -47,7 +45,7 @@ class OpenAIJSON(DatasetFormat): ) title_slots: list[tuple[PathTuple, PathTuple]] = [ - (('conversation', PATH_WILDCARD, 'content'), ('conversation', PATH_WILDCARD, 'role')) + (('messages', PATH_WILDCARD, 'content'), ('messages', PATH_WILDCARD, 'role')) ] user: ClassVar[DatasetFormatInputSelector] = _USER_SELECTOR @@ -56,3 +54,60 @@ class OpenAIJSON(DatasetFormat): input_selectors: ClassVar[dict[str, DatasetFormatInputSelector]] = { selector.name: selector for selector in [_USER_SELECTOR, _ASSISTANT_SELECTOR] } + + +# TODO(nsthorat): Generalize this code so that 'conversations' or 'messages' can be the root field. +# This is the only difference that forces us to make a new format. + + +def _openai_conversation_selector(item: Item, conv_role: str) -> str: + """Selector for OpenAI JSON Formatted conversations.""" + # TODO(nsthorat): Make this return an array, and not pre-join with newlines. + values = [conv['content'] for conv in item['conversation'] if conv['role'] == conv_role] + # Get the __value__ key version of text if it's enriched. + values = [value if isinstance(value, str) else value.get(VALUE_KEY) for value in values] + return '\n'.join(values) + + +_USER_CONVERSATION_SELECTOR = DatasetFormatInputSelector( + name='user', + selector=lambda item: _openai_conversation_selector(item, 'user'), +) + +_ASSISTANT_CONVERSATION_SELECTOR = DatasetFormatInputSelector( + name='assistant', + selector=lambda item: _openai_conversation_selector(item, 'assistant'), +) + + +class OpenAIConversationJSON(DatasetFormat): + """OpenAI JSON format. + + Taken from: https://platform.openai.com/docs/api-reference/chat + + Note that here "messages" is "conversation" for support with common datasets. + """ + + name: ClassVar[str] = 'openai_conversation_json' + data_schema: Schema = schema( + { + 'conversation': [ + { + 'role': 'string', + 'content': 'string', + } + ], + }, + ) + + title_slots: list[tuple[PathTuple, PathTuple]] = [ + (('conversation', PATH_WILDCARD, 'content'), ('conversation', PATH_WILDCARD, 'role')) + ] + + user: ClassVar[DatasetFormatInputSelector] = _USER_CONVERSATION_SELECTOR + assistant: ClassVar[DatasetFormatInputSelector] = _ASSISTANT_CONVERSATION_SELECTOR + + input_selectors: ClassVar[dict[str, DatasetFormatInputSelector]] = { + selector.name: selector + for selector in [_USER_CONVERSATION_SELECTOR, _ASSISTANT_CONVERSATION_SELECTOR] + } diff --git a/lilac/formats/openai_json_test.py b/lilac/formats/openai_json_test.py index abd39b163..4c6b263b5 100644 --- a/lilac/formats/openai_json_test.py +++ b/lilac/formats/openai_json_test.py @@ -2,10 +2,33 @@ from ..data.dataset_test_utils import TestDataMaker -from .openai_json import OpenAIJSON +from .openai_json import OpenAIConversationJSON, OpenAIJSON -def test_infer_open_chat(make_test_data: TestDataMaker) -> None: +def test_infer_openai_json(make_test_data: TestDataMaker) -> None: + dataset = make_test_data( + [ + { + 'messages': [ + {'role': 'user', 'content': 'Hello'}, + {'role': 'assistant', 'content': 'Hi'}, + {'role': 'user', 'content': 'How are you today?'}, + {'role': 'assistant', 'content': "I'm fine."}, + ], + }, + { + 'messages': [ + {'role': 'user', 'content': 'Who are you?'}, + {'role': 'assistant', 'content': "I'm OpenChat."}, + ], + }, + ] + ) + + assert dataset.manifest().dataset_format == OpenAIJSON() + + +def test_infer_openai_conversation_json(make_test_data: TestDataMaker) -> None: dataset = make_test_data( [ { @@ -25,4 +48,4 @@ def test_infer_open_chat(make_test_data: TestDataMaker) -> None: ] ) - assert dataset.manifest().dataset_format == OpenAIJSON() + assert dataset.manifest().dataset_format == OpenAIConversationJSON() diff --git a/lilac_hf_space.yml b/lilac_hf_space.yml index f2b0a8a03..a6db68735 100644 --- a/lilac_hf_space.yml +++ b/lilac_hf_space.yml @@ -105,7 +105,7 @@ datasets: - '*' - content - - name: 'OpenOrca' + - name: OpenOrca namespace: lilac settings: tags: [datasets] @@ -123,21 +123,42 @@ datasets: - question - namespace: lilac - name: mosaic-instruct-v3 + name: SlimOrca source: - dataset_name: mosaicml/instruct-v3 + dataset_name: Open-Orca/SlimOrca + source_name: huggingface + settings: + tags: [datasets] + ui: + media_paths: + - - conversations + - '*' + - value + + - namespace: lilac + name: UltraChat-200k + source: + dataset_name: HuggingFaceH4/ultrachat_200k + source_name: huggingface + settings: + tags: [datasets] + ui: + media_paths: + - - messages + - '*' + - content + + - namespace: lilac + name: roblox_luau_corpus + source: + dataset_name: Roblox/luau_corpus source_name: huggingface settings: tags: [datasets] ui: media_paths: - prompt - - response - markdown_paths: [] - embeddings: - - embedding: gte-small - path: - - prompt + - completion ## Eval datasets - namespace: lilac @@ -303,11 +324,10 @@ datasets: - option2 - answer - ## Old datasets to be cleaned up. - name: databricks-dolly-15k-curated-en namespace: lilac settings: - tags: [machine-learning] + tags: [datasets] ui: media_paths: - original-instruction @@ -336,21 +356,35 @@ datasets: path: - original-response + - namespace: lilac + name: mosaic-instruct-v3 + source: + dataset_name: mosaicml/instruct-v3 + source_name: huggingface + settings: + tags: [datasets] + ui: + media_paths: + - prompt + - response + embeddings: + - embedding: gte-small + path: + - prompt + - namespace: lilac name: dolphin - tags: [datasets] source: dataset_name: cognitivecomputations/dolphin config_name: flan1m-alpaca-uncensored source_name: huggingface settings: + tags: [datasets] ui: media_paths: - instruction - input - output - - - input__cluster - - text signals: - signal_name: text_statistics @@ -389,7 +423,7 @@ clusters: - dataset_namespace: lilac dataset_name: lmsys-chat-1m input_selector: - format: openai_json + format: openai_conversation_json selector: user output_path: - conversation__clusters @@ -399,6 +433,14 @@ clusters: input_path: - question + - dataset_namespace: lilac + dataset_name: SlimOrca + input_selector: + format: sharegpt + selector: human + output_path: + - conversation__clusters + - dataset_namespace: lilac dataset_name: databricks-dolly-15k-curated-en input_path: @@ -409,6 +451,30 @@ clusters: input_path: - prompt + - dataset_namespace: lilac + dataset_name: dolphin + input_path: + - input + + - dataset_namespace: lilac + dataset_name: UltraChat-200k + input_selector: + format: openai_json + selector: user + output_path: + - messages__clusters + + # NOTE: We cluster both prompt and completion because the roblox prompts are just small sections + # of code which are not extremely descriptive. + - dataset_namespace: lilac + dataset_name: roblox_luau_corpus + input_path: + - prompt + - dataset_namespace: lilac + dataset_name: roblox_luau_corpus + input_path: + - completion + ## Eval datasets - dataset_namespace: lilac dataset_name: MMLU @@ -464,12 +530,6 @@ clusters: input_path: - sentence - ## Other datasets - - dataset_namespace: lilac - dataset_name: dolphin - input_path: - - input - concept_model_cache_embeddings: - gte-small - gte-base diff --git a/web/blueprint/src/lib/components/datasetView/DatasetPivotResult.svelte b/web/blueprint/src/lib/components/datasetView/DatasetPivotResult.svelte index 99bb13508..5f93154ad 100644 --- a/web/blueprint/src/lib/components/datasetView/DatasetPivotResult.svelte +++ b/web/blueprint/src/lib/components/datasetView/DatasetPivotResult.svelte @@ -102,7 +102,7 @@