From 65513c601667fd532a920d7a0b63074eca7ccae5 Mon Sep 17 00:00:00 2001 From: Ryan Mullins Date: Mon, 24 Jun 2024 14:23:06 +0000 Subject: [PATCH] LLM Comparator: Docs and licensing for OSS release. PiperOrigin-RevId: 646085859 --- python/README.md | 176 +++++++++++- python/notebooks/basic_demo.ipynb | 264 ++++++++++++++++++ .../run_scripts_for_llm_comparator.ipynb | 157 ----------- python/pyproject.toml | 5 +- python/src/llm_comparator/_colab.py | 14 + python/src/llm_comparator/_logging.py | 14 + python/src/llm_comparator/comparison.py | 16 +- python/src/llm_comparator/llm_judge_runner.py | 14 + python/src/llm_comparator/model_helper.py | 16 +- python/src/llm_comparator/prompt_templates.py | 14 + .../rationale_bullet_generator.py | 14 + .../rationale_cluster_generator.py | 30 +- python/src/llm_comparator/types.py | 43 +-- python/src/llm_comparator/utils.py | 16 +- python/src/llm_comparator/utils_test.py | 16 +- 15 files changed, 623 insertions(+), 186 deletions(-) create mode 100644 python/notebooks/basic_demo.ipynb delete mode 100644 python/notebooks/run_scripts_for_llm_comparator.ipynb diff --git a/python/README.md b/python/README.md index c4565e6..6a595ba 100644 --- a/python/README.md +++ b/python/README.md @@ -1 +1,175 @@ -# LLM Comparator Python Scripts +# LLM Comparator: Python Library + +The LLM Comparator Python Library provides a simple API for configuring and +running **comparative evaluations** of models, and generating the JSON files +required to analyze these models side-by-side in the +[LLM Comparator app][llm-comparator-app]. + +## Installation + +```sh +# [OPTIONAL] We recommend using a Python virtual environemnt. +python3 -m venv ~/.venv/llm_comparator +source ~/.venv/llm_comparator/bin/activate +``` + +You can install this library from the PyPI registry. This workflow is +recommended for most usage. + +```sh +pip install llm_comparator +``` + +Or by cloning the repo from GitHub and installing from source. This workflow is +recommended for contributors fixing bugs or adding new features to the library. + +```sh +git clone https://github.com/PAIR-code/llm-comparator.git +cd llm-comparator/python +pip install -e . +``` + +## Core Concepts + +The primary entrypoint to the LLM Comparator Python Library is the +`llm_comparator.comparison` module. This module provides a `run()` function that +coordinates the three phases of comparative evaluation: judging, bulletizing, +and clustering. The library provides modules for each of these phases, as well +as wrappers for interacting with LLMs. + +### Model Helpers + +The `llm_comparator.model_helper` module is used to initialize API wrappers to +interface with LLMs. Broadly, we support two kinds of models: generation models, +which should be text-to-text language models, and embedding models. + +We provide concrete two concrete implementations of wrappers that interact with +the [Google Vertex AI API][vertex-api]: + +* `VertexGenerationModelHelper` supports any + [generative language model][model-garden-gen] available in the Model Garden. + By default we use `gemini-pro`, but alternatives can be configured with the + `model_name=` constructor parameter. +* `VertexEmbeddingModelHelper` supports any + [text embedding model][model-garden-emb] available in the Model Garden. + By default we use `textembedding-gecko@003`, but alternatives can be + configured with the `model_name=` constructor parameter. + +Additional wrapper classes can be implemented by subclassing +`GenerationModelHelper` and `EmbeddingModelHelper` as necessary. We welcome +contributions of these classes to the library via +[Pull Requests][llm-comparator-prs]. + +### Judges + +The "judge" is the model responsible for actually doing the comparison between +the two model responses. + +This functionality is encapsulated in the `LLMJudgeRunner` from the +`llm_comparator.llm_judge_runner` module. It requires a generator model that +conforms to the `GenerationModelHelper` protocol in order to call an LLM to +generate their judgements of which of the two model responses is better for +every prompt. + +We provide a default judge prompt in +`llm_comparator.llm_judge_runner.DEFAULT_LLM_JUDGE_PROMPT_TEMPLATE`, and you can +use the `llm_judge_prompt_template=` parameter to provide a custom prompt that +may better suit your needs at initialization time. Prompts should require the +judge to phrase its responses in a simple XML format that includes the `verdict` +and an `explanation`, to enable downstream processing by the buletizer and +clusterer. Note that if you do provide a custom prompt, you can use the +`rating_to_score_map=` parameter to ensure judgement `verdict`s can be mapped to +numeric values. + +```xml + + YOUR EXPLANATION GOES HERE. + A is slightly better + +``` +The same judge is run multiple times during each comparative analysis to get a +diversity of ratings. This can be configured via the `num_repeats=` key of the +optional `judge_opts=` dictionary passed to `llm_comparator.comparison.run()`. +By default, we call the judge 6 times. + +### Bulletizers + +A "bulletizer" condenses the results provided by the judge into a set of bullets +to make them easier to understand and consume in the UI. + +This functionality is encapsulated in the `RationaleBulletGenerator` from the +`llm_comparator.rationale_bullet_generator` module. It requires a generator +model that conforms to the `GenerationModelHelper` protocol in order to call an +LLM to generate the bulleted summaries. + +The primary configuration is the `win_rate_threshold=` parameter, which can be +configured per-comparative evaluation run via the `bulletizer_opts=` dictionary +passed to `llm_comparator.comparison.run()`. By default, the threshold is set to +`0.25` based on the default scoring range of [-1.5, 1.5]. + +### Clusterers + +A "clusterer" takes the bullets, embeds them, groups them into clusters based on +embedding similarity, and generates a label for those clusters. + +This functionality is encapsulated in the `RationaleClusterGenerator` from the +`llm_comparator.rationale_cluster_generator` module. It requires a generator +model that conforms to the `GenerationModelHelper` protocol and an embedder that +conforms to the `EmbeddingModelHelper` protocol in order to call LLMs to +generate the clustered summary of rationale bullets. + +The primary configuration is the `num_clusters=` parameter, which can be +configured per-comparative evaluation run via the `clusterer_opts=` dictionary +passed to `llm_comparator.comparison.run()`. By default, a clusterer creates 10 +clusters from the bullets. + +## Basic Usage + +We have prepared a Python notebook that you can +[open in Google Colab][llm-comparator-colab] to follow along interactively. It +uses Google's Vertex AI APIs to call models, and requires you to +[authenticate][vertex-auth] in order to follow along. + +The following pseudo-script is the minimal distillation of the notebook linked +above, without any AI platform-specific guidance. + +```python +from llm_comparator import comparison +from llm_comparator import model_helper +from llm_comparator import llm_judge_runner +from llm_comparator import rationale_bullet_generator +from llm_comparator import rationale_cluster_generator + +inputs = [ + # Provide your inputs here. + # They must conform to llm_comparator.types.LLMJudgeInput +] + +# Initialize the models-calling classes. +generator = # Initialize a model_helper.GenerationModelHelper() subclass +embedder = # Initialize a model_helper.EmbeddingModelHelper() subclass + +# Initialize the instances that run work on the models. +judge = llm_judge_runner.LLMJudgeRunner(generator) +bulletizer = rationale_bullet_generator.RationaleBulletGenerator(generator) +clusterer = rationale_cluster_generator.RationaleClusterGenerator( + generator, embedder +) + +# Configure and run the comparative evaluation. +comparison_result = comparison.run(inputs, judge, bulletizer, clusterer) + +# Write the results to a JSON file that can be loaded in +# https://pair-code.github.io/llm-comparator +file_path = "path/to/file.json" +comparison.write(comparison_result, file_path) +``` + + +[llm-comparator-app]: https://pair-code.github.io/llm-comparator +[llm-comparator-colab]: https://colab.research.google.com/github/PAIR-code/llm-comparator/blob/main/python/notebooks/basic_demo.ipynb +[llm-comparator-prs]: https://github.com/PAIR-code/llm-comparator/pulls +[model-garden-emb]: https://console.cloud.google.com/vertex-ai/model-garden?pageState=(%22galleryStateKey%22:(%22f%22:(%22g%22:%5B%22supportedTasks%22,%22inputTypes%22%5D,%22o%22:%5B%22EMBEDDING%22,%22LANGUAGE%22%5D),%22s%22:%22%22)) +[model-garden-gen]: https://console.cloud.google.com/vertex-ai/model-garden?pageState=(%22galleryStateKey%22:(%22f%22:(%22g%22:%5B%22supportedTasks%22,%22inputTypes%22%5D,%22o%22:%5B%22GENERATION%22,%22LANGUAGE%22%5D),%22s%22:%22%22)) +[vertex-api]: https://cloud.google.com/vertex-ai/docs/reference +[vertex-auth]: https://cloud.google.com/vertex-ai/docs/authentication diff --git a/python/notebooks/basic_demo.ipynb b/python/notebooks/basic_demo.ipynb new file mode 100644 index 0000000..a9060d2 --- /dev/null +++ b/python/notebooks/basic_demo.ipynb @@ -0,0 +1,264 @@ +{ + "cells": [ + { + "metadata": { + "id": "TMG3IgcdtzdI" + }, + "cell_type": "markdown", + "source": [ + "# LLM Comparator: Running Comparative Evaluations with Google Vertex AI" + ] + }, + { + "metadata": { + "id": "Lu90H8Os4UFZ" + }, + "cell_type": "code", + "source": [ + "#@title Licensed under the Apache License, Version 2.0\n", + "# Copyright 2024 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# http://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kunEkAgFB9Yt" + }, + "outputs": [], + "source": [ + "#@title Install the LLM Comparator package\n", + "! pip install llm_comparator" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QZlVpN83nJBv" + }, + "outputs": [], + "source": [ + "#@title Import relevant packages\n", + "import vertexai\n", + "from google.colab import auth\n", + "\n", + "# The comparison library provides the primary API for running Comparative\n", + "# Evaluations and generating the JSON files required by the LLM Comparator web\n", + "# app.\n", + "from llm_comparator import comparison\n", + "\n", + "# The model_helper library is used to initialize API wrapper to interface with\n", + "# models. For this demo we focus on models served by Google Vertex AI, but you\n", + "# can extend the llm_comparator.model_helper.GenerationModelHelper and\n", + "# llm_comparator.model_helper.EmbeddingModelHelper classes to work with other\n", + "# providers or models you host yourself.\n", + "from llm_comparator import model_helper\n", + "\n", + "# The following libraries contain wrappers that implement the core functionality\n", + "# of the Comparative Evaluation workflow. More on these below.\n", + "from llm_comparator import llm_judge_runner\n", + "from llm_comparator import rationale_bullet_generator\n", + "from llm_comparator import rationale_cluster_generator" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "3ygUJPVxDMB7" + }, + "outputs": [], + "source": [ + "#@title Setup and authenticate with Google Vertex AI.\n", + "PROJECT_ID = 'your_project_id' #@param {type: \"string\"}\n", + "REGION = 'us-central1' #@param {type: \"string\"}\n", + "\n", + "auth.authenticate_user()\n", + "! gcloud config set project {PROJECT_ID}\n", + "vertexai.init(project=PROJECT_ID, location=REGION)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "JBGp4LiVCO00" + }, + "outputs": [], + "source": [ + "#@title Prepare Your Inputs\n", + "\n", + "# See llm_comparator.llm_judge_runner.LLMJudgeInput for the required input type.\n", + "llm_judge_inputs = [\n", + " {'prompt': 'how are you?', 'response_a': 'good', 'response_b': 'bad'},\n", + " {'prompt': 'hello?', 'response_a': 'hello', 'response_b': 'hi'},\n", + " {'prompt': 'what is the capital of korea?', 'response_a': 'Seoul', 'response_b': 'Vancouver'}\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Xy6GxkgJCD-M" + }, + "outputs": [], + "source": [ + "#@title Initialize models used in the LLM Comparator evaluation.\n", + "\n", + "# The generator model can be any Text-to-Text LLM provided by Vertex AI. This\n", + "# model will be asked to do a series of tasks---judge, bulletize, and cluster---\n", + "# and it is often beneficial to use a larger model for this reason.\n", + "#\n", + "# We default to 'gemini-pro' but you can change this with the `model_name=`\n", + "# param. For a full list of models available via the Model Garden, check out\n", + "# https://console.cloud.google.com/vertex-ai/model-garden?pageState=(%22galleryStateKey%22:(%22f%22:(%22g%22:%5B%22supportedTasks%22,%22inputTypes%22%5D,%22o%22:%5B%22GENERATION%22,%22LANGUAGE%22%5D),%22s%22:%22%22)).\n", + "#\n", + "# Since we're using Gemini Pro, a very competent and flexible foundation model,\n", + "# we are sharing the same generator across all downstream tasks. However, you\n", + "# could use different models for each task if desired.\n", + "generator = model_helper.VertexGenerationModelHelper()\n", + "\n", + "# The embedding model can be any text embedder provided by Vertex AI. We default\n", + "# to 'textembedding-gecko@003' but you can change this with the `model_name=`\n", + "# param. For a full list of models available via the Model Garden, check out\n", + "# https://console.cloud.google.com/vertex-ai/model-garden?pageState=(%22galleryStateKey%22:(%22f%22:(%22g%22:%5B%22supportedTasks%22,%22inputTypes%22%5D,%22o%22:%5B%22EMBEDDING%22,%22LANGUAGE%22%5D),%22s%22:%22%22))\n", + "embedder = model_helper.VertexEmbeddingModelHelper()\n", + "\n", + "# The following models do the core work of a Comparative Evaluation: judge,\n", + "# bulletize, and cluster. Each class provides a `.run()` function, and the\n", + "# `llm_comparator.comparison.run()` API orchestrates configuring and calling\n", + "# these APIs on the instances you pass in. More on how to configure these below.\n", + "\n", + "# The `judge` is the model responsible for actually doing the comparison between\n", + "# the two models. The same judge is run multiple times to get a diversity of\n", + "# perspectives, more on how to configure this below.\n", + "#\n", + "# A judge must phrase its responses in a simple XML format that includes the\n", + "# verdict and an explanation of the results, to enable downstream processing by\n", + "# the bulletizer and clusterer.\n", + "#\n", + "# \u003cresult\u003e\n", + "# \u003cexplanation\u003eYOUR EXPLANATION GOES HERE.\u003c/explanation\u003e\n", + "# \u003cverdict\u003eA is slightly better\u003c/verdict\u003e\n", + "# \u003c/result\u003e\n", + "#\n", + "# We provide a default \"judge\" prompt in\n", + "# llm_comparator.llm_judge_runner.DEFAULT_LLM_JUDGE_PROMPT_TEMPLATE, and you can\n", + "# use the `llm_judge_prompt_template=` parameter to provide a custom prompt that\n", + "# may better suit your needs.\n", + "judge = llm_judge_runner.LLMJudgeRunner(generator)\n", + "\n", + "# The `bulletizer` condenses the results provided by the judge into a set of\n", + "# bullets to make them easier to understand and consume in the UI.\n", + "bulletizer = rationale_bullet_generator.RationaleBulletGenerator(generator)\n", + "\n", + "# The `clusterer` takes the bullets, embeds them, groups them into clusters\n", + "# based on embedding similarity, and generates a label for those clusters.\n", + "clusterer = rationale_cluster_generator.RationaleClusterGenerator(\n", + " generator, embedder\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QUU3V63vVbvS" + }, + "outputs": [], + "source": [ + "#@title Run the Comparative Evaluation.\n", + "\n", + "# The comparison.run() function is the primary interface for running a\n", + "# Comparative Evaluation. It take your prepared inputs, a judge, a buletizer,\n", + "# and a clusterer and returns a Python dictioary in the required format for use\n", + "# in the LLM Comparator web app. You can inspect this dictionary in Python if\n", + "# you like, but it's more useful once written to a file.\n", + "#\n", + "# The example below is basic, but you can use the judge_opts=, bulletizer_opts=,\n", + "# and/or clusterer_opts= parameters (all of which are optional dictionaries that\n", + "# are converted to keyword options) to further customize the behaviors. See the\n", + "# Docsrtrings for more.\n", + "comparison_result = comparison.run(\n", + " llm_judge_inputs,\n", + " judge,\n", + " bulletizer,\n", + " clusterer,\n", + ")" + ] + }, + { + "metadata": { + "id": "hViMkxUGhnTA" + }, + "cell_type": "code", + "source": [ + "#@title [Optional] Save the results to a file.\n", + "file_path = 'json_for_llm_comparator.json' # @param {type: \"string\"}\n", + "comparison.write(comparison_result, file_path)" + ], + "outputs": [], + "execution_count": null + }, + { + "metadata": { + "id": "7W4B1uzqvJZ-" + }, + "cell_type": "code", + "source": [ + "#@title [Optional] View the results in the app in Colab.\n", + "comparison.show_in_colab(file_path)" + ], + "outputs": [], + "execution_count": null + } + ], + "metadata": { + "colab": { + "last_runtime": { + "build_target": "", + "kind": "local" + }, + "private_outputs": true, + "provenance": [ + { + "file_id": "/piper/depot/google3/third_party/javascript/llm_comparator/python/notebooks/run_scripts_for_llm_comparator.ipynb?workspaceId=kahng:comparator12-2::citc", + "timestamp": 1718335330333 + }, + { + "file_id": "/piper/depot/google3/third_party/javascript/llm_comparator/python/notebooks/run_scripts_for_llm_comparator.ipynb?workspaceId=kahng:fig-export-comparator12-2-3818-change-2::citc", + "timestamp": 1718334507750 + }, + { + "file_id": "/piper/depot/google3/third_party/javascript/llm_comparator/python/notebooks/run_scripts_for_llm_comparator.ipynb?workspaceId=kahng:fig-export-comparator12-2-3818-change-2::citc", + "timestamp": 1718215678229 + } + ] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/python/notebooks/run_scripts_for_llm_comparator.ipynb b/python/notebooks/run_scripts_for_llm_comparator.ipynb deleted file mode 100644 index 6f53b90..0000000 --- a/python/notebooks/run_scripts_for_llm_comparator.ipynb +++ /dev/null @@ -1,157 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "kunEkAgFB9Yt" - }, - "outputs": [], - "source": [ - "! pip install /content/llm_comparator-0.1-py3-none-any.whl" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "QZlVpN83nJBv" - }, - "outputs": [], - "source": [ - "import vertexai\n", - "from google.colab import auth\n", - "\n", - "from llm_comparator import comparison\n", - "from llm_comparator import model_helper\n", - "from llm_comparator import llm_judge_runner\n", - "from llm_comparator import rationale_bullet_generator\n", - "from llm_comparator import rationale_cluster_generator" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "3ygUJPVxDMB7" - }, - "outputs": [], - "source": [ - "#@title Setup for using Vertex AI.\n", - "auth.authenticate_user()\n", - "\n", - "PROJECT_ID = 'pair-experimental' #@param {type: \"string\"}\n", - "REGION = 'us-central1' #@param {type: \"string\"}\n", - "\n", - "! gcloud config set project {PROJECT_ID}\n", - "\n", - "vertexai.init(project=PROJECT_ID, location=REGION)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "JBGp4LiVCO00" - }, - "outputs": [], - "source": [ - "llm_judge_inputs = [\n", - " {'prompt': 'how are you?', 'response_a': 'good', 'response_b': 'bad'},\n", - " {'prompt': 'hello?', 'response_a': 'hello', 'response_b': 'hi'},\n", - " {'prompt': 'what is the capital of korea?', 'response_a': 'Seoul', 'response_b': 'Vancouver'}\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Xy6GxkgJCD-M" - }, - "outputs": [], - "source": [ - "#@title Initialize models used in the LLM Comparator evaluation.\n", - "generator = model_helper.VertexGenerationModelHelper()\n", - "embedder = model_helper.VertexEmbeddingModelHelper()\n", - "judge = llm_judge_runner.LLMJudgeRunner(generator)\n", - "bulletizer = rationale_bullet_generator.RationaleBulletGenerator(generator)\n", - "clusterer = rationale_cluster_generator.RationaleClusterGenerator(\n", - " generator, embedder\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "QUU3V63vVbvS" - }, - "outputs": [], - "source": [ - "#@title Run the LLM Comparator evauation.\n", - "comparison_result = comparison.run(\n", - " llm_judge_inputs,\n", - " judge,\n", - " bulletizer,\n", - " clusterer,\n", - ")" - ] - }, - { - "metadata": { - "id": "hViMkxUGhnTA" - }, - "cell_type": "code", - "source": [ - "file_path = 'json_for_llm_comparator.json' # @param {type: \"string\"}\n", - "comparison.write(comparison_result, file_path)" - ], - "outputs": [], - "execution_count": null - }, - { - "metadata": { - "id": "7W4B1uzqvJZ-" - }, - "cell_type": "code", - "source": [ - "comparison.show_in_colab(file_path)" - ], - "outputs": [], - "execution_count": null - } - ], - "metadata": { - "colab": { - "last_runtime": { - "build_target": "", - "kind": "local" - }, - "private_outputs": true, - "provenance": [ - { - "file_id": "/piper/depot/google3/third_party/javascript/llm_comparator/python/notebooks/run_scripts_for_llm_comparator.ipynb?workspaceId=kahng:comparator12-2::citc", - "timestamp": 1718335330333 - }, - { - "file_id": "/piper/depot/google3/third_party/javascript/llm_comparator/python/notebooks/run_scripts_for_llm_comparator.ipynb?workspaceId=kahng:fig-export-comparator12-2-3818-change-2::citc", - "timestamp": 1718334507750 - }, - { - "file_id": "/piper/depot/google3/third_party/javascript/llm_comparator/python/notebooks/run_scripts_for_llm_comparator.ipynb?workspaceId=kahng:fig-export-comparator12-2-3818-change-2::citc", - "timestamp": 1718215678229 - } - ] - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - }, - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/python/pyproject.toml b/python/pyproject.toml index 7d2894b..f798ffc 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -13,7 +13,10 @@ readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" dependencies = [ - "tqdm" + "absl-py", + "numpy", + "tqdm", + "vertexai", ] classifiers = [ "Development Status :: 4 - Beta", diff --git a/python/src/llm_comparator/_colab.py b/python/src/llm_comparator/_colab.py index d203521..381542f 100644 --- a/python/src/llm_comparator/_colab.py +++ b/python/src/llm_comparator/_colab.py @@ -1,3 +1,17 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== """Provides a constant that incidates whether we are running in Google Colab.""" try: diff --git a/python/src/llm_comparator/_logging.py b/python/src/llm_comparator/_logging.py index 71426e7..25e8d6f 100644 --- a/python/src/llm_comparator/_logging.py +++ b/python/src/llm_comparator/_logging.py @@ -1,3 +1,17 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== """Logging utilities.""" import logging diff --git a/python/src/llm_comparator/comparison.py b/python/src/llm_comparator/comparison.py index 392e3d8..6b05ed2 100644 --- a/python/src/llm_comparator/comparison.py +++ b/python/src/llm_comparator/comparison.py @@ -1,4 +1,18 @@ -"""Primary entry point for running evaluations with LLM Comparator.""" +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Entrypoint for running comparative evaluations with LLM Comparator.""" from collections.abc import Sequence import json diff --git a/python/src/llm_comparator/llm_judge_runner.py b/python/src/llm_comparator/llm_judge_runner.py index 3333db6..463901b 100644 --- a/python/src/llm_comparator/llm_judge_runner.py +++ b/python/src/llm_comparator/llm_judge_runner.py @@ -1,3 +1,17 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== """Runner for LLM Judge.""" from collections.abc import Sequence diff --git a/python/src/llm_comparator/model_helper.py b/python/src/llm_comparator/model_helper.py index b928615..36b6089 100644 --- a/python/src/llm_comparator/model_helper.py +++ b/python/src/llm_comparator/model_helper.py @@ -1,4 +1,18 @@ -"""Helper classes for calling LLMs.""" +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Classes for calling generating LLMs and embedding models.""" import abc from collections.abc import Iterable, Sequence diff --git a/python/src/llm_comparator/prompt_templates.py b/python/src/llm_comparator/prompt_templates.py index 9570ace..4120ee0 100644 --- a/python/src/llm_comparator/prompt_templates.py +++ b/python/src/llm_comparator/prompt_templates.py @@ -1,3 +1,17 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== """Prompt templates for the rationale summary generation script.""" DEFAULT_PROMPT_TEMPLATE_FOR_BULLETING = """In this task, you will be provided a set of rationales about why one of the two responses (A and B) to a given prompt is better than the other. diff --git a/python/src/llm_comparator/rationale_bullet_generator.py b/python/src/llm_comparator/rationale_bullet_generator.py index 90d4d21..6d05e03 100644 --- a/python/src/llm_comparator/rationale_bullet_generator.py +++ b/python/src/llm_comparator/rationale_bullet_generator.py @@ -1,3 +1,17 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== """Runner for generating rationale bullets for LLM Comparator.""" from collections.abc import Sequence diff --git a/python/src/llm_comparator/rationale_cluster_generator.py b/python/src/llm_comparator/rationale_cluster_generator.py index 1e7a7f8..375f302 100644 --- a/python/src/llm_comparator/rationale_cluster_generator.py +++ b/python/src/llm_comparator/rationale_cluster_generator.py @@ -1,7 +1,21 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== """Runner for creating rationale clusters for LLM Comparator.""" +from collections.abc import Mapping, Sequence import random -from typing import Mapping, Sequence import numpy as np import tqdm.auto @@ -23,7 +37,7 @@ class RationaleClusterGenerator: - """Runner for rationale cluster generation.""" + """Runner for generating labeled clusters using embedding similarity.""" def __init__( self, @@ -216,15 +230,17 @@ def run( Sequence[_RationaleCluster], Sequence[Sequence[_RationaleBulletWithClusterSimilarity]], ]: - """Generating rationale clusters. + """Generate clusters of similar rationale bullets. Args: - rationale_bullets_for_examples: Rationale bullets. - num_clusters: Number of clusters. + rationale_bullets_for_examples: The rationale bullets from a + llm_comparator.rationale_bullet_generator.RationaleBulletGenerator that + will be clustered. + num_clusters: The number of clusters to generate. Returns: - List of clusters and rationale_bullets augmented with similarities to - clusters for dynamic assignments from the client. + A tuple of index-aligned lists of clusters and rationale_bullets augmented + with similarities to clusters for dynamic assignments from the client. """ # Flatten rationales. flattened_rationales = self._flatten_rationales( diff --git a/python/src/llm_comparator/types.py b/python/src/llm_comparator/types.py index 82333c1..ccff075 100644 --- a/python/src/llm_comparator/types.py +++ b/python/src/llm_comparator/types.py @@ -1,4 +1,18 @@ -"""Type classes for LLM Comparator.""" +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Types for LLM Comparator.""" from collections.abc import Mapping from typing import Any, Optional, TypedDict @@ -7,11 +21,11 @@ class IndividualRating(TypedDict): - score: float | None - rating_label: str | None - is_flipped: bool | None - rationale: str | None - custom_fields: JsonDict = {} + score: Optional[float] + rating_label: Optional[str] + is_flipped: Optional[bool] + rationale: Optional[str] + custom_fields: JsonDict class LLMJudgeInput(TypedDict): @@ -22,29 +36,26 @@ class LLMJudgeInput(TypedDict): class LLMJudgeOutput(TypedDict): score: float - individual_rater_scores: list[IndividualRating] = [] + individual_rater_scores: list[IndividualRating] class RationaleBullet(TypedDict): rationale: str -class RationaleBulletWithClusterSimilarity(TypedDict): - rationale: str +class RationaleBulletWithClusterSimilarity(RationaleBullet): similarities: list[float] class Example(TypedDict): input_text: str - tags: list[str] = [] + tags: list[str] output_text_a: str output_text_b: str - score: Optional[float] = None - individual_rater_scores: list[IndividualRating] = [] - rationale_list: list[ - RationaleBullet | RationaleBulletWithClusterSimilarity - ] = [] - custom_fields: JsonDict = {} + score: Optional[float] + individual_rater_scores: list[IndividualRating] + rationale_list: list[RationaleBullet] + custom_fields: JsonDict class RationaleCluster(TypedDict): diff --git a/python/src/llm_comparator/utils.py b/python/src/llm_comparator/utils.py index 8696462..526602f 100644 --- a/python/src/llm_comparator/utils.py +++ b/python/src/llm_comparator/utils.py @@ -1,3 +1,17 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== """Utils for LLM Comparator scripts.""" from collections.abc import Sequence @@ -13,7 +27,7 @@ def extract_xml_part(raw_output: str, tag_name: str) -> Optional[ET.Element]: - """Find parts where is in the XML-formatted output.""" + """Find parts where <{tag_name}> is in the XML-formatted output.""" xml_output = re.search( rf'<{tag_name}>(.*?)', raw_output, flags=re.DOTALL ) diff --git a/python/src/llm_comparator/utils_test.py b/python/src/llm_comparator/utils_test.py index 108c24c..9606fef 100644 --- a/python/src/llm_comparator/utils_test.py +++ b/python/src/llm_comparator/utils_test.py @@ -1,4 +1,18 @@ -"""Tests for utils for LLM Comparator scripts.""" +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for llm_comparator.utils.""" import unittest import xml.etree.ElementTree as ET