Merge branch 'master' into update-costs-2/24

wandb · Feb 24, 2025 · c26a19b · c26a19b
2 parents 964b8f0 + 0abb752
commit c26a19b
Show file tree

Hide file tree

Showing 86 changed files with 3,641 additions and 473 deletions.
diff --git a/docs/docs/guides/core-types/models.md b/docs/docs/guides/core-types/models.md
@@ -3,10 +3,10 @@ import TabItem from '@theme/TabItem';
 
 # Models
 
+A `Model` is a combination of data (which can include configuration, trained model weights, or other information) and code that defines how the model operates. By structuring your code to be compatible with this API, you benefit from a structured way to version your application so you can more systematically keep track of your experiments.
+
 <Tabs groupId="programming-language" queryString>
   <TabItem value="python" label="Python" default>
-    A `Model` is a combination of data (which can include configuration, trained model weights, or other information) and code that defines how the model operates. By structuring your code to be compatible with this API, you benefit from a structured way to version your application so you can more systematically keep track of your experiments.
-
     To create a model in Weave, you need the following:
 
     - a class that inherits from `weave.Model`
@@ -76,6 +76,92 @@ import TabItem from '@theme/TabItem';
         model.predict('world')
     ```
 
+    ## Pairwise evaluation of models
+
+    When [scoring](../evaluation/scorers.md) models in a Weave [evaluation](../core-types/evaluations.md), absolute value metrics (e.g. `9/10` for Model A and `8/10` for Model B) are typically harder to assign than than relative ones (e.g. Model A performs better than Model B). _Pairwise evaluation_ allows you to compare the outputs of two models by ranking them relative to each other. This approach is particularly useful when you want to determine which model performs better for subjective tasks such as text generation, summarization, or question answering. With pairwise evaluation, you can obtain a relative preference ranking that reveals which model is best for specific inputs.
+
+    The following code sample demonstrates how to implement a pairwise evaluation in Weave by creating a [class-based scorer](../evaluation/scorers.md#class-based-scorers) called `PreferenceScorer`. The `PreferenceScorer` compares two models, `ModelA` and `ModelB`, and returns a relative score of the model outputs based on explicit hints in the input text.
+
+    ```python
+    from weave import Model, Evaluation, Scorer, Dataset
+    from weave.flow.model import ApplyModelError, apply_model_async
+
+    class ModelA(Model):
+        @weave.op
+        def predict(self, input_text: str):
+            if "Prefer model A" in input_text:
+                return {"response": "This is a great answer from Model A"}
+            return {"response": "Meh, whatever"}
+
+    class ModelB(Model):
+        @weave.op
+        def predict(self, input_text: str):
+            if "Prefer model B" in input_text:
+                return {"response": "This is a thoughtful answer from Model B"}
+            return {"response": "I don't know"}
+
+    class PreferenceScorer(Scorer):
+        @weave.op
+        async def _get_other_model_output(self, example: dict) -> Any:
+            """Get output from the other model for comparison.
+            Args:
+                example: The input example data to run through the other model
+            Returns:
+                The output from the other model
+            """
+
+            other_model_result = await apply_model_async(
+                self.other_model,
+                example,
+                None,
+            )
+
+            if isinstance(other_model_result, ApplyModelError):
+                return None
+
+            return other_model_result.model_output
+
+        @weave.op
+        async def score(self, output: dict, input_text: str) -> dict:
+            """Compare the output of the primary model with the other model.
+            Args:
+                output (dict): The output from the primary model.
+                other_output (dict): The output from the other model being compared.
+                inputs (str): The input text used to generate the outputs.
+            Returns:
+                dict: A flat dictionary containing the comparison result and reason.
+            """
+            other_output = await self._get_other_model_output(
+                {"input_text": inputs}
+            )
+            if other_output is None:
+                return {"primary_is_better": False, "reason": "Other model failed"}
+
+            if "Prefer model A" in input_text:
+                primary_is_better = True
+                reason = "Model A gave a great answer"
+            else:
+                primary_is_better = False
+                reason = "Model B is preferred for this type of question"
+
+            return {"primary_is_better": primary_is_better, "reason": reason}
+
+    dataset = Dataset(
+        rows=[
+            {"input_text": "Prefer model A: Question 1"},  # Model A wins
+            {"input_text": "Prefer model A: Question 2"},  # Model A wins
+            {"input_text": "Prefer model B: Question 3"},  # Model B wins
+            {"input_text": "Prefer model B: Question 4"},  # Model B wins
+        ]
+    )
+
+    model_a = ModelA()
+    model_b = ModelB()
+    pref_scorer = PreferenceScorer(other_model=model_b)
+    evaluation = Evaluation(dataset=dataset, scorers=[pref_scorer])
+    evaluation.evaluate(model_a)
+```
+
   </TabItem>
   <TabItem value="typescript" label="TypeScript">
     ```plaintext

diff --git a/docs/docs/guides/integrations/azure.md b/docs/docs/guides/integrations/azure.md
@@ -1,30 +1,36 @@
 # Microsoft Azure
 
-Weights & Biases integrates with Microsoft Azure OpenAI services, helping teams to manage, debug, and optimize their Azure AI workflows at scale. This guide introduces the W&B integration, what it means for Weave users, its key features, and how to get started.
+Weights & Biases (W&B) Weave integrates with Microsoft Azure OpenAI services, helping teams to optimize their Azure AI applications. Using W&B, you can 
 
 :::tip
 For the latest tutorials, visit [Weights & Biases on Microsoft Azure](https://wandb.ai/site/partners/azure).
 :::
 
-## Key features
-
-- **LLM evaluations**: Evaluate and monitor LLM-powered applications using Weave, optimized for Azure infrastructure.  
-- **Seamless integration**: Deploy W&B Models on a dedicated Azure tenant with built-in integrations for Azure AI Studio, Azure ML, Azure OpenAI Service, and other Azure AI services.  
-- **Enhanced performance**: Use Azure’s infrastructure to train and deploy models faster, with auto-scaling clusters and optimized resources.  
-- **Scalable experiment tracking**: Automatically log hyperparameters, metrics, and artifacts for Azure AI Studio and Azure ML runs.  
-- **LLM fine-tuning**: Fine-tune models with W&B Models.
-- **Central repository for models and datasets**: Manage and version models and datasets with W&B Registry and Azure AI Studio.  
-- **Collaborative workspaces**: Support teamwork with shared workspaces, experiment commenting, and Microsoft Teams integration.  
-- **Governance framework**: Ensure security with fine-grained access controls, audit trails, and Microsoft Entra ID integration.  
-
 ## Getting started
 
-To use W&B with Azure, add the W&B integration via the [Azure Marketplace](https://azuremarketplace.microsoft.com/en-us/marketplace/apps/weightsandbiasesinc1641502883483.weights_biases_for_azure?tab=Overview).
+To get started using Azure with Weave, simply decorate the function(s) you want to track with `weave.op`.
 
-For a detailed guide describing how to integrate Azure OpenAI fine-tuning with W&B, see [Integrating Weights & Biases with Azure AI Services](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/weights-and-biases-integration).
+```python
+@weave.op()
+def call_azure_chat(model_id: str, messages: list, max_tokens: int = 1000, temperature: float = 0.5):
+    response = client.chat.completions.create(
+        model=model_id,
+        messages=messages,
+        max_tokens=max_tokens,
+        temperature=temperature
+    )
+    return {"status": "success", "response": response.choices[0].message.content}
+
+```
 
 ## Learn more
 
-- [Weights & Biases + Microsoft Azure Overview](https://wandb.ai/site/partners/azure)
-- [How W&B and Microsoft Azure Are Empowering Enterprises](https://techcommunity.microsoft.com/blog/azure-ai-services-blog/how-weights--biases-and-microsoft-azure-are-empowering-enterprises-to-fine-tune-/4303716)
-- [Microsoft Azure OpenAI Service Documentation](https://learn.microsoft.com/en-us/azure/ai-services/openai/)
+Learn more about advanced Azure with Weave topics using the resources below.
+
+### Use the Azure AI Model Inference API with Weave
+
+Learn how to use the [Azure AI Model Inference API] with Weave to gain insights into Azure models in [this guide](https://wandb.ai/byyoung3/ML-NEWS2/reports/A-guide-to-using-the-Azure-AI-model-inference-API--Vmlldzo4OTY1MjEy#tutorial:-implementing-azure-ai-model-inference-api-with-w&b-weave-).
+
+### Trace Azure OpenAI models with Weave
+
+Learn how to trace Azure OpenAI models using Weave in [this guide](https://wandb.ai/a-sh0ts/azure-weave-cookbook/reports/How-to-use-Azure-OpenAI-and-Azure-AI-Studio-with-W-B-Weave--Vmlldzo4MTI0NDgy).  
diff --git a/docs/docs/guides/integrations/bedrock.md b/docs/docs/guides/integrations/bedrock.md
@@ -2,14 +2,12 @@
 
 Weave automatically tracks and logs LLM calls made via Amazon Bedrock, AWS's fully managed service that offers foundation models from leading AI companies through a unified API.
 
+There are multiple ways to log LLM calls to Weave from Amazon Bedrock. You can use `weave.op` to create reusable operations for tracking any calls to a Bedrock model. Optionally, if you're using Anthropic models, you can use Weave’s built-in integration with Anthropic. 
+
 :::tip
 For the latest tutorials, visit [Weights & Biases on Amazon Web Services](https://wandb.ai/site/partners/aws/).
 :::
 
-:::note
-Do you want to experiment with Amazon Bedrock models on Weave without any set up? Try the [LLM Playground](../tools/playground.md).
-:::
-
 ## Traces
 
 Weave will automatically capture traces for Bedrock API calls. You can use the Bedrock client as usual after initializing Weave and patching the client:
@@ -143,3 +141,15 @@ print(result)
 ```
 
 This approach allows you to version your experiments and easily track different configurations of your Bedrock-based application.
+
+## Learn more
+
+Learn more about using Amazon Bedrock with Weave
+
+### Try Bedrock in the Weave Playground
+
+Do you want to experiment with Amazon Bedrock models in the Weave UI without any set up? Try the [LLM Playground](../tools/playground.md).
+
+### Report: Compare LLMs on Bedrock for text summarization with Weave
+
+The [Compare LLMs on Bedrock for text summarization with Weave](https://wandb.ai/byyoung3/ML_NEWS3/reports/Compare-LLMs-on-Amazon-Bedrock-for-text-summarization-with-W-B-Weave--VmlldzoxMDI1MTIzNw) report explains how to use Bedrock in combination with Weave to evaluate and compare LLMs for summarization tasks, code samples included.
diff --git a/...docs/guides/integrations/google-gemini.md → docs/docs/guides/integrations/google.md b/...docs/guides/integrations/google-gemini.md → docs/docs/guides/integrations/google.md
@@ -1,21 +1,21 @@
-# Google Gemini
+# Google 
 
 :::tip
 For the latest tutorials, visit [Weights & Biases on Google Cloud](https://wandb.ai/site/partners/googlecloud/).
 :::
 
 :::note
-Do you want to experiment with Google Gemini models on Weave without any set up? Try the [LLM Playground](../tools/playground.md).
+Do you want to experiment with Google AI models on Weave without any set up? Try the [LLM Playground](../tools/playground.md).
 :::
 
-Google offers two ways of calling Gemini via API:
+This page describes how to use W&B Weave with the Google Vertex AI API and the Google Gemini API.
 
-1. Via the [Vertex APIs](https://cloud.google.com/vertex-ai/docs).
-2. Via the [Gemini API SDK](https://ai.google.dev/gemini-api/docs/quickstart?lang=python).
+You can use Weave to evaluate, monitor, and iterate on your Google GenAI applications. Weave automatically captures traces for the:
 
-## Tracing
+1. [Google Vertex AI API](https://cloud.google.com/vertex-ai/docs), which provides access to Google’s Gemini models and [various partner models](https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/use-partner-models).
+2. [Google Gemini API](https://ai.google.dev/gemini-api/docs/quickstart?lang=python), which is accessible via Python SDK, Node.js SDK, Go SDK, and REST.
 
-It’s important to store traces of language model applications in a central location, both during development and in production. These traces can be useful for debugging, and as a dataset that will help you improve your application.
+## Get started
 
 Weave will automatically capture traces for [Gemini API SDK](https://ai.google.dev/gemini-api/docs/quickstart?lang=python). To start tracking, calling `weave.init(project_name="<YOUR-WANDB-PROJECT-NAME>")` and use the library as normal.
 
@@ -120,3 +120,4 @@ Given a weave reference to any `weave.Model` object, you can spin up a fastapi s
 ```shell
 weave serve weave:///your_entity/project-name/YourModel:<hash>
 ```
+
diff --git a/docs/docs/guides/integrations/index.md b/docs/docs/guides/integrations/index.md
@@ -14,7 +14,7 @@ LLM providers are the vendors that offer access to large language models for gen
 - **[Anthropic](/guides/integrations/anthropic)**
 - **[Cerebras](/guides/integrations/cerebras)**
 - **[Cohere](/guides/integrations/cohere)**
-- **[Google Gemini](/guides/integrations/google-gemini)**
+- **[Google](/guides/integrations/google)**
 - **[Groq](/guides/integrations/groq)**
 - **[Hugging Face Hub](/guides/integrations/huggingface)**
 - **[LiteLLM](/guides/integrations/litellm)**

diff --git a/docs/docs/guides/tools/playground.md b/docs/docs/guides/tools/playground.md
@@ -2,7 +2,7 @@
 
 > **The LLM Playground is currently in preview.**
 
-Evaluating LLM prompts and responses is challenging. The Weave Playground is designed to simplify the process of iterating on LLM prompts and responses, making it easier to experiment with different models and prompts. With features like prompt editing, message retrying, and model comparison, Playground helps you to quickly test and improve your LLM applications. Playground currently supports OpenAI, Anthropic, Google Gemini, Groq, and Amazon Bedrock models.
+Evaluating LLM prompts and responses is challenging. The Weave Playground is designed to simplify the process of iterating on LLM prompts and responses, making it easier to experiment with different models and prompts. With features like prompt editing, message retrying, and model comparison, Playground helps you to quickly test and improve your LLM applications. Playground currently supports models from OpenAI, Anthropic, Google, Groq, Amazon Bedrock, and Microsoft Azure.
 
 ## Features
 
@@ -32,12 +32,11 @@ Before you can use Playground, you must [add provider credentials](#add-provider
 
 ### Add provider credentials and information
 
-Playground currently supports OpenAI, Anthropic, Gemini, Groq, Amazon Bedrock, and Azure models.
-To use one of the available models, add the appropriate information to your team secrets in W&B settings.
+Playground currently supports models from OpenAI, Anthropic, Google, Groq, Amazon Bedrock, and Microsoft Azure. To use one of the available models, add the appropriate information to your team secrets in W&B settings.
 
 - OpenAI: `OPENAI_API_KEY`
 - Anthropic: `ANTHROPIC_API_KEY`
-- Google Gemini: `GOOGLE_API_KEY`
+- Google: `GOOGLE_API_KEY`
 - Groq: `GEMMA_API_KEY`
 - Amazon Bedrock:
   - `AWS_ACCESS_KEY_ID`
@@ -67,7 +66,7 @@ You can switch the LLM using the dropdown menu in the top left. The available mo
 - [Amazon Bedrock](#amazon-bedrock)
 - [Anthropic](#anthropic)
 - [Azure](#azure)
-- [Google Gemini](#gemini)
+- [Google](#google)
 - [Groq](#groq)
 - [OpenAI](#openai)
 - [X.AI](#xai)
@@ -126,7 +125,7 @@ You can switch the LLM using the dropdown menu in the top left. The available mo
 - azure/gpt-4o-mini
 - azure/gpt-4o-mini-2024-07-18
 
-### [Google Gemini](../integrations/google-gemini.md)
+### [Google](../integrations/google.md)
 
 - gemini/gemini-1.5-flash-001
 - gemini/gemini-1.5-flash-002

diff --git a/docs/sidebars.ts b/docs/sidebars.ts
@@ -132,7 +132,7 @@ const sidebars: SidebarsConfig = {
             "guides/integrations/anthropic",
             "guides/integrations/cerebras",
             "guides/integrations/cohere",
-            "guides/integrations/google-gemini",
+            "guides/integrations/google",
             "guides/integrations/groq",
             "guides/integrations/huggingface",
             "guides/integrations/litellm",

diff --git a/tests/integrations/langchain/langchain_test.py b/tests/integrations/langchain/langchain_test.py
@@ -183,6 +183,7 @@ def assert_correct_calls_for_chain_batch(calls: list[Call]) -> None:
 )
 def test_simple_chain_batch(
     client: WeaveClient,
+    capsys: pytest.CaptureFixture[str],
 ) -> None:
     from langchain_core.prompts import PromptTemplate
     from langchain_openai import ChatOpenAI
@@ -198,6 +199,12 @@ def test_simple_chain_batch(
     calls = list(client.calls(filter=tsi.CallsFilter(trace_roots_only=True)))
     assert_correct_calls_for_chain_batch(calls)
 
+    log_lines = capsys.readouterr().out
+
+    # one parent call link
+    assert log_lines.count("/shawn/test-project/r/call") == 1
+    assert "Error in WeaveTracer.on_chain_start callback" not in log_lines
+
 
 @pytest.mark.skip_clickhouse_client
 @pytest.mark.vcr(

diff --git a/tests/integrations/pandas-test/test_calls_to_pandas.py b/tests/integrations/pandas-test/test_calls_to_pandas.py
@@ -0,0 +1,79 @@
+import pandas as pd
+import pytest
+
+import weave
+
+
+@weave.op
+def func(name: str, age: int) -> str:
+    return f"Hello, {name}! You are {age} years old."
+
+
+@weave.op
+def raising_func(name: str, age: int) -> str:
+    raise ValueError("This is a test error")
+
+
+@pytest.fixture
+def logging_example(client):
+    func("Alice", 30)
+
+    with weave.attributes({"tag": "test", "version": "1.0"}):
+        func("Bob", 25)
+
+    try:
+        raising_func("Claire", 35)
+    except:
+        pass
+
+
+def test_calls_to_pandas_basic(logging_example, client):
+    calls = client.get_calls()
+    df = calls.to_pandas()
+
+    assert isinstance(df, pd.DataFrame)
+    assert len(df) == 3  # The three calls we made
+
+    dictified = df.to_dict(orient="records")
+    calls_as_dicts = [c.to_dict() for c in calls]
+
+    for d1, d2 in zip(dictified, calls_as_dicts):
+        assert d1 == d2
+
+
+def test_calls_to_pandas_with_limit(logging_example, client):
+    calls = client.get_calls(limit=1)
+    df = calls.to_pandas()
+
+    assert isinstance(df, pd.DataFrame)
+    assert len(df) == 1
+
+    dictified = df.to_dict(orient="records")
+
+    # Maintains insertion order
+    d = dictified[0]
+    assert d["inputs"]["name"] == "Alice"
+    assert d["inputs"]["age"] == 30
+
+
+@pytest.mark.asyncio
+async def test_calls_to_pandas_with_evaluations(client):
+    @weave.op
+    def model(x: int, y: int) -> int:
+        return x + y
+
+    ev = weave.Evaluation(
+        dataset=[
+            {"x": 1, "y": 2},
+            {"x": 3, "y": 4},
+            {"x": 5, "y": 6},
+        ]
+    )
+    res = await ev.evaluate(model)
+
+    calls_df = client.get_calls().to_pandas()
+    assert len(calls_df) == (
+        1  # evaluate
+        + 3 * 2  # predict and score + model
+        + 1  # summarize
+    )
diff --git a/...s/integrations/pandas-test/test_pandas.py → ...ations/pandas-test/test_dataset_pandas.py b/...s/integrations/pandas-test/test_pandas.py → ...ations/pandas-test/test_dataset_pandas.py
diff --git a/tests/trace/test_patcher.py → tests/integrations/test_patcher.py b/tests/trace/test_patcher.py → tests/integrations/test_patcher.py
@@ -2,7 +2,7 @@
 
 import pytest
 
-from weave.trace.patcher import SymbolPatcher
+from weave.integrations.patcher import SymbolPatcher
 
 
 def test_symbol_patcher():