Merge branch 'master' into DOCS-1290-follow

wandb · Feb 25, 2025 · eb0c049 · eb0c049
2 parents 549ade7 + 555d401
commit eb0c049
Show file tree

Hide file tree

Showing 48 changed files with 3,332 additions and 165 deletions.
diff --git a/docs/docs/guides/core-types/models.md b/docs/docs/guides/core-types/models.md
@@ -76,6 +76,92 @@ A `Model` is a combination of data (which can include configuration, trained mod
         model.predict('world')
     ```
 
+    ## Pairwise evaluation of models
+
+    When [scoring](../evaluation/scorers.md) models in a Weave [evaluation](../core-types/evaluations.md), absolute value metrics (e.g. `9/10` for Model A and `8/10` for Model B) are typically harder to assign than than relative ones (e.g. Model A performs better than Model B). _Pairwise evaluation_ allows you to compare the outputs of two models by ranking them relative to each other. This approach is particularly useful when you want to determine which model performs better for subjective tasks such as text generation, summarization, or question answering. With pairwise evaluation, you can obtain a relative preference ranking that reveals which model is best for specific inputs.
+
+    The following code sample demonstrates how to implement a pairwise evaluation in Weave by creating a [class-based scorer](../evaluation/scorers.md#class-based-scorers) called `PreferenceScorer`. The `PreferenceScorer` compares two models, `ModelA` and `ModelB`, and returns a relative score of the model outputs based on explicit hints in the input text.
+
+    ```python
+    from weave import Model, Evaluation, Scorer, Dataset
+    from weave.flow.model import ApplyModelError, apply_model_async
+
+    class ModelA(Model):
+        @weave.op
+        def predict(self, input_text: str):
+            if "Prefer model A" in input_text:
+                return {"response": "This is a great answer from Model A"}
+            return {"response": "Meh, whatever"}
+
+    class ModelB(Model):
+        @weave.op
+        def predict(self, input_text: str):
+            if "Prefer model B" in input_text:
+                return {"response": "This is a thoughtful answer from Model B"}
+            return {"response": "I don't know"}
+
+    class PreferenceScorer(Scorer):
+        @weave.op
+        async def _get_other_model_output(self, example: dict) -> Any:
+            """Get output from the other model for comparison.
+            Args:
+                example: The input example data to run through the other model
+            Returns:
+                The output from the other model
+            """
+
+            other_model_result = await apply_model_async(
+                self.other_model,
+                example,
+                None,
+            )
+
+            if isinstance(other_model_result, ApplyModelError):
+                return None
+
+            return other_model_result.model_output
+
+        @weave.op
+        async def score(self, output: dict, input_text: str) -> dict:
+            """Compare the output of the primary model with the other model.
+            Args:
+                output (dict): The output from the primary model.
+                other_output (dict): The output from the other model being compared.
+                inputs (str): The input text used to generate the outputs.
+            Returns:
+                dict: A flat dictionary containing the comparison result and reason.
+            """
+            other_output = await self._get_other_model_output(
+                {"input_text": inputs}
+            )
+            if other_output is None:
+                return {"primary_is_better": False, "reason": "Other model failed"}
+
+            if "Prefer model A" in input_text:
+                primary_is_better = True
+                reason = "Model A gave a great answer"
+            else:
+                primary_is_better = False
+                reason = "Model B is preferred for this type of question"
+
+            return {"primary_is_better": primary_is_better, "reason": reason}
+
+    dataset = Dataset(
+        rows=[
+            {"input_text": "Prefer model A: Question 1"},  # Model A wins
+            {"input_text": "Prefer model A: Question 2"},  # Model A wins
+            {"input_text": "Prefer model B: Question 3"},  # Model B wins
+            {"input_text": "Prefer model B: Question 4"},  # Model B wins
+        ]
+    )
+
+    model_a = ModelA()
+    model_b = ModelB()
+    pref_scorer = PreferenceScorer(other_model=model_b)
+    evaluation = Evaluation(dataset=dataset, scorers=[pref_scorer])
+    evaluation.evaluate(model_a)
+```
+
   </TabItem>
   <TabItem value="typescript" label="TypeScript">
     ```plaintext

diff --git a/pyproject.toml b/pyproject.toml
@@ -237,7 +237,7 @@ module = "weave_query.*"
 ignore_errors = true
 
 [tool.bumpversion]
-current_version = "0.51.35-dev0"
+current_version = "0.51.36-dev0"
 parse = """(?x)
     (?P<major>0|[1-9]\\d*)\\.
     (?P<minor>0|[1-9]\\d*)\\.

diff --git a/tests/integrations/langchain/langchain_test.py b/tests/integrations/langchain/langchain_test.py
@@ -10,6 +10,7 @@
     flatten_calls,
     op_name_from_ref,
 )
+from weave.trace.context import call_context
 from weave.trace.weave_client import Call, WeaveClient
 from weave.trace_server import trace_server_interface as tsi
 
@@ -181,9 +182,7 @@ def assert_correct_calls_for_chain_batch(calls: list[Call]) -> None:
     allowed_hosts=["api.wandb.ai", "localhost", "trace.wandb.ai"],
     before_record_request=filter_body,
 )
-def test_simple_chain_batch(
-    client: WeaveClient,
-) -> None:
+def test_simple_chain_batch(client: WeaveClient) -> None:
     from langchain_core.prompts import PromptTemplate
     from langchain_openai import ChatOpenAI
 
@@ -253,9 +252,7 @@ def assert_correct_calls_for_chain_batch_from_op(calls: list[Call]) -> None:
     allowed_hosts=["api.wandb.ai", "localhost", "trace.wandb.ai"],
     before_record_request=filter_body,
 )
-def test_simple_chain_batch_inside_op(
-    client: WeaveClient,
-) -> None:
+def test_simple_chain_batch_inside_op(client: WeaveClient) -> None:
     # This test is the same as test_simple_chain_batch, but ensures things work when nested in an op
     from langchain_core.prompts import PromptTemplate
     from langchain_openai import ChatOpenAI
@@ -271,6 +268,23 @@ def test_simple_chain_batch_inside_op(
     def run_batch(batch: list) -> None:
         _ = llm_chain.batch(batch)
 
+        # assert call stack is properly constructed, during runtime
+        parent = call_context.get_current_call()
+        assert parent is not None
+        assert "run_batch" in parent.op_name
+        assert parent.parent_id is None
+        assert len(parent.children()) == 2
+        for child in parent.children():
+            assert "langchain.Chain.RunnableSequence" in child.op_name
+            assert child.parent_id == parent.id
+
+            grandchildren = child.children()
+            assert len(grandchildren) == 2
+            assert "langchain.Prompt.PromptTemplate" in grandchildren[0].op_name
+            assert grandchildren[0].parent_id == child.id
+            assert "langchain.Llm.ChatOpenAI" in grandchildren[1].op_name
+            assert grandchildren[1].parent_id == child.id
+
     run_batch([{"number": 2}, {"number": 3}])
 
     calls = list(client.calls(filter=tsi.CallsFilter(trace_roots_only=True)))

diff --git a/tests/trace/test_call_behaviours.py b/tests/trace/test_call_behaviours.py
@@ -49,3 +49,36 @@ async def test_async_call_doesnt_print_link_if_failed(client_with_throwing_serve
         await afunc()
 
     assert captured.getvalue().count(TRACE_CALL_EMOJI) == 0
+
+
+def test_nested_calls_print_single_link(client):
+    @weave.op
+    def inner(a, b):
+        return a + b
+
+    @weave.op
+    def middle(a, b):
+        return inner(a, b)
+
+    @weave.op
+    def outer(a, b):
+        return middle(a, b)
+
+    callbacks = [flushing_callback(client)]
+    with capture_output(callbacks) as captured:
+        outer(1, 2)
+
+    # Check that all 3 calls landed
+    calls = list(client.get_calls())
+    assert len(calls) == 3
+
+    # But only 1 donut link should be printed
+    s = captured.getvalue()
+    assert s.count(TRACE_CALL_EMOJI) == 1
+
+    # And that link should be the "outer" call
+    s = s.strip("\n")
+    _, call_id = s.rsplit("/", 1)
+
+    call = client.get_call(call_id)
+    assert "outer" in call.op_name
diff --git a/tests/trace/test_serialize.py b/tests/trace/test_serialize.py
@@ -1,4 +1,11 @@
-from weave.trace.serialize import dictify, fallback_encode
+from pydantic import BaseModel
+
+from weave.trace.serialize import (
+    dictify,
+    fallback_encode,
+    is_pydantic_model_class,
+    to_json,
+)
 
 
 def test_dictify_simple() -> None:
@@ -199,3 +206,54 @@ def __init__(self, a: MyClassA) -> None:
             "api_key": "REDACTED",
         },
     }
+
+
+def test_is_pydantic_model_class() -> None:
+    """We expect is_pydantic_model_class to return True for Pydantic model classes, and False otherwise.
+    Notably it should return False for instances of Pydantic model classes."""
+    assert not is_pydantic_model_class(int)
+    assert not is_pydantic_model_class(str)
+    assert not is_pydantic_model_class(list)
+    assert not is_pydantic_model_class(dict)
+    assert not is_pydantic_model_class(tuple)
+    assert not is_pydantic_model_class(set)
+    assert not is_pydantic_model_class(None)
+    assert not is_pydantic_model_class(42)
+    assert not is_pydantic_model_class("foo")
+    assert not is_pydantic_model_class({})
+    assert not is_pydantic_model_class([])
+
+    class CalendarEvent(BaseModel):
+        name: str
+        date: str
+        participants: list[str]
+
+    event = CalendarEvent(name="Test", date="2024-01-01", participants=["Alice", "Bob"])
+    assert not is_pydantic_model_class(event)
+    assert is_pydantic_model_class(CalendarEvent)
+
+
+def test_to_json_pydantic_class(client) -> None:
+    """We expect to_json to return the Pydantic schema for the class."""
+
+    class CalendarEvent(BaseModel):
+        name: str
+        date: str
+        participants: list[str]
+
+    project_id = "entity/project"
+    serialized = to_json(CalendarEvent, project_id, client, use_dictify=False)
+    assert serialized == {
+        "properties": {
+            "name": {"title": "Name", "type": "string"},
+            "date": {"title": "Date", "type": "string"},
+            "participants": {
+                "items": {"type": "string"},
+                "title": "Participants",
+                "type": "array",
+            },
+        },
+        "required": ["name", "date", "participants"],
+        "title": "CalendarEvent",
+        "type": "object",
+    }
diff --git a/weave-js/package.json b/weave-js/package.json
@@ -14,7 +14,7 @@
     "eslint-fix": "eslint --fix --ext .js,.jsx,.ts,.tsx src",
     "tslint": "tslint --project .",
     "tslint-fix": "tslint --fix --project .",
-    "generate": "graphql-codegen",
+    "generate": "graphql-codegen --silent",
     "generate:watch": "graphql-codegen -w",
     "prettier": "prettier --config .prettierrc --check \"src/**/*.ts\" \"src/**/*.tsx\"",
     "prettier-fix": "prettier --loglevel warn --config .prettierrc --write \"src/**/*.ts\" \"src/**/*.tsx\"",

diff --git a/weave-js/src/components/PagePanelComponents/Home/Browse2/CellValue.tsx b/weave-js/src/components/PagePanelComponents/Home/Browse2/CellValue.tsx
@@ -18,17 +18,18 @@ import {CellValueString} from './CellValueString';
 
 type CellValueProps = {
   value: any;
+  noLink?: boolean;
 };
 
-export const CellValue = ({value}: CellValueProps) => {
+export const CellValue = ({value, noLink}: CellValueProps) => {
   if (value === undefined) {
     return null;
   }
   if (value === null) {
     return <ValueViewPrimitive>null</ValueViewPrimitive>;
   }
   if (isWeaveRef(value) || isArtifactRef(value)) {
-    return <SmallRef objRef={parseRef(value)} />;
+    return <SmallRef objRef={parseRef(value)} noLink={noLink} />;
   }
   if (typeof value === 'boolean') {
     return (