Skip to content

Commit

Permalink
Merge branch 'master' into DOCS-1290-follow
Browse files Browse the repository at this point in the history
  • Loading branch information
J2-D2-3PO authored Feb 25, 2025
2 parents 549ade7 + 555d401 commit eb0c049
Show file tree
Hide file tree
Showing 48 changed files with 3,332 additions and 165 deletions.
86 changes: 86 additions & 0 deletions docs/docs/guides/core-types/models.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,92 @@ A `Model` is a combination of data (which can include configuration, trained mod
model.predict('world')
```

## Pairwise evaluation of models

When [scoring](../evaluation/scorers.md) models in a Weave [evaluation](../core-types/evaluations.md), absolute value metrics (e.g. `9/10` for Model A and `8/10` for Model B) are typically harder to assign than than relative ones (e.g. Model A performs better than Model B). _Pairwise evaluation_ allows you to compare the outputs of two models by ranking them relative to each other. This approach is particularly useful when you want to determine which model performs better for subjective tasks such as text generation, summarization, or question answering. With pairwise evaluation, you can obtain a relative preference ranking that reveals which model is best for specific inputs.

The following code sample demonstrates how to implement a pairwise evaluation in Weave by creating a [class-based scorer](../evaluation/scorers.md#class-based-scorers) called `PreferenceScorer`. The `PreferenceScorer` compares two models, `ModelA` and `ModelB`, and returns a relative score of the model outputs based on explicit hints in the input text.

```python
from weave import Model, Evaluation, Scorer, Dataset
from weave.flow.model import ApplyModelError, apply_model_async

class ModelA(Model):
@weave.op
def predict(self, input_text: str):
if "Prefer model A" in input_text:
return {"response": "This is a great answer from Model A"}
return {"response": "Meh, whatever"}

class ModelB(Model):
@weave.op
def predict(self, input_text: str):
if "Prefer model B" in input_text:
return {"response": "This is a thoughtful answer from Model B"}
return {"response": "I don't know"}

class PreferenceScorer(Scorer):
@weave.op
async def _get_other_model_output(self, example: dict) -> Any:
"""Get output from the other model for comparison.
Args:
example: The input example data to run through the other model
Returns:
The output from the other model
"""

other_model_result = await apply_model_async(
self.other_model,
example,
None,
)

if isinstance(other_model_result, ApplyModelError):
return None

return other_model_result.model_output

@weave.op
async def score(self, output: dict, input_text: str) -> dict:
"""Compare the output of the primary model with the other model.
Args:
output (dict): The output from the primary model.
other_output (dict): The output from the other model being compared.
inputs (str): The input text used to generate the outputs.
Returns:
dict: A flat dictionary containing the comparison result and reason.
"""
other_output = await self._get_other_model_output(
{"input_text": inputs}
)
if other_output is None:
return {"primary_is_better": False, "reason": "Other model failed"}

if "Prefer model A" in input_text:
primary_is_better = True
reason = "Model A gave a great answer"
else:
primary_is_better = False
reason = "Model B is preferred for this type of question"

return {"primary_is_better": primary_is_better, "reason": reason}

dataset = Dataset(
rows=[
{"input_text": "Prefer model A: Question 1"}, # Model A wins
{"input_text": "Prefer model A: Question 2"}, # Model A wins
{"input_text": "Prefer model B: Question 3"}, # Model B wins
{"input_text": "Prefer model B: Question 4"}, # Model B wins
]
)

model_a = ModelA()
model_b = ModelB()
pref_scorer = PreferenceScorer(other_model=model_b)
evaluation = Evaluation(dataset=dataset, scorers=[pref_scorer])
evaluation.evaluate(model_a)
```
</TabItem>
<TabItem value="typescript" label="TypeScript">
```plaintext
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ module = "weave_query.*"
ignore_errors = true

[tool.bumpversion]
current_version = "0.51.35-dev0"
current_version = "0.51.36-dev0"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.
Expand Down
26 changes: 20 additions & 6 deletions tests/integrations/langchain/langchain_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
flatten_calls,
op_name_from_ref,
)
from weave.trace.context import call_context
from weave.trace.weave_client import Call, WeaveClient
from weave.trace_server import trace_server_interface as tsi

Expand Down Expand Up @@ -181,9 +182,7 @@ def assert_correct_calls_for_chain_batch(calls: list[Call]) -> None:
allowed_hosts=["api.wandb.ai", "localhost", "trace.wandb.ai"],
before_record_request=filter_body,
)
def test_simple_chain_batch(
client: WeaveClient,
) -> None:
def test_simple_chain_batch(client: WeaveClient) -> None:
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI

Expand Down Expand Up @@ -253,9 +252,7 @@ def assert_correct_calls_for_chain_batch_from_op(calls: list[Call]) -> None:
allowed_hosts=["api.wandb.ai", "localhost", "trace.wandb.ai"],
before_record_request=filter_body,
)
def test_simple_chain_batch_inside_op(
client: WeaveClient,
) -> None:
def test_simple_chain_batch_inside_op(client: WeaveClient) -> None:
# This test is the same as test_simple_chain_batch, but ensures things work when nested in an op
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
Expand All @@ -271,6 +268,23 @@ def test_simple_chain_batch_inside_op(
def run_batch(batch: list) -> None:
_ = llm_chain.batch(batch)

# assert call stack is properly constructed, during runtime
parent = call_context.get_current_call()
assert parent is not None
assert "run_batch" in parent.op_name
assert parent.parent_id is None
assert len(parent.children()) == 2
for child in parent.children():
assert "langchain.Chain.RunnableSequence" in child.op_name
assert child.parent_id == parent.id

grandchildren = child.children()
assert len(grandchildren) == 2
assert "langchain.Prompt.PromptTemplate" in grandchildren[0].op_name
assert grandchildren[0].parent_id == child.id
assert "langchain.Llm.ChatOpenAI" in grandchildren[1].op_name
assert grandchildren[1].parent_id == child.id

run_batch([{"number": 2}, {"number": 3}])

calls = list(client.calls(filter=tsi.CallsFilter(trace_roots_only=True)))
Expand Down
33 changes: 33 additions & 0 deletions tests/trace/test_call_behaviours.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,36 @@ async def test_async_call_doesnt_print_link_if_failed(client_with_throwing_serve
await afunc()

assert captured.getvalue().count(TRACE_CALL_EMOJI) == 0


def test_nested_calls_print_single_link(client):
@weave.op
def inner(a, b):
return a + b

@weave.op
def middle(a, b):
return inner(a, b)

@weave.op
def outer(a, b):
return middle(a, b)

callbacks = [flushing_callback(client)]
with capture_output(callbacks) as captured:
outer(1, 2)

# Check that all 3 calls landed
calls = list(client.get_calls())
assert len(calls) == 3

# But only 1 donut link should be printed
s = captured.getvalue()
assert s.count(TRACE_CALL_EMOJI) == 1

# And that link should be the "outer" call
s = s.strip("\n")
_, call_id = s.rsplit("/", 1)

call = client.get_call(call_id)
assert "outer" in call.op_name
60 changes: 59 additions & 1 deletion tests/trace/test_serialize.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
from weave.trace.serialize import dictify, fallback_encode
from pydantic import BaseModel

from weave.trace.serialize import (
dictify,
fallback_encode,
is_pydantic_model_class,
to_json,
)


def test_dictify_simple() -> None:
Expand Down Expand Up @@ -199,3 +206,54 @@ def __init__(self, a: MyClassA) -> None:
"api_key": "REDACTED",
},
}


def test_is_pydantic_model_class() -> None:
"""We expect is_pydantic_model_class to return True for Pydantic model classes, and False otherwise.
Notably it should return False for instances of Pydantic model classes."""
assert not is_pydantic_model_class(int)
assert not is_pydantic_model_class(str)
assert not is_pydantic_model_class(list)
assert not is_pydantic_model_class(dict)
assert not is_pydantic_model_class(tuple)
assert not is_pydantic_model_class(set)
assert not is_pydantic_model_class(None)
assert not is_pydantic_model_class(42)
assert not is_pydantic_model_class("foo")
assert not is_pydantic_model_class({})
assert not is_pydantic_model_class([])

class CalendarEvent(BaseModel):
name: str
date: str
participants: list[str]

event = CalendarEvent(name="Test", date="2024-01-01", participants=["Alice", "Bob"])
assert not is_pydantic_model_class(event)
assert is_pydantic_model_class(CalendarEvent)


def test_to_json_pydantic_class(client) -> None:
"""We expect to_json to return the Pydantic schema for the class."""

class CalendarEvent(BaseModel):
name: str
date: str
participants: list[str]

project_id = "entity/project"
serialized = to_json(CalendarEvent, project_id, client, use_dictify=False)
assert serialized == {
"properties": {
"name": {"title": "Name", "type": "string"},
"date": {"title": "Date", "type": "string"},
"participants": {
"items": {"type": "string"},
"title": "Participants",
"type": "array",
},
},
"required": ["name", "date", "participants"],
"title": "CalendarEvent",
"type": "object",
}
2 changes: 1 addition & 1 deletion weave-js/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
"eslint-fix": "eslint --fix --ext .js,.jsx,.ts,.tsx src",
"tslint": "tslint --project .",
"tslint-fix": "tslint --fix --project .",
"generate": "graphql-codegen",
"generate": "graphql-codegen --silent",
"generate:watch": "graphql-codegen -w",
"prettier": "prettier --config .prettierrc --check \"src/**/*.ts\" \"src/**/*.tsx\"",
"prettier-fix": "prettier --loglevel warn --config .prettierrc --write \"src/**/*.ts\" \"src/**/*.tsx\"",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,18 @@ import {CellValueString} from './CellValueString';

type CellValueProps = {
value: any;
noLink?: boolean;
};

export const CellValue = ({value}: CellValueProps) => {
export const CellValue = ({value, noLink}: CellValueProps) => {
if (value === undefined) {
return null;
}
if (value === null) {
return <ValueViewPrimitive>null</ValueViewPrimitive>;
}
if (isWeaveRef(value) || isArtifactRef(value)) {
return <SmallRef objRef={parseRef(value)} />;
return <SmallRef objRef={parseRef(value)} noLink={noLink} />;
}
if (typeof value === 'boolean') {
return (
Expand Down
Loading

0 comments on commit eb0c049

Please sign in to comment.