From 67dda58aea2a620f904c5bbc8748a901c31f2a2a Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Thu, 14 May 2026 13:08:40 -0500 Subject: [PATCH] fix: Make judge runners non-multi-turn PR #166 moved conversation history into the provider model runners. That broke judges: a Judge shares a single runner across successive evaluate() calls, so each evaluation polluted the next one's history and concurrent evaluations raced on the mutable state. Add a ``multi_turn`` flag (default True) to the OpenAI and LangChain model runners and thread it through ``RunnerFactory.create_model`` and the provider factories. The judge factory now constructs its runner with ``multi_turn=False`` so each evaluation starts from the same baseline. Also fix ``Judge.evaluate_messages`` to preserve message roles when rendering the conversation for the judge, so user and assistant turns remain distinguishable. Co-Authored-By: Claude Opus 4.7 --- .../ldai_langchain/langchain_model_runner.py | 10 ++- .../langchain_runner_factory.py | 7 +- .../tests/test_langchain_provider.py | 35 +++++++++ .../src/ldai_openai/openai_model_runner.py | 4 +- .../src/ldai_openai/openai_runner_factory.py | 9 ++- .../tests/test_openai_provider.py | 59 +++++++++++++++ packages/sdk/server-ai/src/ldai/client.py | 4 +- .../sdk/server-ai/src/ldai/judge/__init__.py | 8 +- .../src/ldai/providers/ai_provider.py | 7 +- .../src/ldai/providers/runner_factory.py | 10 ++- packages/sdk/server-ai/tests/test_judge.py | 74 +++++++++++++++++++ .../server-ai/tests/test_runner_factory.py | 65 +++++++++++++++- 12 files changed, 280 insertions(+), 12 deletions(-) diff --git a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_model_runner.py b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_model_runner.py index ff97d5b5..bc90f425 100644 --- a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_model_runner.py +++ b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_model_runner.py @@ -25,11 +25,17 @@ class LangChainModelRunner(Runner): :meth:`run`. """ - def __init__(self, llm: BaseChatModel, config_messages: Optional[List[LDMessage]] = None): + def __init__( + self, + llm: BaseChatModel, + config_messages: Optional[List[LDMessage]] = None, + multi_turn: bool = True, + ): self._llm = llm self._chat_history = InMemoryChatMessageHistory( messages=cast(List[BaseMessage], convert_messages_to_langchain(config_messages or [])) ) + self._multi_turn = multi_turn def get_llm(self) -> BaseChatModel: """ @@ -61,7 +67,7 @@ async def run( else: result = await self._run_completion(langchain_messages) - if result.metrics.success and result.content: + if result.metrics.success and result.content and self._multi_turn: self._chat_history.add_user_message(input) self._chat_history.add_ai_message(result.content) diff --git a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_runner_factory.py b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_runner_factory.py index f53f9f00..0a7005a2 100644 --- a/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_runner_factory.py +++ b/packages/ai-providers/server-ai-langchain/src/ldai_langchain/langchain_runner_factory.py @@ -61,13 +61,16 @@ def create_agent_graph( ) return LangGraphAgentGraphRunner(graph_def, tools) - def create_model(self, config: AIConfigKind) -> LangChainModelRunner: + def create_model(self, config: AIConfigKind, multi_turn: bool = True) -> LangChainModelRunner: """ Create a configured LangChainModelRunner for the given AI config. :param config: The LaunchDarkly AI configuration + :param multi_turn: When ``True`` (the default) the runner accumulates + successful exchanges into its conversation history. Pass ``False`` to + keep history fixed at the configured baseline across ``run()`` calls. :return: LangChainModelRunner ready to invoke the model """ llm = create_langchain_model(config) config_messages = list(getattr(config, 'messages', None) or []) - return LangChainModelRunner(llm, config_messages) + return LangChainModelRunner(llm, config_messages, multi_turn=multi_turn) diff --git a/packages/ai-providers/server-ai-langchain/tests/test_langchain_provider.py b/packages/ai-providers/server-ai-langchain/tests/test_langchain_provider.py index ec1dd661..0b8c0f14 100644 --- a/packages/ai-providers/server-ai-langchain/tests/test_langchain_provider.py +++ b/packages/ai-providers/server-ai-langchain/tests/test_langchain_provider.py @@ -281,6 +281,41 @@ async def test_accumulates_history_across_successful_calls(self, mock_llm): assert second_call_messages[1].content == 'First response' assert second_call_messages[2].content == 'Second question' + @pytest.mark.asyncio + async def test_multi_turn_false_does_not_accumulate_history(self, mock_llm): + """When multi_turn=False the runner must not append to history on success.""" + mock_llm.ainvoke = AsyncMock(side_effect=[ + AIMessage(content='First response'), + AIMessage(content='Second response'), + ]) + provider = LangChainModelRunner(mock_llm, multi_turn=False) + baseline_len = len(provider._chat_history.messages) + + await provider.run('First question') + assert len(provider._chat_history.messages) == baseline_len + + await provider.run('Second question') + assert len(provider._chat_history.messages) == baseline_len + + second_call_messages = mock_llm.ainvoke.call_args_list[1][0][0] + assert len(second_call_messages) == 1 + assert second_call_messages[0].content == 'Second question' + + @pytest.mark.asyncio + async def test_multi_turn_default_accumulates_history(self, mock_llm): + """Default behavior (multi_turn omitted) still accumulates history (preserves PR #166).""" + mock_llm.ainvoke = AsyncMock(side_effect=[ + AIMessage(content='First response'), + AIMessage(content='Second response'), + ]) + provider = LangChainModelRunner(mock_llm) + baseline_len = len(provider._chat_history.messages) + + await provider.run('First question') + await provider.run('Second question') + + assert len(provider._chat_history.messages) == baseline_len + 4 + @pytest.mark.asyncio async def test_does_not_accumulate_history_on_failed_call(self, mock_llm): """Should not add to history when the call fails.""" diff --git a/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_model_runner.py b/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_model_runner.py index e16f5d41..358f34b3 100644 --- a/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_model_runner.py +++ b/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_model_runner.py @@ -29,11 +29,13 @@ def __init__( model_name: str, parameters: Dict[str, Any], config_messages: Optional[List[LDMessage]] = None, + multi_turn: bool = True, ): self._client = client self._model_name = model_name self._parameters = parameters self._history: List[LDMessage] = list(config_messages or []) + self._multi_turn = multi_turn async def run( self, @@ -58,7 +60,7 @@ async def run( else: result = await self._run_completion(messages) - if result.metrics.success and result.content: + if result.metrics.success and result.content and self._multi_turn: self._history.append(user_message) self._history.append(LDMessage(role='assistant', content=result.content)) diff --git a/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_runner_factory.py b/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_runner_factory.py index 01f8a166..275c6ecc 100644 --- a/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_runner_factory.py +++ b/packages/ai-providers/server-ai-openai/src/ldai_openai/openai_runner_factory.py @@ -84,7 +84,7 @@ def create_agent_graph( from ldai_openai.openai_agent_graph_runner import OpenAIAgentGraphRunner return OpenAIAgentGraphRunner(graph_def, tools) - def create_model(self, config: AIConfigKind) -> OpenAIModelRunner: + def create_model(self, config: AIConfigKind, multi_turn: bool = True) -> OpenAIModelRunner: """ Create a configured OpenAIModelRunner for the given AI config. @@ -93,6 +93,9 @@ def create_model(self, config: AIConfigKind) -> OpenAIModelRunner: needed; all other fields are passed through from the config. :param config: The LaunchDarkly AI configuration + :param multi_turn: When ``True`` (the default) the runner accumulates + successful exchanges into its conversation history. Pass ``False`` to + keep history fixed at the configured baseline across ``run()`` calls. :return: OpenAIModelRunner ready to invoke the model """ model_name, parameters = self._extract_model_config(config) @@ -101,7 +104,9 @@ def create_model(self, config: AIConfigKind) -> OpenAIModelRunner: if tool_defs: parameters['tools'] = normalize_tool_types(tool_defs) config_messages = list(getattr(config, 'messages', None) or []) - return OpenAIModelRunner(self._client, model_name, parameters, config_messages) + return OpenAIModelRunner( + self._client, model_name, parameters, config_messages, multi_turn=multi_turn + ) def get_client(self) -> AsyncOpenAI: """ diff --git a/packages/ai-providers/server-ai-openai/tests/test_openai_provider.py b/packages/ai-providers/server-ai-openai/tests/test_openai_provider.py index b3bd2286..195149b0 100644 --- a/packages/ai-providers/server-ai-openai/tests/test_openai_provider.py +++ b/packages/ai-providers/server-ai-openai/tests/test_openai_provider.py @@ -234,6 +234,65 @@ def make_response(text: str): {'role': 'user', 'content': 'Second question'}, ] + @pytest.mark.asyncio + async def test_multi_turn_false_does_not_accumulate_history(self, mock_client): + """When multi_turn=False the runner must not append to history on success.""" + def make_response(text: str): + r = MagicMock() + r.context_wrapper = None + r.choices = [MagicMock()] + r.choices[0].message = MagicMock() + r.choices[0].message.content = text + r.usage = None + return r + + mock_client.chat = MagicMock() + mock_client.chat.completions = MagicMock() + mock_client.chat.completions.create = AsyncMock(side_effect=[ + make_response('First response'), + make_response('Second response'), + ]) + + provider = OpenAIModelRunner(mock_client, 'gpt-4o', {}, multi_turn=False) + baseline_len = len(provider._history) + + await provider.run('First question') + assert len(provider._history) == baseline_len + + await provider.run('Second question') + assert len(provider._history) == baseline_len + + # Each call must see only the configured baseline, never the prior turn. + second_call_messages = mock_client.chat.completions.create.call_args_list[1].kwargs['messages'] + assert second_call_messages == [{'role': 'user', 'content': 'Second question'}] + + @pytest.mark.asyncio + async def test_multi_turn_default_accumulates_history(self, mock_client): + """Default behavior (multi_turn omitted) still accumulates history (preserves PR #166).""" + def make_response(text: str): + r = MagicMock() + r.context_wrapper = None + r.choices = [MagicMock()] + r.choices[0].message = MagicMock() + r.choices[0].message.content = text + r.usage = None + return r + + mock_client.chat = MagicMock() + mock_client.chat.completions = MagicMock() + mock_client.chat.completions.create = AsyncMock(side_effect=[ + make_response('First response'), + make_response('Second response'), + ]) + + provider = OpenAIModelRunner(mock_client, 'gpt-4o', {}) + baseline_len = len(provider._history) + + await provider.run('First question') + await provider.run('Second question') + + assert len(provider._history) == baseline_len + 4 + @pytest.mark.asyncio async def test_does_not_accumulate_history_on_failed_call(self, mock_client): """Should not add to history when the call fails.""" diff --git a/packages/sdk/server-ai/src/ldai/client.py b/packages/sdk/server-ai/src/ldai/client.py index 12d1a960..8940ebda 100644 --- a/packages/sdk/server-ai/src/ldai/client.py +++ b/packages/sdk/server-ai/src/ldai/client.py @@ -339,7 +339,9 @@ def _create_judge_instance( if not judge_config.enabled: return None - provider = RunnerFactory.create_model(judge_config, default_ai_provider) + provider = RunnerFactory.create_model( + judge_config, default_ai_provider, multi_turn=False + ) if not provider: return None diff --git a/packages/sdk/server-ai/src/ldai/judge/__init__.py b/packages/sdk/server-ai/src/ldai/judge/__init__.py index a2927016..6ae7d90d 100644 --- a/packages/sdk/server-ai/src/ldai/judge/__init__.py +++ b/packages/sdk/server-ai/src/ldai/judge/__init__.py @@ -132,13 +132,19 @@ async def evaluate_messages( """ Evaluates an AI response from chat messages and response. + The conversation is rendered for the judge by joining each message as + ``"{role}: {content}"`` on newlines, preserving who said what so the + judge can distinguish user turns from assistant turns. + :param messages: Array of messages representing the conversation history :param response: The runner result to be evaluated :param sampling_ratio: Sampling ratio (0-1) to determine if evaluation should be processed. When ``None`` (the default), falls back to ``self.sample_rate``. :return: The result of the judge evaluation. """ - input_text = '\r\n'.join([msg.content for msg in messages]) if messages else '' + input_text = ( + '\n'.join(f'{msg.role}: {msg.content}' for msg in messages) if messages else '' + ) output_text = response.content return await self.evaluate(input_text, output_text, sampling_ratio) diff --git a/packages/sdk/server-ai/src/ldai/providers/ai_provider.py b/packages/sdk/server-ai/src/ldai/providers/ai_provider.py index eb3183a3..6bccafe4 100644 --- a/packages/sdk/server-ai/src/ldai/providers/ai_provider.py +++ b/packages/sdk/server-ai/src/ldai/providers/ai_provider.py @@ -15,13 +15,18 @@ class AIProvider(ABC): create_model(), create_agent(), and create_agent_graph(). """ - def create_model(self, config: Any) -> Optional[Any]: + def create_model(self, config: Any, multi_turn: bool = True) -> Optional[Any]: """ Create a configured model executor for the given AI config. Default implementation warns. Provider implementations should override this method. :param config: The LaunchDarkly AI configuration + :param multi_turn: When ``True`` (the default) the returned runner should + accumulate conversation history across successful ``run()`` calls. + When ``False`` each invocation starts from the same baseline history, + which is required for callers that share one runner across + independent invocations (e.g. judges). :return: Configured model runner instance, or None if unsupported """ log.warning('create_model not implemented by this provider') diff --git a/packages/sdk/server-ai/src/ldai/providers/runner_factory.py b/packages/sdk/server-ai/src/ldai/providers/runner_factory.py index c1262bfe..6c36a7b6 100644 --- a/packages/sdk/server-ai/src/ldai/providers/runner_factory.py +++ b/packages/sdk/server-ai/src/ldai/providers/runner_factory.py @@ -120,17 +120,25 @@ def _get_providers_to_try( def create_model( config: AIConfigKind, default_ai_provider: Optional[str] = None, + multi_turn: bool = True, ) -> Optional[Runner]: """ Create a model executor for the given AI completion config. :param config: LaunchDarkly AI config (completion or judge) :param default_ai_provider: Optional provider override ('openai', 'langchain', …) + :param multi_turn: When ``True`` (the default) the returned runner appends + each successful exchange to its history so subsequent ``run()`` calls + include the prior conversation. Set ``False`` for callers that share a + single runner across independent invocations (for example, judges) so + each call starts from the same baseline history. :return: Configured Runner ready to invoke the model, or None """ provider_name = config.provider.name.lower() if config.provider else None providers = RunnerFactory._get_providers_to_try(default_ai_provider, provider_name) - return RunnerFactory._with_fallback(providers, lambda p: p.create_model(config)) + return RunnerFactory._with_fallback( + providers, lambda p: p.create_model(config, multi_turn=multi_turn) + ) @staticmethod def create_agent( diff --git a/packages/sdk/server-ai/tests/test_judge.py b/packages/sdk/server-ai/tests/test_judge.py index f9c5bf46..b1f88fa2 100644 --- a/packages/sdk/server-ai/tests/test_judge.py +++ b/packages/sdk/server-ai/tests/test_judge.py @@ -1,5 +1,6 @@ """Tests for Judge functionality.""" +from typing import List from unittest.mock import AsyncMock, MagicMock, call, patch import pytest @@ -543,6 +544,79 @@ async def test_evaluate_messages_calls_evaluate( assert result.success is True assert tracker.track_metrics_of_async.called + @pytest.mark.asyncio + async def test_evaluate_messages_preserves_roles_in_input( + self, judge_config_with_key: AIJudgeConfig, mock_runner + ): + """evaluate_messages must forward role-prefixed lines to evaluate().""" + messages = [ + LDMessage(role='user', content='hi'), + LDMessage(role='assistant', content='hello'), + ] + chat_response = RunnerResult(content='reply', metrics=LDAIMetrics(success=True)) + + judge = Judge(judge_config_with_key, mock_runner) + with patch.object(judge, 'evaluate', new=AsyncMock(return_value=JudgeResult(judge_config_key='judge-config'))) as mock_evaluate: + await judge.evaluate_messages(messages, chat_response) + + mock_evaluate.assert_called_once() + args, _ = mock_evaluate.call_args + assert args[0] == 'user: hi\nassistant: hello' + assert args[1] == 'reply' + + +class TestJudgeRunnerNonMultiTurn: + """Successive evaluate() calls must not contaminate each other. + + The Judge shares one runner across evaluations, so the runner must be + stateless across calls — RunnerFactory.create_model(..., multi_turn=False) + is what guarantees that at the client layer. These tests verify the Judge + itself does not accidentally mutate the runner's history and that two + evaluations see the same baseline. + """ + + @pytest.mark.asyncio + async def test_two_evaluations_do_not_contaminate_history( + self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker + ): + """A judge bound to a non-multi-turn runner must run the same baseline twice.""" + # Stand in a fake runner that records the history it would expose to the + # LLM at the moment run() is called. With multi_turn=False the recorded + # baseline should be identical across calls. + seen_baselines: List[List[LDMessage]] = [] + + class _FakeRunner: + def __init__(self): + self._history: List[LDMessage] = [] + self._multi_turn = False + + async def run(self, input, output_type=None): # type: ignore[no-untyped-def] + # Snapshot history as seen at call time. + seen_baselines.append(list(self._history)) + return RunnerResult( + content='ok', + metrics=LDAIMetrics(success=True), + parsed={'score': 0.9, 'reasoning': 'fine'}, + ) + + runner = _FakeRunner() + + async def _await_fn(_metric_fn, fn): + return await fn() + + tracker.track_metrics_of_async = AsyncMock(side_effect=_await_fn) + judge = Judge(judge_config_with_key, runner) # type: ignore[arg-type] + + await judge.evaluate('first input', 'first output') + await judge.evaluate('second input', 'second output') + + assert len(seen_baselines) == 2 + # Both runs see the same baseline (empty in this fake; the point is + # they're equal — no contamination from the prior turn). + assert seen_baselines[0] == seen_baselines[1] + # And the runner's history never grew because multi_turn is False. + assert runner._history == [] + class TestJudgeConfigStripsLegacyMessages: """Tests for ``LDAIClient.judge_config()`` legacy-message stripping. diff --git a/packages/sdk/server-ai/tests/test_runner_factory.py b/packages/sdk/server-ai/tests/test_runner_factory.py index 2d972ea8..2364f58d 100644 --- a/packages/sdk/server-ai/tests/test_runner_factory.py +++ b/packages/sdk/server-ai/tests/test_runner_factory.py @@ -1,9 +1,10 @@ """Tests for RunnerFactory provider loading and error messages.""" -from unittest.mock import patch +from unittest.mock import MagicMock, patch import pytest +from ldai.providers.ai_provider import AIProvider from ldai.providers.runner_factory import RunnerFactory, _PYPI_PACKAGE_NAMES @@ -82,3 +83,65 @@ def test_warning_does_not_reference_pip(self): RunnerFactory._get_provider_factory('openai') warning_text = mock_log.warning.call_args[0][0] assert 'pip install' not in warning_text + + +class TestCreateModelMultiTurn: + """create_model forwards the multi_turn flag to the provider factory.""" + + def _make_config(self, provider_name='openai'): + config = MagicMock() + config.provider = MagicMock() + config.provider.name = provider_name + return config + + def test_forwards_multi_turn_false_to_provider(self): + sentinel_runner = object() + provider_factory = MagicMock(spec=AIProvider) + provider_factory.create_model.return_value = sentinel_runner + + with patch.object( + RunnerFactory, '_get_provider_factory', return_value=provider_factory + ): + result = RunnerFactory.create_model( + self._make_config(), default_ai_provider='openai', multi_turn=False + ) + + assert result is sentinel_runner + provider_factory.create_model.assert_called_once() + _, kwargs = provider_factory.create_model.call_args + assert kwargs.get('multi_turn') is False + + def test_defaults_multi_turn_to_true(self): + sentinel_runner = object() + provider_factory = MagicMock(spec=AIProvider) + provider_factory.create_model.return_value = sentinel_runner + + with patch.object( + RunnerFactory, '_get_provider_factory', return_value=provider_factory + ): + RunnerFactory.create_model(self._make_config(), default_ai_provider='openai') + + _, kwargs = provider_factory.create_model.call_args + assert kwargs.get('multi_turn') is True + + def test_constructed_runner_has_multi_turn_false_attribute(self): + """End-to-end: when multi_turn=False is passed, the constructed runner has _multi_turn == False.""" + # Use a stub AIProvider that constructs an OpenAIModelRunner-shaped object. + class _StubProvider(AIProvider): + def __init__(self): + self.last_kwargs = None + + def create_model(self, config, multi_turn: bool = True): + self.last_kwargs = {'multi_turn': multi_turn} + runner = MagicMock() + runner._multi_turn = multi_turn + return runner + + stub = _StubProvider() + with patch.object(RunnerFactory, '_get_provider_factory', return_value=stub): + runner = RunnerFactory.create_model( + self._make_config(), default_ai_provider='openai', multi_turn=False + ) + + assert runner is not None + assert runner._multi_turn is False