Skip to content

karenina.benchmark.verification.evaluators.trace

trace

Trace analysis components for detecting abstention and sufficiency.

This package provides: - detect_abstention: Detect when models refuse to answer questions - detect_sufficiency: Detect if responses have sufficient information for templates

Functions

detect_abstention

detect_abstention(
    raw_llm_response: str,
    parsing_model: ModelConfig,
    question_text: str,
    prompt_config: PromptConfig | None = None,
) -> tuple[bool, bool, str | None, dict[str, Any]]

Detect if the model refused to answer or abstained from answering.

This function uses an LLM to analyze the response and determine if it contains patterns indicating refusal, abstention, or evasion. Uses retry logic for transient errors (connection issues, rate limits, etc.).

Parameters:

Name Type Description Default
raw_llm_response
str

The raw response text from the answering model

required
parsing_model
ModelConfig

Configuration for the model to use for abstention detection

required
question_text
str

The original question that was asked

required

Returns:

Type Description
bool

Tuple of (abstention_detected, check_performed, reasoning, usage_metadata):

bool
  • abstention_detected: True if model refused/abstained, False if genuine attempt
str | None
  • check_performed: True if check completed successfully, False if check failed
dict[str, Any]
  • reasoning: The LLM's explanation for its determination (None if check failed)
tuple[bool, bool, str | None, dict[str, Any]]
  • usage_metadata: Token usage metadata from the LLM invocation

Examples:

>>> config = ModelConfig(id="parser", model_provider="openai", ...)
>>> detected, performed, reasoning, metadata = detect_abstention("I cannot answer this", config, "What is X?")
>>> print(detected, performed, reasoning)
True, True, "Response contains explicit refusal pattern"
Source code in src/karenina/benchmark/verification/evaluators/trace/abstention.py
def detect_abstention(
    raw_llm_response: str,
    parsing_model: ModelConfig,
    question_text: str,
    prompt_config: PromptConfig | None = None,
) -> tuple[bool, bool, str | None, dict[str, Any]]:
    """
    Detect if the model refused to answer or abstained from answering.

    This function uses an LLM to analyze the response and determine if it contains
    patterns indicating refusal, abstention, or evasion. Uses retry logic for
    transient errors (connection issues, rate limits, etc.).

    Args:
        raw_llm_response: The raw response text from the answering model
        parsing_model: Configuration for the model to use for abstention detection
        question_text: The original question that was asked

    Returns:
        Tuple of (abstention_detected, check_performed, reasoning, usage_metadata):
        - abstention_detected: True if model refused/abstained, False if genuine attempt
        - check_performed: True if check completed successfully, False if check failed
        - reasoning: The LLM's explanation for its determination (None if check failed)
        - usage_metadata: Token usage metadata from the LLM invocation

    Examples:
        >>> config = ModelConfig(id="parser", model_provider="openai", ...)
        >>> detected, performed, reasoning, metadata = detect_abstention("I cannot answer this", config, "What is X?")
        >>> print(detected, performed, reasoning)
        True, True, "Response contains explicit refusal pattern"
    """

    @retry(
        retry=retry_if_exception_type(Exception),
        stop=stop_after_attempt(3),  # Try 3 times
        wait=wait_exponential(multiplier=1, min=2, max=10),  # Exponential backoff: 2s, 4s, 8s
        reraise=True,
        before_sleep=partial(log_retry, context="abstention detection"),
    )
    def _detect_with_retry() -> tuple[bool, bool, str | None, dict[str, Any]]:
        """Inner function with retry logic."""
        usage_metadata: dict[str, Any] = {}

        try:
            # Create config copy with temperature=0 for consistent detection
            # Note: LangChain adapter respects temperature; Claude SDK adapter ignores it
            detection_config = parsing_model.model_copy(update={"temperature": 0.0})

            # Get LLM via adapter factory
            llm = get_llm(detection_config)

            # Configure for structured output
            structured_llm = llm.with_structured_output(AbstentionResult)

            # Build messages using PromptAssembler (tri-section pattern)
            user_prompt = ABSTENTION_DETECTION_USER.format(question=question_text, response=raw_llm_response)
            assembler = PromptAssembler(
                task=PromptTask.ABSTENTION_DETECTION,
                interface=parsing_model.interface,
                capabilities=PortCapabilities(),
            )
            user_instructions = (
                prompt_config.get_for_task(PromptTask.ABSTENTION_DETECTION.value) if prompt_config else None
            )
            messages = assembler.assemble(
                system_text=ABSTENTION_DETECTION_SYS,
                user_text=user_prompt,
                user_instructions=user_instructions,
            )

            # Invoke with structured output
            response: LLMResponse = structured_llm.invoke(messages)
            usage_metadata = response.usage.to_dict()

            # Extract result from structured output or fall back to manual parsing
            result = extract_judge_result(response, AbstentionResult, "abstention_detected")
            if result is not None:
                logger.debug(f"Abstention check: {result.abstention_detected} - Reasoning: {result.reasoning}")
                return result.abstention_detected, True, result.reasoning, usage_metadata

            # Fallback: manual JSON parsing from content
            return fallback_json_parse(
                response.content, usage_metadata, "abstention_detected", False, "Abstention check (fallback)"
            )

        except json.JSONDecodeError as e:
            # JSON parsing failed - log and treat as check failure
            logger.warning(f"Failed to parse abstention detection response as JSON: {e}")
            return False, False, None, usage_metadata

        except Exception as e:
            # Check if this is a retryable error
            if is_retryable_error(e):
                logger.info(f"Detected retryable error in abstention check: {type(e).__name__}: {e}")
                raise  # Re-raise to trigger retry
            else:
                # Non-retryable error - log and treat as check failure
                logger.warning(f"Abstention detection failed with non-retryable error: {e}")
                return False, False, None, usage_metadata

    try:
        return _detect_with_retry()
    except Exception as e:
        # All retries exhausted or unhandled error
        logger.error(f"Abstention detection failed after all retries: {e}")
        return False, False, None, {}

detect_sufficiency

detect_sufficiency(
    raw_llm_response: str,
    parsing_model: ModelConfig,
    question_text: str,
    template_schema: dict[str, Any],
    prompt_config: PromptConfig | None = None,
) -> tuple[bool, bool, str | None, dict[str, Any]]

Detect if the response contains sufficient information to populate the template schema.

This function uses an LLM to analyze the response against the template schema and determine if all required fields can be populated. Uses retry logic for transient errors (connection issues, rate limits, etc.).

Parameters:

Name Type Description Default
raw_llm_response
str

The raw response text from the answering model

required
parsing_model
ModelConfig

Configuration for the model to use for sufficiency detection

required
question_text
str

The original question that was asked

required
template_schema
dict[str, Any]

The JSON schema of the answer template to populate

required

Returns:

Type Description
bool

Tuple of (sufficient, check_performed, reasoning, usage_metadata):

bool
  • sufficient: True if response has info for all fields, False if information missing
str | None
  • check_performed: True if check completed successfully, False if check failed
dict[str, Any]
  • reasoning: The LLM's explanation for its determination (None if check failed)
tuple[bool, bool, str | None, dict[str, Any]]
  • usage_metadata: Token usage metadata from the LLM invocation

Examples:

>>> config = ModelConfig(id="parser", model_provider="openai", ...)
>>> schema = {"properties": {"answer": {"type": "string"}}}
>>> sufficient, performed, reasoning, metadata = detect_sufficiency(
...     "The answer is 42", config, "What is X?", schema
... )
>>> print(sufficient, performed)
True, True
Source code in src/karenina/benchmark/verification/evaluators/trace/sufficiency.py
def detect_sufficiency(
    raw_llm_response: str,
    parsing_model: ModelConfig,
    question_text: str,
    template_schema: dict[str, Any],
    prompt_config: PromptConfig | None = None,
) -> tuple[bool, bool, str | None, dict[str, Any]]:
    """
    Detect if the response contains sufficient information to populate the template schema.

    This function uses an LLM to analyze the response against the template schema and
    determine if all required fields can be populated. Uses retry logic for transient
    errors (connection issues, rate limits, etc.).

    Args:
        raw_llm_response: The raw response text from the answering model
        parsing_model: Configuration for the model to use for sufficiency detection
        question_text: The original question that was asked
        template_schema: The JSON schema of the answer template to populate

    Returns:
        Tuple of (sufficient, check_performed, reasoning, usage_metadata):
        - sufficient: True if response has info for all fields, False if information missing
        - check_performed: True if check completed successfully, False if check failed
        - reasoning: The LLM's explanation for its determination (None if check failed)
        - usage_metadata: Token usage metadata from the LLM invocation

    Examples:
        >>> config = ModelConfig(id="parser", model_provider="openai", ...)
        >>> schema = {"properties": {"answer": {"type": "string"}}}
        >>> sufficient, performed, reasoning, metadata = detect_sufficiency(
        ...     "The answer is 42", config, "What is X?", schema
        ... )
        >>> print(sufficient, performed)
        True, True
    """

    @retry(
        retry=retry_if_exception_type(Exception),
        stop=stop_after_attempt(3),  # Try 3 times
        wait=wait_exponential(multiplier=1, min=2, max=10),  # Exponential backoff: 2s, 4s, 8s
        reraise=True,
        before_sleep=partial(log_retry, context="sufficiency detection"),
    )
    def _detect_with_retry() -> tuple[bool, bool, str | None, dict[str, Any]]:
        """Inner function with retry logic."""
        usage_metadata: dict[str, Any] = {}

        try:
            # Create config copy with temperature=0 for consistent detection
            # Note: LangChain adapter respects temperature; Claude SDK adapter ignores it
            detection_config = parsing_model.model_copy(update={"temperature": 0.0})

            # Get LLM via adapter factory
            llm = get_llm(detection_config)

            # Configure for structured output
            structured_llm = llm.with_structured_output(SufficiencyResult)

            # Convert schema to string for prompt
            schema_str = json.dumps(template_schema, indent=2)

            # Build messages using PromptAssembler (tri-section pattern)
            user_prompt = SUFFICIENCY_DETECTION_USER.format(
                question=question_text,
                response=raw_llm_response,
                schema=schema_str,
            )
            assembler = PromptAssembler(
                task=PromptTask.SUFFICIENCY_DETECTION,
                interface=parsing_model.interface,
                capabilities=PortCapabilities(),
            )
            user_instructions = (
                prompt_config.get_for_task(PromptTask.SUFFICIENCY_DETECTION.value) if prompt_config else None
            )
            messages = assembler.assemble(
                system_text=SUFFICIENCY_DETECTION_SYS,
                user_text=user_prompt,
                user_instructions=user_instructions,
            )

            # Invoke with structured output
            response: LLMResponse = structured_llm.invoke(messages)
            usage_metadata = response.usage.to_dict()

            # Extract result from structured output or fall back to manual parsing
            result = extract_judge_result(response, SufficiencyResult, "sufficient")
            if result is not None:
                logger.debug(f"Sufficiency check: {result.sufficient} - Reasoning: {result.reasoning}")
                return result.sufficient, True, result.reasoning, usage_metadata

            # Fallback: manual JSON parsing from content
            return fallback_json_parse(
                response.content, usage_metadata, "sufficient", True, "Sufficiency check (fallback)"
            )

        except json.JSONDecodeError as e:
            # JSON parsing failed - log and treat as check failure
            logger.warning(f"Failed to parse sufficiency detection response as JSON: {e}")
            return True, False, None, usage_metadata  # Default to sufficient on failure

        except Exception as e:
            # Check if this is a retryable error
            if is_retryable_error(e):
                logger.info(f"Detected retryable error in sufficiency check: {type(e).__name__}: {e}")
                raise  # Re-raise to trigger retry
            else:
                # Non-retryable error - log and treat as check failure
                logger.warning(f"Sufficiency detection failed with non-retryable error: {e}")
                return True, False, None, usage_metadata  # Default to sufficient on failure

    try:
        return _detect_with_retry()
    except Exception as e:
        # All retries exhausted or unhandled error
        logger.error(f"Sufficiency detection failed after all retries: {e}")
        return True, False, None, {}  # Default to sufficient on failure