Skip to content
Navigation

Ralph iterative refinement loop: Run, Analyze, Learn, Plan, Halt. A 5-phase execution engine that combines scoring, reflection, and re-prompting to iteratively improve agent outputs.

Module Paths

python
from orbiter.eval.ralph.runner import RalphRunner, RalphResult, ExecuteFn, RePlanFn
from orbiter.eval.ralph.config import (
    StopType,
    ValidationConfig,
    ReflectionConfig,
    StopConditionConfig,
    RalphConfig,
    LoopState,
)
from orbiter.eval.ralph.detectors import (
    StopDecision,
    StopDetector,
    MaxIterationDetector,
    TimeoutDetector,
    CostLimitDetector,
    ConsecutiveFailureDetector,
    ScoreThresholdDetector,
    CompositeDetector,
)

StopType

Categorised exit reason for loop termination.

python
class StopType(StrEnum):
    NONE = "none"
    COMPLETION = "completion"
    MAX_ITERATIONS = "max_iterations"
    TIMEOUT = "timeout"
    MAX_COST = "max_cost"
    MAX_CONSECUTIVE_FAILURES = "max_consecutive_failures"
    SCORE_THRESHOLD = "score_threshold"
    USER_INTERRUPTED = "user_interrupted"
    SYSTEM_ERROR = "system_error"
ValueDescription
NONENo stop condition triggered
COMPLETIONTask completed successfully
MAX_ITERATIONSReached maximum iteration count
TIMEOUTWall-clock time limit exceeded
MAX_COSTCumulative cost limit reached
MAX_CONSECUTIVE_FAILURESToo many failures in a row
SCORE_THRESHOLDScore meets or exceeds threshold
USER_INTERRUPTEDUser requested interruption
SYSTEM_ERRORUnrecoverable system error

Methods

is_success()

python
def is_success(self) -> bool

Returns True for COMPLETION and SCORE_THRESHOLD.

is_failure()

python
def is_failure(self) -> bool

Returns True for MAX_CONSECUTIVE_FAILURES and SYSTEM_ERROR.


ValidationConfig

Configuration for the Analyze (scoring) phase.

Decorator: @dataclass(frozen=True, slots=True)

Constructor

python
ValidationConfig(
    enabled: bool = True,
    scorer_names: tuple[str, ...] = (),
    min_score_threshold: float = 0.5,
    parallel: int = 4,
    timeout: float = 0.0,
)
FieldTypeDefaultDescription
enabledboolTrueWhether scoring is enabled
scorer_namestuple[str, ...]()Names of scorers to use (empty = use all)
min_score_thresholdfloat0.5Below this mean score, reflection is triggered
parallelint4Max parallel scorer executions
timeoutfloat0.0Scorer timeout (0 = no timeout)

Raises: ValueError if min_score_threshold is not in [0.0, 1.0].


ReflectionConfig

Configuration for the Learn (reflection) phase.

Decorator: @dataclass(frozen=True, slots=True)

Constructor

python
ReflectionConfig(
    enabled: bool = True,
    level: str = "medium",
    max_history: int = 50,
)
FieldTypeDefaultDescription
enabledboolTrueWhether reflection is enabled
levelstr"medium"Reflection depth level
max_historyint50Maximum reflection history entries

StopConditionConfig

Configuration for the Halt (stop detection) phase.

Decorator: @dataclass(frozen=True, slots=True)

Constructor

python
StopConditionConfig(
    max_iterations: int = 10,
    timeout: float = 0.0,
    max_cost: float = 0.0,
    max_consecutive_failures: int = 3,
    score_threshold: float = 0.0,
    enable_user_interrupt: bool = False,
)
FieldTypeDefaultDescription
max_iterationsint10Stop after this many iterations
timeoutfloat0.0Wall-clock timeout in seconds (0 = disabled)
max_costfloat0.0Cumulative cost limit (0 = disabled)
max_consecutive_failuresint3Stop after N consecutive failures
score_thresholdfloat0.0Stop when mean score reaches this (0 = disabled)
enable_user_interruptboolFalseAllow user-initiated interruption

Raises: ValueError if max_iterations < 1.


RalphConfig

Unified configuration for the Ralph iterative refinement loop. Aggregates validation, reflection, and stop-condition settings.

Decorator: @dataclass(frozen=True, slots=True)

Constructor

python
RalphConfig(
    validation: ValidationConfig = ValidationConfig(),
    reflection: ReflectionConfig = ReflectionConfig(),
    stop_condition: StopConditionConfig = StopConditionConfig(),
    metadata: dict[str, Any] = {},
)
FieldTypeDefaultDescription
validationValidationConfigValidationConfig()Scoring phase configuration
reflectionReflectionConfigReflectionConfig()Reflection phase configuration
stop_conditionStopConditionConfigStopConditionConfig()Halt phase configuration
metadatadict[str, Any]{}Custom metadata

LoopState

Mutable runtime state for a Ralph loop execution. Tracks iteration count, timing, cost, and aggregated score/reflection history.

Decorator: @dataclass(slots=True)

Constructor

python
LoopState()

Attributes

AttributeTypeDefaultDescription
iterationint0Current iteration number
start_timefloattime.monotonic()Loop start timestamp
cumulative_costfloat0.0Total cost across iterations
consecutive_failuresint0Current consecutive failure streak
successful_stepsint0Total successful steps
failed_stepsint0Total failed steps
total_tokensint0Total tokens consumed
score_historylist[dict[str, float]][]Score snapshots per iteration
reflection_historylist[dict[str, Any]][]Reflection summaries per iteration
metadatadict[str, Any]{}Custom metadata

Query Methods

elapsed()

python
def elapsed(self) -> float

Seconds since the loop started.

success_rate()

python
def success_rate(self) -> float

Fraction of successful steps. Returns 0.0 when no steps have been executed.

latest_score()

python
def latest_score(self) -> dict[str, float]

Return the most recent score snapshot, or empty dict.

best_score()

python
def best_score(self, metric: str) -> float

Return the highest value seen for a metric across all iterations.

ParameterTypeDescription
metricstrScorer name to look up

Mutation Methods

record_score()

python
def record_score(self, scores: dict[str, float]) -> None

Append a score snapshot for the current iteration.

record_reflection()

python
def record_reflection(self, reflection: dict[str, Any]) -> None

Append a reflection summary for the current iteration.

record_success()

python
def record_success(self, *, tokens: int = 0, cost: float = 0.0) -> None

Mark the current step as successful. Resets consecutive_failures to 0.

ParameterTypeDefaultDescription
tokensint0Tokens consumed
costfloat0.0Cost incurred

record_failure()

python
def record_failure(self, *, cost: float = 0.0) -> None

Mark the current step as failed. Increments consecutive_failures.

ParameterTypeDefaultDescription
costfloat0.0Cost incurred

Serialization

to_dict()

python
def to_dict(self) -> dict[str, Any]

Serialise to a plain dict for checkpointing.

Dunder Methods

MethodDescription
__repr__LoopState(iteration=5, success_rate=80.0%, elapsed=12.3s)

StopDecision

Outcome of a single detector evaluation.

Decorator: @dataclass(frozen=True, slots=True)

Constructor

python
StopDecision(
    should_stop: bool,
    stop_type: StopType = StopType.NONE,
    reason: str = "",
    metadata: dict[str, Any] = {},
)
FieldTypeDefaultDescription
should_stopbool(required)Whether the loop should stop
stop_typeStopTypeNONEThe category of stop condition
reasonstr""Human-readable explanation
metadatadict[str, Any]{}Additional decision context

Dunder Methods

MethodDescription
__bool__Returns should_stop

StopDetector (ABC)

Base class for pluggable stop-condition detectors. Each detector examines the current LoopState and the static StopConditionConfig.

Abstract Methods

check()

python
async def check(
    self,
    state: LoopState,
    config: StopConditionConfig,
) -> StopDecision

Return a stop decision for the current loop state.


Built-in Detectors

MaxIterationDetector

Stops when the loop has reached config.max_iterations.

python
MaxIterationDetector()

Triggers StopType.MAX_ITERATIONS.

TimeoutDetector

Stops when elapsed wall-clock time exceeds config.timeout. A timeout of 0.0 (the default) disables this detector.

python
TimeoutDetector()

Triggers StopType.TIMEOUT.

CostLimitDetector

Stops when cumulative cost meets or exceeds config.max_cost. A max_cost of 0.0 (the default) disables this detector.

python
CostLimitDetector()

Triggers StopType.MAX_COST.

ConsecutiveFailureDetector

Stops when consecutive failures reach config.max_consecutive_failures. A value of 0 disables this detector.

python
ConsecutiveFailureDetector()

Triggers StopType.MAX_CONSECUTIVE_FAILURES.

ScoreThresholdDetector

Stops when the mean of the latest score snapshot meets or exceeds config.score_threshold. A score_threshold of 0.0 (the default) disables this detector.

python
ScoreThresholdDetector()

Triggers StopType.SCORE_THRESHOLD.


CompositeDetector

Aggregates multiple detectors and returns the first triggered decision.

Constructor

python
CompositeDetector(detectors: list[StopDetector] | None = None)
ParameterTypeDefaultDescription
detectorslist[StopDetector] | NoneNoneList of detectors to run (empty if None)

Methods

add()

python
def add(self, detector: StopDetector) -> CompositeDetector

Append a detector. Returns self for chaining.

check()

python
async def check(
    self,
    state: LoopState,
    config: StopConditionConfig,
) -> StopDecision

Iterates through all detectors in order. Returns the first StopDecision where should_stop=True, or a “continue” decision if none trigger.

Dunder Methods

MethodDescription
__len__Number of detectors
__repr__CompositeDetector(detectors=5)

Type Aliases

python
ExecuteFn = Callable[..., Any]  # Async callable: (input: str) -> str
RePlanFn = Callable[..., Any]   # Async callable: (prompt: str) -> str

RalphResult

Final outcome of a Ralph loop execution.

Decorator: @dataclass(frozen=True, slots=True)

Constructor

python
RalphResult(
    output: str,
    stop_type: StopType,
    reason: str,
    iterations: int,
    scores: dict[str, float],
    state: dict[str, Any],
    reflections: list[dict[str, Any]] = [],
)
FieldTypeDefaultDescription
outputstr(required)Final output from the last iteration
stop_typeStopType(required)Why the loop stopped
reasonstr(required)Human-readable stop reason
iterationsint(required)Total iterations executed
scoresdict[str, float](required)Final scores from the last iteration
statedict[str, Any](required)Serialized LoopState
reflectionslist[dict[str, Any]][]Reflection summaries per iteration

RalphRunner

Implements the 5-phase Ralph iterative refinement loop.

Phases Per Iteration

  1. Run — Execute the agent/task via execute_fn
  2. Analyze — Score the output using configured scorers
  3. Learn — Reflect on failures to extract actionable insights
  4. Plan — Re-prompt by appending reflection suggestions to input
  5. Halt — Check stop conditions; break or continue

Constructor

python
RalphRunner(
    execute_fn: ExecuteFn,
    scorers: list[Scorer],
    *,
    config: RalphConfig | None = None,
    reflector: Reflector | None = None,
    replan_fn: RePlanFn | None = None,
)
ParameterTypeDefaultDescription
execute_fnExecuteFn(required)Async callable to run the task
scorerslist[Scorer](required)Scorers for the Analyze phase
configRalphConfig | NoneNoneLoop configuration (defaults to RalphConfig())
reflectorReflector | NoneNoneReflector for the Learn phase
replan_fnRePlanFn | NoneNoneCustom re-plan function (unused in current implementation)

Methods

run()

python
async def run(self, input: str) -> RalphResult

Execute the full Ralph loop on the given input and return the result.

ParameterTypeDescription
inputstrInitial input to the task

Returns: RalphResult with the final output, stop type, scores, and reflections.

Behavior per iteration:

  1. Run: Calls execute_fn(current_input). On success, records success. On exception, records failure.
  2. Analyze: If validation is enabled and execution succeeded, runs all scorers and records scores.
  3. Learn: If reflection is enabled and either execution failed or mean score < min_score_threshold, calls reflector.reflect().
  4. Plan: If reflection produced suggestions, appends them to the original input as [Previous feedback].
  5. Halt: Runs all stop detectors via CompositeDetector. Stops on the first triggered condition.

Built-in Detectors

The runner automatically creates a CompositeDetector with these detectors (in order):

  1. MaxIterationDetector
  2. TimeoutDetector
  3. CostLimitDetector
  4. ConsecutiveFailureDetector
  5. ScoreThresholdDetector

Dunder Methods

MethodDescription
__repr__RalphRunner(scorers=3, config=RalphConfig(...))

Example

python
import asyncio
from orbiter.eval import (
    GeneralReflector,
    OutputCorrectnessScorer,
    OutputLengthScorer,
)
from orbiter.eval.ralph.runner import RalphRunner
from orbiter.eval.ralph.config import (
    RalphConfig,
    ValidationConfig,
    ReflectionConfig,
    StopConditionConfig,
)

iteration_count = 0

async def my_agent(input: str) -> str:
    global iteration_count
    iteration_count += 1
    if iteration_count >= 3:
        return "The capital of France is Paris."
    return "I'm not sure about that."

async def my_judge(prompt: str) -> str:
    return '''{
        "summary": "The answer was incomplete.",
        "key_findings": ["Missing specific answer"],
        "root_causes": ["Insufficient confidence"],
        "insights": ["Need to be more decisive"],
        "suggestions": ["Provide a direct, specific answer"]
    }'''

async def main():
    config = RalphConfig(
        validation=ValidationConfig(min_score_threshold=0.8),
        reflection=ReflectionConfig(enabled=True),
        stop_condition=StopConditionConfig(
            max_iterations=5,
            score_threshold=0.9,
        ),
    )

    runner = RalphRunner(
        execute_fn=my_agent,
        scorers=[
            OutputCorrectnessScorer(keywords=["Paris", "capital"]),
            OutputLengthScorer(min_length=10),
        ],
        config=config,
        reflector=GeneralReflector(judge=my_judge),
    )

    result = await runner.run("What is the capital of France?")

    print(f"Output: {result.output}")
    print(f"Stop type: {result.stop_type}")
    print(f"Iterations: {result.iterations}")
    print(f"Scores: {result.scores}")
    print(f"Success: {result.stop_type.is_success()}")

    # Check reflections
    for ref in result.reflections:
        print(f"  Iteration {ref['iteration']}: {ref['summary']}")

asyncio.run(main())