POST/api/v1/evals/pairwise

Create a pairwise eval run

Starts a pairwise comparison between two systems on a dataset. Each case yields a winner (left | right | tie). Used to build preference datasets.

Authentication

Send Authorization: Bearer YOUR_API_KEY on every request. Generate API keys at /dashboard/api-keys.

Request body required

Example

{
  "dataset_id": "00000000-0000-0000-0000-000000000000",
  "left": {},
  "right": {},
  "judge": "gpt-4"
}
Schema
{
  "application/json": {
    "schema": {
      "type": "object",
      "required": [
        "dataset_id",
        "left",
        "right"
      ],
      "properties": {
        "dataset_id": {
          "type": "string",
          "format": "uuid"
        },
        "left": {
          "type": "object",
          "description": "Left contender — model + prompt config."
        },
        "right": {
          "type": "object",
          "description": "Right contender."
        },
        "judge": {
          "type": "string",
          "enum": [
            "gpt-4",
            "claude-3-opus",
            "gemini-pro",
            "human"
          ],
          "description": "Judge model or 'human' for human review queue."
        }
      }
    }
  }
}

Response

200 example

{
  "success": true
}

All status codes

200Pairwise eval started.
400(no description)
401(no description)
403Forbidden — insufficient role for this operation.
429(no description)

Code samples

cURL

curl -X POST \
  https://evalguard.ai/api/v1/evals/pairwise \
  -H "Authorization: Bearer $EVALGUARD_API_KEY" \
  -H "Content-Type: application/json" \
  -d '{ "dataset_id": "00000000-0000-0000-0000-000000000000", "left": {}, "right": {}, "judge": "gpt-4" }'

TypeScript

import { EvalGuard } from "@evalguard/sdk";

const client = new EvalGuard({ apiKey: process.env.EVALGUARD_API_KEY });

const response = await client.request({
  method: "POST",
  path: "/api/v1/evals/pairwise",
  body: {
    "dataset_id": "00000000-0000-0000-0000-000000000000",
    "left": {},
    "right": {},
    "judge": "gpt-4"
  },
});
console.log(response);

Python

from evalguard import EvalGuard
import os

client = EvalGuard(api_key=os.environ["EVALGUARD_API_KEY"])

response = client.request(
    method="POST",
    path="/api/v1/evals/pairwise",
    body={
    "dataset_id": "00000000-0000-0000-0000-000000000000",
    "left": {},
    "right": {},
    "judge": "gpt-4"
},
)
print(response)

Go

package main

import (
	"context"
	"fmt"
	"os"

	"github.com/evalguard/evalguard-go"
)

func main() {
	client := evalguard.NewClient(os.Getenv("EVALGUARD_API_KEY"))
	resp, err := client.Request(context.Background(), "POST", "/api/v1/evals/pairwise", map[string]any{"dataset_id": "00000000-0000-0000-0000-000000000000", "left": map[string]any{}, "right": map[string]any{}, "judge": "gpt-4"})
	if err != nil { panic(err) }
	fmt.Println(resp)
}

Errors

400401403429

Other Evals endpoints