Skip to content

API Reference

Complete reference for all public classes and functions in Eval AI Library.

Core Functions

evaluate()

Run evaluation on single-turn test cases.

async def evaluate(
    test_cases: list[EvalTestCase],
    metrics: list[MetricPattern],
    verbose: bool = True,
    show_dashboard: bool = False,
    session_name: str = None
) -> list[tuple[None, list[TestCaseResult]]]:
Parameter Type Default Description
test_cases list[EvalTestCase] required Test cases to evaluate
metrics list[MetricPattern] required Metrics to apply
verbose bool True Show console progress
show_dashboard bool False Open dashboard in browser
session_name str None Name for caching

evaluate_conversations()

Run evaluation on multi-turn conversational test cases.

async def evaluate_conversations(
    conv_cases: list[ConversationalEvalTestCase],
    metrics: list[ConversationalMetricPattern],
    verbose: bool = True
) -> list[tuple[None, list[ConversationalTestCaseResult]]]:

chat_complete()

Direct LLM chat completion.

async def chat_complete(
    llm: str | tuple[str, str] | LLMDescriptor | CustomLLMClient,
    messages: list[dict[str, str]],
    temperature: float = 0.0
) -> tuple[str, float | None]:

Returns: (response_text, cost_in_usd)

get_embeddings()

Generate text embeddings.

async def get_embeddings(
    model: str | tuple[str, str] | LLMDescriptor | CustomLLMClient,
    texts: list[str]
) -> tuple[list[list[float]], float | None]:

Returns: (embeddings, cost_in_usd)

score_agg()

Temperature-controlled score aggregation.

def score_agg(
    scores: list[float],
    temperature: float = 0.5,
    penalty: float = 0.1,
    eps_for_neg_p: float = 1e-9
) -> float:

extract_json_block()

Extract JSON from LLM responses (handles markdown code blocks, embedded JSON, etc.).

def extract_json_block(text: str) -> str:

Test Case Models

EvalTestCase

class EvalTestCase(BaseModel):
    input: str                                      # User query (required)
    actual_output: str                              # AI response (required)
    expected_output: str | None = None              # Reference answer
    retrieval_context: list[str] | None = None      # Retrieved documents
    tools_called: list[str] | None = None           # Tools actually called
    expected_tools: list[str] | None = None         # Expected tool calls
    reasoning: str | None = None                    # Chain of thought
    name: str | None = None                         # Test case label

ConversationalEvalTestCase

class ConversationalEvalTestCase(BaseModel):
    turns: list[EvalTestCase]                       # Conversation turns
    chatbot_role: str | None = None                 # System prompt / role
    name: str | None = None                         # Test case label

ToolCall

class ToolCall(BaseModel):
    name: str                                       # Tool name
    description: str | None = None                  # Tool description
    reasoning: str | None = None                    # Why tool was called

Result Models

MetricResult

@dataclass
class MetricResult:
    name: str                               # Metric name
    score: float                            # Score (0.0-1.0)
    threshold: float                        # Pass threshold
    success: bool                           # score >= threshold
    evaluation_cost: float | None           # Cost in USD
    reason: str                             # Human-readable explanation
    evaluation_model: str                   # Model used for evaluation
    evaluation_log: dict | None = None      # Detailed evaluation data

TestCaseResult

@dataclass
class TestCaseResult:
    input: str
    actual_output: str
    expected_output: str | None
    retrieval_context: list[str] | None
    success: bool                           # All metrics passed
    metrics_data: list[MetricResult]
    tools_called: list[str] | None = None
    expected_tools: list[str] | None = None

ConversationalTestCaseResult

@dataclass
class ConversationalTestCaseResult:
    dialogue: list[dict[str, str]]          # role/content pairs
    success: bool                           # All metrics passed
    metrics_data: list[MetricResult]

LLM Types

Provider (Enum)

class Provider(str, Enum):
    OPENAI = "openai"
    AZURE = "azure"
    GOOGLE = "google"
    ANTHROPIC = "anthropic"
    OLLAMA = "ollama"
    CUSTOM = "custom"

LLMDescriptor

class LLMDescriptor:
    provider: Provider
    model: str

Parses from: - String: "gpt-4o" or "openai:gpt-4o" - Tuple: ("openai", "gpt-4o") - LLMDescriptor instance

CustomLLMClient (ABC)

class CustomLLMClient(ABC):
    async def chat_complete(self, messages: list, temperature: float) -> tuple[str, float | None]: ...
    async def get_embeddings(self, texts: list, model: str) -> tuple[list[list[float]], float | None]: ...
    def get_model_name(self) -> str: ...

Base Metric Classes

MetricPattern

Base class for single-turn metrics.

class MetricPattern:
    name: str

    def __init__(self, model: str, threshold: float, verbose: bool = False): ...
    async def evaluate(self, test_case: EvalTestCase) -> dict[str, Any]: ...

ConversationalMetricPattern

Base class for multi-turn metrics.

class ConversationalMetricPattern:
    name: str

    def __init__(self, model: str, threshold: float, verbose: bool = False): ...
    async def evaluate(self, test_case: ConversationalEvalTestCase) -> dict[str, Any]: ...

All Metrics

RAG Metrics

Class Import
AnswerRelevancyMetric from eval_lib import AnswerRelevancyMetric
AnswerPrecisionMetric from eval_lib import AnswerPrecisionMetric
FaithfulnessMetric from eval_lib import FaithfulnessMetric
ContextualRelevancyMetric from eval_lib import ContextualRelevancyMetric
ContextualPrecisionMetric from eval_lib import ContextualPrecisionMetric
ContextualRecallMetric from eval_lib import ContextualRecallMetric
BiasMetric from eval_lib import BiasMetric
ToxicityMetric from eval_lib import ToxicityMetric
RestrictedRefusalMetric from eval_lib import RestrictedRefusalMetric
CustomEvalMetric from eval_lib import CustomEvalMetric
GEval from eval_lib import GEval

Agent Metrics

Class Import
ToolCorrectnessMetric from eval_lib import ToolCorrectnessMetric
TaskSuccessRateMetric from eval_lib import TaskSuccessRateMetric
RoleAdherenceMetric from eval_lib import RoleAdherenceMetric
KnowledgeRetentionMetric from eval_lib import KnowledgeRetentionMetric
ToolsErrorMetric from eval_lib import ToolsErrorMetric

Security Metrics

Class Import
PromptInjectionDetectionMetric from eval_lib import PromptInjectionDetectionMetric
JailbreakDetectionMetric from eval_lib import JailbreakDetectionMetric
PIILeakageMetric from eval_lib import PIILeakageMetric
HarmfulContentMetric from eval_lib import HarmfulContentMetric
PromptInjectionResistanceMetric from eval_lib import PromptInjectionResistanceMetric
JailbreakResistanceMetric from eval_lib import JailbreakResistanceMetric
PolicyComplianceMetric from eval_lib import PolicyComplianceMetric

Data Generation

Class Import
DatasetGenerator from eval_lib import DatasetGenerator
DocumentLoader from eval_lib import DocumentLoader