diff --git a/evals/eval_config.json b/evals/eval_config.json index c51f9e79..fe50ef52 100644 --- a/evals/eval_config.json +++ b/evals/eval_config.json @@ -1,14 +1,15 @@ { "testdata_path": "ground_truth.jsonl", "results_dir": "results/experiment", - "requested_metrics": ["gpt_groundedness", "gpt_relevance", "answer_length", "latency", "citation_match"], + "requested_metrics": ["gpt_groundedness", "gpt_relevance", "answer_length", "latency", "citations_matched"], "target_url": "http://127.0.0.1:8000/chat", "target_parameters": { "overrides": { "use_advanced_flow": true, "top": 3, "retrieval_mode": "hybrid", - "temperature": 0.3 + "temperature": 0.3, + "seed": 42 } }, "target_response_answer_jmespath": "message.content", diff --git a/evals/evaluate.py b/evals/evaluate.py index 9ec2eb31..c4075479 100644 --- a/evals/evaluate.py +++ b/evals/evaluate.py @@ -1,17 +1,44 @@ import argparse import logging import os +import re from pathlib import Path from typing import Any import azure.identity from dotenv import load_dotenv from evaltools.eval.evaluate import run_evaluate_from_config +from evaltools.eval.evaluate_metrics import register_metric +from evaltools.eval.evaluate_metrics.base_metric import BaseMetric from rich.logging import RichHandler logger = logging.getLogger("ragapp") +class CitationsMatchedMetric(BaseMetric): + METRIC_NAME = "citations_matched" + + @classmethod + def evaluator_fn(cls, **kwargs): + def citations_overlap(*, response, ground_truth, **kwargs): + if response is None: + logger.warning("Received response of None, can't compute citation_match metric. Setting to -1.") + return {cls.METRIC_NAME: -1} + truth_citations = set(re.findall(r"\[(\d+)\]", ground_truth)) + response_citations = set(re.findall(r"\[(\d+)\]", response)) + # Count the percentage of citations that are present in the response + num_citations = len(truth_citations) + num_matched_citations = len(truth_citations.intersection(response_citations)) + return {cls.METRIC_NAME: num_matched_citations / num_citations} + + return citations_overlap + + @classmethod + def get_aggregate_stats(cls, df): + df = df[df[cls.METRIC_NAME] != -1] + return {"mean": round(df[cls.METRIC_NAME].mean(), 2)} + + def get_openai_config() -> dict: openai_config: dict[str, Any] if os.environ.get("OPENAI_CHAT_HOST") == "azure": @@ -60,6 +87,7 @@ def get_openai_config() -> dict: openai_config = get_openai_config() + register_metric(CitationsMatchedMetric) run_evaluate_from_config( working_dir=Path(__file__).parent, config_path="eval_config.json", diff --git a/src/backend/fastapi_app/api_models.py b/src/backend/fastapi_app/api_models.py index 35068a80..61027830 100644 --- a/src/backend/fastapi_app/api_models.py +++ b/src/backend/fastapi_app/api_models.py @@ -28,6 +28,7 @@ class ChatRequestOverrides(BaseModel): retrieval_mode: RetrievalMode = RetrievalMode.HYBRID use_advanced_flow: bool = True prompt_template: str | None = None + seed: int | None = None class ChatRequestContext(BaseModel): diff --git a/src/backend/fastapi_app/rag_advanced.py b/src/backend/fastapi_app/rag_advanced.py index b3a253fe..48ac8a09 100644 --- a/src/backend/fastapi_app/rag_advanced.py +++ b/src/backend/fastapi_app/rag_advanced.py @@ -35,7 +35,11 @@ def __init__( self.chat_token_limit = get_token_limit(chat_model, default_to_minimum=True) async def generate_search_query( - self, original_user_query: str, past_messages: list[ChatCompletionMessageParam], query_response_token_limit: int + self, + original_user_query: str, + past_messages: list[ChatCompletionMessageParam], + query_response_token_limit: int, + seed: int | None = None, ) -> tuple[list[ChatCompletionMessageParam], Any | str | None, list]: """Generate an optimized keyword search query based on the chat history and the last question""" @@ -63,6 +67,7 @@ async def generate_search_query( n=1, tools=tools, tool_choice=tool_choice, + seed=seed, ) query_text, filters = extract_search_arguments(original_user_query, chat_completion) @@ -76,6 +81,7 @@ async def prepare_context( original_user_query=chat_params.original_user_query, past_messages=chat_params.past_messages, query_response_token_limit=500, + seed=chat_params.seed, ) # Retrieve relevant rows from the database with the GPT optimized query @@ -142,6 +148,7 @@ async def answer( max_tokens=chat_params.response_token_limit, n=1, stream=False, + seed=chat_params.seed, ) return RetrievalResponse( diff --git a/src/backend/fastapi_app/rag_base.py b/src/backend/fastapi_app/rag_base.py index 183647e7..34fba44a 100644 --- a/src/backend/fastapi_app/rag_base.py +++ b/src/backend/fastapi_app/rag_base.py @@ -36,6 +36,7 @@ def get_params(self, messages: list[ChatCompletionMessageParam], overrides: Chat return ChatParams( top=overrides.top, temperature=overrides.temperature, + seed=overrides.seed, retrieval_mode=overrides.retrieval_mode, use_advanced_flow=overrides.use_advanced_flow, response_token_limit=response_token_limit, diff --git a/src/backend/fastapi_app/rag_simple.py b/src/backend/fastapi_app/rag_simple.py index 638f03ea..4bf50d1f 100644 --- a/src/backend/fastapi_app/rag_simple.py +++ b/src/backend/fastapi_app/rag_simple.py @@ -90,6 +90,7 @@ async def answer( max_tokens=chat_params.response_token_limit, n=1, stream=False, + seed=chat_params.seed, ) return RetrievalResponse( @@ -130,6 +131,7 @@ async def answer_stream( max_tokens=chat_params.response_token_limit, n=1, stream=True, + seed=chat_params.seed, ) yield RetrievalResponseDelta(