BatchEvalRunner - Running Multiple Evaluations#

The BatchEvalRunner class can be used to run a series of evaluations asynchronously. The async jobs are limited to a defined size of num_workers.

Setup#

%pip install llama-index-llms-openai

# attach to the same event-loop
import nest_asyncio

nest_asyncio.apply()

import os
import openai

os.environ["OPENAI_API_KEY"] = "sk-..."
# openai.api_key = os.environ["OPENAI_API_KEY"]

from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Response
from llama_index.llms.openai import OpenAI
from llama_index.core.evaluation import (
    FaithfulnessEvaluator,
    RelevancyEvaluator,
    CorrectnessEvaluator,
)
from llama_index.core.node_parser import SentenceSplitter
import pandas as pd

pd.set_option("display.max_colwidth", 0)

Using GPT-4 here for evaluation

# gpt-4
gpt4 = OpenAI(temperature=0, model="gpt-4")

faithfulness_gpt4 = FaithfulnessEvaluator(llm=gpt4)
relevancy_gpt4 = RelevancyEvaluator(llm=gpt4)
correctness_gpt4 = CorrectnessEvaluator(llm=gpt4)

documents = SimpleDirectoryReader("./test_wiki_data/").load_data()

# create vector index
llm = OpenAI(temperature=0.3, model="gpt-3.5-turbo")
splitter = SentenceSplitter(chunk_size=512)
vector_index = VectorStoreIndex.from_documents(
    documents, transformations=[splitter]
)

Question Generation#

To run evaluations in batch, you can create the runner and then call the .aevaluate_queries() function on a list of queries.

First, we can generate some questions and then run evaluation on them.

!pip install spacy datasets span-marker scikit-learn

from llama_index.core.evaluation import DatasetGenerator

dataset_generator = DatasetGenerator.from_documents(documents, llm=llm)

qas = dataset_generator.generate_dataset_from_nodes(num=3)

/home/loganm/llama_index_proper/llama_index/llama_index/evaluation/dataset_generation.py:187: DeprecationWarning: Call to deprecated class DatasetGenerator. (Deprecated in favor of `RagDatasetGenerator` which should be used instead.)
  return cls(
/home/loganm/llama_index_proper/llama_index/llama_index/evaluation/dataset_generation.py:282: DeprecationWarning: Call to deprecated class QueryResponseDataset. (Deprecated in favor of `LabelledRagDataset` which should be used instead.)
  return QueryResponseDataset(queries=queries, responses=responses_dict)

Running Batch Evaluation#

Now, we can run our batch evaluation!

from llama_index.core.evaluation import BatchEvalRunner

runner = BatchEvalRunner(
    {"faithfulness": faithfulness_gpt4, "relevancy": relevancy_gpt4},
    workers=8,
)

eval_results = await runner.aevaluate_queries(
    vector_index.as_query_engine(llm=llm), queries=qas.questions
)

# If we had ground-truth answers, we could also include the correctness evaluator like below.
# The correctness evaluator depends on additional kwargs, which are passed in as a dictionary.
# Each question is mapped to a set of kwargs
#

# runner = BatchEvalRunner(
#     {"correctness": correctness_gpt4},
#     workers=8,
# )

# eval_results = await runner.aevaluate_queries(
#     vector_index.as_query_engine(),
#     queries=qas.queries,
#     reference=[qr[1] for qr in qas.qr_pairs],
# )

print(len([qr for qr in qas.qr_pairs]))

Inspecting Outputs#

print(eval_results.keys())

print(eval_results["correctness"][0].dict().keys())

print(eval_results["correctness"][0].passing)
print(eval_results["correctness"][0].response)
print(eval_results["correctness"][0].contexts)

dict_keys(['correctness'])
dict_keys(['query', 'contexts', 'response', 'passing', 'feedback', 'score', 'pairwise_source'])
False
The context information does not provide any information related to the query. Therefore, I cannot provide an answer based on the given context.
None

Reporting Total Scores#

def get_eval_results(key, eval_results):
    results = eval_results[key]
    correct = 0
    for result in results:
        if result.passing:
            correct += 1
    score = correct / len(results)
    print(f"{key} Score: {score}")
    return score

score = get_eval_results("correctness", eval_results)

correctness Score: 0.0

score = get_eval_results("relevancy", eval_results)

relevancy Score: 0.96