You can make your own custom string evaluators by inheriting from the StringEvaluator class and implementing the _evaluate_strings (and _aevaluate_strings for async support) methods.
In this example, you will create a perplexity evaluator using the HuggingFace evaluate library. Perplexity is a measure of how well the generated text would be predicted by the model used to compute the metric
from typing import Any, Optional
from evaluate import load
from langchain.evaluation import StringEvaluator
class PerplexityEvaluator(StringEvaluator):
"""Evaluate the perplexity of a predicted string."""
def __init__(self, model_id: str = "gpt2"):
self.model_id = model_id
self.metric_fn = load(
"perplexity", module_type="metric", model_id=self.model_id, pad_token=0
)
def _evaluate_strings(
self,
*,
prediction: str,
reference: Optional[str] = None,
input: Optional[str] = None,
**kwargs: Any,
) -> dict:
results = self.metric_fn.compute(
predictions=[prediction], model_id=self.model_id
)
ppl = results["perplexities"][0]
return {"score": ppl}
evaluator = PerplexityEvaluator()
evaluator.evaluate_strings(prediction="The rains in Spain fall mainly on the plain.")
references:
https://python.langchain.com/v0.1/docs/guides/productionization/evaluation/string/custom/
No comments:
Post a Comment