Source code for mlflow.genai.judges.databricks

from functools import wraps
from typing import Any, Callable, Optional, Union

from mlflow.entities.assessment import Feedback
from mlflow.genai.utils.enum_utils import StrEnum
from mlflow.utils.annotations import experimental

# NB: User-facing name for the is_context_relevant assessment.
_IS_CONTEXT_RELEVANT_ASSESSMENT_NAME = "relevance_to_context"


[docs]class CategoricalRating(StrEnum): """ A categorical rating for an assessment. Example: .. code-block:: python from mlflow.genai.judges import CategoricalRating from mlflow.entities import Feedback # Create feedback with categorical rating feedback = Feedback( name="my_metric", value=CategoricalRating.YES, rationale="The metric is passing." ) """ YES = "yes" NO = "no" UNKNOWN = "unknown" @classmethod def _missing_(cls, value: str): value = value.lower() for member in cls: if member == value: return member return cls.UNKNOWN
def _sanitize_feedback(feedback: Feedback) -> Feedback: """Sanitize the feedback object from the databricks judges. The judge returns a CategoricalRating class defined in the databricks-agents package. This function converts it to our CategoricalRating definition above. Args: feedback: The Feedback object to convert. Returns: A new Feedback object with our CategoricalRating. """ feedback.value = CategoricalRating(feedback.value.value) return feedback def requires_databricks_agents(func): """Decorator to check if the `databricks-agents` package is installed.""" @wraps(func) def wrapper(*args, **kwargs): try: import databricks.agents.evals.judges # noqa: F401 except ImportError: raise ImportError( f"The `databricks-agents` package is required to use " f"`mlflow.genai.judges.{func.__name__}`. " "Please install it with `pip install databricks-agents`." ) return func(*args, **kwargs) return wrapper
[docs]@requires_databricks_agents def is_context_relevant(*, request: str, context: Any, name: Optional[str] = None) -> Feedback: """ LLM judge determines whether the given context is relevant to the input request. Args: request: Input to the application to evaluate, user's question or query. context: Context to evaluate the relevance to the request. Supports any JSON-serializable object. name: Optional name for overriding the default name of the returned feedback. Returns: A :py:class:`mlflow.entities.assessment.Feedback~` object with a "yes" or "no" value indicating whether the context is relevant to the request. Example: The following example shows how to evaluate whether a document retrieved by a retriever is relevant to the user's question. .. code-block:: python from mlflow.genai.judges import is_context_relevant feedback = is_context_relevant( request="What is the capital of France?", context="Paris is the capital of France.", ) print(feedback.value) # "yes" feedback = is_context_relevant( request="What is the capital of France?", context="Paris is known for its Eiffel Tower.", ) print(feedback.value) # "no" """ from databricks.agents.evals.judges import relevance_to_query return _sanitize_feedback( relevance_to_query( request=request, response=str(context), # NB: User-facing name for the is_context_relevant assessment. This is required since # the existing databricks judge is called `relevance_to_query` assessment_name=name or _IS_CONTEXT_RELEVANT_ASSESSMENT_NAME, ) )
[docs]@requires_databricks_agents def is_context_sufficient( *, request: str, context: Any, expected_facts: list[str], expected_response: Optional[str] = None, name: Optional[str] = None, ) -> Feedback: """ LLM judge determines whether the given context is sufficient to answer the input request. Args: request: Input to the application to evaluate, user's question or query. context: Context to evaluate the sufficiency of. Supports any JSON-serializable object. expected_facts: A list of expected facts that should be present in the context. expected_response: The expected response from the application. Optional. name: Optional name for overriding the default name of the returned feedback. Returns: A :py:class:`mlflow.entities.assessment.Feedback~` object with a "yes" or "no" value indicating whether the context is sufficient to answer the request. Example: The following example shows how to evaluate whether the documents returned by a retriever gives sufficient context to answer the user's question. .. code-block:: python from mlflow.genai.judges import is_context_sufficient feedback = is_context_sufficient( request="What is the capital of France?", context=[ {"content": "Paris is the capital of France."}, {"content": "Paris is known for its Eiffel Tower."}, ], expected_facts=["Paris is the capital of France."], ) print(feedback.value) # "yes" """ from databricks.agents.evals.judges import context_sufficiency return _sanitize_feedback( context_sufficiency( request=request, retrieved_context=context, expected_facts=expected_facts, expected_response=expected_response, assessment_name=name, ) )
[docs]@requires_databricks_agents def is_correct( *, request: str, response: str, expected_facts: list[str], expected_response: Optional[str] = None, name: Optional[str] = None, ) -> Feedback: """ LLM judge determines whether the given response is correct for the input request. Args: request: Input to the application to evaluate, user's question or query. response: The response from the application to evaluate. expected_facts: A list of expected facts that should be present in the response. expected_response: The expected response from the application. Optional. name: Optional name for overriding the default name of the returned feedback. Returns: A :py:class:`mlflow.entities.assessment.Feedback~` object with a "yes" or "no" value indicating whether the response is correct for the request. """ from databricks.agents.evals.judges import correctness return _sanitize_feedback( correctness( request=request, response=response, expected_facts=expected_facts, expected_response=expected_response, assessment_name=name, ) )
[docs]@requires_databricks_agents def is_grounded( *, request: str, response: str, context: Any, name: Optional[str] = None ) -> Feedback: """ LLM judge determines whether the given response is grounded in the given context. Args: request: Input to the application to evaluate, user's question or query. response: The response from the application to evaluate. context: Context to evaluate the response against. Supports any JSON-serializable object. name: Optional name for overriding the default name of the returned feedback. Returns: A :py:class:`mlflow.entities.assessment.Feedback~` object with a "yes" or "no" value indicating whether the response is grounded in the context. Example: The following example shows how to evaluate whether the response is grounded in the context. .. code-block:: python from mlflow.genai.judges import is_grounded feedback = is_grounded( request="What is the capital of France?", response="Paris", context=[ {"content": "Paris is the capital of France."}, {"content": "Paris is known for its Eiffel Tower."}, ], ) print(feedback.value) # "yes" """ from databricks.agents.evals.judges import groundedness return _sanitize_feedback( groundedness( request=request, response=response, retrieved_context=context, assessment_name=name, ) )
[docs]@requires_databricks_agents def is_safe(*, content: str, name: Optional[str] = None) -> Feedback: """ LLM judge determines whether the given response is safe. Args: content: Text content to evaluate for safety. name: Optional name for overriding the default name of the returned feedback. Returns: A :py:class:`mlflow.entities.assessment.Feedback~` object with a "yes" or "no" value indicating whether the response is safe. Example: .. code-block:: python from mlflow.genai.judges import is_safe feedback = is_safe(content="I am a happy person.") print(feedback.value) # "yes" """ from databricks.agents.evals.judges import safety return _sanitize_feedback(safety(response=content, assessment_name=name))
[docs]@requires_databricks_agents def meets_guidelines( *, guidelines: Union[str, list[str]], context: dict[str, Any], name: Optional[str] = None, ) -> Feedback: """ LLM judge determines whether the given response meets the given guideline(s). Args: guidelines: A single guideline or a list of guidelines. context: Mapping of context to be evaluated against the guidelines. For example, pass {"response": "<response text>"} to evaluate whether the response meets the given guidelines. name: Optional name for overriding the default name of the returned feedback. Returns: A :py:class:`mlflow.entities.assessment.Feedback~` object with a "yes" or "no" value indicating whether the response meets the guideline(s). Example: The following example shows how to evaluate whether the response meets the given guideline(s). .. code-block:: python from mlflow.genai.judges import meets_guidelines feedback = meets_guidelines( guidelines="Be polite and respectful.", context={"response": "Hello, how are you?"}, ) print(feedback.value) # "yes" feedback = meets_guidelines( guidelines=["Be polite and respectful.", "Must be in English."], context={"response": "Hola, ¿cómo estás?"}, ) print(feedback.value) # "no" """ from databricks.agents.evals.judges import guideline_adherence # Ensure guidelines is a list, as the underlying databricks judge only accepts lists if isinstance(guidelines, str): guidelines = [guidelines] return _sanitize_feedback( guideline_adherence( guidelines=guidelines, guidelines_context=context, assessment_name=name, ) )
[docs]@experimental @requires_databricks_agents def custom_prompt_judge( *, name: str, prompt_template: str, numeric_values: Optional[dict[str, Union[int, float]]] = None, ) -> Callable[..., Feedback]: """ Create a custom prompt judge that evaluates inputs using a template. Example prompt template: .. code-block:: You will look at the response and determine the formality of the response. <request>{{request}}</request> <response>{{response}}</response> You must choose one of the following categories. [[formal]]: The response is very formal. [[semi_formal]]: The response is somewhat formal. The response is somewhat formal if the response mentions friendship, etc. [[not_formal]]: The response is not formal. Variable names in the template should be enclosed in double curly braces, e.g., `{{request}}`, `{{response}}`. They should be alphanumeric and can include underscores, but should not contain spaces or special characters. It is required for the prompt template to request choices as outputs, with each choice enclosed in square brackets. Choice names should be alphanumeric and can include underscores and spaces. Args: name: Name of the judge, used as the name of returned :py:class`mlflow.entities.Feedback~` object. prompt_template: Template string with {{var_name}} placeholders for variable substitution. Should be prompted with choices as outputs. numeric_values: Optional mapping from categorical values to numeric scores. Useful if you want to create a custom judge that returns continuous valued outputs. Defaults to None. Returns: A callable that takes keyword arguments mapping to the template variables and returns an mlflow :py:class`mlflow.entities.Feedback~`. """ from databricks.agents.evals.judges import custom_prompt_judge return custom_prompt_judge( name=name, prompt_template=prompt_template, numeric_values=numeric_values, )