Skip to main content

Custom Metrics & Visualizations

MLflow's evaluation framework allows you to define custom metrics and create specialized visualizations tailored to your specific business requirements. This capability is essential when standard metrics don't capture your domain's unique success criteria or when you need custom visual analysis for stakeholder communication.

Quick Start: Creating Custom Metrics​

Define domain-specific metrics using MLflow's metric builder:

import mlflow
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from mlflow.models import make_metric
from mlflow.metrics.base import MetricValue

# Generate sample data
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=42
)

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Create evaluation dataset
eval_data = pd.DataFrame(X_test)
eval_data["target"] = y_test


# Define a custom business metric
def business_value_metric(predictions, targets, metrics):
"""
Custom metric calculating business value impact.
True Positives = $100 value, False Positives = -$20 cost
"""
tp = np.sum((predictions == 1) & (targets == 1))
fp = np.sum((predictions == 1) & (targets == 0))
tn = np.sum((predictions == 0) & (targets == 0))
fn = np.sum((predictions == 0) & (targets == 1))

# Business logic: TP worth $100, FP costs $20
business_value = (tp * 100) - (fp * 20)
total_possible_value = np.sum(targets == 1) * 100

return MetricValue(
scores=[business_value], # Total business value
aggregate_results={
"total_business_value": business_value,
"value_per_prediction": business_value / len(predictions),
"value_efficiency": business_value / total_possible_value
if total_possible_value > 0
else 0,
},
)


# Create the metric
business_metric = make_metric(
eval_fn=business_value_metric, greater_is_better=True, name="business_value"
)

with mlflow.start_run():
# Log model
mlflow.sklearn.log_model(model, name="model")
model_uri = mlflow.get_artifact_uri("model")

# Evaluate with custom metric
result = mlflow.evaluate(
model_uri,
eval_data,
targets="target",
model_type="classifier",
extra_metrics=[business_metric],
)

print(f"Standard Accuracy: {result.metrics['accuracy_score']:.3f}")
print(
f"Business Value: ${result.metrics['business_value/total_business_value']:.2f}"
)
print(
f"Value per Prediction: ${result.metrics['business_value/value_per_prediction']:.2f}"
)

Custom Metric Patterns​

Create metrics that translate model performance into business terms:

def create_profit_loss_metric(cost_per_fp=50, revenue_per_tp=200):
"""Calculate profit/loss impact of model predictions."""

def eval_fn(predictions, targets, metrics):
tp = np.sum((predictions == 1) & (targets == 1))
fp = np.sum((predictions == 1) & (targets == 0))
fn = np.sum((predictions == 0) & (targets == 1))
tn = np.sum((predictions == 0) & (targets == 0))

# Calculate financial impact
revenue = tp * revenue_per_tp
costs = fp * cost_per_fp
missed_opportunity = fn * revenue_per_tp
net_profit = revenue - costs

return MetricValue(
aggregate_results={
"net_profit": net_profit,
"total_revenue": revenue,
"total_costs": costs,
"missed_revenue": missed_opportunity,
"roi": (net_profit / max(costs, 1)) * 100,
}
)

return make_metric(eval_fn=eval_fn, greater_is_better=True, name="profit_loss")


# Usage
profit_metric = create_profit_loss_metric(cost_per_fp=30, revenue_per_tp=150)

with mlflow.start_run():
result = mlflow.evaluate(
model_uri,
eval_data,
targets="target",
model_type="classifier",
extra_metrics=[profit_metric],
)

print(f"Net Profit: ${result.metrics['profit_loss/net_profit']:.2f}")
print(f"ROI: {result.metrics['profit_loss/roi']:.1f}%")

Custom Visualizations​

Generate custom visual analysis beyond standard plots:

import matplotlib.pyplot as plt
import seaborn as sns
import os


def create_business_impact_visualization(eval_df, builtin_metrics, artifacts_dir):
"""Create custom business impact visualization."""

# Calculate business segments
eval_df["prediction_confidence"] = np.abs(eval_df["prediction"] - 0.5)
eval_df["confidence_segment"] = pd.cut(
eval_df["prediction_confidence"],
bins=[0, 0.1, 0.3, 0.5],
labels=["Low", "Medium", "High"],
)

# Create subplots
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Accuracy by confidence segment
accuracy_by_segment = eval_df.groupby("confidence_segment").apply(
lambda x: (x["prediction"] == x["target"]).mean()
)

axes[0, 0].bar(
accuracy_by_segment.index, accuracy_by_segment.values, color="skyblue"
)
axes[0, 0].set_title("Accuracy by Confidence Segment")
axes[0, 0].set_ylabel("Accuracy")
axes[0, 0].set_ylim(0, 1)

# 2. Prediction distribution
axes[0, 1].hist(
eval_df["prediction"],
bins=20,
alpha=0.7,
label="Predictions",
color="lightgreen",
)
axes[0, 1].axvline(
eval_df["prediction"].mean(), color="red", linestyle="--", label="Mean"
)
axes[0, 1].set_title("Prediction Distribution")
axes[0, 1].legend()

# 3. Confusion matrix heatmap
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(eval_df["target"], eval_df["prediction"])
sns.heatmap(cm, annot=True, fmt="d", ax=axes[1, 0], cmap="Blues")
axes[1, 0].set_title("Confusion Matrix")
axes[1, 0].set_xlabel("Predicted")
axes[1, 0].set_ylabel("Actual")

# 4. Business value by segment
def calculate_segment_value(segment_data):
tp = np.sum((segment_data["prediction"] == 1) & (segment_data["target"] == 1))
fp = np.sum((segment_data["prediction"] == 1) & (segment_data["target"] == 0))
return (tp * 100) - (fp * 20) # Business value calculation

value_by_segment = eval_df.groupby("confidence_segment").apply(
calculate_segment_value
)

colors = ["lightcoral" if v < 0 else "lightgreen" for v in value_by_segment.values]
axes[1, 1].bar(value_by_segment.index, value_by_segment.values, color=colors)
axes[1, 1].set_title("Business Value by Confidence Segment")
axes[1, 1].set_ylabel("Business Value ($)")
axes[1, 1].axhline(y=0, color="black", linestyle="-", alpha=0.3)

plt.tight_layout()

# Save visualization
viz_path = os.path.join(artifacts_dir, "business_impact_analysis.png")
plt.savefig(viz_path, dpi=300, bbox_inches="tight")
plt.close()

return {"business_impact_analysis": viz_path}

Advanced Custom Metrics​

Create composite metrics that combine multiple evaluation criteria:

def create_composite_business_metric():
"""Composite metric combining accuracy, profit, and risk measures."""

def eval_fn(predictions, targets, metrics):
# Get standard accuracy
accuracy = metrics.get("accuracy_score", 0)

# Calculate profit (reuse previous logic)
tp = np.sum((predictions == 1) & (targets == 1))
fp = np.sum((predictions == 1) & (targets == 0))
fn = np.sum((predictions == 0) & (targets == 1))

profit = (tp * 100) - (fp * 20) - (fn * 50) # Include missed opportunity cost

# Calculate risk (variance in performance across segments)
n_segments = 5
segment_size = len(predictions) // n_segments
segment_accuracies = []

for i in range(n_segments):
start_idx = i * segment_size
end_idx = (
start_idx + segment_size if i < n_segments - 1 else len(predictions)
)

seg_pred = predictions[start_idx:end_idx]
seg_target = targets[start_idx:end_idx]
seg_accuracy = np.mean(seg_pred == seg_target) if len(seg_pred) > 0 else 0
segment_accuracies.append(seg_accuracy)

risk_measure = np.std(segment_accuracies) # Lower std = lower risk

# Composite score: balance accuracy, profit, and risk
composite_score = (
(accuracy * 0.4) + (profit / 1000 * 0.4) + ((1 - risk_measure) * 0.2)
)

return MetricValue(
aggregate_results={
"composite_score": composite_score,
"accuracy_component": accuracy,
"profit_component": profit,
"risk_component": risk_measure,
"segment_consistency": 1 - risk_measure,
}
)

return make_metric(
eval_fn=eval_fn, greater_is_better=True, name="composite_business"
)


# Usage
composite_metric = create_composite_business_metric()

Best Practices and Guidelines​

Metric Design Principles​

Make Metrics Business-Relevant - Translate technical performance into business impact using domain-specific terminology and thresholds that align with actual business objectives.

Ensure Metric Reliability - Handle edge cases like division by zero and empty datasets, validate calculations with known test cases, and include confidence intervals when appropriate.

Optimize for Interpretability - Provide multiple aggregation levels, include intermediate calculations for transparency, use descriptive naming, and add context through ratio and percentage metrics.

Documentation and Validation - Document metric assumptions and limitations, test with synthetic data where ground truth is known, and provide clear interpretations of metric values.

Visualization Best Practices​

  • Target Your Audience: Create technical plots for data scientists and executive dashboards for business stakeholders
  • Make It Interactive: Use interactive visualizations for exploration and static ones for reports
  • Focus on Insights: Highlight key findings and actionable insights rather than just displaying data
  • Consistent Styling: Use consistent color schemes and formatting across all visualizations
  • Performance Considerations: Optimize large visualizations for rendering speed and file size

Integration Example​

Here's a comprehensive example combining custom metrics and visualizations:

def comprehensive_custom_evaluation():
"""Complete example of custom metrics and visualizations."""

# Define multiple custom metrics
business_metric = create_profit_loss_metric(cost_per_fp=30, revenue_per_tp=150)
threshold_metric = create_threshold_precision_metric(threshold=0.8)
composite_metric = create_composite_business_metric()

# Define custom visualizations
custom_artifacts = [
create_business_impact_visualization,
create_performance_breakdown_visualization,
create_interactive_analysis_artifacts,
]

with mlflow.start_run(run_name="Comprehensive_Custom_Evaluation"):
# Evaluate with all custom components
result = mlflow.evaluate(
model_uri,
eval_data,
targets="target",
model_type="classifier",
extra_metrics=[business_metric, threshold_metric, composite_metric],
custom_artifacts=custom_artifacts,
)

# Log additional business context
mlflow.log_params(
{
"evaluation_focus": "business_impact",
"custom_metrics_count": 3,
"custom_visualizations_count": len(custom_artifacts),
}
)

print("Custom Evaluation Results:")
print(f"Profit/Loss: ${result.metrics.get('profit_loss/net_profit', 0):.2f}")
print(
f"High Confidence Precision: {result.metrics.get('threshold_precision/high_confidence_precision', 0):.3f}"
)
print(
f"Composite Score: {result.metrics.get('composite_business/composite_score', 0):.3f}"
)

print("\nCustom Artifacts Created:")
for name, path in result.artifacts.items():
if any(
keyword in name.lower()
for keyword in ["business", "performance", "interactive"]
):
print(f" {name}: {path}")


# Run comprehensive evaluation
# comprehensive_custom_evaluation()

Conclusion​

Custom metrics and visualizations in MLflow enable you to create evaluation frameworks perfectly tailored to your business needs. By defining domain-specific success criteria and creating targeted visual analysis, you can bridge the gap between technical model performance and business value.

Key benefits include:

  • Business Alignment: Metrics that directly reflect business impact and priorities
  • Stakeholder Communication: Visual dashboards that make model performance accessible
  • Domain Expertise: Evaluation criteria that capture industry-specific requirements
  • Decision Support: Clear insights that drive informed model selection and deployment

Whether you're optimizing for profit, minimizing risk, or meeting regulatory requirements, custom metrics and visualizations ensure your model evaluation process aligns with what truly matters for your use case.