Skip to main content

Function Evaluation

Function evaluation allows you to assess Python functions directly without the overhead of logging models to MLflow. This lightweight approach is perfect for rapid prototyping, testing custom prediction logic, and evaluating complex business rules that may not fit the traditional model paradigm.

Quick Start: Evaluating a Simple Function​

The most straightforward function evaluation involves a callable that takes data and returns predictions:

import mlflow
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Generate sample data
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=42
)

# Train a model (we'll use this in our function)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


# Define a prediction function
def predict_function(input_data):
"""Custom prediction function that can include business logic."""

# Get base model predictions
base_predictions = model.predict(input_data)

# Add custom business logic
# Example: Override predictions for specific conditions
feature_sum = input_data.sum(axis=1)
high_feature_mask = feature_sum > feature_sum.quantile(0.9)

# Custom rule: high feature sum values are always class 1
final_predictions = base_predictions.copy()
final_predictions[high_feature_mask] = 1

return final_predictions


# Create evaluation dataset
eval_data = pd.DataFrame(X_test)
eval_data["target"] = y_test

with mlflow.start_run():
# Evaluate function directly - no model logging needed!
result = mlflow.evaluate(
predict_function, # Function to evaluate
eval_data, # Evaluation data
targets="target", # Target column
model_type="classifier", # Task type
)

print(f"Function Accuracy: {result.metrics['accuracy_score']:.3f}")
print(f"Function F1 Score: {result.metrics['f1_score']:.3f}")

This approach is ideal when:

  • You want to test functions quickly without model persistence
  • Your prediction logic includes complex business rules
  • You're prototyping custom algorithms or ensemble methods
  • You need to evaluate preprocessing + prediction pipelines

Advanced Function Patterns​

Evaluate complete data processing and prediction pipelines:

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


def complete_pipeline_function(input_data):
"""Function that includes preprocessing, feature engineering, and prediction."""

# Preprocessing
scaler = StandardScaler()
scaled_data = scaler.fit_transform(input_data)

# Feature engineering
pca = PCA(n_components=10)
pca_features = pca.fit_transform(scaled_data)

# Create additional features
feature_interactions = pca_features[:, 0] * pca_features[:, 1]
feature_ratios = pca_features[:, 0] / (pca_features[:, 1] + 1e-8)

# Combine features
enhanced_features = np.column_stack(
[
pca_features,
feature_interactions.reshape(-1, 1),
feature_ratios.reshape(-1, 1),
]
)

# Model prediction
model = RandomForestClassifier(n_estimators=50, random_state=42)
# Note: In practice, you'd fit this on training data separately
model.fit(enhanced_features, np.random.choice([0, 1], size=len(enhanced_features)))

predictions = model.predict(enhanced_features)
return predictions


with mlflow.start_run(run_name="Complete_Pipeline_Function"):
# Log pipeline configuration
mlflow.log_params(
{
"preprocessing": "StandardScaler + PCA",
"pca_components": 10,
"feature_engineering": "interactions + ratios",
"model": "RandomForest",
}
)

result = mlflow.evaluate(
complete_pipeline_function, eval_data, targets="target", model_type="classifier"
)

print(f"Pipeline Function Performance: {result.metrics['accuracy_score']:.3f}")

Function Testing and Validation​

Test functions with different parameter configurations:

def create_parameterized_function(
threshold=0.5, use_scaling=True, feature_selection=None
):
"""Factory function that creates prediction functions with different parameters."""

def parameterized_prediction_function(input_data):
processed_data = input_data.copy()

# Optional scaling
if use_scaling:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
processed_data = scaler.fit_transform(processed_data)

# Optional feature selection
if feature_selection:
n_features = min(feature_selection, processed_data.shape[1])
processed_data = processed_data[:, :n_features]

# Model prediction
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(
processed_data if use_scaling or feature_selection else X_train, y_train
)

probabilities = model.predict_proba(processed_data)[:, 1]
predictions = (probabilities > threshold).astype(int)

return predictions

return parameterized_prediction_function


# Test different parameter combinations
parameter_configs = [
{"threshold": 0.3, "use_scaling": True, "feature_selection": None},
{"threshold": 0.5, "use_scaling": True, "feature_selection": 10},
{"threshold": 0.7, "use_scaling": False, "feature_selection": None},
{"threshold": 0.5, "use_scaling": False, "feature_selection": 15},
]

results = {}

for i, config in enumerate(parameter_configs):
with mlflow.start_run(run_name=f"Param_Config_{i+1}"):
# Log configuration
mlflow.log_params(config)

# Create and evaluate function
param_function = create_parameterized_function(**config)

result = mlflow.evaluate(
param_function, eval_data, targets="target", model_type="classifier"
)

results[f"config_{i+1}"] = {
"config": config,
"accuracy": result.metrics["accuracy_score"],
"f1_score": result.metrics["f1_score"],
}

print(f"Config {i+1}: Accuracy = {result.metrics['accuracy_score']:.3f}")

# Find best configuration
best_config = max(results.keys(), key=lambda k: results[k]["accuracy"])
print(f"Best configuration: {results[best_config]['config']}")
print(f"Best accuracy: {results[best_config]['accuracy']:.3f}")

Key Use Cases and Benefits​

Function evaluation in MLflow is particularly valuable for several scenarios:

Rapid Prototyping - Perfect for testing prediction logic immediately without the overhead of model persistence. Developers can iterate quickly on algorithms and see results instantly.

Custom Business Logic - Ideal for evaluating functions that combine machine learning predictions with domain-specific business rules, regulatory requirements, or operational constraints.

Ensemble Methods - Excellent for testing custom ensemble approaches that may not fit standard model frameworks, including dynamic voting strategies and confidence-based combinations.

Pipeline Development - Enables evaluation of complete data processing pipelines that include preprocessing, feature engineering, and prediction in a single function.

Best Practices​

When using function evaluation, consider these best practices:

  • Stateless Functions: Design functions to be stateless when possible to ensure reproducible results
  • Parameter Logging: Always log function parameters and configuration for reproducibility
  • Input Validation: Include input validation and error handling in production functions
  • Performance Monitoring: Track execution time and resource usage for production functions
  • Version Control: Use MLflow's tagging and naming conventions to track function versions

Conclusion​

Function evaluation in MLflow provides a lightweight, flexible approach to assessing prediction functions without the overhead of model persistence. This capability is essential for rapid prototyping, testing complex business logic, and evaluating custom algorithms that don't fit traditional model paradigms.

Key advantages of function evaluation include:

  • Rapid Prototyping: Test prediction logic immediately without model logging
  • Business Logic Integration: Evaluate functions that combine ML with business rules
  • Custom Algorithms: Assess non-standard prediction approaches and ensemble methods
  • Development Workflow: Seamlessly integrate with iterative development processes
  • Performance Testing: Benchmark and optimize function performance

Whether you're developing custom ensemble methods, integrating business rules with ML predictions, or rapidly prototyping new approaches, MLflow's function evaluation provides the flexibility and insights needed for effective model development and testing.

The lightweight nature of function evaluation makes it perfect for exploratory analysis, A/B testing different approaches, and validating custom prediction logic before committing to full model deployment workflows.