An mlflow.pyfunc.PyFuncModel 实例或指向已记录的 MLflow 模型的 URI。
接受字符串输入并输出单个字符串的 Python 函数。
MLflow 部署端点 URI。
米 model=None 并将模型输出纳入评估数据中。
让我们看一个使用已记录的 MLflow 模型的示例:
import mlflow
import openai
with mlflow.start_run():
system_prompt = "Answer the following question concisely."
logged_model_info = mlflow.openai.log_model(
model="gpt-3.5-turbo",
task=openai.chat.completions,
artifact_path="model",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": "{question}"},
],
)
# Prepare evaluation data
eval_data = pd.DataFrame({
"question": ["What is machine learning?", "Explain neural networks."],
"ground_truth": [
"Machine learning is a subset of AI that enables systems to learn and improve from experience without explicit programming.",
"Neural networks are computing systems inspired by biological neural networks, consisting of interconnected nodes that process and transmit information."
]
})
# Evaluate the model
results = mlflow.evaluate(
logged_model_info.model_uri,
eval_data,
targets="ground_truth",
model_type="question-answering",
)
print(f"Evaluation metrics: {results.metrics}")
from mlflow.metrics.genai import EvaluationExample, make_genai_metric
professionalism = make_genai_metric(
name="professionalism",
definition="Measure of formal and appropriate communication style.",
grading_prompt=(
"Score the professionalism of the answer on a scale of 0-4:\n"
"0: Extremely casual or inappropriate\n"
"1: Casual but respectful\n"
"2: Moderately formal\n"
"3: Professional and appropriate\n"
"4: Highly formal and expertly crafted"
),
examples=[
EvaluationExample(
input="What is MLflow?",
output="MLflow is like your friendly neighborhood toolkit for managing ML projects. It's super cool!",
score=1,
justification="The response is casual and uses informal language."
),
EvaluationExample(
input="What is MLflow?",
output="MLflow is an open-source platform for the machine learning lifecycle, including experimentation, reproducibility, and deployment.",
score=4,
justification="The response is formal, concise, and professionally worded."
)
],
model="openai:/gpt-3.5-turbo-16k",
parameters={"temperature": 0.0},
aggregations=["mean", "variance"],
greater_is_better=True,
)
# Use the custom metric in evaluation
results = mlflow.evaluate(
logged_model_info.model_uri,
eval_data,
targets="ground_truth",
model_type="question-answering",
extra_metrics=[professionalism]
)
print(f"Professionalism score: {results.metrics['professionalism_mean']}")
import matplotlib.pyplot as plt
import mlflow
def plot_metric_comparison(metric_name, run_ids):
plt.figure(figsize=(10, 6))
for run_id in run_ids:
run = mlflow.get_run(run_id)
metric_values = mlflow.get_metric_history(run_id, metric_name)
plt.plot([m.step for m in metric_values], [m.value for m in metric_values], label=run.data.tags.get("mlflow.runName", run_id))
plt.title(f"Comparison of {metric_name}")
plt.xlabel("Step")
plt.ylabel(metric_name)
plt.legend()
# Save and log the plot
plt.savefig(f"{metric_name}_comparison.png")
mlflow.log_artifact(f"{metric_name}_comparison.png")
# Usage
with mlflow.start_run():
plot_metric_comparison("answer_relevance", ["run_id_1", "run_id_2", "run_id_3"])