import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
class Evaluator:
def __init__(self, df):
self.df = df
self.y_pred = pd.Series() # Initialize as empty Series
self.labels_answer_expected = ["✅ Answered Correctly", "❎ Skipped", "❌ Wrong Answer"]
self.labels_idk_expected = ["❌ Hallucination", "✅ I don't know"]
def _evaluate_answer_expected(self, row, answers_column):
generated_answer = row[answers_column].lower()
actual_answers = [ans.lower() for ans in row["answers"]]
return (
"✅ Answered Correctly" if any(ans in generated_answer for ans in actual_answers)
else "❎ Skipped" if generated_answer == "i don't know"
else "❌ Wrong Answer"
)
def _evaluate_idk_expected(self, row, answers_column):
generated_answer = row[answers_column].lower()
return (
"❌ Hallucination" if generated_answer != "i don't know"
else "✅ I don't know"
)
def _evaluate_single_row(self, row, answers_column):
is_impossible = row["is_impossible"]
return (
self._evaluate_answer_expected(row, answers_column) if not is_impossible
else self._evaluate_idk_expected(row, answers_column)
)
def evaluate_model(self, answers_column="generated_answer"):
self.y_pred = pd.Series(self.df.apply(self._evaluate_single_row, answers_column=answers_column, axis=1))
freq_series = self.y_pred.value_counts()
# Counting rows for each scenario
total_answer_expected = len(self.df[self.df['is_impossible'] == False])
total_idk_expected = len(self.df[self.df['is_impossible'] == True])
freq_answer_expected = (freq_series / total_answer_expected * 100).round(2).reindex(self.labels_answer_expected, fill_value=0)
freq_idk_expected = (freq_series / total_idk_expected * 100).round(2).reindex(self.labels_idk_expected, fill_value=0)
return freq_answer_expected.to_dict(), freq_idk_expected.to_dict()
def print_eval(self):
answer_columns=["generated_answer", "ft_generated_answer"]
baseline_correctness, baseline_idk = self.evaluate_model()
ft_correctness, ft_idk = self.evaluate_model(self.df, answer_columns[1])
print("When the model should answer correctly:")
eval_df = pd.merge(
baseline_correctness.rename("Baseline"),
ft_correctness.rename("Fine-Tuned"),
left_index=True,
right_index=True,
)
print(eval_df)
print("\n\n\nWhen the model should say 'I don't know':")
eval_df = pd.merge(
baseline_idk.rename("Baseline"),
ft_idk.rename("Fine-Tuned"),
left_index=True,
right_index=True,
)
print(eval_df)
def plot_model_comparison(self, answer_columns=["generated_answer", "ft_generated_answer"], scenario="answer_expected", nice_names=["Baseline", "Fine-Tuned"]):
results = []
for col in answer_columns:
answer_expected, idk_expected = self.evaluate_model(col)
if scenario == "answer_expected":
results.append(answer_expected)
elif scenario == "idk_expected":
results.append(idk_expected)
else:
raise ValueError("Invalid scenario")
results_df = pd.DataFrame(results, index=nice_names)
if scenario == "answer_expected":
results_df = results_df.reindex(self.labels_answer_expected, axis=1)
elif scenario == "idk_expected":
results_df = results_df.reindex(self.labels_idk_expected, axis=1)
melted_df = results_df.reset_index().melt(id_vars='index', var_name='Status', value_name='Frequency')
sns.set_theme(style="whitegrid", palette="icefire")
g = sns.catplot(data=melted_df, x='Frequency', y='index', hue='Status', kind='bar', height=5, aspect=2)
# Annotating each bar
for p in g.ax.patches:
g.ax.annotate(f"{p.get_width():.0f}%", (p.get_width()+5, p.get_y() + p.get_height() / 2),
textcoords="offset points",
xytext=(0, 0),
ha='center', va='center')
plt.ylabel("Model")
plt.xlabel("Percentage")
plt.xlim(0, 100)
plt.tight_layout()
plt.title(scenario.replace("_", " ").title())
plt.show()
# Compare the results by merging into one dataframe
evaluator = Evaluator(df)
# evaluator.evaluate_model(answers_column="ft_generated_answer")
# evaluator.plot_model_comparison(["generated_answer", "ft_generated_answer"], scenario="answer_expected", nice_names=["Baseline", "Fine-Tuned"])