def execute_unit_tests(input_df, output_list, system_prompt):
"""Unit testing function that takes in a dataframe and appends test results to an output_list."""
for x, y in tqdm(input_df.iterrows(), total=len(input_df)):
model_response = get_response(system_prompt, y['question'])
format_valid = test_valid_schema(model_response)
try:
test_query = LLMResponse.model_validate_json(model_response)
# Avoid logging since we're executing many rows at once
sql_valid = test_llm_sql(test_query, should_log=False)
except:
sql_valid = False
output_list.append((y['question'], model_response, format_valid, sql_valid))
def evaluate_row(row):
"""Simple evaluation function to categorize unit testing results.
If the format or SQL are flagged it returns a label, otherwise it is correct"""
if row['format'] is False:
return 'Format incorrect'
elif row['sql'] is False:
return 'SQL incorrect'
else:
return 'SQL correct'
def test_system_prompt(test_df, system_prompt):
# Execute unit tests and capture results
results = []
execute_unit_tests(
input_df=test_df,
output_list=results,
system_prompt=system_prompt
)
results_df = pd.DataFrame(results)
results_df.columns = ['question','response','format','sql']
# Use `apply` to calculate the geval score and unit test evaluation
# for each generated response
results_df['evaluation_score'] = results_df.apply(
lambda x: get_geval_score(
RELEVANCY_SCORE_CRITERIA,
RELEVANCY_SCORE_STEPS,
x['question'],
x['response'],
'relevancy'
),
axis=1
)
results_df['unit_test_evaluation'] = results_df.apply(
lambda x: evaluate_row(x),
axis=1
)
return results_df