Spaces:
Runtime error
Runtime error
File size: 4,031 Bytes
234bc89 b558422 969c59e b7dfde8 b558422 9bd8edd b558422 969c59e b558422 969c59e b558422 969c59e b558422 969c59e b558422 969c59e b558422 969c59e b558422 969c59e b558422 969c59e b558422 969c59e b558422 969c59e b558422 969c59e b558422 969c59e b558422 969c59e b558422 97950bf 969c59e b558422 969c59e b558422 9bd8edd b558422 07c333a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
import json
import pandas as pd
import yaml
from sklearn.metrics import cohen_kappa_score
import numpy as np
from datasets import load_dataset
from .envs import TOKEN
TYPES = ["number", "html", "number", "number", "number", "number"]
def read_json(file_path: str) -> list[dict]:
"""
Read a JSON/JSONL file and return its contents as a list of dictionaries.
Parameters:
file_path (str): The path to the JSON file.
Returns:
list[dict]: The contents of the JSON file as a list of dictionaries.
"""
try:
with open(file_path) as f:
data = [json.loads(x) for x in f]
return data
except json.decoder.JSONDecodeError:
with open(file_path) as f:
data = json.load(f)
return data
def pairwise_compare(
evaluator1_responses: list[dict],
evaluator2_responses: list[dict],
) -> tuple[float, float]:
"""
Compare pairwise evaluators.
Args:
evaluator1_responses: The responses from the first evaluator.
evaluator2_responses: The responses from the second evaluator.
Returns:
None
"""
assert len(evaluator1_responses) == len(evaluator2_responses)
evaluator1_winners = np.array([response["winner"] for response in evaluator1_responses])
evaluator2_winners = np.array([response["winner"] for response in evaluator2_responses])
acc = (evaluator1_winners == evaluator2_winners).mean().item()
agreement = cohen_kappa_score(evaluator1_winners, evaluator2_winners)
return acc, agreement
def pairwise_meta_eval(human_responses: list[dict], model_dir: str, model_dir_swap: str) -> dict[float]:
"""
Evaluate a pairwise evaluator.
Args:
human_responses: The responses from the human evaluator.
model_dir: The directory containing the model responses.
model_dir_swap: The directory containing the model responses with swapped inputs.
Returns:
dict[float]: The accuracy and agreement.
"""
model_responses = read_json(model_dir)
model_responses_swap = read_json(model_dir_swap)
acc, agr = pairwise_compare(human_responses, model_responses)
swap_acc, swap_agr = pairwise_compare(
human_responses,
model_responses_swap,
)
acc = (acc + swap_acc) / 2
agr = (agr + swap_agr) / 2
models_acc, models_agr = pairwise_compare(
model_responses,
model_responses_swap,
)
return acc, agr, models_acc, models_agr
def load_leaderboard() -> pd.DataFrame:
"""Loads the leaderboard from the file system"""
with open("./data/models.yaml") as fp:
models = yaml.safe_load(fp)
human_responses = load_dataset("salesforce/instrusum", "human_eval_pairwise", token=TOKEN)["data"]
human_responses = [x for x in human_responses]
predictions = {k: [] for k in ["Model", "Accuracy", "Agreement", "Self-Accuracy", "Self-Agreement"]}
for model in models:
fdir = model["fdir"]
acc, agr, models_acc, models_agr = pairwise_meta_eval(
human_responses, f"./predictions/{fdir}.jsonl", f"./predictions/{fdir}_swap.jsonl"
)
# predictions["Model"].append(model["name"])
# predictions["Model"].append(f"[{model['name']}]({model['url']})")
link = model['url']
model_name = model['name']
output = f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
predictions["Model"].append(output)
predictions["Accuracy"].append(acc)
predictions["Agreement"].append(agr)
predictions["Self-Accuracy"].append(models_acc)
predictions["Self-Agreement"].append(models_agr)
df = pd.DataFrame(predictions).sort_values(by="Agreement", ascending=False).round(decimals=3)
df.reset_index(drop=True, inplace=True)
df[' '] = pd.Series(range(1, len(df) + 1))
columns = [' '] + [col for col in df.columns if col != ' ']
df = df[columns]
return df
|