File size: 4,031 Bytes
234bc89
 
 
 
b558422
 
 
969c59e
b7dfde8
b558422
9bd8edd
b558422
969c59e
b558422
 
 
969c59e
b558422
 
969c59e
b558422
 
 
 
 
 
 
 
 
 
 
 
969c59e
b558422
969c59e
 
b558422
 
 
 
 
969c59e
 
b558422
 
 
 
 
969c59e
 
b558422
 
 
 
 
969c59e
b558422
 
 
 
969c59e
b558422
 
 
 
 
 
969c59e
 
 
b558422
969c59e
 
b558422
 
 
 
969c59e
 
b558422
 
 
969c59e
b558422
 
 
 
97950bf
969c59e
b558422
 
 
 
 
 
969c59e
b558422
9bd8edd
 
 
 
 
 
b558422
 
 
 
07c333a
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import json

import pandas as pd

import yaml
from sklearn.metrics import cohen_kappa_score
import numpy as np
from datasets import load_dataset
from .envs import TOKEN

TYPES = ["number", "html", "number", "number", "number", "number"]


def read_json(file_path: str) -> list[dict]:
    """
    Read a JSON/JSONL file and return its contents as a list of dictionaries.

    Parameters:
        file_path (str): The path to the JSON file.

    Returns:
        list[dict]: The contents of the JSON file as a list of dictionaries.
    """
    try:
        with open(file_path) as f:
            data = [json.loads(x) for x in f]
        return data
    except json.decoder.JSONDecodeError:
        with open(file_path) as f:
            data = json.load(f)
        return data


def pairwise_compare(
    evaluator1_responses: list[dict],
    evaluator2_responses: list[dict],
) -> tuple[float, float]:
    """
    Compare pairwise evaluators.

    Args:
        evaluator1_responses: The responses from the first evaluator.
        evaluator2_responses: The responses from the second evaluator.
    Returns:
        None
    """

    assert len(evaluator1_responses) == len(evaluator2_responses)
    evaluator1_winners = np.array([response["winner"] for response in evaluator1_responses])
    evaluator2_winners = np.array([response["winner"] for response in evaluator2_responses])
    acc = (evaluator1_winners == evaluator2_winners).mean().item()
    agreement = cohen_kappa_score(evaluator1_winners, evaluator2_winners)
    return acc, agreement


def pairwise_meta_eval(human_responses: list[dict], model_dir: str, model_dir_swap: str) -> dict[float]:
    """
    Evaluate a pairwise evaluator.

    Args:
        human_responses: The responses from the human evaluator.
        model_dir: The directory containing the model responses.
        model_dir_swap: The directory containing the model responses with swapped inputs.

    Returns:
        dict[float]: The accuracy and agreement.
    """
    model_responses = read_json(model_dir)
    model_responses_swap = read_json(model_dir_swap)
    acc, agr = pairwise_compare(human_responses, model_responses)
    swap_acc, swap_agr = pairwise_compare(
        human_responses,
        model_responses_swap,
    )
    acc = (acc + swap_acc) / 2
    agr = (agr + swap_agr) / 2
    models_acc, models_agr = pairwise_compare(
        model_responses,
        model_responses_swap,
    )
    return acc, agr, models_acc, models_agr


def load_leaderboard() -> pd.DataFrame:
    """Loads the leaderboard from the file system"""
    with open("./data/models.yaml") as fp:
        models = yaml.safe_load(fp)
    human_responses = load_dataset("salesforce/instrusum", "human_eval_pairwise", token=TOKEN)["data"]
    human_responses = [x for x in human_responses]

    predictions = {k: [] for k in ["Model", "Accuracy", "Agreement", "Self-Accuracy", "Self-Agreement"]}

    for model in models:
        fdir = model["fdir"]
        acc, agr, models_acc, models_agr = pairwise_meta_eval(
            human_responses, f"./predictions/{fdir}.jsonl", f"./predictions/{fdir}_swap.jsonl"
        )
        # predictions["Model"].append(model["name"])
        # predictions["Model"].append(f"[{model['name']}]({model['url']})")
        link = model['url']
        model_name = model['name']
        output = f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
        predictions["Model"].append(output)
        predictions["Accuracy"].append(acc)
        predictions["Agreement"].append(agr)
        predictions["Self-Accuracy"].append(models_acc)
        predictions["Self-Agreement"].append(models_agr)
    df = pd.DataFrame(predictions).sort_values(by="Agreement", ascending=False).round(decimals=3)
    df.reset_index(drop=True, inplace=True)
    df[' '] = pd.Series(range(1, len(df) + 1))
    columns = [' '] + [col for col in df.columns if col != ' ']
    df = df[columns]
    return df