Spaces:

yale-nlp
/

InstruSumEval

Runtime error

File size: 4,031 Bytes

import json

import pandas as pd

import yaml
from sklearn.metrics import cohen_kappa_score
import numpy as np
from datasets import load_dataset
from .envs import TOKEN

TYPES = ["number", "html", "number", "number", "number", "number"]


def read_json(file_path: str) -> list[dict]:
    """
    Read a JSON/JSONL file and return its contents as a list of dictionaries.

    Parameters:
        file_path (str): The path to the JSON file.

    Returns:
        list[dict]: The contents of the JSON file as a list of dictionaries.
    """
    try:
        with open(file_path) as f:
            data = [json.loads(x) for x in f]
        return data
    except json.decoder.JSONDecodeError:
        with open(file_path) as f:
            data = json.load(f)
        return data


def pairwise_compare(
    evaluator1_responses: list[dict],
    evaluator2_responses: list[dict],
) -> tuple[float, float]:
    """
    Compare pairwise evaluators.

    Args:
        evaluator1_responses: The responses from the first evaluator.
        evaluator2_responses: The responses from the second evaluator.
    Returns:
        None
    """

    assert len(evaluator1_responses) == len(evaluator2_responses)
    evaluator1_winners = np.array([response["winner"] for response in evaluator1_responses])
    evaluator2_winners = np.array([response["winner"] for response in evaluator2_responses])
    acc = (evaluator1_winners == evaluator2_winners).mean().item()
    agreement = cohen_kappa_score(evaluator1_winners, evaluator2_winners)
    return acc, agreement


def pairwise_meta_eval(human_responses: list[dict], model_dir: str, model_dir_swap: str) -> dict[float]:
    """
    Evaluate a pairwise evaluator.

    Args:
        human_responses: The responses from the human evaluator.
        model_dir: The directory containing the model responses.
        model_dir_swap: The directory containing the model responses with swapped inputs.

    Returns:
        dict[float]: The accuracy and agreement.
    """
    model_responses = read_json(model_dir)
    model_responses_swap = read_json(model_dir_swap)
    acc, agr = pairwise_compare(human_responses, model_responses)
    swap_acc, swap_agr = pairwise_compare(
        human_responses,
        model_responses_swap,
    )
    acc = (acc + swap_acc) / 2
    agr = (agr + swap_agr) / 2
    models_acc, models_agr = pairwise_compare(
        model_responses,
        model_responses_swap,
    )
    return acc, agr, models_acc, models_agr


def load_leaderboard() -> pd.DataFrame:
    """Loads the leaderboard from the file system"""
    with open("./data/models.yaml") as fp:
        models = yaml.safe_load(fp)
    human_responses = load_dataset("salesforce/instrusum", "human_eval_pairwise", token=TOKEN)["data"]
    human_responses = [x for x in human_responses]

    predictions = {k: [] for k in ["Model", "Accuracy", "Agreement", "Self-Accuracy", "Self-Agreement"]}

    for model in models:
        fdir = model["fdir"]
        acc, agr, models_acc, models_agr = pairwise_meta_eval(
            human_responses, f"./predictions/{fdir}.jsonl", f"./predictions/{fdir}_swap.jsonl"
        )
        # predictions["Model"].append(model["name"])
        # predictions["Model"].append(f"[{model['name']}]({model['url']})")
        link = model['url']
        model_name = model['name']
        output = f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
        predictions["Model"].append(output)
        predictions["Accuracy"].append(acc)
        predictions["Agreement"].append(agr)
        predictions["Self-Accuracy"].append(models_acc)
        predictions["Self-Agreement"].append(models_agr)
    df = pd.DataFrame(predictions).sort_values(by="Agreement", ascending=False).round(decimals=3)
    df.reset_index(drop=True, inplace=True)
    df[' '] = pd.Series(range(1, len(df) + 1))
    columns = [' '] + [col for col in df.columns if col != ' ']
    df = df[columns]
    return df