polish_eq-bench / script.py
djstrong's picture
generate static page leaderboard
235501b
import pandas as pd
import json
import re
# Load the CSV file
leaderboard_df = []
with open("benchmark_results.csv", "r") as f:
header = f.readline().strip().split(",")
header = [h.strip() for h in header]
for i, line in enumerate(f):
leaderboard_df.append(line.strip().split(",", 13))
# Load metadata
metadata = json.load(open('metadata.json'))
for k, v in list(metadata.items()):
metadata[k.split(",")[0]] = v
# Create DataFrame
leaderboard_df = pd.DataFrame(leaderboard_df, columns=header)
# Filter and process DataFrame
leaderboard_df = leaderboard_df[(leaderboard_df["Benchmark Version"] == "eq-bench_v2_pl") | (
leaderboard_df["Benchmark Version"] == 'eq-bench_pl')]
leaderboard_df = leaderboard_df[["Model Path", "Benchmark Score", "Num Questions Parseable", "Error"]]
def parse_parseable(x):
if x["Num Questions Parseable"] == 'FAILED':
m = re.match(r'(\d+)\.0 questions were parseable', x["Error"])
return m.group(1)
return x["Num Questions Parseable"]
leaderboard_df["Num Questions Parseable"] = leaderboard_df[["Num Questions Parseable", "Error"]].apply(
lambda x: parse_parseable(x), axis=1)
NUMBER_OF_QUESTIONS = 171.0
def fraction_to_percentage(numerator: float, denominator: float) -> float:
return (numerator / denominator) * 100
leaderboard_df["Num Questions Parseable"] = leaderboard_df["Num Questions Parseable"].apply(lambda x: fraction_to_percentage(float(x), NUMBER_OF_QUESTIONS))
def get_params(model_name):
if model_name in metadata:
return metadata[model_name]
else:
print(model_name)
return None
leaderboard_df["Params"] = leaderboard_df["Model Path"].apply(lambda x: get_params(x))
leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].replace('FAILED', None)
leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].astype(float) * ((leaderboard_df["Num Questions Parseable"].astype(float) / 100))
leaderboard_df.loc[leaderboard_df["Benchmark Score"] < 0, "Benchmark Score"] = 0
leaderboard_df = leaderboard_df.sort_values(by=["Benchmark Score", "Num Questions Parseable"], ascending=[False, False])
leaderboard_df = leaderboard_df.rename(columns={"Model Path": "Model", "Num Questions Parseable": "Percentage Questions Parseable"})
# Generate HTML with DataTables
html = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Leaderboard</title>
<link rel="stylesheet" href="https://cdn.datatables.net/1.11.5/css/jquery.dataTables.min.css">
<script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
<script src="https://cdn.datatables.net/1.11.5/js/jquery.dataTables.min.js"></script>
<style>
body {
font: 90%/1.45em "Helvetica Neue", HelveticaNeue, Verdana, Arial, Helvetica, sans-serif;
margin: 0;
padding: 20px;
color: #333;
background-color: #fff;
}
.numeric-cell {
text-align: right;
padding: 8px !important;
}
</style>
<script>
(function($) {
$.fn.colorize = function(oOptions) {
var settings = $.extend({
parse: function(e) {
return parseFloat(e.html());
},
min: undefined,
max: undefined,
readable: true,
themes: {
"default": {
color_min: "#C80000",
color_mid: "#FFFFFF",
color_max: "#10A54A"
}
},
theme: "default",
center: undefined,
percent: false
}, oOptions);
function getColor(color1, color2, ratio) {
var hex = function(x) {
x = x.toString(16);
return (x.length == 1) ? '0' + x : x;
}
color1 = (color1.charAt(0) == "#") ? color1.slice(1) : color1
color2 = (color2.charAt(0) == "#") ? color2.slice(1) : color2
var r = Math.ceil(parseInt(color1.substring(0,2), 16) * ratio + parseInt(color2.substring(0,2), 16) * (1-ratio));
var g = Math.ceil(parseInt(color1.substring(2,4), 16) * ratio + parseInt(color2.substring(2,4), 16) * (1-ratio));
var b = Math.ceil(parseInt(color1.substring(4,6), 16) * ratio + parseInt(color2.substring(4,6), 16) * (1-ratio));
return "#" + (hex(r) + hex(g) + hex(b)).toUpperCase();
}
function getContrastYIQ(hexcolor) {
var hex = (hexcolor.charAt(0) == "#") ? hexcolor.slice(1) : hexcolor;
var r = parseInt(hex.substr(0,2),16);
var g = parseInt(hex.substr(2,2),16);
var b = parseInt(hex.substr(4,2),16);
var yiq = ((r*299)+(g*587)+(b*114))/1000;
return (yiq >= 128) ? 'black' : 'white';
}
var min = settings.min;
var max = settings.max;
if (min === undefined || max === undefined) {
min = Infinity;
max = -Infinity;
this.each(function() {
var value = parseFloat(settings.parse($(this)));
if (!isNaN(value) && isFinite(value)) {
min = Math.min(min, value);
max = Math.max(max, value);
}
});
}
var center = settings.center !== undefined ? settings.center : (max + min) / 2;
var adj = Math.max(Math.abs(max - center), Math.abs(center - min));
this.each(function() {
var value = parseFloat(settings.parse($(this)));
if (isNaN(value) || !isFinite(value)) return;
var ratio = (value - center) / adj;
var color1, color2;
if (value < center) {
ratio = Math.abs(ratio);
if (ratio > 1) ratio = 1;
color1 = settings.themes[settings.theme].color_min;
color2 = settings.themes[settings.theme].color_mid;
} else {
ratio = Math.abs(ratio);
if (ratio > 1) ratio = 1;
color1 = settings.themes[settings.theme].color_max;
color2 = settings.themes[settings.theme].color_mid;
}
var color = getColor(color1, color2, ratio);
$(this).css('background-color', color);
if (settings.readable)
$(this).css('color', getContrastYIQ(color));
});
return this;
};
}(jQuery));
$(document).ready(function() {
// Add custom filtering function
$.fn.dataTable.ext.search.push(function(settings, data, dataIndex) {
var searchValue = $('.dataTables_filter input').val();
if (!searchValue) return true;
// Split search terms by semicolon and trim whitespace
var searchTerms = searchValue.split(';').map(term => term.trim().toLowerCase());
var modelName = data[0].toLowerCase(); // Model name is in first column
// Return true if ANY search terms are found in the model name (OR logic)
return searchTerms.some(term => modelName.includes(term));
});
// Custom sorting function for benchmark scores
$.fn.dataTable.ext.type.order['score-pre'] = function(data) {
var score = parseFloat(data);
return isNaN(score) ? -Infinity : score;
};
// Get min/max values for each numeric column before initializing DataTables
var columnRanges = {
1: { min: Infinity, max: -Infinity }, // Params
2: { min: Infinity, max: -Infinity }, // Benchmark Score
3: { min: Infinity, max: -Infinity } // Percentage Questions Parseable
};
$('#leaderboard tbody td').each(function() {
var columnIdx = $(this).index();
if (columnIdx in columnRanges) {
var value = parseFloat($(this).text());
if (!isNaN(value) && isFinite(value)) {
columnRanges[columnIdx].min = Math.min(columnRanges[columnIdx].min, value);
columnRanges[columnIdx].max = Math.max(columnRanges[columnIdx].max, value);
}
}
});
var table = $('#leaderboard').DataTable({
"order": [[2, "desc"]], // Sort by Benchmark Score by default
"pageLength": 20, // Show 20 results per page
"lengthMenu": [[10, 20, 50, 100, -1], [10, 20, 50, 100, "All"]], // Update length menu options
"columnDefs": [
{
"targets": [1],
"className": "numeric-cell"
},
{
"type": "score",
"targets": [2], // Apply custom sorting to Benchmark Score column
"className": "numeric-cell"
},
{
"targets": [3],
"className": "numeric-cell"
}
],
"drawCallback": function() {
// Apply colorization with pre-calculated ranges
$("#leaderboard tbody td:nth-child(2)").colorize({
parse: function(e) { return parseFloat($(e).text()); },
min: columnRanges[1].min,
max: columnRanges[1].max,
themes: {
"default": {
color_min: "#10A54A", // White for smaller models
color_mid: "#FFD700", // Gold/yellow for medium models
color_max: "#C80000" // Hot pink for larger models
}
}
});
$("#leaderboard tbody td:nth-child(3)").colorize({
parse: function(e) { return parseFloat($(e).text()); },
min: columnRanges[2].min,
max: columnRanges[2].max,
themes: {
"default": {
color_min: "#C80000", // Red for lower scores
color_mid: "#FFD700", // Gold/yellow for medium scores
color_max: "#10A54A" // Green for higher scores
}
}
});
$("#leaderboard tbody td:nth-child(4)").colorize({
parse: function(e) { return parseFloat($(e).text()); },
min: columnRanges[3].min,
max: columnRanges[3].max,
themes: {
"default": {
color_min: "#C80000", // Red for lower percentages
color_mid: "#FFD700", // Gold/yellow for medium percentages
color_max: "#10A54A" // Green for higher percentages
}
}
});
},
// Override the default search behavior
"search": {
"smart": false
},
// Update search on input change
"initComplete": function() {
var table = this.api();
$('.dataTables_filter input')
.off() // Remove default binding
.on('input', function() {
table.draw();
});
}
});
});
</script>
</head>
<body>
<h1>Leaderboard</h1>
<table id="leaderboard" class="display" style="width:100%">
<thead>
<tr>
<th>Model</th>
<th>Params</th>
<th>Benchmark Score</th>
<th>Percentage Questions Parseable</th>
<th>Error</th>
</tr>
</thead>
<tbody>
"""
# Add rows to the HTML table
for _, row in leaderboard_df.iterrows():
html += f"""
<tr>
<td>{row['Model']}</td>
<td>{row['Params']}</td>
<td>{row['Benchmark Score']:.2f}</td>
<td>{row['Percentage Questions Parseable']:.2f}</td>
<td>{row['Error']}</td>
</tr>
"""
# Close the HTML tags
html += """
</tbody>
</table>
</body>
</html>
"""
# Save the HTML to a file
with open("leaderboard.html", "w") as file:
file.write(html)
print("HTML leaderboard generated and saved as leaderboard.html")