Spaces:
Running
Running
import pandas as pd | |
import json | |
import re | |
# Load the CSV file | |
leaderboard_df = [] | |
with open("benchmark_results.csv", "r") as f: | |
header = f.readline().strip().split(",") | |
header = [h.strip() for h in header] | |
for i, line in enumerate(f): | |
leaderboard_df.append(line.strip().split(",", 13)) | |
# Load metadata | |
metadata = json.load(open('metadata.json')) | |
for k, v in list(metadata.items()): | |
metadata[k.split(",")[0]] = v | |
# Create DataFrame | |
leaderboard_df = pd.DataFrame(leaderboard_df, columns=header) | |
# Filter and process DataFrame | |
leaderboard_df = leaderboard_df[(leaderboard_df["Benchmark Version"] == "eq-bench_v2_pl") | ( | |
leaderboard_df["Benchmark Version"] == 'eq-bench_pl')] | |
leaderboard_df = leaderboard_df[["Model Path", "Benchmark Score", "Num Questions Parseable", "Error"]] | |
def parse_parseable(x): | |
if x["Num Questions Parseable"] == 'FAILED': | |
m = re.match(r'(\d+)\.0 questions were parseable', x["Error"]) | |
return m.group(1) | |
return x["Num Questions Parseable"] | |
leaderboard_df["Num Questions Parseable"] = leaderboard_df[["Num Questions Parseable", "Error"]].apply( | |
lambda x: parse_parseable(x), axis=1) | |
NUMBER_OF_QUESTIONS = 171.0 | |
def fraction_to_percentage(numerator: float, denominator: float) -> float: | |
return (numerator / denominator) * 100 | |
leaderboard_df["Num Questions Parseable"] = leaderboard_df["Num Questions Parseable"].apply(lambda x: fraction_to_percentage(float(x), NUMBER_OF_QUESTIONS)) | |
def get_params(model_name): | |
if model_name in metadata: | |
return metadata[model_name] | |
else: | |
print(model_name) | |
return None | |
leaderboard_df["Params"] = leaderboard_df["Model Path"].apply(lambda x: get_params(x)) | |
leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].replace('FAILED', None) | |
leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].astype(float) * ((leaderboard_df["Num Questions Parseable"].astype(float) / 100)) | |
leaderboard_df.loc[leaderboard_df["Benchmark Score"] < 0, "Benchmark Score"] = 0 | |
leaderboard_df = leaderboard_df.sort_values(by=["Benchmark Score", "Num Questions Parseable"], ascending=[False, False]) | |
leaderboard_df = leaderboard_df.rename(columns={"Model Path": "Model", "Num Questions Parseable": "Percentage Questions Parseable"}) | |
# Generate HTML with DataTables | |
html = """ | |
<!DOCTYPE html> | |
<html lang="en"> | |
<head> | |
<meta charset="UTF-8"> | |
<meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
<title>Leaderboard</title> | |
<link rel="stylesheet" href="https://cdn.datatables.net/1.11.5/css/jquery.dataTables.min.css"> | |
<script src="https://code.jquery.com/jquery-3.6.0.min.js"></script> | |
<script src="https://cdn.datatables.net/1.11.5/js/jquery.dataTables.min.js"></script> | |
<style> | |
body { | |
font: 90%/1.45em "Helvetica Neue", HelveticaNeue, Verdana, Arial, Helvetica, sans-serif; | |
margin: 0; | |
padding: 20px; | |
color: #333; | |
background-color: #fff; | |
} | |
.numeric-cell { | |
text-align: right; | |
padding: 8px !important; | |
} | |
</style> | |
<script> | |
(function($) { | |
$.fn.colorize = function(oOptions) { | |
var settings = $.extend({ | |
parse: function(e) { | |
return parseFloat(e.html()); | |
}, | |
min: undefined, | |
max: undefined, | |
readable: true, | |
themes: { | |
"default": { | |
color_min: "#C80000", | |
color_mid: "#FFFFFF", | |
color_max: "#10A54A" | |
} | |
}, | |
theme: "default", | |
center: undefined, | |
percent: false | |
}, oOptions); | |
function getColor(color1, color2, ratio) { | |
var hex = function(x) { | |
x = x.toString(16); | |
return (x.length == 1) ? '0' + x : x; | |
} | |
color1 = (color1.charAt(0) == "#") ? color1.slice(1) : color1 | |
color2 = (color2.charAt(0) == "#") ? color2.slice(1) : color2 | |
var r = Math.ceil(parseInt(color1.substring(0,2), 16) * ratio + parseInt(color2.substring(0,2), 16) * (1-ratio)); | |
var g = Math.ceil(parseInt(color1.substring(2,4), 16) * ratio + parseInt(color2.substring(2,4), 16) * (1-ratio)); | |
var b = Math.ceil(parseInt(color1.substring(4,6), 16) * ratio + parseInt(color2.substring(4,6), 16) * (1-ratio)); | |
return "#" + (hex(r) + hex(g) + hex(b)).toUpperCase(); | |
} | |
function getContrastYIQ(hexcolor) { | |
var hex = (hexcolor.charAt(0) == "#") ? hexcolor.slice(1) : hexcolor; | |
var r = parseInt(hex.substr(0,2),16); | |
var g = parseInt(hex.substr(2,2),16); | |
var b = parseInt(hex.substr(4,2),16); | |
var yiq = ((r*299)+(g*587)+(b*114))/1000; | |
return (yiq >= 128) ? 'black' : 'white'; | |
} | |
var min = settings.min; | |
var max = settings.max; | |
if (min === undefined || max === undefined) { | |
min = Infinity; | |
max = -Infinity; | |
this.each(function() { | |
var value = parseFloat(settings.parse($(this))); | |
if (!isNaN(value) && isFinite(value)) { | |
min = Math.min(min, value); | |
max = Math.max(max, value); | |
} | |
}); | |
} | |
var center = settings.center !== undefined ? settings.center : (max + min) / 2; | |
var adj = Math.max(Math.abs(max - center), Math.abs(center - min)); | |
this.each(function() { | |
var value = parseFloat(settings.parse($(this))); | |
if (isNaN(value) || !isFinite(value)) return; | |
var ratio = (value - center) / adj; | |
var color1, color2; | |
if (value < center) { | |
ratio = Math.abs(ratio); | |
if (ratio > 1) ratio = 1; | |
color1 = settings.themes[settings.theme].color_min; | |
color2 = settings.themes[settings.theme].color_mid; | |
} else { | |
ratio = Math.abs(ratio); | |
if (ratio > 1) ratio = 1; | |
color1 = settings.themes[settings.theme].color_max; | |
color2 = settings.themes[settings.theme].color_mid; | |
} | |
var color = getColor(color1, color2, ratio); | |
$(this).css('background-color', color); | |
if (settings.readable) | |
$(this).css('color', getContrastYIQ(color)); | |
}); | |
return this; | |
}; | |
}(jQuery)); | |
$(document).ready(function() { | |
// Add custom filtering function | |
$.fn.dataTable.ext.search.push(function(settings, data, dataIndex) { | |
var searchValue = $('.dataTables_filter input').val(); | |
if (!searchValue) return true; | |
// Split search terms by semicolon and trim whitespace | |
var searchTerms = searchValue.split(';').map(term => term.trim().toLowerCase()); | |
var modelName = data[0].toLowerCase(); // Model name is in first column | |
// Return true if ANY search terms are found in the model name (OR logic) | |
return searchTerms.some(term => modelName.includes(term)); | |
}); | |
// Custom sorting function for benchmark scores | |
$.fn.dataTable.ext.type.order['score-pre'] = function(data) { | |
var score = parseFloat(data); | |
return isNaN(score) ? -Infinity : score; | |
}; | |
// Get min/max values for each numeric column before initializing DataTables | |
var columnRanges = { | |
1: { min: Infinity, max: -Infinity }, // Params | |
2: { min: Infinity, max: -Infinity }, // Benchmark Score | |
3: { min: Infinity, max: -Infinity } // Percentage Questions Parseable | |
}; | |
$('#leaderboard tbody td').each(function() { | |
var columnIdx = $(this).index(); | |
if (columnIdx in columnRanges) { | |
var value = parseFloat($(this).text()); | |
if (!isNaN(value) && isFinite(value)) { | |
columnRanges[columnIdx].min = Math.min(columnRanges[columnIdx].min, value); | |
columnRanges[columnIdx].max = Math.max(columnRanges[columnIdx].max, value); | |
} | |
} | |
}); | |
var table = $('#leaderboard').DataTable({ | |
"order": [[2, "desc"]], // Sort by Benchmark Score by default | |
"pageLength": 20, // Show 20 results per page | |
"lengthMenu": [[10, 20, 50, 100, -1], [10, 20, 50, 100, "All"]], // Update length menu options | |
"columnDefs": [ | |
{ | |
"targets": [1], | |
"className": "numeric-cell" | |
}, | |
{ | |
"type": "score", | |
"targets": [2], // Apply custom sorting to Benchmark Score column | |
"className": "numeric-cell" | |
}, | |
{ | |
"targets": [3], | |
"className": "numeric-cell" | |
} | |
], | |
"drawCallback": function() { | |
// Apply colorization with pre-calculated ranges | |
$("#leaderboard tbody td:nth-child(2)").colorize({ | |
parse: function(e) { return parseFloat($(e).text()); }, | |
min: columnRanges[1].min, | |
max: columnRanges[1].max, | |
themes: { | |
"default": { | |
color_min: "#10A54A", // White for smaller models | |
color_mid: "#FFD700", // Gold/yellow for medium models | |
color_max: "#C80000" // Hot pink for larger models | |
} | |
} | |
}); | |
$("#leaderboard tbody td:nth-child(3)").colorize({ | |
parse: function(e) { return parseFloat($(e).text()); }, | |
min: columnRanges[2].min, | |
max: columnRanges[2].max, | |
themes: { | |
"default": { | |
color_min: "#C80000", // Red for lower scores | |
color_mid: "#FFD700", // Gold/yellow for medium scores | |
color_max: "#10A54A" // Green for higher scores | |
} | |
} | |
}); | |
$("#leaderboard tbody td:nth-child(4)").colorize({ | |
parse: function(e) { return parseFloat($(e).text()); }, | |
min: columnRanges[3].min, | |
max: columnRanges[3].max, | |
themes: { | |
"default": { | |
color_min: "#C80000", // Red for lower percentages | |
color_mid: "#FFD700", // Gold/yellow for medium percentages | |
color_max: "#10A54A" // Green for higher percentages | |
} | |
} | |
}); | |
}, | |
// Override the default search behavior | |
"search": { | |
"smart": false | |
}, | |
// Update search on input change | |
"initComplete": function() { | |
var table = this.api(); | |
$('.dataTables_filter input') | |
.off() // Remove default binding | |
.on('input', function() { | |
table.draw(); | |
}); | |
} | |
}); | |
}); | |
</script> | |
</head> | |
<body> | |
<h1>Leaderboard</h1> | |
<table id="leaderboard" class="display" style="width:100%"> | |
<thead> | |
<tr> | |
<th>Model</th> | |
<th>Params</th> | |
<th>Benchmark Score</th> | |
<th>Percentage Questions Parseable</th> | |
<th>Error</th> | |
</tr> | |
</thead> | |
<tbody> | |
""" | |
# Add rows to the HTML table | |
for _, row in leaderboard_df.iterrows(): | |
html += f""" | |
<tr> | |
<td>{row['Model']}</td> | |
<td>{row['Params']}</td> | |
<td>{row['Benchmark Score']:.2f}</td> | |
<td>{row['Percentage Questions Parseable']:.2f}</td> | |
<td>{row['Error']}</td> | |
</tr> | |
""" | |
# Close the HTML tags | |
html += """ | |
</tbody> | |
</table> | |
</body> | |
</html> | |
""" | |
# Save the HTML to a file | |
with open("leaderboard.html", "w") as file: | |
file.write(html) | |
print("HTML leaderboard generated and saved as leaderboard.html") | |