Spaces:
Running
Running
generate static page leaderboard
Browse files
script.py
ADDED
@@ -0,0 +1,322 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import json
|
3 |
+
import re
|
4 |
+
|
5 |
+
# Load the CSV file
|
6 |
+
leaderboard_df = []
|
7 |
+
with open("benchmark_results.csv", "r") as f:
|
8 |
+
header = f.readline().strip().split(",")
|
9 |
+
header = [h.strip() for h in header]
|
10 |
+
for i, line in enumerate(f):
|
11 |
+
leaderboard_df.append(line.strip().split(",", 13))
|
12 |
+
|
13 |
+
# Load metadata
|
14 |
+
metadata = json.load(open('metadata.json'))
|
15 |
+
for k, v in list(metadata.items()):
|
16 |
+
metadata[k.split(",")[0]] = v
|
17 |
+
|
18 |
+
# Create DataFrame
|
19 |
+
leaderboard_df = pd.DataFrame(leaderboard_df, columns=header)
|
20 |
+
|
21 |
+
# Filter and process DataFrame
|
22 |
+
leaderboard_df = leaderboard_df[(leaderboard_df["Benchmark Version"] == "eq-bench_v2_pl") | (
|
23 |
+
leaderboard_df["Benchmark Version"] == 'eq-bench_pl')]
|
24 |
+
leaderboard_df = leaderboard_df[["Model Path", "Benchmark Score", "Num Questions Parseable", "Error"]]
|
25 |
+
|
26 |
+
def parse_parseable(x):
|
27 |
+
if x["Num Questions Parseable"] == 'FAILED':
|
28 |
+
m = re.match(r'(\d+)\.0 questions were parseable', x["Error"])
|
29 |
+
return m.group(1)
|
30 |
+
return x["Num Questions Parseable"]
|
31 |
+
|
32 |
+
leaderboard_df["Num Questions Parseable"] = leaderboard_df[["Num Questions Parseable", "Error"]].apply(
|
33 |
+
lambda x: parse_parseable(x), axis=1)
|
34 |
+
|
35 |
+
NUMBER_OF_QUESTIONS = 171.0
|
36 |
+
|
37 |
+
def fraction_to_percentage(numerator: float, denominator: float) -> float:
|
38 |
+
return (numerator / denominator) * 100
|
39 |
+
|
40 |
+
leaderboard_df["Num Questions Parseable"] = leaderboard_df["Num Questions Parseable"].apply(lambda x: fraction_to_percentage(float(x), NUMBER_OF_QUESTIONS))
|
41 |
+
|
42 |
+
def get_params(model_name):
|
43 |
+
if model_name in metadata:
|
44 |
+
return metadata[model_name]
|
45 |
+
else:
|
46 |
+
print(model_name)
|
47 |
+
return None
|
48 |
+
|
49 |
+
leaderboard_df["Params"] = leaderboard_df["Model Path"].apply(lambda x: get_params(x))
|
50 |
+
leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].replace('FAILED', None)
|
51 |
+
leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].astype(float) * ((leaderboard_df["Num Questions Parseable"].astype(float) / 100))
|
52 |
+
leaderboard_df.loc[leaderboard_df["Benchmark Score"] < 0, "Benchmark Score"] = 0
|
53 |
+
leaderboard_df = leaderboard_df.sort_values(by=["Benchmark Score", "Num Questions Parseable"], ascending=[False, False])
|
54 |
+
leaderboard_df = leaderboard_df.rename(columns={"Model Path": "Model", "Num Questions Parseable": "Percentage Questions Parseable"})
|
55 |
+
|
56 |
+
# Generate HTML with DataTables
|
57 |
+
html = """
|
58 |
+
<!DOCTYPE html>
|
59 |
+
<html lang="en">
|
60 |
+
<head>
|
61 |
+
<meta charset="UTF-8">
|
62 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
63 |
+
<title>Leaderboard</title>
|
64 |
+
<link rel="stylesheet" href="https://cdn.datatables.net/1.11.5/css/jquery.dataTables.min.css">
|
65 |
+
<script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
|
66 |
+
<script src="https://cdn.datatables.net/1.11.5/js/jquery.dataTables.min.js"></script>
|
67 |
+
<style>
|
68 |
+
body {
|
69 |
+
font: 90%/1.45em "Helvetica Neue", HelveticaNeue, Verdana, Arial, Helvetica, sans-serif;
|
70 |
+
margin: 0;
|
71 |
+
padding: 20px;
|
72 |
+
color: #333;
|
73 |
+
background-color: #fff;
|
74 |
+
}
|
75 |
+
.numeric-cell {
|
76 |
+
text-align: right;
|
77 |
+
padding: 8px !important;
|
78 |
+
}
|
79 |
+
</style>
|
80 |
+
<script>
|
81 |
+
(function($) {
|
82 |
+
$.fn.colorize = function(oOptions) {
|
83 |
+
var settings = $.extend({
|
84 |
+
parse: function(e) {
|
85 |
+
return parseFloat(e.html());
|
86 |
+
},
|
87 |
+
min: undefined,
|
88 |
+
max: undefined,
|
89 |
+
readable: true,
|
90 |
+
themes: {
|
91 |
+
"default": {
|
92 |
+
color_min: "#C80000",
|
93 |
+
color_mid: "#FFFFFF",
|
94 |
+
color_max: "#10A54A"
|
95 |
+
}
|
96 |
+
},
|
97 |
+
theme: "default",
|
98 |
+
center: undefined,
|
99 |
+
percent: false
|
100 |
+
}, oOptions);
|
101 |
+
|
102 |
+
function getColor(color1, color2, ratio) {
|
103 |
+
var hex = function(x) {
|
104 |
+
x = x.toString(16);
|
105 |
+
return (x.length == 1) ? '0' + x : x;
|
106 |
+
}
|
107 |
+
color1 = (color1.charAt(0) == "#") ? color1.slice(1) : color1
|
108 |
+
color2 = (color2.charAt(0) == "#") ? color2.slice(1) : color2
|
109 |
+
var r = Math.ceil(parseInt(color1.substring(0,2), 16) * ratio + parseInt(color2.substring(0,2), 16) * (1-ratio));
|
110 |
+
var g = Math.ceil(parseInt(color1.substring(2,4), 16) * ratio + parseInt(color2.substring(2,4), 16) * (1-ratio));
|
111 |
+
var b = Math.ceil(parseInt(color1.substring(4,6), 16) * ratio + parseInt(color2.substring(4,6), 16) * (1-ratio));
|
112 |
+
return "#" + (hex(r) + hex(g) + hex(b)).toUpperCase();
|
113 |
+
}
|
114 |
+
|
115 |
+
function getContrastYIQ(hexcolor) {
|
116 |
+
var hex = (hexcolor.charAt(0) == "#") ? hexcolor.slice(1) : hexcolor;
|
117 |
+
var r = parseInt(hex.substr(0,2),16);
|
118 |
+
var g = parseInt(hex.substr(2,2),16);
|
119 |
+
var b = parseInt(hex.substr(4,2),16);
|
120 |
+
var yiq = ((r*299)+(g*587)+(b*114))/1000;
|
121 |
+
return (yiq >= 128) ? 'black' : 'white';
|
122 |
+
}
|
123 |
+
|
124 |
+
var min = settings.min;
|
125 |
+
var max = settings.max;
|
126 |
+
if (min === undefined || max === undefined) {
|
127 |
+
min = Infinity;
|
128 |
+
max = -Infinity;
|
129 |
+
this.each(function() {
|
130 |
+
var value = parseFloat(settings.parse($(this)));
|
131 |
+
if (!isNaN(value) && isFinite(value)) {
|
132 |
+
min = Math.min(min, value);
|
133 |
+
max = Math.max(max, value);
|
134 |
+
}
|
135 |
+
});
|
136 |
+
}
|
137 |
+
|
138 |
+
var center = settings.center !== undefined ? settings.center : (max + min) / 2;
|
139 |
+
var adj = Math.max(Math.abs(max - center), Math.abs(center - min));
|
140 |
+
|
141 |
+
this.each(function() {
|
142 |
+
var value = parseFloat(settings.parse($(this)));
|
143 |
+
if (isNaN(value) || !isFinite(value)) return;
|
144 |
+
|
145 |
+
var ratio = (value - center) / adj;
|
146 |
+
var color1, color2;
|
147 |
+
|
148 |
+
if (value < center) {
|
149 |
+
ratio = Math.abs(ratio);
|
150 |
+
if (ratio > 1) ratio = 1;
|
151 |
+
color1 = settings.themes[settings.theme].color_min;
|
152 |
+
color2 = settings.themes[settings.theme].color_mid;
|
153 |
+
} else {
|
154 |
+
ratio = Math.abs(ratio);
|
155 |
+
if (ratio > 1) ratio = 1;
|
156 |
+
color1 = settings.themes[settings.theme].color_max;
|
157 |
+
color2 = settings.themes[settings.theme].color_mid;
|
158 |
+
}
|
159 |
+
var color = getColor(color1, color2, ratio);
|
160 |
+
$(this).css('background-color', color);
|
161 |
+
if (settings.readable)
|
162 |
+
$(this).css('color', getContrastYIQ(color));
|
163 |
+
});
|
164 |
+
|
165 |
+
return this;
|
166 |
+
};
|
167 |
+
}(jQuery));
|
168 |
+
|
169 |
+
$(document).ready(function() {
|
170 |
+
// Add custom filtering function
|
171 |
+
$.fn.dataTable.ext.search.push(function(settings, data, dataIndex) {
|
172 |
+
var searchValue = $('.dataTables_filter input').val();
|
173 |
+
if (!searchValue) return true;
|
174 |
+
|
175 |
+
// Split search terms by semicolon and trim whitespace
|
176 |
+
var searchTerms = searchValue.split(';').map(term => term.trim().toLowerCase());
|
177 |
+
var modelName = data[0].toLowerCase(); // Model name is in first column
|
178 |
+
|
179 |
+
// Return true if ANY search terms are found in the model name (OR logic)
|
180 |
+
return searchTerms.some(term => modelName.includes(term));
|
181 |
+
});
|
182 |
+
|
183 |
+
// Custom sorting function for benchmark scores
|
184 |
+
$.fn.dataTable.ext.type.order['score-pre'] = function(data) {
|
185 |
+
var score = parseFloat(data);
|
186 |
+
return isNaN(score) ? -Infinity : score;
|
187 |
+
};
|
188 |
+
|
189 |
+
// Get min/max values for each numeric column before initializing DataTables
|
190 |
+
var columnRanges = {
|
191 |
+
1: { min: Infinity, max: -Infinity }, // Params
|
192 |
+
2: { min: Infinity, max: -Infinity }, // Benchmark Score
|
193 |
+
3: { min: Infinity, max: -Infinity } // Percentage Questions Parseable
|
194 |
+
};
|
195 |
+
|
196 |
+
$('#leaderboard tbody td').each(function() {
|
197 |
+
var columnIdx = $(this).index();
|
198 |
+
if (columnIdx in columnRanges) {
|
199 |
+
var value = parseFloat($(this).text());
|
200 |
+
if (!isNaN(value) && isFinite(value)) {
|
201 |
+
columnRanges[columnIdx].min = Math.min(columnRanges[columnIdx].min, value);
|
202 |
+
columnRanges[columnIdx].max = Math.max(columnRanges[columnIdx].max, value);
|
203 |
+
}
|
204 |
+
}
|
205 |
+
});
|
206 |
+
|
207 |
+
var table = $('#leaderboard').DataTable({
|
208 |
+
"order": [[2, "desc"]], // Sort by Benchmark Score by default
|
209 |
+
"pageLength": 20, // Show 20 results per page
|
210 |
+
"lengthMenu": [[10, 20, 50, 100, -1], [10, 20, 50, 100, "All"]], // Update length menu options
|
211 |
+
"columnDefs": [
|
212 |
+
{
|
213 |
+
"targets": [1],
|
214 |
+
"className": "numeric-cell"
|
215 |
+
},
|
216 |
+
{
|
217 |
+
"type": "score",
|
218 |
+
"targets": [2], // Apply custom sorting to Benchmark Score column
|
219 |
+
"className": "numeric-cell"
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"targets": [3],
|
223 |
+
"className": "numeric-cell"
|
224 |
+
}
|
225 |
+
],
|
226 |
+
"drawCallback": function() {
|
227 |
+
// Apply colorization with pre-calculated ranges
|
228 |
+
$("#leaderboard tbody td:nth-child(2)").colorize({
|
229 |
+
parse: function(e) { return parseFloat($(e).text()); },
|
230 |
+
min: columnRanges[1].min,
|
231 |
+
max: columnRanges[1].max,
|
232 |
+
themes: {
|
233 |
+
"default": {
|
234 |
+
color_min: "#10A54A", // White for smaller models
|
235 |
+
color_mid: "#FFD700", // Gold/yellow for medium models
|
236 |
+
color_max: "#C80000" // Hot pink for larger models
|
237 |
+
}
|
238 |
+
}
|
239 |
+
});
|
240 |
+
$("#leaderboard tbody td:nth-child(3)").colorize({
|
241 |
+
parse: function(e) { return parseFloat($(e).text()); },
|
242 |
+
min: columnRanges[2].min,
|
243 |
+
max: columnRanges[2].max,
|
244 |
+
themes: {
|
245 |
+
"default": {
|
246 |
+
color_min: "#C80000", // Red for lower scores
|
247 |
+
color_mid: "#FFD700", // Gold/yellow for medium scores
|
248 |
+
color_max: "#10A54A" // Green for higher scores
|
249 |
+
}
|
250 |
+
}
|
251 |
+
});
|
252 |
+
$("#leaderboard tbody td:nth-child(4)").colorize({
|
253 |
+
parse: function(e) { return parseFloat($(e).text()); },
|
254 |
+
min: columnRanges[3].min,
|
255 |
+
max: columnRanges[3].max,
|
256 |
+
themes: {
|
257 |
+
"default": {
|
258 |
+
color_min: "#C80000", // Red for lower percentages
|
259 |
+
color_mid: "#FFD700", // Gold/yellow for medium percentages
|
260 |
+
color_max: "#10A54A" // Green for higher percentages
|
261 |
+
}
|
262 |
+
}
|
263 |
+
});
|
264 |
+
},
|
265 |
+
// Override the default search behavior
|
266 |
+
"search": {
|
267 |
+
"smart": false
|
268 |
+
},
|
269 |
+
|
270 |
+
// Update search on input change
|
271 |
+
"initComplete": function() {
|
272 |
+
var table = this.api();
|
273 |
+
$('.dataTables_filter input')
|
274 |
+
.off() // Remove default binding
|
275 |
+
.on('input', function() {
|
276 |
+
table.draw();
|
277 |
+
});
|
278 |
+
}
|
279 |
+
});
|
280 |
+
});
|
281 |
+
</script>
|
282 |
+
</head>
|
283 |
+
<body>
|
284 |
+
<h1>Leaderboard</h1>
|
285 |
+
<table id="leaderboard" class="display" style="width:100%">
|
286 |
+
<thead>
|
287 |
+
<tr>
|
288 |
+
<th>Model</th>
|
289 |
+
<th>Params</th>
|
290 |
+
<th>Benchmark Score</th>
|
291 |
+
<th>Percentage Questions Parseable</th>
|
292 |
+
<th>Error</th>
|
293 |
+
</tr>
|
294 |
+
</thead>
|
295 |
+
<tbody>
|
296 |
+
"""
|
297 |
+
|
298 |
+
# Add rows to the HTML table
|
299 |
+
for _, row in leaderboard_df.iterrows():
|
300 |
+
html += f"""
|
301 |
+
<tr>
|
302 |
+
<td>{row['Model']}</td>
|
303 |
+
<td>{row['Params']}</td>
|
304 |
+
<td>{row['Benchmark Score']:.2f}</td>
|
305 |
+
<td>{row['Percentage Questions Parseable']:.2f}</td>
|
306 |
+
<td>{row['Error']}</td>
|
307 |
+
</tr>
|
308 |
+
"""
|
309 |
+
|
310 |
+
# Close the HTML tags
|
311 |
+
html += """
|
312 |
+
</tbody>
|
313 |
+
</table>
|
314 |
+
</body>
|
315 |
+
</html>
|
316 |
+
"""
|
317 |
+
|
318 |
+
# Save the HTML to a file
|
319 |
+
with open("leaderboard.html", "w") as file:
|
320 |
+
file.write(html)
|
321 |
+
|
322 |
+
print("HTML leaderboard generated and saved as leaderboard.html")
|