djstrong commited on
Commit
235501b
·
1 Parent(s): a3a884e

generate static page leaderboard

Browse files
Files changed (1) hide show
  1. script.py +322 -0
script.py ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import json
3
+ import re
4
+
5
+ # Load the CSV file
6
+ leaderboard_df = []
7
+ with open("benchmark_results.csv", "r") as f:
8
+ header = f.readline().strip().split(",")
9
+ header = [h.strip() for h in header]
10
+ for i, line in enumerate(f):
11
+ leaderboard_df.append(line.strip().split(",", 13))
12
+
13
+ # Load metadata
14
+ metadata = json.load(open('metadata.json'))
15
+ for k, v in list(metadata.items()):
16
+ metadata[k.split(",")[0]] = v
17
+
18
+ # Create DataFrame
19
+ leaderboard_df = pd.DataFrame(leaderboard_df, columns=header)
20
+
21
+ # Filter and process DataFrame
22
+ leaderboard_df = leaderboard_df[(leaderboard_df["Benchmark Version"] == "eq-bench_v2_pl") | (
23
+ leaderboard_df["Benchmark Version"] == 'eq-bench_pl')]
24
+ leaderboard_df = leaderboard_df[["Model Path", "Benchmark Score", "Num Questions Parseable", "Error"]]
25
+
26
+ def parse_parseable(x):
27
+ if x["Num Questions Parseable"] == 'FAILED':
28
+ m = re.match(r'(\d+)\.0 questions were parseable', x["Error"])
29
+ return m.group(1)
30
+ return x["Num Questions Parseable"]
31
+
32
+ leaderboard_df["Num Questions Parseable"] = leaderboard_df[["Num Questions Parseable", "Error"]].apply(
33
+ lambda x: parse_parseable(x), axis=1)
34
+
35
+ NUMBER_OF_QUESTIONS = 171.0
36
+
37
+ def fraction_to_percentage(numerator: float, denominator: float) -> float:
38
+ return (numerator / denominator) * 100
39
+
40
+ leaderboard_df["Num Questions Parseable"] = leaderboard_df["Num Questions Parseable"].apply(lambda x: fraction_to_percentage(float(x), NUMBER_OF_QUESTIONS))
41
+
42
+ def get_params(model_name):
43
+ if model_name in metadata:
44
+ return metadata[model_name]
45
+ else:
46
+ print(model_name)
47
+ return None
48
+
49
+ leaderboard_df["Params"] = leaderboard_df["Model Path"].apply(lambda x: get_params(x))
50
+ leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].replace('FAILED', None)
51
+ leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].astype(float) * ((leaderboard_df["Num Questions Parseable"].astype(float) / 100))
52
+ leaderboard_df.loc[leaderboard_df["Benchmark Score"] < 0, "Benchmark Score"] = 0
53
+ leaderboard_df = leaderboard_df.sort_values(by=["Benchmark Score", "Num Questions Parseable"], ascending=[False, False])
54
+ leaderboard_df = leaderboard_df.rename(columns={"Model Path": "Model", "Num Questions Parseable": "Percentage Questions Parseable"})
55
+
56
+ # Generate HTML with DataTables
57
+ html = """
58
+ <!DOCTYPE html>
59
+ <html lang="en">
60
+ <head>
61
+ <meta charset="UTF-8">
62
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
63
+ <title>Leaderboard</title>
64
+ <link rel="stylesheet" href="https://cdn.datatables.net/1.11.5/css/jquery.dataTables.min.css">
65
+ <script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
66
+ <script src="https://cdn.datatables.net/1.11.5/js/jquery.dataTables.min.js"></script>
67
+ <style>
68
+ body {
69
+ font: 90%/1.45em "Helvetica Neue", HelveticaNeue, Verdana, Arial, Helvetica, sans-serif;
70
+ margin: 0;
71
+ padding: 20px;
72
+ color: #333;
73
+ background-color: #fff;
74
+ }
75
+ .numeric-cell {
76
+ text-align: right;
77
+ padding: 8px !important;
78
+ }
79
+ </style>
80
+ <script>
81
+ (function($) {
82
+ $.fn.colorize = function(oOptions) {
83
+ var settings = $.extend({
84
+ parse: function(e) {
85
+ return parseFloat(e.html());
86
+ },
87
+ min: undefined,
88
+ max: undefined,
89
+ readable: true,
90
+ themes: {
91
+ "default": {
92
+ color_min: "#C80000",
93
+ color_mid: "#FFFFFF",
94
+ color_max: "#10A54A"
95
+ }
96
+ },
97
+ theme: "default",
98
+ center: undefined,
99
+ percent: false
100
+ }, oOptions);
101
+
102
+ function getColor(color1, color2, ratio) {
103
+ var hex = function(x) {
104
+ x = x.toString(16);
105
+ return (x.length == 1) ? '0' + x : x;
106
+ }
107
+ color1 = (color1.charAt(0) == "#") ? color1.slice(1) : color1
108
+ color2 = (color2.charAt(0) == "#") ? color2.slice(1) : color2
109
+ var r = Math.ceil(parseInt(color1.substring(0,2), 16) * ratio + parseInt(color2.substring(0,2), 16) * (1-ratio));
110
+ var g = Math.ceil(parseInt(color1.substring(2,4), 16) * ratio + parseInt(color2.substring(2,4), 16) * (1-ratio));
111
+ var b = Math.ceil(parseInt(color1.substring(4,6), 16) * ratio + parseInt(color2.substring(4,6), 16) * (1-ratio));
112
+ return "#" + (hex(r) + hex(g) + hex(b)).toUpperCase();
113
+ }
114
+
115
+ function getContrastYIQ(hexcolor) {
116
+ var hex = (hexcolor.charAt(0) == "#") ? hexcolor.slice(1) : hexcolor;
117
+ var r = parseInt(hex.substr(0,2),16);
118
+ var g = parseInt(hex.substr(2,2),16);
119
+ var b = parseInt(hex.substr(4,2),16);
120
+ var yiq = ((r*299)+(g*587)+(b*114))/1000;
121
+ return (yiq >= 128) ? 'black' : 'white';
122
+ }
123
+
124
+ var min = settings.min;
125
+ var max = settings.max;
126
+ if (min === undefined || max === undefined) {
127
+ min = Infinity;
128
+ max = -Infinity;
129
+ this.each(function() {
130
+ var value = parseFloat(settings.parse($(this)));
131
+ if (!isNaN(value) && isFinite(value)) {
132
+ min = Math.min(min, value);
133
+ max = Math.max(max, value);
134
+ }
135
+ });
136
+ }
137
+
138
+ var center = settings.center !== undefined ? settings.center : (max + min) / 2;
139
+ var adj = Math.max(Math.abs(max - center), Math.abs(center - min));
140
+
141
+ this.each(function() {
142
+ var value = parseFloat(settings.parse($(this)));
143
+ if (isNaN(value) || !isFinite(value)) return;
144
+
145
+ var ratio = (value - center) / adj;
146
+ var color1, color2;
147
+
148
+ if (value < center) {
149
+ ratio = Math.abs(ratio);
150
+ if (ratio > 1) ratio = 1;
151
+ color1 = settings.themes[settings.theme].color_min;
152
+ color2 = settings.themes[settings.theme].color_mid;
153
+ } else {
154
+ ratio = Math.abs(ratio);
155
+ if (ratio > 1) ratio = 1;
156
+ color1 = settings.themes[settings.theme].color_max;
157
+ color2 = settings.themes[settings.theme].color_mid;
158
+ }
159
+ var color = getColor(color1, color2, ratio);
160
+ $(this).css('background-color', color);
161
+ if (settings.readable)
162
+ $(this).css('color', getContrastYIQ(color));
163
+ });
164
+
165
+ return this;
166
+ };
167
+ }(jQuery));
168
+
169
+ $(document).ready(function() {
170
+ // Add custom filtering function
171
+ $.fn.dataTable.ext.search.push(function(settings, data, dataIndex) {
172
+ var searchValue = $('.dataTables_filter input').val();
173
+ if (!searchValue) return true;
174
+
175
+ // Split search terms by semicolon and trim whitespace
176
+ var searchTerms = searchValue.split(';').map(term => term.trim().toLowerCase());
177
+ var modelName = data[0].toLowerCase(); // Model name is in first column
178
+
179
+ // Return true if ANY search terms are found in the model name (OR logic)
180
+ return searchTerms.some(term => modelName.includes(term));
181
+ });
182
+
183
+ // Custom sorting function for benchmark scores
184
+ $.fn.dataTable.ext.type.order['score-pre'] = function(data) {
185
+ var score = parseFloat(data);
186
+ return isNaN(score) ? -Infinity : score;
187
+ };
188
+
189
+ // Get min/max values for each numeric column before initializing DataTables
190
+ var columnRanges = {
191
+ 1: { min: Infinity, max: -Infinity }, // Params
192
+ 2: { min: Infinity, max: -Infinity }, // Benchmark Score
193
+ 3: { min: Infinity, max: -Infinity } // Percentage Questions Parseable
194
+ };
195
+
196
+ $('#leaderboard tbody td').each(function() {
197
+ var columnIdx = $(this).index();
198
+ if (columnIdx in columnRanges) {
199
+ var value = parseFloat($(this).text());
200
+ if (!isNaN(value) && isFinite(value)) {
201
+ columnRanges[columnIdx].min = Math.min(columnRanges[columnIdx].min, value);
202
+ columnRanges[columnIdx].max = Math.max(columnRanges[columnIdx].max, value);
203
+ }
204
+ }
205
+ });
206
+
207
+ var table = $('#leaderboard').DataTable({
208
+ "order": [[2, "desc"]], // Sort by Benchmark Score by default
209
+ "pageLength": 20, // Show 20 results per page
210
+ "lengthMenu": [[10, 20, 50, 100, -1], [10, 20, 50, 100, "All"]], // Update length menu options
211
+ "columnDefs": [
212
+ {
213
+ "targets": [1],
214
+ "className": "numeric-cell"
215
+ },
216
+ {
217
+ "type": "score",
218
+ "targets": [2], // Apply custom sorting to Benchmark Score column
219
+ "className": "numeric-cell"
220
+ },
221
+ {
222
+ "targets": [3],
223
+ "className": "numeric-cell"
224
+ }
225
+ ],
226
+ "drawCallback": function() {
227
+ // Apply colorization with pre-calculated ranges
228
+ $("#leaderboard tbody td:nth-child(2)").colorize({
229
+ parse: function(e) { return parseFloat($(e).text()); },
230
+ min: columnRanges[1].min,
231
+ max: columnRanges[1].max,
232
+ themes: {
233
+ "default": {
234
+ color_min: "#10A54A", // White for smaller models
235
+ color_mid: "#FFD700", // Gold/yellow for medium models
236
+ color_max: "#C80000" // Hot pink for larger models
237
+ }
238
+ }
239
+ });
240
+ $("#leaderboard tbody td:nth-child(3)").colorize({
241
+ parse: function(e) { return parseFloat($(e).text()); },
242
+ min: columnRanges[2].min,
243
+ max: columnRanges[2].max,
244
+ themes: {
245
+ "default": {
246
+ color_min: "#C80000", // Red for lower scores
247
+ color_mid: "#FFD700", // Gold/yellow for medium scores
248
+ color_max: "#10A54A" // Green for higher scores
249
+ }
250
+ }
251
+ });
252
+ $("#leaderboard tbody td:nth-child(4)").colorize({
253
+ parse: function(e) { return parseFloat($(e).text()); },
254
+ min: columnRanges[3].min,
255
+ max: columnRanges[3].max,
256
+ themes: {
257
+ "default": {
258
+ color_min: "#C80000", // Red for lower percentages
259
+ color_mid: "#FFD700", // Gold/yellow for medium percentages
260
+ color_max: "#10A54A" // Green for higher percentages
261
+ }
262
+ }
263
+ });
264
+ },
265
+ // Override the default search behavior
266
+ "search": {
267
+ "smart": false
268
+ },
269
+
270
+ // Update search on input change
271
+ "initComplete": function() {
272
+ var table = this.api();
273
+ $('.dataTables_filter input')
274
+ .off() // Remove default binding
275
+ .on('input', function() {
276
+ table.draw();
277
+ });
278
+ }
279
+ });
280
+ });
281
+ </script>
282
+ </head>
283
+ <body>
284
+ <h1>Leaderboard</h1>
285
+ <table id="leaderboard" class="display" style="width:100%">
286
+ <thead>
287
+ <tr>
288
+ <th>Model</th>
289
+ <th>Params</th>
290
+ <th>Benchmark Score</th>
291
+ <th>Percentage Questions Parseable</th>
292
+ <th>Error</th>
293
+ </tr>
294
+ </thead>
295
+ <tbody>
296
+ """
297
+
298
+ # Add rows to the HTML table
299
+ for _, row in leaderboard_df.iterrows():
300
+ html += f"""
301
+ <tr>
302
+ <td>{row['Model']}</td>
303
+ <td>{row['Params']}</td>
304
+ <td>{row['Benchmark Score']:.2f}</td>
305
+ <td>{row['Percentage Questions Parseable']:.2f}</td>
306
+ <td>{row['Error']}</td>
307
+ </tr>
308
+ """
309
+
310
+ # Close the HTML tags
311
+ html += """
312
+ </tbody>
313
+ </table>
314
+ </body>
315
+ </html>
316
+ """
317
+
318
+ # Save the HTML to a file
319
+ with open("leaderboard.html", "w") as file:
320
+ file.write(html)
321
+
322
+ print("HTML leaderboard generated and saved as leaderboard.html")