Spaces:

speakleash
/

polish_eq-bench

Running

App Files Files Community

djstrong commited on 5 days ago

Commit

235501b

1 Parent(s): a3a884e

generate static page leaderboard

Browse files

Files changed (1) hide show

script.py +322 -0

script.py ADDED Viewed

	@@ -0,0 +1,322 @@

+import pandas as pd
+import json
+import re
+# Load the CSV file
+leaderboard_df = []
+with open("benchmark_results.csv", "r") as f:
+    header = f.readline().strip().split(",")
+    header = [h.strip() for h in header]
+    for i, line in enumerate(f):
+        leaderboard_df.append(line.strip().split(",", 13))
+# Load metadata
+metadata = json.load(open('metadata.json'))
+for k, v in list(metadata.items()):
+    metadata[k.split(",")[0]] = v
+# Create DataFrame
+leaderboard_df = pd.DataFrame(leaderboard_df, columns=header)
+# Filter and process DataFrame
+leaderboard_df = leaderboard_df[(leaderboard_df["Benchmark Version"] == "eq-bench_v2_pl") | (
+        leaderboard_df["Benchmark Version"] == 'eq-bench_pl')]
+leaderboard_df = leaderboard_df[["Model Path", "Benchmark Score", "Num Questions Parseable", "Error"]]
+def parse_parseable(x):
+    if x["Num Questions Parseable"] == 'FAILED':
+        m = re.match(r'(\d+)\.0 questions were parseable', x["Error"])
+        return m.group(1)
+    return x["Num Questions Parseable"]
+leaderboard_df["Num Questions Parseable"] = leaderboard_df[["Num Questions Parseable", "Error"]].apply(
+    lambda x: parse_parseable(x), axis=1)
+NUMBER_OF_QUESTIONS = 171.0
+def fraction_to_percentage(numerator: float, denominator: float) -> float:
+    return (numerator / denominator) * 100
+leaderboard_df["Num Questions Parseable"] = leaderboard_df["Num Questions Parseable"].apply(lambda x: fraction_to_percentage(float(x), NUMBER_OF_QUESTIONS))
+def get_params(model_name):
+    if model_name in metadata:
+        return metadata[model_name]
+    else:
+        print(model_name)
+    return None
+leaderboard_df["Params"] = leaderboard_df["Model Path"].apply(lambda x: get_params(x))
+leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].replace('FAILED', None)
+leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].astype(float) * ((leaderboard_df["Num Questions Parseable"].astype(float) / 100))
+leaderboard_df.loc[leaderboard_df["Benchmark Score"] < 0, "Benchmark Score"] = 0
+leaderboard_df = leaderboard_df.sort_values(by=["Benchmark Score", "Num Questions Parseable"], ascending=[False, False])
+leaderboard_df = leaderboard_df.rename(columns={"Model Path": "Model", "Num Questions Parseable": "Percentage Questions Parseable"})
+# Generate HTML with DataTables
+html = """
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Leaderboard</title>
+    <link rel="stylesheet" href="https://cdn.datatables.net/1.11.5/css/jquery.dataTables.min.css">
+    <script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
+    <script src="https://cdn.datatables.net/1.11.5/js/jquery.dataTables.min.js"></script>
+    <style>
+        body {
+            font: 90%/1.45em "Helvetica Neue", HelveticaNeue, Verdana, Arial, Helvetica, sans-serif;
+            margin: 0;
+            padding: 20px;
+            color: #333;
+            background-color: #fff;
+        }
+        .numeric-cell {
+            text-align: right;
+            padding: 8px !important;
+        }
+    </style>
+    <script>
+        (function($) {
+            $.fn.colorize = function(oOptions) {
+                var settings = $.extend({
+                    parse: function(e) {
+                        return parseFloat(e.html());
+                    },
+                    min: undefined,
+                    max: undefined,
+                    readable: true,
+                    themes: {
+                        "default": {
+                            color_min: "#C80000",
+                            color_mid: "#FFFFFF",
+                            color_max: "#10A54A"
+                        }
+                    },
+                    theme: "default",
+                    center: undefined,
+                    percent: false
+                }, oOptions);
+                function getColor(color1, color2, ratio) {
+                    var hex = function(x) {
+                        x = x.toString(16);
+                        return (x.length == 1) ? '0' + x : x;
+                    }
+                    color1 = (color1.charAt(0) == "#") ? color1.slice(1) : color1
+                    color2 = (color2.charAt(0) == "#") ? color2.slice(1) : color2
+                    var r = Math.ceil(parseInt(color1.substring(0,2), 16) * ratio + parseInt(color2.substring(0,2), 16) * (1-ratio));
+                    var g = Math.ceil(parseInt(color1.substring(2,4), 16) * ratio + parseInt(color2.substring(2,4), 16) * (1-ratio));
+                    var b = Math.ceil(parseInt(color1.substring(4,6), 16) * ratio + parseInt(color2.substring(4,6), 16) * (1-ratio));
+                    return "#" + (hex(r) + hex(g) + hex(b)).toUpperCase();
+                }
+                function getContrastYIQ(hexcolor) {
+                    var hex = (hexcolor.charAt(0) == "#") ? hexcolor.slice(1) : hexcolor;
+                    var r = parseInt(hex.substr(0,2),16);
+                    var g = parseInt(hex.substr(2,2),16);
+                    var b = parseInt(hex.substr(4,2),16);
+                    var yiq = ((r*299)+(g*587)+(b*114))/1000;
+                    return (yiq >= 128) ? 'black' : 'white';
+                }
+                var min = settings.min;
+                var max = settings.max;
+                if (min === undefined || max === undefined) {
+                    min = Infinity;
+                    max = -Infinity;
+                    this.each(function() {
+                        var value = parseFloat(settings.parse($(this)));
+                        if (!isNaN(value) && isFinite(value)) {
+                            min = Math.min(min, value);
+                            max = Math.max(max, value);
+                        }
+                    });
+                }
+                var center = settings.center !== undefined ? settings.center : (max + min) / 2;
+                var adj = Math.max(Math.abs(max - center), Math.abs(center - min));
+                this.each(function() {
+                    var value = parseFloat(settings.parse($(this)));
+                    if (isNaN(value) || !isFinite(value)) return;
+                    var ratio = (value - center) / adj;
+                    var color1, color2;
+                    if (value < center) {
+                        ratio = Math.abs(ratio);
+                        if (ratio > 1) ratio = 1;
+                        color1 = settings.themes[settings.theme].color_min;
+                        color2 = settings.themes[settings.theme].color_mid;
+                    } else {
+                        ratio = Math.abs(ratio);
+                        if (ratio > 1) ratio = 1;
+                        color1 = settings.themes[settings.theme].color_max;
+                        color2 = settings.themes[settings.theme].color_mid;
+                    }
+                    var color = getColor(color1, color2, ratio);
+                    $(this).css('background-color', color);
+                    if (settings.readable)
+                        $(this).css('color', getContrastYIQ(color));
+                });
+                return this;
+            };
+        }(jQuery));
+        $(document).ready(function() {
+            // Add custom filtering function
+            $.fn.dataTable.ext.search.push(function(settings, data, dataIndex) {
+                var searchValue = $('.dataTables_filter input').val();
+                if (!searchValue) return true;
+                // Split search terms by semicolon and trim whitespace
+                var searchTerms = searchValue.split(';').map(term => term.trim().toLowerCase());
+                var modelName = data[0].toLowerCase(); // Model name is in first column
+                // Return true if ANY search terms are found in the model name (OR logic)
+                return searchTerms.some(term => modelName.includes(term));
+            });
+            // Custom sorting function for benchmark scores
+            $.fn.dataTable.ext.type.order['score-pre'] = function(data) {
+                var score = parseFloat(data);
+                return isNaN(score) ? -Infinity : score;
+            };
+            // Get min/max values for each numeric column before initializing DataTables
+            var columnRanges = {
+                1: { min: Infinity, max: -Infinity },  // Params
+                2: { min: Infinity, max: -Infinity },  // Benchmark Score
+                3: { min: Infinity, max: -Infinity }   // Percentage Questions Parseable
+            };
+            $('#leaderboard tbody td').each(function() {
+                var columnIdx = $(this).index();
+                if (columnIdx in columnRanges) {
+                    var value = parseFloat($(this).text());
+                    if (!isNaN(value) && isFinite(value)) {
+                        columnRanges[columnIdx].min = Math.min(columnRanges[columnIdx].min, value);
+                        columnRanges[columnIdx].max = Math.max(columnRanges[columnIdx].max, value);
+                    }
+                }
+            });
+            var table = $('#leaderboard').DataTable({
+                "order": [[2, "desc"]],  // Sort by Benchmark Score by default
+                "pageLength": 20,  // Show 20 results per page
+                "lengthMenu": [[10, 20, 50, 100, -1], [10, 20, 50, 100, "All"]],  // Update length menu options
+                "columnDefs": [
+                    {
+                        "targets": [1],
+                        "className": "numeric-cell"
+                    },
+                    {
+                        "type": "score",
+                        "targets": [2],  // Apply custom sorting to Benchmark Score column
+                        "className": "numeric-cell"
+                    },
+                    {
+                        "targets": [3],
+                        "className": "numeric-cell"
+                    }
+                ],
+                "drawCallback": function() {
+                    // Apply colorization with pre-calculated ranges
+                    $("#leaderboard tbody td:nth-child(2)").colorize({
+                        parse: function(e) { return parseFloat($(e).text()); },
+                        min: columnRanges[1].min,
+                        max: columnRanges[1].max,
+                        themes: {
+                            "default": {
+                                color_min: "#10A54A",    // White for smaller models
+                                color_mid: "#FFD700",    // Gold/yellow for medium models
+                                color_max: "#C80000"     // Hot pink for larger models
+                            }
+                        }
+                    });
+                    $("#leaderboard tbody td:nth-child(3)").colorize({
+                        parse: function(e) { return parseFloat($(e).text()); },
+                        min: columnRanges[2].min,
+                        max: columnRanges[2].max,
+                        themes: {
+                            "default": {
+                                color_min: "#C80000",    // Red for lower scores
+                                color_mid: "#FFD700",    // Gold/yellow for medium scores
+                                color_max: "#10A54A"     // Green for higher scores
+                            }
+                        }
+                    });
+                    $("#leaderboard tbody td:nth-child(4)").colorize({
+                        parse: function(e) { return parseFloat($(e).text()); },
+                        min: columnRanges[3].min,
+                        max: columnRanges[3].max,
+                        themes: {
+                            "default": {
+                                color_min: "#C80000",    // Red for lower percentages
+                                color_mid: "#FFD700",    // Gold/yellow for medium percentages
+                                color_max: "#10A54A"     // Green for higher percentages
+                            }
+                        }
+                    });
+                },
+                // Override the default search behavior
+                "search": {
+                    "smart": false
+                },
+                // Update search on input change
+                "initComplete": function() {
+                    var table = this.api();
+                    $('.dataTables_filter input')
+                        .off() // Remove default binding
+                        .on('input', function() {
+                            table.draw();
+                        });
+                }
+            });
+        });
+    </script>
+</head>
+<body>
+    <h1>Leaderboard</h1>
+    <table id="leaderboard" class="display" style="width:100%">
+        <thead>
+            <tr>
+                <th>Model</th>
+                <th>Params</th>
+                <th>Benchmark Score</th>
+                <th>Percentage Questions Parseable</th>
+                <th>Error</th>
+            </tr>
+        </thead>
+        <tbody>
+"""
+# Add rows to the HTML table
+for _, row in leaderboard_df.iterrows():
+    html += f"""
+            <tr>
+                <td>{row['Model']}</td>
+                <td>{row['Params']}</td>
+                <td>{row['Benchmark Score']:.2f}</td>
+                <td>{row['Percentage Questions Parseable']:.2f}</td>
+                <td>{row['Error']}</td>
+            </tr>
+    """
+# Close the HTML tags
+html += """
+        </tbody>
+    </table>
+</body>
+</html>
+"""
+# Save the HTML to a file
+with open("leaderboard.html", "w") as file:
+    file.write(html)
+print("HTML leaderboard generated and saved as leaderboard.html")