lewtun HF staff commited on
Commit
8ea545e
·
1 Parent(s): 599688f

Merge evals

Browse files
Files changed (2) hide show
  1. app.py +15 -3
  2. debug.ipynb +458 -110
app.py CHANGED
@@ -11,7 +11,7 @@ Evaluation of H4 and community models across a diverse range of benchmarks from
11
  """
12
 
13
 
14
- def get_leaderboard_df():
15
  filepaths = list(Path("eval_results").rglob("*.json"))
16
 
17
  # Parse filepaths to get unique models
@@ -66,11 +66,17 @@ def get_leaderboard_df():
66
  df = df.reset_index().rename(columns={"index": "Model"}).round(2)
67
  # Strip off date from model name
68
  df["Model"] = df["Model"].apply(lambda x: x.rsplit("_", 1)[0])
 
 
 
 
 
 
69
  return df
70
 
71
 
72
- def refresh():
73
- return get_leaderboard_df()
74
 
75
 
76
  # Function to update the table based on search query
@@ -94,11 +100,17 @@ with demo:
94
  gr.Markdown(DESCRIPTION, elem_classes="markdown-text")
95
  with gr.Row():
96
  search_bar = gr.Textbox(placeholder="Search for your model...", show_label=False)
 
 
 
 
97
  with gr.Group():
 
98
  leaderboard_table = gr.Dataframe(value=leaderboard_df, wrap=True, height=1000)
99
  with gr.Row():
100
  refresh_button = gr.Button("Refresh")
101
 
 
102
  search_bar.submit(update_table, inputs=[search_bar], outputs=[leaderboard_table])
103
  refresh_button.click(refresh, inputs=[], outputs=[leaderboard_table])
104
 
 
11
  """
12
 
13
 
14
+ def get_leaderboard_df(merge_values: bool = False):
15
  filepaths = list(Path("eval_results").rglob("*.json"))
16
 
17
  # Parse filepaths to get unique models
 
66
  df = df.reset_index().rename(columns={"index": "Model"}).round(2)
67
  # Strip off date from model name
68
  df["Model"] = df["Model"].apply(lambda x: x.rsplit("_", 1)[0])
69
+
70
+ if merge_values:
71
+ merged_df = df.drop(["Date", "Average"], axis=1).groupby("Model").max().reset_index()
72
+ merged_df.insert(loc=0, column="Average", value=merged_df.mean(axis=1, numeric_only=True))
73
+ merged_df = merged_df.sort_values(by=["Average"], ascending=False).round(2)
74
+ df = df[["Model", "Date"]].merge(merged_df, on="Model", how="left")
75
  return df
76
 
77
 
78
+ def refresh(merge_values: bool = False):
79
+ return get_leaderboard_df(merge_values)
80
 
81
 
82
  # Function to update the table based on search query
 
100
  gr.Markdown(DESCRIPTION, elem_classes="markdown-text")
101
  with gr.Row():
102
  search_bar = gr.Textbox(placeholder="Search for your model...", show_label=False)
103
+ merge_values = gr.Checkbox(
104
+ label="Merge evals",
105
+ info="Merge evals for the same model. If there are duplicates, we display the largest one.",
106
+ )
107
  with gr.Group():
108
+ leaderboard_df = get_leaderboard_df()
109
  leaderboard_table = gr.Dataframe(value=leaderboard_df, wrap=True, height=1000)
110
  with gr.Row():
111
  refresh_button = gr.Button("Refresh")
112
 
113
+ merge_values.change(refresh, inputs=[merge_values], outputs=[leaderboard_table])
114
  search_bar.submit(update_table, inputs=[search_bar], outputs=[leaderboard_table])
115
  refresh_button.click(refresh, inputs=[], outputs=[leaderboard_table])
116
 
debug.ipynb CHANGED
@@ -2,7 +2,7 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 1,
6
  "metadata": {},
7
  "outputs": [],
8
  "source": [
@@ -15,7 +15,7 @@
15
  },
16
  {
17
  "cell_type": "code",
18
- "execution_count": 51,
19
  "metadata": {},
20
  "outputs": [],
21
  "source": [
@@ -44,18 +44,34 @@
44
  " data = json.load(file)\n",
45
  " first_result_key = next(iter(data[\"results\"])) # gets the first key in 'results'\n",
46
  " # TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard\n",
47
- " if task == \"truthfulqa\":\n",
48
  " value = data[\"results\"][first_result_key][\"truthfulqa_mc2\"]\n",
 
 
 
 
 
 
 
 
 
49
  " else:\n",
50
- " first_metric_key = next(iter(data[\"results\"][first_result_key])) # gets the first key in the first result\n",
 
 
51
  " value = data[\"results\"][first_result_key][first_metric_key] # gets the value of the first metric\n",
52
  " df.loc[model_revision, task] = value\n",
53
- " \n",
 
 
 
54
  " # Drop rows where every entry is NaN\n",
55
  " df = df.dropna(how=\"all\", axis=0, subset=[c for c in df.columns if c != \"Date\"])\n",
56
  " df.insert(loc=1, column=\"Average\", value=df.mean(axis=1, numeric_only=True))\n",
 
 
57
  " df = df.sort_values(by=[\"Average\"], ascending=False)\n",
58
- " df = df.reset_index().rename(columns={\"index\": \"Model\"}).round(3)\n",
59
  " # Strip off date from model name\n",
60
  " df[\"Model\"] = df[\"Model\"].apply(lambda x: x.rsplit(\"_\", 1)[0])\n",
61
  " return df"
@@ -63,7 +79,7 @@
63
  },
64
  {
65
  "cell_type": "code",
66
- "execution_count": 52,
67
  "metadata": {},
68
  "outputs": [],
69
  "source": [
@@ -72,7 +88,7 @@
72
  },
73
  {
74
  "cell_type": "code",
75
- "execution_count": 53,
76
  "metadata": {},
77
  "outputs": [
78
  {
@@ -111,68 +127,68 @@
111
  " <tbody>\n",
112
  " <tr>\n",
113
  " <th>0</th>\n",
114
- " <td>NousResearch_Nous-Hermes-2-Mixtral-8x7B-DPO_main</td>\n",
115
- " <td>2024-03-02</td>\n",
116
- " <td>0.617</td>\n",
117
- " <td>0.553</td>\n",
118
- " <td>0.477</td>\n",
119
- " <td>0.785</td>\n",
120
- " <td>0.622</td>\n",
121
- " <td>0.51</td>\n",
122
- " <td>0.677</td>\n",
123
- " <td>0.698</td>\n",
124
  " </tr>\n",
125
  " <tr>\n",
126
  " <th>1</th>\n",
127
- " <td>NousResearch_Nous-Hermes-2-Yi-34B_main</td>\n",
128
- " <td>2024-03-04</td>\n",
129
- " <td>0.604</td>\n",
 
 
130
  " <td>NaN</td>\n",
131
- " <td>0.439</td>\n",
132
- " <td>0.806</td>\n",
 
133
  " <td>NaN</td>\n",
134
- " <td>0.48</td>\n",
135
- " <td>0.640</td>\n",
136
- " <td>0.654</td>\n",
137
  " </tr>\n",
138
  " <tr>\n",
139
  " <th>2</th>\n",
140
- " <td>mistralai_Mixtral-8x7B-Instruct-v0.1_main</td>\n",
141
  " <td>2024-03-02</td>\n",
142
- " <td>0.603</td>\n",
143
- " <td>0.497</td>\n",
144
- " <td>0.554</td>\n",
145
- " <td>0.736</td>\n",
146
- " <td>0.599</td>\n",
147
- " <td>0.43</td>\n",
148
- " <td>0.709</td>\n",
149
- " <td>0.698</td>\n",
150
  " </tr>\n",
151
  " <tr>\n",
152
  " <th>3</th>\n",
153
- " <td>deepseek-ai_deepseek-llm-67b-chat_main</td>\n",
154
- " <td>2024-03-04</td>\n",
155
- " <td>0.603</td>\n",
156
- " <td>NaN</td>\n",
157
- " <td>0.395</td>\n",
158
- " <td>0.792</td>\n",
159
- " <td>NaN</td>\n",
160
- " <td>NaN</td>\n",
161
- " <td>NaN</td>\n",
162
- " <td>0.622</td>\n",
163
  " </tr>\n",
164
  " <tr>\n",
165
  " <th>4</th>\n",
166
  " <td>deepseek-ai_deepseek-llm-67b-chat_main</td>\n",
167
- " <td>2024-03-05</td>\n",
168
- " <td>0.585</td>\n",
169
- " <td>0.505</td>\n",
 
 
170
  " <td>NaN</td>\n",
171
  " <td>NaN</td>\n",
172
- " <td>0.761</td>\n",
173
- " <td>0.42</td>\n",
174
- " <td>0.654</td>\n",
175
  " <td>NaN</td>\n",
 
176
  " </tr>\n",
177
  " <tr>\n",
178
  " <th>...</th>\n",
@@ -191,11 +207,11 @@
191
  " <th>269</th>\n",
192
  " <td>HuggingFaceH4_starcoder2-15b-ift_v18.0</td>\n",
193
  " <td>2024-03-10</td>\n",
194
- " <td>0.089</td>\n",
195
- " <td>0.170</td>\n",
196
  " <td>NaN</td>\n",
197
  " <td>NaN</td>\n",
198
- " <td>0.008</td>\n",
199
  " <td>NaN</td>\n",
200
  " <td>NaN</td>\n",
201
  " <td>NaN</td>\n",
@@ -204,11 +220,11 @@
204
  " <th>270</th>\n",
205
  " <td>HuggingFaceH4_mistral-7b-ift_v49.0</td>\n",
206
  " <td>2024-03-07</td>\n",
207
- " <td>0.086</td>\n",
208
- " <td>0.172</td>\n",
209
  " <td>NaN</td>\n",
210
  " <td>NaN</td>\n",
211
- " <td>0.000</td>\n",
212
  " <td>NaN</td>\n",
213
  " <td>NaN</td>\n",
214
  " <td>NaN</td>\n",
@@ -217,8 +233,8 @@
217
  " <th>271</th>\n",
218
  " <td>HuggingFaceH4_starchat-beta_main</td>\n",
219
  " <td>2024-03-12</td>\n",
220
- " <td>0.079</td>\n",
221
- " <td>0.079</td>\n",
222
  " <td>NaN</td>\n",
223
  " <td>NaN</td>\n",
224
  " <td>NaN</td>\n",
@@ -230,11 +246,11 @@
230
  " <th>272</th>\n",
231
  " <td>HuggingFaceH4_starcoder2-15b-ift_v7.0</td>\n",
232
  " <td>2024-03-10</td>\n",
233
- " <td>0.070</td>\n",
234
- " <td>0.107</td>\n",
235
  " <td>NaN</td>\n",
236
  " <td>NaN</td>\n",
237
- " <td>0.032</td>\n",
238
  " <td>NaN</td>\n",
239
  " <td>NaN</td>\n",
240
  " <td>NaN</td>\n",
@@ -243,11 +259,11 @@
243
  " <th>273</th>\n",
244
  " <td>HuggingFaceH4_zephyr-7b-beta-ift_v1.1</td>\n",
245
  " <td>2024-03-13</td>\n",
246
- " <td>0.043</td>\n",
247
- " <td>0.087</td>\n",
248
  " <td>NaN</td>\n",
249
  " <td>NaN</td>\n",
250
- " <td>0.000</td>\n",
251
  " <td>NaN</td>\n",
252
  " <td>NaN</td>\n",
253
  " <td>NaN</td>\n",
@@ -259,35 +275,35 @@
259
  ],
260
  "text/plain": [
261
  " Model Date Average \\\n",
262
- "0 NousResearch_Nous-Hermes-2-Mixtral-8x7B-DPO_main 2024-03-02 0.617 \n",
263
- "1 NousResearch_Nous-Hermes-2-Yi-34B_main 2024-03-04 0.604 \n",
264
- "2 mistralai_Mixtral-8x7B-Instruct-v0.1_main 2024-03-02 0.603 \n",
265
- "3 deepseek-ai_deepseek-llm-67b-chat_main 2024-03-04 0.603 \n",
266
- "4 deepseek-ai_deepseek-llm-67b-chat_main 2024-03-05 0.585 \n",
267
  ".. ... ... ... \n",
268
- "269 HuggingFaceH4_starcoder2-15b-ift_v18.0 2024-03-10 0.089 \n",
269
- "270 HuggingFaceH4_mistral-7b-ift_v49.0 2024-03-07 0.086 \n",
270
- "271 HuggingFaceH4_starchat-beta_main 2024-03-12 0.079 \n",
271
- "272 HuggingFaceH4_starcoder2-15b-ift_v7.0 2024-03-10 0.070 \n",
272
- "273 HuggingFaceH4_zephyr-7b-beta-ift_v1.1 2024-03-13 0.043 \n",
273
  "\n",
274
- " Ifeval Truthfulqa Winogrande Gsm8k Mmlu Hellaswag Arc \n",
275
- "0 0.553 0.477 0.785 0.622 0.51 0.677 0.698 \n",
276
- "1 NaN 0.439 0.806 NaN 0.48 0.640 0.654 \n",
277
- "2 0.497 0.554 0.736 0.599 0.43 0.709 0.698 \n",
278
- "3 NaN 0.395 0.792 NaN NaN NaN 0.622 \n",
279
- "4 0.505 NaN NaN 0.761 0.42 0.654 NaN \n",
280
- ".. ... ... ... ... ... ... ... \n",
281
- "269 0.170 NaN NaN 0.008 NaN NaN NaN \n",
282
- "270 0.172 NaN NaN 0.000 NaN NaN NaN \n",
283
- "271 0.079 NaN NaN NaN NaN NaN NaN \n",
284
- "272 0.107 NaN NaN 0.032 NaN NaN NaN \n",
285
- "273 0.087 NaN NaN 0.000 NaN NaN NaN \n",
286
  "\n",
287
  "[274 rows x 10 columns]"
288
  ]
289
  },
290
- "execution_count": 53,
291
  "metadata": {},
292
  "output_type": "execute_result"
293
  }
@@ -298,7 +314,7 @@
298
  },
299
  {
300
  "cell_type": "code",
301
- "execution_count": 32,
302
  "metadata": {},
303
  "outputs": [
304
  {
@@ -323,7 +339,6 @@
323
  " <tr style=\"text-align: right;\">\n",
324
  " <th></th>\n",
325
  " <th>Model</th>\n",
326
- " <th>Average</th>\n",
327
  " <th>Ifeval</th>\n",
328
  " <th>Truthfulqa</th>\n",
329
  " <th>Winogrande</th>\n",
@@ -335,50 +350,383 @@
335
  " </thead>\n",
336
  " <tbody>\n",
337
  " <tr>\n",
338
- " <th>50</th>\n",
339
- " <td>HuggingFaceH4_mistral-7b-ift_v48.56_2024-03-08</td>\n",
340
- " <td>0.49</td>\n",
341
- " <td>0.418</td>\n",
342
- " <td>0.359</td>\n",
343
- " <td>0.672</td>\n",
344
- " <td>0.453</td>\n",
345
- " <td>0.33</td>\n",
346
- " <td>0.656</td>\n",
347
- " <td>0.545</td>\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
348
  " </tr>\n",
349
  " <tr>\n",
350
- " <th>532</th>\n",
351
- " <td>HuggingFaceH4_mistral-7b-ift_v48.56</td>\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352
  " <td>NaN</td>\n",
353
  " <td>NaN</td>\n",
354
  " <td>NaN</td>\n",
 
 
 
 
 
 
355
  " <td>NaN</td>\n",
356
  " <td>NaN</td>\n",
 
357
  " <td>NaN</td>\n",
358
  " <td>NaN</td>\n",
359
  " <td>NaN</td>\n",
360
  " </tr>\n",
361
  " </tbody>\n",
362
  "</table>\n",
 
363
  "</div>"
364
  ],
365
  "text/plain": [
366
- " Model Average Ifeval \\\n",
367
- "50 HuggingFaceH4_mistral-7b-ift_v48.56_2024-03-08 0.49 0.418 \n",
368
- "532 HuggingFaceH4_mistral-7b-ift_v48.56 NaN NaN \n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
369
  "\n",
370
- " Truthfulqa Winogrande Gsm8k Mmlu Hellaswag Arc \n",
371
- "50 0.359 0.672 0.453 0.33 0.656 0.545 \n",
372
- "532 NaN NaN NaN NaN NaN NaN "
373
  ]
374
  },
375
- "execution_count": 32,
376
  "metadata": {},
377
  "output_type": "execute_result"
378
  }
379
  ],
380
  "source": [
381
- "df[df['Model'].str.contains(\"HuggingFaceH4_mistral-7b-ift_v48.56\")]"
382
  ]
383
  },
384
  {
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 2,
6
  "metadata": {},
7
  "outputs": [],
8
  "source": [
 
15
  },
16
  {
17
  "cell_type": "code",
18
+ "execution_count": 3,
19
  "metadata": {},
20
  "outputs": [],
21
  "source": [
 
44
  " data = json.load(file)\n",
45
  " first_result_key = next(iter(data[\"results\"])) # gets the first key in 'results'\n",
46
  " # TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard\n",
47
+ " if task.lower() == \"truthfulqa\":\n",
48
  " value = data[\"results\"][first_result_key][\"truthfulqa_mc2\"]\n",
49
+ " # IFEval has several metrics but we report just the prompt-loose-acc one\n",
50
+ " elif task.lower() == \"ifeval\":\n",
51
+ " value = data[\"results\"][first_result_key][\"prompt_level_loose_acc\"]\n",
52
+ " # MMLU has several metrics but we report just the average one\n",
53
+ " elif task.lower() == \"mmlu\":\n",
54
+ " value = data[\"results\"][\"lighteval|mmlu:_average|5\"][\"acc\"]\n",
55
+ " # HellaSwag and ARC reports acc_norm\n",
56
+ " elif task.lower() in [\"hellaswag\", \"arc\"]:\n",
57
+ " value = data[\"results\"][first_result_key][\"acc_norm\"]\n",
58
  " else:\n",
59
+ " first_metric_key = next(\n",
60
+ " iter(data[\"results\"][first_result_key])\n",
61
+ " ) # gets the first key in the first result\n",
62
  " value = data[\"results\"][first_result_key][first_metric_key] # gets the value of the first metric\n",
63
  " df.loc[model_revision, task] = value\n",
64
+ "\n",
65
+ " # Put IFEval in first column\n",
66
+ " ifeval_col = df.pop(\"Ifeval\")\n",
67
+ " df.insert(1, \"Ifeval\", ifeval_col)\n",
68
  " # Drop rows where every entry is NaN\n",
69
  " df = df.dropna(how=\"all\", axis=0, subset=[c for c in df.columns if c != \"Date\"])\n",
70
  " df.insert(loc=1, column=\"Average\", value=df.mean(axis=1, numeric_only=True))\n",
71
+ " # Convert all values to percentage\n",
72
+ " df[df.select_dtypes(include=[\"number\"]).columns] *= 100.0\n",
73
  " df = df.sort_values(by=[\"Average\"], ascending=False)\n",
74
+ " df = df.reset_index().rename(columns={\"index\": \"Model\"}).round(2)\n",
75
  " # Strip off date from model name\n",
76
  " df[\"Model\"] = df[\"Model\"].apply(lambda x: x.rsplit(\"_\", 1)[0])\n",
77
  " return df"
 
79
  },
80
  {
81
  "cell_type": "code",
82
+ "execution_count": 4,
83
  "metadata": {},
84
  "outputs": [],
85
  "source": [
 
88
  },
89
  {
90
  "cell_type": "code",
91
+ "execution_count": 5,
92
  "metadata": {},
93
  "outputs": [
94
  {
 
127
  " <tbody>\n",
128
  " <tr>\n",
129
  " <th>0</th>\n",
130
+ " <td>NousResearch_Nous-Hermes-2-Yi-34B_main</td>\n",
131
+ " <td>2024-03-04</td>\n",
132
+ " <td>74.01</td>\n",
133
+ " <td>NaN</td>\n",
134
+ " <td>61.44</td>\n",
135
+ " <td>80.58</td>\n",
136
+ " <td>NaN</td>\n",
137
+ " <td>76.24</td>\n",
138
+ " <td>83.79</td>\n",
139
+ " <td>68.00</td>\n",
140
  " </tr>\n",
141
  " <tr>\n",
142
  " <th>1</th>\n",
143
+ " <td>deepseek-ai_deepseek-llm-67b-chat_main</td>\n",
144
+ " <td>2024-03-05</td>\n",
145
+ " <td>71.62</td>\n",
146
+ " <td>55.27</td>\n",
147
+ " <td>NaN</td>\n",
148
  " <td>NaN</td>\n",
149
+ " <td>76.12</td>\n",
150
+ " <td>71.18</td>\n",
151
+ " <td>83.94</td>\n",
152
  " <td>NaN</td>\n",
 
 
 
153
  " </tr>\n",
154
  " <tr>\n",
155
  " <th>2</th>\n",
156
+ " <td>NousResearch_Nous-Hermes-2-Mixtral-8x7B-DPO_main</td>\n",
157
  " <td>2024-03-02</td>\n",
158
+ " <td>70.43</td>\n",
159
+ " <td>59.33</td>\n",
160
+ " <td>64.76</td>\n",
161
+ " <td>78.53</td>\n",
162
+ " <td>62.17</td>\n",
163
+ " <td>71.96</td>\n",
164
+ " <td>85.42</td>\n",
165
+ " <td>70.82</td>\n",
166
  " </tr>\n",
167
  " <tr>\n",
168
  " <th>3</th>\n",
169
+ " <td>mistralai_Mixtral-8x7B-Instruct-v0.1_main</td>\n",
170
+ " <td>2024-03-02</td>\n",
171
+ " <td>69.80</td>\n",
172
+ " <td>55.08</td>\n",
173
+ " <td>70.79</td>\n",
174
+ " <td>73.56</td>\n",
175
+ " <td>59.89</td>\n",
176
+ " <td>70.60</td>\n",
177
+ " <td>86.68</td>\n",
178
+ " <td>72.01</td>\n",
179
  " </tr>\n",
180
  " <tr>\n",
181
  " <th>4</th>\n",
182
  " <td>deepseek-ai_deepseek-llm-67b-chat_main</td>\n",
183
+ " <td>2024-03-04</td>\n",
184
+ " <td>67.03</td>\n",
185
+ " <td>NaN</td>\n",
186
+ " <td>57.78</td>\n",
187
+ " <td>79.16</td>\n",
188
  " <td>NaN</td>\n",
189
  " <td>NaN</td>\n",
 
 
 
190
  " <td>NaN</td>\n",
191
+ " <td>64.16</td>\n",
192
  " </tr>\n",
193
  " <tr>\n",
194
  " <th>...</th>\n",
 
207
  " <th>269</th>\n",
208
  " <td>HuggingFaceH4_starcoder2-15b-ift_v18.0</td>\n",
209
  " <td>2024-03-10</td>\n",
210
+ " <td>11.23</td>\n",
211
+ " <td>21.63</td>\n",
212
  " <td>NaN</td>\n",
213
  " <td>NaN</td>\n",
214
+ " <td>0.83</td>\n",
215
  " <td>NaN</td>\n",
216
  " <td>NaN</td>\n",
217
  " <td>NaN</td>\n",
 
220
  " <th>270</th>\n",
221
  " <td>HuggingFaceH4_mistral-7b-ift_v49.0</td>\n",
222
  " <td>2024-03-07</td>\n",
223
+ " <td>10.07</td>\n",
224
+ " <td>20.15</td>\n",
225
  " <td>NaN</td>\n",
226
  " <td>NaN</td>\n",
227
+ " <td>0.00</td>\n",
228
  " <td>NaN</td>\n",
229
  " <td>NaN</td>\n",
230
  " <td>NaN</td>\n",
 
233
  " <th>271</th>\n",
234
  " <td>HuggingFaceH4_starchat-beta_main</td>\n",
235
  " <td>2024-03-12</td>\n",
236
+ " <td>8.13</td>\n",
237
+ " <td>8.13</td>\n",
238
  " <td>NaN</td>\n",
239
  " <td>NaN</td>\n",
240
  " <td>NaN</td>\n",
 
246
  " <th>272</th>\n",
247
  " <td>HuggingFaceH4_starcoder2-15b-ift_v7.0</td>\n",
248
  " <td>2024-03-10</td>\n",
249
+ " <td>7.88</td>\n",
250
+ " <td>12.57</td>\n",
251
  " <td>NaN</td>\n",
252
  " <td>NaN</td>\n",
253
+ " <td>3.18</td>\n",
254
  " <td>NaN</td>\n",
255
  " <td>NaN</td>\n",
256
  " <td>NaN</td>\n",
 
259
  " <th>273</th>\n",
260
  " <td>HuggingFaceH4_zephyr-7b-beta-ift_v1.1</td>\n",
261
  " <td>2024-03-13</td>\n",
262
+ " <td>4.71</td>\n",
263
+ " <td>9.43</td>\n",
264
  " <td>NaN</td>\n",
265
  " <td>NaN</td>\n",
266
+ " <td>0.00</td>\n",
267
  " <td>NaN</td>\n",
268
  " <td>NaN</td>\n",
269
  " <td>NaN</td>\n",
 
275
  ],
276
  "text/plain": [
277
  " Model Date Average \\\n",
278
+ "0 NousResearch_Nous-Hermes-2-Yi-34B_main 2024-03-04 74.01 \n",
279
+ "1 deepseek-ai_deepseek-llm-67b-chat_main 2024-03-05 71.62 \n",
280
+ "2 NousResearch_Nous-Hermes-2-Mixtral-8x7B-DPO_main 2024-03-02 70.43 \n",
281
+ "3 mistralai_Mixtral-8x7B-Instruct-v0.1_main 2024-03-02 69.80 \n",
282
+ "4 deepseek-ai_deepseek-llm-67b-chat_main 2024-03-04 67.03 \n",
283
  ".. ... ... ... \n",
284
+ "269 HuggingFaceH4_starcoder2-15b-ift_v18.0 2024-03-10 11.23 \n",
285
+ "270 HuggingFaceH4_mistral-7b-ift_v49.0 2024-03-07 10.07 \n",
286
+ "271 HuggingFaceH4_starchat-beta_main 2024-03-12 8.13 \n",
287
+ "272 HuggingFaceH4_starcoder2-15b-ift_v7.0 2024-03-10 7.88 \n",
288
+ "273 HuggingFaceH4_zephyr-7b-beta-ift_v1.1 2024-03-13 4.71 \n",
289
  "\n",
290
+ " Ifeval Truthfulqa Winogrande Gsm8k Mmlu Hellaswag Arc \n",
291
+ "0 NaN 61.44 80.58 NaN 76.24 83.79 68.00 \n",
292
+ "1 55.27 NaN NaN 76.12 71.18 83.94 NaN \n",
293
+ "2 59.33 64.76 78.53 62.17 71.96 85.42 70.82 \n",
294
+ "3 55.08 70.79 73.56 59.89 70.60 86.68 72.01 \n",
295
+ "4 NaN 57.78 79.16 NaN NaN NaN 64.16 \n",
296
+ ".. ... ... ... ... ... ... ... \n",
297
+ "269 21.63 NaN NaN 0.83 NaN NaN NaN \n",
298
+ "270 20.15 NaN NaN 0.00 NaN NaN NaN \n",
299
+ "271 8.13 NaN NaN NaN NaN NaN NaN \n",
300
+ "272 12.57 NaN NaN 3.18 NaN NaN NaN \n",
301
+ "273 9.43 NaN NaN 0.00 NaN NaN NaN \n",
302
  "\n",
303
  "[274 rows x 10 columns]"
304
  ]
305
  },
306
+ "execution_count": 5,
307
  "metadata": {},
308
  "output_type": "execute_result"
309
  }
 
314
  },
315
  {
316
  "cell_type": "code",
317
+ "execution_count": 14,
318
  "metadata": {},
319
  "outputs": [
320
  {
 
339
  " <tr style=\"text-align: right;\">\n",
340
  " <th></th>\n",
341
  " <th>Model</th>\n",
 
342
  " <th>Ifeval</th>\n",
343
  " <th>Truthfulqa</th>\n",
344
  " <th>Winogrande</th>\n",
 
350
  " </thead>\n",
351
  " <tbody>\n",
352
  " <tr>\n",
353
+ " <th>0</th>\n",
354
+ " <td>HuggingFaceH4_mistral-7b-ift_v41.0</td>\n",
355
+ " <td>44.36</td>\n",
356
+ " <td>49.35</td>\n",
357
+ " <td>72.93</td>\n",
358
+ " <td>37.30</td>\n",
359
+ " <td>60.82</td>\n",
360
+ " <td>79.70</td>\n",
361
+ " <td>58.36</td>\n",
362
+ " </tr>\n",
363
+ " <tr>\n",
364
+ " <th>1</th>\n",
365
+ " <td>HuggingFaceH4_mistral-7b-ift_v41.1</td>\n",
366
+ " <td>47.32</td>\n",
367
+ " <td>47.89</td>\n",
368
+ " <td>72.69</td>\n",
369
+ " <td>36.32</td>\n",
370
+ " <td>60.34</td>\n",
371
+ " <td>79.57</td>\n",
372
+ " <td>57.51</td>\n",
373
+ " </tr>\n",
374
+ " <tr>\n",
375
+ " <th>2</th>\n",
376
+ " <td>HuggingFaceH4_mistral-7b-ift_v41.10</td>\n",
377
+ " <td>32.72</td>\n",
378
+ " <td>51.05</td>\n",
379
+ " <td>72.45</td>\n",
380
+ " <td>25.93</td>\n",
381
+ " <td>59.75</td>\n",
382
+ " <td>81.92</td>\n",
383
+ " <td>59.22</td>\n",
384
+ " </tr>\n",
385
+ " <tr>\n",
386
+ " <th>3</th>\n",
387
+ " <td>HuggingFaceH4_mistral-7b-ift_v41.11</td>\n",
388
+ " <td>37.89</td>\n",
389
+ " <td>51.05</td>\n",
390
+ " <td>64.56</td>\n",
391
+ " <td>17.59</td>\n",
392
+ " <td>57.60</td>\n",
393
+ " <td>77.65</td>\n",
394
+ " <td>55.89</td>\n",
395
+ " </tr>\n",
396
+ " <tr>\n",
397
+ " <th>4</th>\n",
398
+ " <td>HuggingFaceH4_mistral-7b-ift_v41.12</td>\n",
399
+ " <td>37.89</td>\n",
400
+ " <td>45.94</td>\n",
401
+ " <td>63.30</td>\n",
402
+ " <td>21.15</td>\n",
403
+ " <td>58.50</td>\n",
404
+ " <td>74.94</td>\n",
405
+ " <td>52.73</td>\n",
406
  " </tr>\n",
407
  " <tr>\n",
408
+ " <th>...</th>\n",
409
+ " <td>...</td>\n",
410
+ " <td>...</td>\n",
411
+ " <td>...</td>\n",
412
+ " <td>...</td>\n",
413
+ " <td>...</td>\n",
414
+ " <td>...</td>\n",
415
+ " <td>...</td>\n",
416
+ " <td>...</td>\n",
417
+ " </tr>\n",
418
+ " <tr>\n",
419
+ " <th>258</th>\n",
420
+ " <td>mistralai_Mistral-7B-Instruct-v0.2_main</td>\n",
421
+ " <td>53.97</td>\n",
422
+ " <td>70.68</td>\n",
423
+ " <td>68.82</td>\n",
424
+ " <td>38.13</td>\n",
425
+ " <td>59.43</td>\n",
426
+ " <td>83.45</td>\n",
427
+ " <td>65.70</td>\n",
428
+ " </tr>\n",
429
+ " <tr>\n",
430
+ " <th>259</th>\n",
431
+ " <td>mistralai_Mixtral-8x7B-Instruct-v0.1_main</td>\n",
432
+ " <td>55.08</td>\n",
433
+ " <td>70.79</td>\n",
434
+ " <td>73.56</td>\n",
435
+ " <td>59.89</td>\n",
436
+ " <td>70.60</td>\n",
437
+ " <td>86.68</td>\n",
438
+ " <td>72.01</td>\n",
439
+ " </tr>\n",
440
+ " <tr>\n",
441
+ " <th>260</th>\n",
442
+ " <td>openchat_openchat-3.5-0106_main</td>\n",
443
+ " <td>54.71</td>\n",
444
+ " <td>57.55</td>\n",
445
+ " <td>72.53</td>\n",
446
+ " <td>66.19</td>\n",
447
+ " <td>63.72</td>\n",
448
+ " <td>80.10</td>\n",
449
+ " <td>61.01</td>\n",
450
+ " </tr>\n",
451
+ " <tr>\n",
452
+ " <th>261</th>\n",
453
+ " <td>stabilityai_stablelm-zephyr-3b_main</td>\n",
454
+ " <td>34.75</td>\n",
455
+ " <td>46.19</td>\n",
456
+ " <td>58.41</td>\n",
457
+ " <td>40.18</td>\n",
458
+ " <td>45.18</td>\n",
459
+ " <td>71.57</td>\n",
460
+ " <td>45.82</td>\n",
461
+ " </tr>\n",
462
+ " <tr>\n",
463
+ " <th>262</th>\n",
464
+ " <td>teknium_OpenHermes-2.5-Mistral-7B_main</td>\n",
465
+ " <td>52.68</td>\n",
466
+ " <td>58.62</td>\n",
467
+ " <td>72.14</td>\n",
468
+ " <td>54.06</td>\n",
469
+ " <td>63.01</td>\n",
470
+ " <td>82.34</td>\n",
471
+ " <td>62.97</td>\n",
472
+ " </tr>\n",
473
+ " </tbody>\n",
474
+ "</table>\n",
475
+ "<p>263 rows × 8 columns</p>\n",
476
+ "</div>"
477
+ ],
478
+ "text/plain": [
479
+ " Model Ifeval Truthfulqa \\\n",
480
+ "0 HuggingFaceH4_mistral-7b-ift_v41.0 44.36 49.35 \n",
481
+ "1 HuggingFaceH4_mistral-7b-ift_v41.1 47.32 47.89 \n",
482
+ "2 HuggingFaceH4_mistral-7b-ift_v41.10 32.72 51.05 \n",
483
+ "3 HuggingFaceH4_mistral-7b-ift_v41.11 37.89 51.05 \n",
484
+ "4 HuggingFaceH4_mistral-7b-ift_v41.12 37.89 45.94 \n",
485
+ ".. ... ... ... \n",
486
+ "258 mistralai_Mistral-7B-Instruct-v0.2_main 53.97 70.68 \n",
487
+ "259 mistralai_Mixtral-8x7B-Instruct-v0.1_main 55.08 70.79 \n",
488
+ "260 openchat_openchat-3.5-0106_main 54.71 57.55 \n",
489
+ "261 stabilityai_stablelm-zephyr-3b_main 34.75 46.19 \n",
490
+ "262 teknium_OpenHermes-2.5-Mistral-7B_main 52.68 58.62 \n",
491
+ "\n",
492
+ " Winogrande Gsm8k Mmlu Hellaswag Arc \n",
493
+ "0 72.93 37.30 60.82 79.70 58.36 \n",
494
+ "1 72.69 36.32 60.34 79.57 57.51 \n",
495
+ "2 72.45 25.93 59.75 81.92 59.22 \n",
496
+ "3 64.56 17.59 57.60 77.65 55.89 \n",
497
+ "4 63.30 21.15 58.50 74.94 52.73 \n",
498
+ ".. ... ... ... ... ... \n",
499
+ "258 68.82 38.13 59.43 83.45 65.70 \n",
500
+ "259 73.56 59.89 70.60 86.68 72.01 \n",
501
+ "260 72.53 66.19 63.72 80.10 61.01 \n",
502
+ "261 58.41 40.18 45.18 71.57 45.82 \n",
503
+ "262 72.14 54.06 63.01 82.34 62.97 \n",
504
+ "\n",
505
+ "[263 rows x 8 columns]"
506
+ ]
507
+ },
508
+ "execution_count": 14,
509
+ "metadata": {},
510
+ "output_type": "execute_result"
511
+ }
512
+ ],
513
+ "source": [
514
+ "new_df = df.drop([\"Date\", \"Average\"], axis=1).groupby(\"Model\").max().reset_index()\n",
515
+ "new_df"
516
+ ]
517
+ },
518
+ {
519
+ "cell_type": "code",
520
+ "execution_count": 16,
521
+ "metadata": {},
522
+ "outputs": [
523
+ {
524
+ "data": {
525
+ "text/html": [
526
+ "<div>\n",
527
+ "<style scoped>\n",
528
+ " .dataframe tbody tr th:only-of-type {\n",
529
+ " vertical-align: middle;\n",
530
+ " }\n",
531
+ "\n",
532
+ " .dataframe tbody tr th {\n",
533
+ " vertical-align: top;\n",
534
+ " }\n",
535
+ "\n",
536
+ " .dataframe thead th {\n",
537
+ " text-align: right;\n",
538
+ " }\n",
539
+ "</style>\n",
540
+ "<table border=\"1\" class=\"dataframe\">\n",
541
+ " <thead>\n",
542
+ " <tr style=\"text-align: right;\">\n",
543
+ " <th></th>\n",
544
+ " <th>Model</th>\n",
545
+ " <th>Date</th>\n",
546
+ " <th>Ifeval</th>\n",
547
+ " <th>Truthfulqa</th>\n",
548
+ " <th>Winogrande</th>\n",
549
+ " <th>Gsm8k</th>\n",
550
+ " <th>Mmlu</th>\n",
551
+ " <th>Hellaswag</th>\n",
552
+ " <th>Arc</th>\n",
553
+ " </tr>\n",
554
+ " </thead>\n",
555
+ " <tbody>\n",
556
+ " <tr>\n",
557
+ " <th>0</th>\n",
558
+ " <td>NousResearch_Nous-Hermes-2-Yi-34B_main</td>\n",
559
+ " <td>2024-03-04</td>\n",
560
+ " <td>39.00</td>\n",
561
+ " <td>61.44</td>\n",
562
+ " <td>80.58</td>\n",
563
+ " <td>67.93</td>\n",
564
+ " <td>76.24</td>\n",
565
+ " <td>83.79</td>\n",
566
+ " <td>68.00</td>\n",
567
+ " </tr>\n",
568
+ " <tr>\n",
569
+ " <th>1</th>\n",
570
+ " <td>deepseek-ai_deepseek-llm-67b-chat_main</td>\n",
571
+ " <td>2024-03-05</td>\n",
572
+ " <td>55.27</td>\n",
573
+ " <td>57.78</td>\n",
574
+ " <td>79.16</td>\n",
575
+ " <td>76.12</td>\n",
576
+ " <td>71.18</td>\n",
577
+ " <td>83.94</td>\n",
578
+ " <td>64.16</td>\n",
579
+ " </tr>\n",
580
+ " <tr>\n",
581
+ " <th>2</th>\n",
582
+ " <td>NousResearch_Nous-Hermes-2-Mixtral-8x7B-DPO_main</td>\n",
583
+ " <td>2024-03-02</td>\n",
584
+ " <td>59.33</td>\n",
585
+ " <td>64.76</td>\n",
586
+ " <td>78.53</td>\n",
587
+ " <td>62.17</td>\n",
588
+ " <td>71.96</td>\n",
589
+ " <td>85.42</td>\n",
590
+ " <td>70.82</td>\n",
591
+ " </tr>\n",
592
+ " <tr>\n",
593
+ " <th>3</th>\n",
594
+ " <td>mistralai_Mixtral-8x7B-Instruct-v0.1_main</td>\n",
595
+ " <td>2024-03-02</td>\n",
596
+ " <td>55.08</td>\n",
597
+ " <td>70.79</td>\n",
598
+ " <td>73.56</td>\n",
599
+ " <td>59.89</td>\n",
600
+ " <td>70.60</td>\n",
601
+ " <td>86.68</td>\n",
602
+ " <td>72.01</td>\n",
603
+ " </tr>\n",
604
+ " <tr>\n",
605
+ " <th>4</th>\n",
606
+ " <td>deepseek-ai_deepseek-llm-67b-chat_main</td>\n",
607
+ " <td>2024-03-04</td>\n",
608
+ " <td>55.27</td>\n",
609
+ " <td>57.78</td>\n",
610
+ " <td>79.16</td>\n",
611
+ " <td>76.12</td>\n",
612
+ " <td>71.18</td>\n",
613
+ " <td>83.94</td>\n",
614
+ " <td>64.16</td>\n",
615
+ " </tr>\n",
616
+ " <tr>\n",
617
+ " <th>...</th>\n",
618
+ " <td>...</td>\n",
619
+ " <td>...</td>\n",
620
+ " <td>...</td>\n",
621
+ " <td>...</td>\n",
622
+ " <td>...</td>\n",
623
+ " <td>...</td>\n",
624
+ " <td>...</td>\n",
625
+ " <td>...</td>\n",
626
+ " <td>...</td>\n",
627
+ " </tr>\n",
628
+ " <tr>\n",
629
+ " <th>269</th>\n",
630
+ " <td>HuggingFaceH4_starcoder2-15b-ift_v18.0</td>\n",
631
+ " <td>2024-03-10</td>\n",
632
+ " <td>21.63</td>\n",
633
+ " <td>NaN</td>\n",
634
+ " <td>NaN</td>\n",
635
+ " <td>0.83</td>\n",
636
+ " <td>NaN</td>\n",
637
+ " <td>NaN</td>\n",
638
+ " <td>NaN</td>\n",
639
+ " </tr>\n",
640
+ " <tr>\n",
641
+ " <th>270</th>\n",
642
+ " <td>HuggingFaceH4_mistral-7b-ift_v49.0</td>\n",
643
+ " <td>2024-03-07</td>\n",
644
+ " <td>20.15</td>\n",
645
+ " <td>NaN</td>\n",
646
+ " <td>NaN</td>\n",
647
+ " <td>0.00</td>\n",
648
+ " <td>NaN</td>\n",
649
+ " <td>NaN</td>\n",
650
+ " <td>NaN</td>\n",
651
+ " </tr>\n",
652
+ " <tr>\n",
653
+ " <th>271</th>\n",
654
+ " <td>HuggingFaceH4_starchat-beta_main</td>\n",
655
+ " <td>2024-03-12</td>\n",
656
+ " <td>8.13</td>\n",
657
+ " <td>NaN</td>\n",
658
+ " <td>NaN</td>\n",
659
+ " <td>NaN</td>\n",
660
+ " <td>NaN</td>\n",
661
+ " <td>NaN</td>\n",
662
+ " <td>NaN</td>\n",
663
+ " </tr>\n",
664
+ " <tr>\n",
665
+ " <th>272</th>\n",
666
+ " <td>HuggingFaceH4_starcoder2-15b-ift_v7.0</td>\n",
667
+ " <td>2024-03-10</td>\n",
668
+ " <td>12.57</td>\n",
669
+ " <td>NaN</td>\n",
670
+ " <td>NaN</td>\n",
671
+ " <td>3.18</td>\n",
672
  " <td>NaN</td>\n",
673
  " <td>NaN</td>\n",
674
  " <td>NaN</td>\n",
675
+ " </tr>\n",
676
+ " <tr>\n",
677
+ " <th>273</th>\n",
678
+ " <td>HuggingFaceH4_zephyr-7b-beta-ift_v1.1</td>\n",
679
+ " <td>2024-03-13</td>\n",
680
+ " <td>9.43</td>\n",
681
  " <td>NaN</td>\n",
682
  " <td>NaN</td>\n",
683
+ " <td>0.00</td>\n",
684
  " <td>NaN</td>\n",
685
  " <td>NaN</td>\n",
686
  " <td>NaN</td>\n",
687
  " </tr>\n",
688
  " </tbody>\n",
689
  "</table>\n",
690
+ "<p>274 rows × 9 columns</p>\n",
691
  "</div>"
692
  ],
693
  "text/plain": [
694
+ " Model Date Ifeval \\\n",
695
+ "0 NousResearch_Nous-Hermes-2-Yi-34B_main 2024-03-04 39.00 \n",
696
+ "1 deepseek-ai_deepseek-llm-67b-chat_main 2024-03-05 55.27 \n",
697
+ "2 NousResearch_Nous-Hermes-2-Mixtral-8x7B-DPO_main 2024-03-02 59.33 \n",
698
+ "3 mistralai_Mixtral-8x7B-Instruct-v0.1_main 2024-03-02 55.08 \n",
699
+ "4 deepseek-ai_deepseek-llm-67b-chat_main 2024-03-04 55.27 \n",
700
+ ".. ... ... ... \n",
701
+ "269 HuggingFaceH4_starcoder2-15b-ift_v18.0 2024-03-10 21.63 \n",
702
+ "270 HuggingFaceH4_mistral-7b-ift_v49.0 2024-03-07 20.15 \n",
703
+ "271 HuggingFaceH4_starchat-beta_main 2024-03-12 8.13 \n",
704
+ "272 HuggingFaceH4_starcoder2-15b-ift_v7.0 2024-03-10 12.57 \n",
705
+ "273 HuggingFaceH4_zephyr-7b-beta-ift_v1.1 2024-03-13 9.43 \n",
706
+ "\n",
707
+ " Truthfulqa Winogrande Gsm8k Mmlu Hellaswag Arc \n",
708
+ "0 61.44 80.58 67.93 76.24 83.79 68.00 \n",
709
+ "1 57.78 79.16 76.12 71.18 83.94 64.16 \n",
710
+ "2 64.76 78.53 62.17 71.96 85.42 70.82 \n",
711
+ "3 70.79 73.56 59.89 70.60 86.68 72.01 \n",
712
+ "4 57.78 79.16 76.12 71.18 83.94 64.16 \n",
713
+ ".. ... ... ... ... ... ... \n",
714
+ "269 NaN NaN 0.83 NaN NaN NaN \n",
715
+ "270 NaN NaN 0.00 NaN NaN NaN \n",
716
+ "271 NaN NaN NaN NaN NaN NaN \n",
717
+ "272 NaN NaN 3.18 NaN NaN NaN \n",
718
+ "273 NaN NaN 0.00 NaN NaN NaN \n",
719
  "\n",
720
+ "[274 rows x 9 columns]"
 
 
721
  ]
722
  },
723
+ "execution_count": 16,
724
  "metadata": {},
725
  "output_type": "execute_result"
726
  }
727
  ],
728
  "source": [
729
+ "df[[\"Model\", \"Date\"]].merge(new_df, on=\"Model\", how=\"left\")"
730
  ]
731
  },
732
  {