Spaces:
Running
Running
| import pandas as pd | |
| import gradio as gr | |
| from collections import defaultdict | |
| def parse_excel(file_path): | |
| xls = pd.ExcelFile(file_path) | |
| task_data = defaultdict(lambda: defaultdict(dict)) | |
| all_models = set() | |
| all_datasets = defaultdict(set) | |
| model_urls = {} # εε¨ζ¨‘εURL | |
| for sheet_name in xls.sheet_names: | |
| if '_' not in sheet_name: | |
| continue | |
| task_name, lang = sheet_name.rsplit('_', 1) | |
| if lang not in ['en', 'zh']: | |
| continue | |
| df = xls.parse(sheet_name) | |
| has_url = 'URL' in df.columns | |
| urls = df['URL'].tolist() if has_url else [None] * len(df) | |
| models = df.iloc[:, 0].tolist() | |
| datasets = [col for col in df.columns[1:] if col != 'URL'] if has_url else df.columns[1:].tolist() | |
| for model, url in zip(models, urls): | |
| if url and pd.notnull(url): | |
| model_urls[model] = url | |
| all_models.update(models) | |
| all_datasets[task_name].update([(d, lang) for d in datasets]) | |
| for idx, row in df.iterrows(): | |
| model = row.iloc[0] | |
| scores = row[datasets].tolist() if datasets else [] | |
| task_data[task_name][lang][model] = dict(zip(datasets, scores)) | |
| return task_data, sorted(all_models), dict(all_datasets), model_urls | |
| def calculate_averages(task_data, all_models): | |
| lang_overall_avg = defaultdict(lambda: defaultdict(list)) | |
| task_lang_avg = defaultdict(lambda: defaultdict(lambda: defaultdict(list))) | |
| for task, langs in task_data.items(): | |
| for lang, models in langs.items(): | |
| for model in all_models: | |
| if model in models: | |
| scores = list(models[model].values()) | |
| lang_overall_avg[lang][model].extend(scores) | |
| task_lang_avg[task][lang][model].extend(scores) | |
| overall = { | |
| lang: { | |
| model: sum(scores)/len(scores) if scores else 0.0 | |
| for model, scores in models.items() | |
| } | |
| for lang, models in lang_overall_avg.items() | |
| } | |
| processed_task_avg = defaultdict(dict) | |
| for task, langs in task_lang_avg.items(): | |
| for lang, models in langs.items(): | |
| processed_task_avg[task][lang] = { | |
| model: sum(scores)/len(scores) if scores else 0.0 | |
| for model, scores in models.items() | |
| } | |
| return overall, processed_task_avg | |
| def filter_models(search_term): | |
| if not search_term: | |
| return all_models | |
| return [m for m in all_models if search_term.lower() in m.lower()] | |
| def create_lang_view(lang, models): | |
| model_links = [ | |
| f'<a href="{model_urls.get(m, "#")}" target="_blank">{m}</a>' | |
| if model_urls.get(m) else m | |
| for m in models | |
| ] | |
| df_data = { | |
| "Model": model_links, | |
| f"Overall ({lang.upper()})": [ | |
| round(overall_avg[lang].get(m, 0), 3) | |
| for m in models | |
| ] | |
| } | |
| for task in sorted(task_avg.keys()): | |
| task_scores = [] | |
| for m in models: | |
| score = task_avg[task].get(lang, {}).get(m, 0) | |
| task_scores.append(round(score, 3)) | |
| df_data[task] = task_scores | |
| df = pd.DataFrame(df_data) | |
| if not df.empty: | |
| numeric_cols = df.columns[df.columns != "Model"] | |
| df = df[~(df[numeric_cols] == 0).all(axis=1)] | |
| df = df.sort_values(by=f"Overall ({lang.upper()})", ascending=False) | |
| df.reset_index(drop=True, inplace=True) | |
| return df if not df.empty else pd.DataFrame({"Status": [f"No {lang.upper()} data matching criteria..."]}) | |
| def create_overall_view(search_term=None): | |
| filtered_models = filter_models(search_term) | |
| en_df = create_lang_view('en', filtered_models) | |
| zh_df = create_lang_view('zh', filtered_models) | |
| return en_df, zh_df | |
| def create_task_view(task_name, search_term=None): | |
| task_langs = task_data.get(task_name, {}) | |
| dfs = [] | |
| filtered_models = filter_models(search_term) | |
| model_links = [ | |
| f'<a href="{model_urls.get(m, "#")}" target="_blank">{m}</a>' | |
| if model_urls.get(m) else m | |
| for m in filtered_models | |
| ] | |
| for lang in ['en', 'zh']: | |
| lang_data = task_langs.get(lang, {}) | |
| datasets = [] | |
| if lang_data: | |
| models_in_lang = list(lang_data.keys()) | |
| if models_in_lang: | |
| datasets = sorted(lang_data[models_in_lang[0]].keys()) | |
| df = pd.DataFrame(columns=["Model", "Avg."] + datasets) | |
| for i, model in enumerate(filtered_models): | |
| row_data = {"Model": model_links[i]} | |
| scores = [] | |
| if model in lang_data: | |
| for ds in datasets: | |
| score = lang_data[model].get(ds, 0.0) | |
| row_data[ds] = round(score, 3) | |
| scores.append(score) | |
| row_data["Avg."] = round(sum(scores)/len(scores) if scores else 0.0, 3) | |
| else: | |
| row_data.update({ds: 0.0 for ds in datasets}) | |
| row_data["Avg."] = 0.0 | |
| df = pd.concat([df, pd.DataFrame([row_data])], ignore_index=True) | |
| if datasets: | |
| df = df[["Model", "Avg."] + datasets] | |
| numeric_cols = df.columns[df.columns != "Model"] | |
| if not numeric_cols.empty: | |
| df = df[~(df[numeric_cols] == 0).all(axis=1)] | |
| df = df.sort_values(by="Avg.", ascending=False) | |
| df.reset_index(drop=True, inplace=True) | |
| else: | |
| df = pd.DataFrame({"Status": ["There is no data for this language.."]}) | |
| dfs.append(df) | |
| return dfs | |
| task_data, all_models, all_datasets, model_urls = parse_excel('benchmark.xlsx') | |
| overall_avg, task_avg = calculate_averages(task_data, all_models) | |
| with gr.Blocks(title="Benchmark Leaderboard", css=""".search-box {margin-bottom: 20px} | |
| .gradio-container {max-width: 100% !important} | |
| .dataframe {width: 100% !important}""") as demo: | |
| gr.Markdown("# π° FinMTEB Benchmark Leaderboard") | |
| gr.Markdown("**Finance** Massive Text Embedding Benchmark (FinMTEB), an embedding benchmark consists of 64 financial domain-specific text datasets, across English and Chinese, spanning seven different tasks.") | |
| gr.Markdown("---") | |
| gr.Markdown("π If you feel our work helpful, please cite the following paper: [FinMTEB: Finance Massive Text Embedding Benchmark](https://arxiv.org/abs/2502.10990)") | |
| gr.Markdown("Github: [FinMTEB](https://github.com/yixuantt/FinMTEB/blob/main/README.md)") | |
| search = gr.Textbox( | |
| placeholder="π Enter the model name...", | |
| label="model_search", | |
| show_label=False, | |
| elem_classes=["search-box"] | |
| ) | |
| with gr.Tabs() as main_tabs: | |
| with gr.Tab("π Overview"): | |
| with gr.Column(elem_classes=["lang-section"]): | |
| gr.Markdown("### English Datasets") | |
| en_table = gr.DataFrame(interactive=False,datatype=["markdown", "markdown", "html"]) | |
| with gr.Column(elem_classes=["lang-section"]): | |
| gr.Markdown("### Chinese Datasets") | |
| zh_table = gr.DataFrame(interactive=False,datatype=["markdown", "markdown", "html"]) | |
| search.change( | |
| create_overall_view, | |
| inputs=search, | |
| outputs=[en_table, zh_table] | |
| ) | |
| demo.load( | |
| lambda: create_overall_view(), | |
| outputs=[en_table, zh_table] | |
| ) | |
| for task_name in task_data: | |
| with gr.Tab(task_name): | |
| with gr.Column(): | |
| gr.Markdown("### English Datasets") | |
| en_display = gr.DataFrame(interactive=False,datatype=["markdown", "markdown", "html"]) | |
| with gr.Column(): | |
| gr.Markdown("### Chinese Datasets") | |
| zh_display = gr.DataFrame(interactive=False,datatype=["markdown", "markdown", "html"]) | |
| search.change( | |
| lambda term, tn=task_name: create_task_view(tn, term), | |
| inputs=search, | |
| outputs=[en_display, zh_display] | |
| ) | |
| demo.load( | |
| lambda tn=task_name: create_task_view(tn), | |
| outputs=[en_display, zh_display] | |
| ) | |
| with gr.Tab("π¬ Submit"): | |
| gr.Markdown("---") | |
| gr.Markdown("For the results report, please send the results to **[email protected]**") | |
| gr.Markdown("π Thanks for your contribution!") | |
| if __name__ == "__main__": | |
| demo.launch() |