abdev-leaderboard

Running

App Files Files Community

abdev-leaderboard / app.py

loodvanniekerkginkgo

Fixed fillna anonymous, moved data fetching to utils.py

cfa5138 3 months ago

raw

history blame

11.5 kB

	import os
	import pandas as pd
	import gradio as gr
	from gradio.themes.utils import sizes
	from gradio_leaderboard import Leaderboard
	from dotenv import load_dotenv

	load_dotenv() # Load environment variables from .env file

	from about import ABOUT_INTRO, ABOUT_TEXT, FAQS, SUBMIT_INSTRUCTIONS
	from constants import (
	ASSAY_RENAME, # noqa: F401
	SEQUENCES_FILE_DICT,
	LEADERBOARD_DISPLAY_COLUMNS,
	ABOUT_TAB_NAME,
	FAQ_TAB_NAME,
	TERMS_URL,
	LEADERBOARD_COLUMNS_RENAME,
	LEADERBOARD_COLUMNS_RENAME_LIST,
	SUBMIT_TAB_NAME,
	SLACK_URL,
	)
	from submit import make_submission
	from utils import fetch_hf_results, show_output_box, periodic_data_fetch


	def format_leaderboard_table(df_results: pd.DataFrame, assay: str \| None = None):
	"""
	Format the dataframe for display on the leaderboard. The dataframe comes from utils.fetch_hf_results().
	"""
	df = df_results.query("assay.isin(@ASSAY_RENAME.keys())").copy()
	if assay is not None:
	df = df[df["assay"] == assay]
	df = df[LEADERBOARD_DISPLAY_COLUMNS]
	df = df.sort_values(by="spearman", ascending=False)
	# After sorting, just add the reason for excluding heldout test set
	# Note: We can also just say the following as a text box at the bottom of the leaderboard: "Note: Results for the Heldout Test Set are only evaluated at competition close"
	# Convert spearman column to string to avoid dtype incompatibility when assigning text
	df["spearman"] = df["spearman"].astype(str)
	df.loc[
	(df["dataset"] == "Heldout Test Set") & (df["spearman"] == "nan"), "spearman"
	] = "N/A, evaluated at competition close"

	# Finally, rename columns for readability
	df = df.rename(columns=LEADERBOARD_COLUMNS_RENAME)
	return df


	def get_leaderboard_object(assay: str \| None = None):
	filter_columns = ["dataset"]
	if assay is None:
	filter_columns.append("property")
	# Bug: Can't leave search_columns empty because then it says "Column None not found in headers"
	# Note(Lood): Would be nice to make it clear that the Search Column is searching on model name
	current_dataframe = pd.read_csv("debug-current-results.csv")
	lb = Leaderboard(
	value=format_leaderboard_table(df_results=current_dataframe, assay=assay),
	datatype=["str", "str", "str", "number", "str"],
	select_columns=LEADERBOARD_COLUMNS_RENAME_LIST(
	["model", "property", "spearman", "dataset", "user"]
	),
	search_columns=["Model Name"],
	filter_columns=LEADERBOARD_COLUMNS_RENAME_LIST(filter_columns),
	every=15,
	render=True,
	)
	return lb


	def refresh_overall_leaderboard():
	if not os.path.exists("debug-current-results.csv"):
	fetch_hf_results() # Hope this doesn't cause race conditions with the main fetch_hf_results() thread
	current_dataframe = pd.read_csv("debug-current-results.csv")
	return format_leaderboard_table(df_results=current_dataframe)


	# Initialize global dataframe
	fetch_hf_results()
	current_dataframe = pd.read_csv("debug-current-results.csv")


	# Make font size bigger using gradio theme
	with gr.Blocks(theme=gr.themes.Default(text_size=sizes.text_lg)) as demo:
	timer = gr.Timer(3) # Run every 3 seconds when page is focused

	## Header

	with gr.Row():
	with gr.Column(scale=6): # bigger text area
	gr.Markdown(
	f"""
	## Welcome to the Ginkgo Antibody Developability Benchmark!

	Participants can submit their model to the leaderboards by simply uploading a CSV file (see the "✉️ Submit" tab).

	You can predict any or all of the 5 properties, and you can filter the main leaderboard by property.
	See more details in the "{ABOUT_TAB_NAME}" tab.
	Submissions close on 1 November 2025.
	"""
	)
	with gr.Column(scale=2): # smaller side column for logo
	gr.Image(
	value="./assets/competition_logo.jpg",
	show_label=False,
	show_download_button=False,
	show_share_button=False,
	show_fullscreen_button=False,
	width="25vw", # Take up the width of the column (2/8 = 1/4)
	)

	with gr.Tabs(elem_classes="tab-buttons"):
	with gr.TabItem(ABOUT_TAB_NAME, elem_id="abdev-benchmark-tab-table"):
	gr.Markdown(ABOUT_INTRO)
	gr.Image(
	value="./assets/prediction_explainer_cv.png",
	show_label=False,
	show_download_button=False,
	show_share_button=False,
	show_fullscreen_button=False,
	width="30vw",
	)
	gr.Markdown(ABOUT_TEXT)

	# Sequence download buttons
	gr.Markdown(
	"""### 📥 Download Sequences
	The GDPa1 dataset (with assay data and sequences) is available on Hugging Face [here](https://huggingface.co/datasets/ginkgo-datapoints/GDPa1),
	but we provide this and the private test set for convenience."""
	)
	with gr.Row():
	with gr.Column():
	download_button_cv_about = gr.DownloadButton(
	label="📥 Download GDPa1 sequences",
	value=SEQUENCES_FILE_DICT["GDPa1_cross_validation"],
	variant="secondary",
	)
	with gr.Column():
	download_button_test_about = gr.DownloadButton(
	label="📥 Download Private Test Set sequences",
	value=SEQUENCES_FILE_DICT["Heldout Test Set"],
	variant="secondary",
	)

	with gr.TabItem(
	"🏆 Leaderboard", elem_id="abdev-benchmark-tab-table"
	) as leaderboard_tab:
	gr.Markdown(
	"""
	# Overall Leaderboard (filter below by property)
	Each property has its own prize, and participants can submit models for any combination of properties.

	Note: It is easy to overfit the public GDPa1 dataset, which results in artificially high Spearman correlations.
	We would suggest training using cross-validation to give a better indication of the model's performance on the eventual private test set.
	"""
	)
	lb = get_leaderboard_object()
	timer.tick(fn=refresh_overall_leaderboard, outputs=lb)
	demo.load(fn=refresh_overall_leaderboard, outputs=lb)

	with gr.TabItem(SUBMIT_TAB_NAME, elem_id="boundary-benchmark-tab-table"):
	gr.Markdown(SUBMIT_INSTRUCTIONS)

	with gr.Row():
	with gr.Column():
	username_input = gr.Textbox(
	label="Username",
	placeholder="Enter your Hugging Face username",
	info="This will be used to identify valid submissions, and to update your results if you submit again.",
	)

	anonymous_checkbox = gr.Checkbox(
	label="Anonymous",
	value=False,
	info="If checked, your username will be replaced with an anonymous hash on the leaderboard.",
	)
	model_name_input = gr.Textbox(
	label="Model Name",
	placeholder="Enter your model name (e.g., 'MyProteinLM-v1')",
	info="This will be displayed on the leaderboard.",
	)
	model_description_input = gr.Textbox(
	label="Model Description (optional)",
	placeholder="Brief description of your model and approach",
	info="Describe your model, training data, or methodology.",
	lines=3,
	)
	registration_code = gr.Textbox(
	label="Registration Code",
	placeholder="Enter your registration code",
	info="If you did not receive a registration code, please sign up on the <a href='https://datapoints.ginkgo.bio/ai-competitions/2025-abdev-competition'>Competition Registration page</a> or email <a href='mailto:[email protected]'>[email protected]</a>.",
	)

	with gr.Column():
	gr.Markdown("### Upload Both Submission Files")

	# GDPa1 Cross-validation file
	gr.Markdown("GDPa1 Cross-Validation Predictions:")
	download_button_cv = gr.DownloadButton(
	label="📥 Download GDPa1 sequences",
	value=SEQUENCES_FILE_DICT["GDPa1_cross_validation"],
	variant="secondary",
	)
	submission_file_cv = gr.File(label="GDPa1 Cross-Validation CSV")

	# Test set file
	gr.Markdown("Private Test Set Predictions:")
	download_button_test = gr.DownloadButton(
	label="📥 Download Private Test Set sequences",
	value=SEQUENCES_FILE_DICT["Heldout Test Set"],
	variant="secondary",
	)
	submission_file_test = gr.File(label="Private Test Set CSV")

	submit_btn = gr.Button("Evaluate")
	message = gr.Textbox(label="Status", lines=3, visible=False)

	submit_btn.click(
	make_submission,
	inputs=[
	submission_file_cv,
	submission_file_test,
	username_input,
	model_name_input,
	model_description_input,
	anonymous_checkbox,
	registration_code,
	],
	outputs=[message],
	).then(
	fn=show_output_box,
	inputs=[message],
	outputs=[message],
	)
	with gr.Tab(FAQ_TAB_NAME):
	gr.Markdown("# Frequently Asked Questions")
	for i, (question, answer) in enumerate(FAQS.items()):
	# Would love to make questions bold but accordion doesn't support it
	question = f"{i+1}. {question}"
	with gr.Accordion(question, open=False):
	gr.Markdown(f"{answer}") # Italics for answers

	# Footnote
	gr.Markdown(
	f"""
	<div style="text-align: center; font-size: 14px; color: gray; margin-top: 2em;">
	📬 For questions or feedback, contact <a href="mailto:[email protected]">[email protected]</a> or discuss on the <a href="{SLACK_URL}">Slack community</a> co-hosted by Bits in Bio.<br>
	Visit the <a href="https://datapoints.ginkgo.bio/ai-competitions/2025-abdev-competition">Competition Registration page</a> to sign up for updates and to register, and see Terms <a href="{TERMS_URL}">here</a>.
	</div>
	""",
	elem_id="contact-footer",
	)

	if __name__ == "__main__":
	demo.launch(ssr_mode=False, app_kwargs={"lifespan": periodic_data_fetch})