| import evaluate |
| import gradio as gr |
| import os |
|
|
|
|
| |
| def create_interface(module): |
| def evaluate_fn(prediction, references, pos_pred, neg_pred): |
| |
| if not prediction or prediction.strip() == "": |
| return "", "", "", "Please provide a candidate hypothesis to evaluate." |
|
|
| if not references or references.strip() == "": |
| return "", "", "", "Please provide a validation program." |
|
|
| if not pos_pred or pos_pred.strip() == "": |
| return "", "", "", "Please specify the positive predicate name." |
|
|
| if not neg_pred or neg_pred.strip() == "": |
| return "", "", "", "Please specify the negative predicate name." |
|
|
| |
| pred = prediction.strip() |
|
|
| |
| ref = { |
| "validation_program": references.strip(), |
| "evaluation_config": { |
| "positive_predicate": pos_pred, |
| "negative_predicate": neg_pred |
| } |
| } |
|
|
| |
| results = module.compute(predictions=[pred], references=[ref]) |
|
|
| |
| error_msg = "" |
| if results["detailed_results"] and len(results["detailed_results"]) > 0: |
| error = results["detailed_results"][0].get("error") |
| if error: |
| error_msg = error |
|
|
| return ( |
| f"Accuracy score: {results['accuracy']:.4f}", |
| f"Partial score: {results['partial_score']:.4f}", |
| f"Syntax score: {results['syntax_score']:.4f}", |
| error_msg |
| ) |
|
|
| |
| def load_example(example): |
| return ( |
| example["rule"], |
| example["validation"], |
| example["pos_pred"], |
| example["neg_pred"] |
| ) |
|
|
| |
| readme_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "README.md") |
| with open(readme_path, 'r') as f: |
| readme_content = f.read() |
| readme_content = '# Metric Card ' + readme_content.split('# Metric Card ')[1] |
|
|
| |
| example_train = { |
| "description": "Basic Train Problem", |
| "validation": """eastbound(train0). |
| has_car(train0, car0_1). |
| car_num(car0_1, 1). |
| car_color(car0_1, white). |
| car_len(car0_1, short). |
| has_wall(car0_1, full). |
| |
| westbound(train1). |
| has_car(train1, car1_1). |
| car_num(car1_1, 1). |
| car_color(car1_1, yellow). |
| car_len(car1_1, short). |
| has_wall(car1_1, full). |
| """, |
| "rule": "eastbound(Train):- has_car(Train, Car1), car_color(Car1, white).", |
| "pos_pred": "eastbound", |
| "neg_pred": "westbound" |
| } |
|
|
| example_family = { |
| "description": "Family Relationships", |
| "validation": """% Custom problem |
| parent(john, mary). |
| parent(john, bob). |
| parent(alice, bob). |
| parent(susan, alice). |
| |
| % Examples |
| grandparent(susan, bob). |
| not_grandparent(john, alice).""", |
| "rule": "grandparent(X, Y) :- parent(X, Z), parent(Z, Y).", |
| "pos_pred": "grandparent", |
| "neg_pred": "not_grandparent" |
| } |
|
|
| with gr.Blocks(title="Symbolic Judge") as demo: |
| with gr.Tab("Evaluation"): |
| gr.Markdown("# Symbolic Judge: Verifiable Rewards for Scalable Logical Reasoning") |
| gr.Markdown(""" |
| Verifiable Rewards for Scalable Logical Reasoning (**SLR**) provides verifiable rewards via logic programm execution. |
| It deterministically evaluates candidate hypotheses by executing them against the validation program and verifying all positive examples ($E^+$) are entailed and all negative examples ($E^-$) are not entailed . |
| Evaluations performed are fully verifiable and grounded in formal logic, ensuring an automatic, transparent, and reproducible standard for evaluation and reward in both supervised and reinforcement learning settings. |
| How it Works: |
| - Input: A candidate hypothesis (logic rule) and an executable validation program containing background knowledge and examples. |
| - Execution: The candidate rule is executed against the validation program using a Prolog interpreter. |
| - Correctness Criteria: The rule is considered correct if it entails all positive examples and rejects all negative examples. |
| - Metrics: We provide a range of evaluation metrics (detailed below). |
| - Usage: see **Documentation tab** for details on how to use Verifiable Rewards for Scalable Logical Reasoning in your own projects. |
| **Note:** A local Prolog interpreter is required to execute validation programs. |
| """) |
|
|
| with gr.Row(): |
| with gr.Column(scale=1): |
| with gr.Group(): |
| gr.Markdown("### Model Output") |
| prediction_input = gr.Textbox( |
| label="Candidate Hypothesis to be evaluated(predicted rule by the model)", |
| placeholder="eastbound(T) :- has_car(T, C), short(C), open(C).", |
| lines=5 |
| ) |
|
|
| with gr.Group(): |
| gr.Markdown("### Validation Program") |
|
|
| references_input = gr.Textbox( |
| label="The validation program contains background knowledge and examples for testing", |
| placeholder="% Background knowledge\ncar(car_1). car(car_2).\nlong(car_2). short(car_1).\nopen(car_1). closed(car_2).\n\n% Examples\neastbound(train_1).\nwestbound(train_2).\n\n% Train configurations\nhas_car(train_1, car_1).\nhas_car(train_2, car_2).", |
| lines=12 |
| ) |
|
|
| with gr.Row(): |
| pos_pred_input = gr.Textbox( |
| label="Positive Validation Examples", |
| value="eastbound", |
| placeholder="eastbound", |
| info="The predicate name identifying positive examples in the validation program" |
| ) |
| neg_pred_input = gr.Textbox( |
| label="Negative Validation Examples", |
| value="westbound", |
| placeholder="westbound", |
| info="The predicate name identifying negative examples in the validation program" |
| ) |
|
|
| eval_button = gr.Button("Evaluate", variant="primary") |
|
|
| with gr.Column(scale=1): |
| with gr.Group(): |
| gr.Markdown("### Evaluation Metrics") |
| with gr.Group(): |
| accuracy_output = gr.Textbox( |
| label="Overall Accuracy", |
| info="Proportion of hypotheses that solve the tasks", |
| container=True |
| ) |
| partial_score_output = gr.Textbox( |
| label="Partial Score", |
| info="Proportion of examples that are correctly classified in the tasks", |
| container=True |
| ) |
| syntax_score_output = gr.Textbox( |
| label="Syntax Score", |
| info="Proportion of syntactically valid hypothesis", |
| container=True |
| ) |
| error_output = gr.Textbox( |
| label="Syntax Details", |
| info="Error messages for syntax errors or execution failures", |
| container=True, |
| ) |
| gr.Markdown("Note: This interface evaluates a single hypothesis at a time. Use Python API for batch processing") |
| |
| |
| examples = [ |
| ["Train Problem", example_train], |
| ["Family Relationships", example_family] |
| ] |
|
|
| with gr.Accordion("Example Logical Reasoning Tasks", open=True): |
| example_radio = gr.Radio([ex[0] for ex in examples], label="Select an example", value="Train Problem") |
| |
| with gr.Row(): |
| with gr.Column(): |
| gr.Markdown("### Selected Example Preview") |
| example_description = gr.Markdown("**Description**: " + example_train["description"]) |
| with gr.Row(): |
| with gr.Column(): |
| gr.Markdown("#### Candidate Hypothesis") |
| example_rule = gr.Code(value=example_train["rule"]) |
| with gr.Column(): |
| gr.Markdown("#### Validation Program") |
| example_validation = gr.Code(value=example_train["validation"]) |
|
|
| with gr.Row(): |
| with gr.Column(): |
| gr.Markdown("#### Validation Examples") |
| example_predicates = gr.Markdown(f""" |
| **Positive Examples**: `{example_train["pos_pred"]}` |
| **Negative Examples**: `{example_train["neg_pred"]}` |
| """) |
|
|
| |
| load_button = gr.Button("Load Selected Example", variant="secondary") |
| gr.Markdown("### Citation") |
|
|
| gr.Markdown(""" |
| If you use Symbolic Judge in your work, please cite: |
| ``` |
| @misc{anonymous2025slr, |
| title={Verifiable Rewards for Scalable Logical Reasoning}, |
| author={Anonymous}, |
| year={2025}, |
| } |
| ``` |
| """) |
|
|
| |
| def update_example_preview(selection): |
| selected_example = next((ex[1] for ex in examples if ex[0] == selection), example_train) |
| return ( |
| "**Description**: " + selected_example["description"], |
| selected_example["rule"], |
| selected_example["validation"], |
| f""" |
| **Positive Examples**: `{selected_example["pos_pred"]}` |
| **Negative Examples**: `{selected_example["neg_pred"]}` |
| """ |
| ) |
|
|
| example_radio.change( |
| fn=update_example_preview, |
| inputs=[example_radio], |
| outputs=[example_description, example_rule, example_validation, example_predicates] |
| ) |
|
|
| |
| def load_selected_example(selection): |
| selected_example = next((ex[1] for ex in examples if ex[0] == selection), example_train) |
| return load_example(selected_example) |
|
|
| load_button.click( |
| fn=load_selected_example, |
| inputs=[example_radio], |
| outputs=[prediction_input, references_input, pos_pred_input, neg_pred_input] |
| ) |
|
|
| |
| eval_button.click( |
| fn=evaluate_fn, |
| inputs=[prediction_input, references_input, pos_pred_input, neg_pred_input], |
| outputs=[accuracy_output, partial_score_output, syntax_score_output, error_output] |
| ) |
|
|
| with gr.Tab("Documentation"): |
| gr.Markdown(readme_content) |
|
|
| return demo |
|
|
| |
| module = evaluate.load("LG-Anonym/VerifiableRewardsForScalableLogicalReasoning") |
| create_interface(module).launch() |