Spaces:
Running
Running
Commit
·
2488d19
1
Parent(s):
bbb587f
Formatting and legibility changes
Browse files
app.py
CHANGED
|
@@ -80,17 +80,23 @@ def de_tokenize_er(pairs):
|
|
| 80 |
with gr.Blocks() as frontend:
|
| 81 |
with gr.Row():
|
| 82 |
with gr.Column(scale=3):
|
| 83 |
-
gr.Markdown("# 🐇 Tokenizaminer\n
|
|
|
|
|
|
|
| 84 |
with gr.Group():
|
| 85 |
-
input_checkpoint = gr.Dropdown(
|
| 86 |
btn_load_tokenizer = gr.Button(value="Load Tokenizer")
|
| 87 |
with gr.Row():
|
| 88 |
-
|
|
|
|
|
|
|
| 89 |
with gr.Row():
|
| 90 |
btn_tokenize = gr.Button(value="Tokenize!")
|
| 91 |
btn_random_seq = gr.Button(value="Randomize!")
|
| 92 |
with gr.Row():
|
| 93 |
-
|
|
|
|
|
|
|
| 94 |
with gr.Row():
|
| 95 |
btn_decode = gr.Button(value="Decode")
|
| 96 |
with gr.Row():
|
|
@@ -101,6 +107,7 @@ with gr.Blocks() as frontend:
|
|
| 101 |
output_decoded_ids = gr.TextArea(label="Decoded IDs", interactive=False)
|
| 102 |
with gr.Column(scale=1):
|
| 103 |
with gr.Group():
|
|
|
|
| 104 |
output_vocab_count = gr.Number(label="Vocab Size", interactive=False)
|
| 105 |
output_unknown_token = gr.Textbox(label="Unknown Token", interactive=False)
|
| 106 |
output_vocab = gr.Code(label="Vocabulary")
|
|
|
|
| 80 |
with gr.Blocks() as frontend:
|
| 81 |
with gr.Row():
|
| 82 |
with gr.Column(scale=3):
|
| 83 |
+
gr.Markdown("# 🐇 Tokenizaminer\n### The Tokenizer Examiner... 🕵️🕳️\nThe purpose of this tool is to examine the vocabulary and tokens of a models tokenizer and play with the results.")
|
| 84 |
+
with gr.Row():
|
| 85 |
+
gr.Markdown("\n#### 1. Load Tokenizer\nSelect from the list or enter any model from 🤗 Hugging Face Models, it will only download the Tokenizer data! Image models won't work here.")
|
| 86 |
with gr.Group():
|
| 87 |
+
input_checkpoint = gr.Dropdown(choices=checkpoints, value=checkpoint, allow_custom_value=True, container=False)
|
| 88 |
btn_load_tokenizer = gr.Button(value="Load Tokenizer")
|
| 89 |
with gr.Row():
|
| 90 |
+
gr.Markdown("\n#### 2. Sequence & Tokenize")
|
| 91 |
+
with gr.Row():
|
| 92 |
+
input_sequence = gr.TextArea(value=sequence, placeholder=placeholder, lines=3, interactive=True, container=False)
|
| 93 |
with gr.Row():
|
| 94 |
btn_tokenize = gr.Button(value="Tokenize!")
|
| 95 |
btn_random_seq = gr.Button(value="Randomize!")
|
| 96 |
with gr.Row():
|
| 97 |
+
gr.Markdown("\n#### 3. Decode\nYou can select and edit each cell individually - then hit Decode!")
|
| 98 |
+
with gr.Row():
|
| 99 |
+
token_id_pair = gr.DataFrame(col_count=(2,"fixed"), headers=["Token","ID"], type="array", datatype=["str", "number"], height=400, interactive=True)
|
| 100 |
with gr.Row():
|
| 101 |
btn_decode = gr.Button(value="Decode")
|
| 102 |
with gr.Row():
|
|
|
|
| 107 |
output_decoded_ids = gr.TextArea(label="Decoded IDs", interactive=False)
|
| 108 |
with gr.Column(scale=1):
|
| 109 |
with gr.Group():
|
| 110 |
+
gr.Markdown("\n#### Tokenizer Data")
|
| 111 |
output_vocab_count = gr.Number(label="Vocab Size", interactive=False)
|
| 112 |
output_unknown_token = gr.Textbox(label="Unknown Token", interactive=False)
|
| 113 |
output_vocab = gr.Code(label="Vocabulary")
|