Spaces:

Taranosaurus
/

Tokenizaminer

Running

App Files Files Community

Taranosaurus commited on Jan 3, 2024

Commit

ce49ae8

1 Parent(s): 2488d19

Refactored how the model gets loaded

Browse files

Removed the model load button and added queueing to the Tokenize button for better user experience

Files changed (1) hide show

app.py +20 -18

app.py CHANGED Viewed

@@ -30,23 +30,26 @@ sequence = randomize_sequence
 def load_tokenizer(checkpoint):
     if not "tokenizer" in globals():
         global tokenizer
-        tokenizer = None
-    try:
         tokenizer = AutoTokenizer.from_pretrained(checkpoint)
         vocab = dict(sorted(tokenizer.vocab.items(), key=lambda item: item[1]))
         unk = next(iter(vocab))
         vocab.pop(unk)
         vocab_sorted = "\n".join(vocab)
         vocab_size = len(vocab)
-        gr.Info(f"Tokenizer loaded '{checkpoint}' with vocab size: {vocab_size}")
-        #return checkpoint, vocab_size, vocab
         return vocab_size, unk, vocab_sorted
     except Exception as error:
         gr.Warning(f"An unexpected error occurred while loading the Tokenizer.")
         gr.Warning(f"{error}")
         return None, None, None
-def tokenize_er(sequence):
     try:
         tokens = tokenizer.tokenize(sequence)
         ids = tokenizer.convert_tokens_to_ids(tokens)
@@ -54,10 +57,10 @@ def tokenize_er(sequence):
         if len(tokens) == len(ids):
             for i in range(len(ids)):
                 token_id_pair.append([tokens[i],ids[i]])
-        return token_id_pair
     except NameError:
-        gr.Warning("Load Tokenizer before sequencing.")
-        return [[None, None]]
 def de_tokenize_er(pairs):
     try:
@@ -80,25 +83,25 @@ def de_tokenize_er(pairs):
 with gr.Blocks() as frontend:
     with gr.Row():
         with gr.Column(scale=3):
-            gr.Markdown("# 🐇 Tokenizaminer\n### The Tokenizer Examiner... 🕵️🕳️\nThe purpose of this tool is to examine the vocabulary and tokens of a models tokenizer and play with the results.")
             with gr.Row():
-                gr.Markdown("\n#### 1. Load Tokenizer\nSelect from the list or enter any model from 🤗 Hugging Face Models, it will only download the Tokenizer data! Image models won't work here.")
             with gr.Group():
-                input_checkpoint = gr.Dropdown(choices=checkpoints, value=checkpoint, allow_custom_value=True, container=False)
-                btn_load_tokenizer = gr.Button(value="Load Tokenizer")
             with gr.Row():
                 gr.Markdown("\n#### 2. Sequence & Tokenize")
             with gr.Row():
-                input_sequence = gr.TextArea(value=sequence, placeholder=placeholder, lines=3, interactive=True, container=False)
             with gr.Row():
                     btn_tokenize = gr.Button(value="Tokenize!")
                     btn_random_seq = gr.Button(value="Randomize!")
             with gr.Row():
                 gr.Markdown("\n#### 3. Decode\nYou can select and edit each cell individually - then hit Decode!")
             with gr.Row():
-                token_id_pair = gr.DataFrame(col_count=(2,"fixed"), headers=["Token","ID"], type="array", datatype=["str", "number"], height=400, interactive=True)
             with gr.Row():
                 btn_decode = gr.Button(value="Decode")
             with gr.Row():
                 with gr.Column():
                     output_decoded_token_ids = gr.TextArea(label="Re-encoded Tokens", interactive=False)
@@ -107,13 +110,12 @@ with gr.Blocks() as frontend:
                     output_decoded_ids = gr.TextArea(label="Decoded IDs", interactive=False)
         with gr.Column(scale=1):
             with gr.Group():
-                gr.Markdown("\n#### Tokenizer Data")
                 output_vocab_count = gr.Number(label="Vocab Size", interactive=False)
                 output_unknown_token = gr.Textbox(label="Unknown Token", interactive=False)
-                output_vocab = gr.Code(label="Vocabulary")
-        btn_load_tokenizer.click(fn=load_tokenizer, inputs=[input_checkpoint], outputs=[output_vocab_count,output_unknown_token, output_vocab])
-        btn_tokenize.click(fn=tokenize_er, inputs=[input_sequence], outputs=[token_id_pair])
         btn_random_seq.click(fn=randomize_sequence, inputs=[], outputs=[input_sequence])
         btn_decode.click(fn=de_tokenize_er, inputs=[token_id_pair], outputs=[output_decoded_token_ids,output_decoded_tokens, output_decoded_ids])

 def load_tokenizer(checkpoint):
     if not "tokenizer" in globals():
         global tokenizer
         tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+    try:
+        if checkpoint == tokenizer.name_or_path:
+            gr.Info(f"Tokenizer already loaded '{checkpoint}'")
+        else:
+            tokenizer = AutoTokenizer.from_pretrained(checkpoint)
         vocab = dict(sorted(tokenizer.vocab.items(), key=lambda item: item[1]))
         unk = next(iter(vocab))
         vocab.pop(unk)
         vocab_sorted = "\n".join(vocab)
         vocab_size = len(vocab)
+        gr.Info(f"Tokenizer vocab size: {vocab_size}")
         return vocab_size, unk, vocab_sorted
     except Exception as error:
         gr.Warning(f"An unexpected error occurred while loading the Tokenizer.")
         gr.Warning(f"{error}")
         return None, None, None
+def tokenize_er(checkpoint, sequence):
+    vocab_size, unk, vocab_sorted = load_tokenizer(checkpoint)
     try:
         tokens = tokenizer.tokenize(sequence)
         ids = tokenizer.convert_tokens_to_ids(tokens)
         if len(tokens) == len(ids):
             for i in range(len(ids)):
                 token_id_pair.append([tokens[i],ids[i]])
+        return token_id_pair, vocab_size, unk, vocab_sorted
     except NameError:
+        gr.Warning("Select Tokenizer before sequencing.")
+        return [[None, None]], None, None, None
 def de_tokenize_er(pairs):
     try:
 with gr.Blocks() as frontend:
     with gr.Row():
         with gr.Column(scale=3):
+            gr.Markdown("# 🐇 Tokenizaminer\n### The Tokenizer Examiner, or the Tokeniza Miner... 🕵️🕳️\nThe purpose of this tool is to examine the vocabulary and tokens of a models tokenizer and play with the results.\nNote how the Vocabulary ID lines up with the full Vocabulary index on the right ➡️\n\n⚠️ Loading the vocabulary can take a few seconds.")
             with gr.Row():
+                gr.Markdown("\n#### 1. Select Tokenizer\nSelect from the list or enter any model from 🤗 Hugging Face Models, it will only download the Tokenizer data! Image models won't work here.")
             with gr.Group():
+                input_checkpoint = gr.Dropdown(label="Tokenizer", choices=checkpoints, value=checkpoint, allow_custom_value=True, show_label=False, container=False)
             with gr.Row():
                 gr.Markdown("\n#### 2. Sequence & Tokenize")
             with gr.Row():
+                input_sequence = gr.TextArea(label="Sequence", value=sequence, placeholder=placeholder, lines=3, interactive=True, show_label=False, container=False)
             with gr.Row():
                     btn_tokenize = gr.Button(value="Tokenize!")
                     btn_random_seq = gr.Button(value="Randomize!")
             with gr.Row():
                 gr.Markdown("\n#### 3. Decode\nYou can select and edit each cell individually - then hit Decode!")
             with gr.Row():
+                token_id_pair = gr.DataFrame(col_count=(2,"fixed"), headers=["Token","Vocabulary ID"], value=[[None,0]], type="array", datatype=["str", "number"], height=400, interactive=True)
             with gr.Row():
                 btn_decode = gr.Button(value="Decode")
+                btn_clear_pairs = gr.ClearButton(value="Clear Token/IDs", components=[token_id_pair])
             with gr.Row():
                 with gr.Column():
                     output_decoded_token_ids = gr.TextArea(label="Re-encoded Tokens", interactive=False)
                     output_decoded_ids = gr.TextArea(label="Decoded IDs", interactive=False)
         with gr.Column(scale=1):
             with gr.Group():
+                gr.Markdown("\n#### 🎲 Tokenizer Data")
                 output_vocab_count = gr.Number(label="Vocab Size", interactive=False)
                 output_unknown_token = gr.Textbox(label="Unknown Token", interactive=False)
+                output_vocab = gr.Code(label="Vocabulary IDs")
+        btn_tokenize.click(fn=tokenize_er, inputs=[input_checkpoint, input_sequence], outputs=[token_id_pair, output_vocab_count,output_unknown_token, output_vocab], queue=True)
         btn_random_seq.click(fn=randomize_sequence, inputs=[], outputs=[input_sequence])
         btn_decode.click(fn=de_tokenize_er, inputs=[token_id_pair], outputs=[output_decoded_token_ids,output_decoded_tokens, output_decoded_ids])