Spaces:
Running
Running
Commit
·
ce49ae8
1
Parent(s):
2488d19
Refactored how the model gets loaded
Browse filesRemoved the model load button and added queueing to the Tokenize button for better user experience
app.py
CHANGED
|
@@ -30,23 +30,26 @@ sequence = randomize_sequence
|
|
| 30 |
def load_tokenizer(checkpoint):
|
| 31 |
if not "tokenizer" in globals():
|
| 32 |
global tokenizer
|
| 33 |
-
tokenizer = None
|
| 34 |
-
try:
|
| 35 |
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
vocab = dict(sorted(tokenizer.vocab.items(), key=lambda item: item[1]))
|
| 37 |
unk = next(iter(vocab))
|
| 38 |
vocab.pop(unk)
|
| 39 |
vocab_sorted = "\n".join(vocab)
|
| 40 |
vocab_size = len(vocab)
|
| 41 |
-
gr.Info(f"Tokenizer
|
| 42 |
-
#return checkpoint, vocab_size, vocab
|
| 43 |
return vocab_size, unk, vocab_sorted
|
| 44 |
except Exception as error:
|
| 45 |
gr.Warning(f"An unexpected error occurred while loading the Tokenizer.")
|
| 46 |
gr.Warning(f"{error}")
|
| 47 |
return None, None, None
|
| 48 |
|
| 49 |
-
def tokenize_er(sequence):
|
|
|
|
| 50 |
try:
|
| 51 |
tokens = tokenizer.tokenize(sequence)
|
| 52 |
ids = tokenizer.convert_tokens_to_ids(tokens)
|
|
@@ -54,10 +57,10 @@ def tokenize_er(sequence):
|
|
| 54 |
if len(tokens) == len(ids):
|
| 55 |
for i in range(len(ids)):
|
| 56 |
token_id_pair.append([tokens[i],ids[i]])
|
| 57 |
-
return token_id_pair
|
| 58 |
except NameError:
|
| 59 |
-
gr.Warning("
|
| 60 |
-
return [[None, None]]
|
| 61 |
|
| 62 |
def de_tokenize_er(pairs):
|
| 63 |
try:
|
|
@@ -80,25 +83,25 @@ def de_tokenize_er(pairs):
|
|
| 80 |
with gr.Blocks() as frontend:
|
| 81 |
with gr.Row():
|
| 82 |
with gr.Column(scale=3):
|
| 83 |
-
gr.Markdown("# 🐇 Tokenizaminer\n### The Tokenizer Examiner... 🕵️🕳️\nThe purpose of this tool is to examine the vocabulary and tokens of a models tokenizer and play with the results.")
|
| 84 |
with gr.Row():
|
| 85 |
-
gr.Markdown("\n#### 1.
|
| 86 |
with gr.Group():
|
| 87 |
-
input_checkpoint = gr.Dropdown(choices=checkpoints, value=checkpoint, allow_custom_value=True, container=False)
|
| 88 |
-
btn_load_tokenizer = gr.Button(value="Load Tokenizer")
|
| 89 |
with gr.Row():
|
| 90 |
gr.Markdown("\n#### 2. Sequence & Tokenize")
|
| 91 |
with gr.Row():
|
| 92 |
-
input_sequence = gr.TextArea(value=sequence, placeholder=placeholder, lines=3, interactive=True, container=False)
|
| 93 |
with gr.Row():
|
| 94 |
btn_tokenize = gr.Button(value="Tokenize!")
|
| 95 |
btn_random_seq = gr.Button(value="Randomize!")
|
| 96 |
with gr.Row():
|
| 97 |
gr.Markdown("\n#### 3. Decode\nYou can select and edit each cell individually - then hit Decode!")
|
| 98 |
with gr.Row():
|
| 99 |
-
token_id_pair = gr.DataFrame(col_count=(2,"fixed"), headers=["Token","ID"], type="array", datatype=["str", "number"], height=400, interactive=True)
|
| 100 |
with gr.Row():
|
| 101 |
btn_decode = gr.Button(value="Decode")
|
|
|
|
| 102 |
with gr.Row():
|
| 103 |
with gr.Column():
|
| 104 |
output_decoded_token_ids = gr.TextArea(label="Re-encoded Tokens", interactive=False)
|
|
@@ -107,13 +110,12 @@ with gr.Blocks() as frontend:
|
|
| 107 |
output_decoded_ids = gr.TextArea(label="Decoded IDs", interactive=False)
|
| 108 |
with gr.Column(scale=1):
|
| 109 |
with gr.Group():
|
| 110 |
-
gr.Markdown("\n#### Tokenizer Data")
|
| 111 |
output_vocab_count = gr.Number(label="Vocab Size", interactive=False)
|
| 112 |
output_unknown_token = gr.Textbox(label="Unknown Token", interactive=False)
|
| 113 |
-
output_vocab = gr.Code(label="Vocabulary")
|
| 114 |
|
| 115 |
-
|
| 116 |
-
btn_tokenize.click(fn=tokenize_er, inputs=[input_sequence], outputs=[token_id_pair])
|
| 117 |
btn_random_seq.click(fn=randomize_sequence, inputs=[], outputs=[input_sequence])
|
| 118 |
btn_decode.click(fn=de_tokenize_er, inputs=[token_id_pair], outputs=[output_decoded_token_ids,output_decoded_tokens, output_decoded_ids])
|
| 119 |
|
|
|
|
| 30 |
def load_tokenizer(checkpoint):
|
| 31 |
if not "tokenizer" in globals():
|
| 32 |
global tokenizer
|
|
|
|
|
|
|
| 33 |
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
| 34 |
+
try:
|
| 35 |
+
if checkpoint == tokenizer.name_or_path:
|
| 36 |
+
gr.Info(f"Tokenizer already loaded '{checkpoint}'")
|
| 37 |
+
else:
|
| 38 |
+
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
| 39 |
vocab = dict(sorted(tokenizer.vocab.items(), key=lambda item: item[1]))
|
| 40 |
unk = next(iter(vocab))
|
| 41 |
vocab.pop(unk)
|
| 42 |
vocab_sorted = "\n".join(vocab)
|
| 43 |
vocab_size = len(vocab)
|
| 44 |
+
gr.Info(f"Tokenizer vocab size: {vocab_size}")
|
|
|
|
| 45 |
return vocab_size, unk, vocab_sorted
|
| 46 |
except Exception as error:
|
| 47 |
gr.Warning(f"An unexpected error occurred while loading the Tokenizer.")
|
| 48 |
gr.Warning(f"{error}")
|
| 49 |
return None, None, None
|
| 50 |
|
| 51 |
+
def tokenize_er(checkpoint, sequence):
|
| 52 |
+
vocab_size, unk, vocab_sorted = load_tokenizer(checkpoint)
|
| 53 |
try:
|
| 54 |
tokens = tokenizer.tokenize(sequence)
|
| 55 |
ids = tokenizer.convert_tokens_to_ids(tokens)
|
|
|
|
| 57 |
if len(tokens) == len(ids):
|
| 58 |
for i in range(len(ids)):
|
| 59 |
token_id_pair.append([tokens[i],ids[i]])
|
| 60 |
+
return token_id_pair, vocab_size, unk, vocab_sorted
|
| 61 |
except NameError:
|
| 62 |
+
gr.Warning("Select Tokenizer before sequencing.")
|
| 63 |
+
return [[None, None]], None, None, None
|
| 64 |
|
| 65 |
def de_tokenize_er(pairs):
|
| 66 |
try:
|
|
|
|
| 83 |
with gr.Blocks() as frontend:
|
| 84 |
with gr.Row():
|
| 85 |
with gr.Column(scale=3):
|
| 86 |
+
gr.Markdown("# 🐇 Tokenizaminer\n### The Tokenizer Examiner, or the Tokeniza Miner... 🕵️🕳️\nThe purpose of this tool is to examine the vocabulary and tokens of a models tokenizer and play with the results.\nNote how the Vocabulary ID lines up with the full Vocabulary index on the right ➡️\n\n⚠️ Loading the vocabulary can take a few seconds.")
|
| 87 |
with gr.Row():
|
| 88 |
+
gr.Markdown("\n#### 1. Select Tokenizer\nSelect from the list or enter any model from 🤗 Hugging Face Models, it will only download the Tokenizer data! Image models won't work here.")
|
| 89 |
with gr.Group():
|
| 90 |
+
input_checkpoint = gr.Dropdown(label="Tokenizer", choices=checkpoints, value=checkpoint, allow_custom_value=True, show_label=False, container=False)
|
|
|
|
| 91 |
with gr.Row():
|
| 92 |
gr.Markdown("\n#### 2. Sequence & Tokenize")
|
| 93 |
with gr.Row():
|
| 94 |
+
input_sequence = gr.TextArea(label="Sequence", value=sequence, placeholder=placeholder, lines=3, interactive=True, show_label=False, container=False)
|
| 95 |
with gr.Row():
|
| 96 |
btn_tokenize = gr.Button(value="Tokenize!")
|
| 97 |
btn_random_seq = gr.Button(value="Randomize!")
|
| 98 |
with gr.Row():
|
| 99 |
gr.Markdown("\n#### 3. Decode\nYou can select and edit each cell individually - then hit Decode!")
|
| 100 |
with gr.Row():
|
| 101 |
+
token_id_pair = gr.DataFrame(col_count=(2,"fixed"), headers=["Token","Vocabulary ID"], value=[[None,0]], type="array", datatype=["str", "number"], height=400, interactive=True)
|
| 102 |
with gr.Row():
|
| 103 |
btn_decode = gr.Button(value="Decode")
|
| 104 |
+
btn_clear_pairs = gr.ClearButton(value="Clear Token/IDs", components=[token_id_pair])
|
| 105 |
with gr.Row():
|
| 106 |
with gr.Column():
|
| 107 |
output_decoded_token_ids = gr.TextArea(label="Re-encoded Tokens", interactive=False)
|
|
|
|
| 110 |
output_decoded_ids = gr.TextArea(label="Decoded IDs", interactive=False)
|
| 111 |
with gr.Column(scale=1):
|
| 112 |
with gr.Group():
|
| 113 |
+
gr.Markdown("\n#### 🎲 Tokenizer Data")
|
| 114 |
output_vocab_count = gr.Number(label="Vocab Size", interactive=False)
|
| 115 |
output_unknown_token = gr.Textbox(label="Unknown Token", interactive=False)
|
| 116 |
+
output_vocab = gr.Code(label="Vocabulary IDs")
|
| 117 |
|
| 118 |
+
btn_tokenize.click(fn=tokenize_er, inputs=[input_checkpoint, input_sequence], outputs=[token_id_pair, output_vocab_count,output_unknown_token, output_vocab], queue=True)
|
|
|
|
| 119 |
btn_random_seq.click(fn=randomize_sequence, inputs=[], outputs=[input_sequence])
|
| 120 |
btn_decode.click(fn=de_tokenize_er, inputs=[token_id_pair], outputs=[output_decoded_token_ids,output_decoded_tokens, output_decoded_ids])
|
| 121 |
|