david-thrower commited on
Commit
6436528
·
verified ·
1 Parent(s): 4402f0f

Update app.py

Browse files

Added new CPU optimized GGUF version. Added handling for files.

Files changed (1) hide show
  1. app.py +181 -64
app.py CHANGED
@@ -1,7 +1,7 @@
1
 
2
- import gc
3
 
4
- import gradio as gr
5
  # import torch
6
  # from transformers import AutoTokenizer, AutoModelForCausalLM #, HqqConfig
7
 
@@ -27,24 +27,24 @@ import gradio as gr
27
 
28
  #########
29
 
30
- import torch
31
- from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
32
- from torchao.quantization import Float8DynamicActivationFloat8WeightConfig, Float8WeightOnlyConfig
33
 
34
- # quant_config = Float8WeightOnlyConfig()
35
- quant_config = Float8DynamicActivationFloat8WeightConfig()
36
- quantization_config = TorchAoConfig(quant_type=quant_config)
37
 
38
- MODEL_ID = "HuggingFaceTB/SmolLM3-3B"
39
 
40
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
41
- model = AutoModelForCausalLM.from_pretrained(
42
- MODEL_ID,
43
- torch_dtype="auto",
44
- device_map="auto",
45
- quantization_config=quantization_config)
46
 
47
- gc.collect()
48
 
49
 
50
  #########
@@ -91,7 +91,7 @@ gc.collect()
91
  # del(model)
92
 
93
  # Run garbage collection again to release memory from quantizer objects
94
- gc.collect()
95
 
96
  # # Step 5: Load the quantized ONNX model for inference
97
  # print("Loading quantized ONNX model for inference...")
@@ -99,7 +99,7 @@ gc.collect()
99
  # print("Loading model was succcessful. Garbage collecting.")
100
 
101
  # Garbage collection again after final loading
102
- gc.collect()
103
 
104
  #########
105
 
@@ -134,79 +134,196 @@ gc.collect()
134
  # Helpers
135
  # -------------------------------------------------
136
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  def build_messages(history, enable_thinking: bool):
138
- """Convert Gradio history to the chat template."""
139
  messages = []
140
  for h in history:
141
  messages.append({"role": h["role"], "content": h["content"]})
142
- # Add system instruction for mode
143
  system_flag = "/think" if enable_thinking else "/no_think"
144
  messages.insert(0, {"role": "system", "content": system_flag})
145
  return messages
146
 
147
- def chat_fn(history, enable_thinking, temperature, top_p, top_k, repetition_penalty, max_new_tokens):
148
- """Generate a streaming response."""
149
  messages = build_messages(history, enable_thinking)
150
- text = tokenizer.apply_chat_template(
151
- messages,
152
- tokenize=False,
153
- add_generation_prompt=True,
154
- # xml_tools=TOOLS
 
 
 
155
  )
156
- inputs = tokenizer(text, return_tensors="pt")
157
- gc.collect()
158
- with torch.inference_mode():
159
- streamer = model.generate(
160
- **inputs,
161
- max_new_tokens=max_new_tokens,
162
- do_sample=True,
163
- temperature=temperature,
164
- top_p=top_p,
165
- top_k=top_k,
166
- repetition_penalty=repetition_penalty,
167
- pad_token_id=tokenizer.eos_token_id,
168
- streamer=None # we'll yield manually
169
- )
170
- gc.collect()
171
- output_ids = streamer[0][len(inputs.input_ids[0]):]
172
- response = tokenizer.decode(output_ids, skip_special_tokens=True)
173
- if isinstance(response, str):
174
- response = response.replace('<think>',"# &lt;think&gt;").replace('</think>',"&lt;/think&gt;")
175
- elif isinstance(response,list):
176
- response = [paper.replace('<think>',"# &lt;think&gt;").replace('</think>',"&lt;/think&gt;") for paper in response]
177
  else:
178
- raise ValueError("Tokenizer response seems malformed; Not a string, nor a list?!?!")
179
 
180
- # streaming char-by-char
181
  history.append({"role": "assistant", "content": ""})
182
  for ch in response:
183
  history[-1]["content"] += ch
184
  yield history
185
 
186
- # -------------------------------------------------
187
- # Blocks UI
188
- # -------------------------------------------------
189
  with gr.Blocks(title="SmolLM3-3B Chat") as demo:
190
  gr.Markdown("## 🤖 SmolLM3-3B Chatbot (Streaming)")
191
  with gr.Row():
192
  enable_think = gr.Checkbox(label="Enable Extended Thinking (/think)", value=True)
193
  temperature = gr.Slider(0.0, 1.0, value=0.6, label="Temperature")
194
  top_p = gr.Slider(0.0, 1.0, value=0.95, label="Top-p")
195
- top_k = gr.Slider(1,40,value=20,label="Top_k")
196
- repetition_penalty = gr.Slider(1.0,1.4,value=1.1,label="Repetition_Penalty")
197
- max_new_tokens = gr.Slider(1000,32768,value=32768,label="Max_New_Tokens")
 
198
  chatbot = gr.Chatbot(type="messages")
199
- msg = gr.Textbox(placeholder="Type your message here…", lines=1)
 
 
 
 
200
  clear = gr.Button("Clear")
201
 
202
- def user_fn(user_msg, history):
203
- return "", history + [{"role": "user", "content": user_msg}]
 
 
 
 
 
 
 
 
 
 
 
 
 
204
 
205
- msg.submit(
206
- user_fn, [msg, chatbot], [msg, chatbot], queue=False
207
- ).then(
208
- chat_fn, [chatbot, enable_think, temperature, top_p, top_k, repetition_penalty, max_new_tokens], chatbot
209
- )
210
  clear.click(lambda: None, None, chatbot, queue=False)
211
 
212
  demo.queue().launch()
 
1
 
2
+ # import gc
3
 
4
+ # import gradio as gr
5
  # import torch
6
  # from transformers import AutoTokenizer, AutoModelForCausalLM #, HqqConfig
7
 
 
27
 
28
  #########
29
 
30
+ # import torch
31
+ # from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
32
+ # from torchao.quantization import Float8DynamicActivationFloat8WeightConfig, Float8WeightOnlyConfig
33
 
34
+ # # quant_config = Float8WeightOnlyConfig()
35
+ # quant_config = Float8DynamicActivationFloat8WeightConfig()
36
+ # quantization_config = TorchAoConfig(quant_type=quant_config)
37
 
38
+ # MODEL_ID = "HuggingFaceTB/SmolLM3-3B"
39
 
40
+ # tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
41
+ # model = AutoModelForCausalLM.from_pretrained(
42
+ # MODEL_ID,
43
+ # torch_dtype="auto",
44
+ # device_map="auto",
45
+ # quantization_config=quantization_config)
46
 
47
+ # gc.collect()
48
 
49
 
50
  #########
 
91
  # del(model)
92
 
93
  # Run garbage collection again to release memory from quantizer objects
94
+ # gc.collect()
95
 
96
  # # Step 5: Load the quantized ONNX model for inference
97
  # print("Loading quantized ONNX model for inference...")
 
99
  # print("Loading model was succcessful. Garbage collecting.")
100
 
101
  # Garbage collection again after final loading
102
+ # gc.collect()
103
 
104
  #########
105
 
 
134
  # Helpers
135
  # -------------------------------------------------
136
 
137
+ # def build_messages(history, enable_thinking: bool):
138
+ # """Convert Gradio history to the chat template."""
139
+ # messages = []
140
+ # for h in history:
141
+ # messages.append({"role": h["role"], "content": h["content"]})
142
+ # # Add system instruction for mode
143
+ # system_flag = "/think" if enable_thinking else "/no_think"
144
+ # messages.insert(0, {"role": "system", "content": system_flag})
145
+ # return messages
146
+
147
+ # def chat_fn(history, enable_thinking, temperature, top_p, top_k, repetition_penalty, max_new_tokens):
148
+ # """Generate a streaming response."""
149
+ # messages = build_messages(history, enable_thinking)
150
+ # text = tokenizer.apply_chat_template(
151
+ # messages,
152
+ # tokenize=False,
153
+ # add_generation_prompt=True,
154
+ # # xml_tools=TOOLS
155
+ # )
156
+ # inputs = tokenizer(text, return_tensors="pt")
157
+ # gc.collect()
158
+ # with torch.inference_mode():
159
+ # streamer = model.generate(
160
+ # **inputs,
161
+ # max_new_tokens=max_new_tokens,
162
+ # do_sample=True,
163
+ # temperature=temperature,
164
+ # top_p=top_p,
165
+ # top_k=top_k,
166
+ # repetition_penalty=repetition_penalty,
167
+ # pad_token_id=tokenizer.eos_token_id,
168
+ # streamer=None # we'll yield manually
169
+ # )
170
+ # gc.collect()
171
+ # output_ids = streamer[0][len(inputs.input_ids[0]):]
172
+ # response = tokenizer.decode(output_ids, skip_special_tokens=True)
173
+ # if isinstance(response, str):
174
+ # response = response.replace('<think>',"# &lt;think&gt;").replace('</think>',"&lt;/think&gt;")
175
+ # elif isinstance(response,list):
176
+ # response = [paper.replace('<think>',"# &lt;think&gt;").replace('</think>',"&lt;/think&gt;") for paper in response]
177
+ # else:
178
+ # raise ValueError("Tokenizer response seems malformed; Not a string, nor a list?!?!")
179
+
180
+ # # streaming char-by-char
181
+ # history.append({"role": "assistant", "content": ""})
182
+ # for ch in response:
183
+ # history[-1]["content"] += ch
184
+ # yield history
185
+
186
+ # # -------------------------------------------------
187
+ # # Blocks UI
188
+ # # -------------------------------------------------
189
+ # with gr.Blocks(title="SmolLM3-3B Chat") as demo:
190
+ # gr.Markdown("## 🤖 SmolLM3-3B Chatbot (Streaming)")
191
+ # with gr.Row():
192
+ # enable_think = gr.Checkbox(label="Enable Extended Thinking (/think)", value=True)
193
+ # temperature = gr.Slider(0.0, 1.0, value=0.6, label="Temperature")
194
+ # top_p = gr.Slider(0.0, 1.0, value=0.95, label="Top-p")
195
+ # top_k = gr.Slider(1,40,value=20,label="Top_k")
196
+ # repetition_penalty = gr.Slider(1.0,1.4,value=1.1,label="Repetition_Penalty")
197
+ # max_new_tokens = gr.Slider(1000,32768,value=32768,label="Max_New_Tokens")
198
+ # chatbot = gr.Chatbot(type="messages")
199
+ # msg = gr.Textbox(placeholder="Type your message here…", lines=1)
200
+ # clear = gr.Button("Clear")
201
+
202
+ # def user_fn(user_msg, history):
203
+ # return "", history + [{"role": "user", "content": user_msg}]
204
+
205
+ # msg.submit(
206
+ # user_fn, [msg, chatbot], [msg, chatbot], queue=False
207
+ # ).then(
208
+ # chat_fn, [chatbot, enable_think, temperature, top_p, top_k, repetition_penalty, max_new_tokens], chatbot
209
+ # )
210
+ # clear.click(lambda: None, None, chatbot, queue=False)
211
+
212
+ # demo.queue().launch()
213
+
214
+ import gc
215
+ from pathlib import Path
216
+ from llama_cpp import Llama
217
+ import gradio as gr
218
+
219
+ from pypdf import PdfReader
220
+ import pandas as pd
221
+ from docx import Document
222
+
223
+ MAX_TOKENS = 10_000
224
+
225
+ llm = Llama.from_pretrained(
226
+ repo_id="unsloth/SmolLM3-3B-GGUF",
227
+ filename="SmolLM3-3B-Q4_K_M.gguf",
228
+ n_ctx=MAX_TOKENS,
229
+ )
230
+ gc.collect()
231
+
232
+ # ---------- helpers ----------
233
+
234
+
235
+ def read_file(p: Path) -> str:
236
+ try:
237
+ suffix = p.suffix.lower()
238
+ if suffix == ".pdf":
239
+ with p.open("rb") as f:
240
+ reader = PdfReader(f)
241
+ return "\n".join(page.extract_text() or "" for page in reader.pages)
242
+ elif suffix in (".xlsx", ".xls"):
243
+ sheets = pd.read_excel(p, sheet_name=None)
244
+ text = ""
245
+ for sheet_name, df in sheets.items():
246
+ text += df.to_string()
247
+ return text
248
+ elif suffix == ".docx":
249
+ with p.open("rb") as f:
250
+ doc = Document(f)
251
+ return "\n".join(para.text for para in doc.paragraphs)
252
+ else:
253
+ return p.read_text(encoding="utf-8", errors="ignore")
254
+ except Exception:
255
+ return "[could not read file]"
256
+
257
+
258
+
259
  def build_messages(history, enable_thinking: bool):
 
260
  messages = []
261
  for h in history:
262
  messages.append({"role": h["role"], "content": h["content"]})
 
263
  system_flag = "/think" if enable_thinking else "/no_think"
264
  messages.insert(0, {"role": "system", "content": system_flag})
265
  return messages
266
 
267
+ def chat_fn(history, enable_thinking, temperature, top_p, top_k,
268
+ repetition_penalty, max_new_tokens):
269
  messages = build_messages(history, enable_thinking)
270
+
271
+ response = llm.create_chat_completion(
272
+ messages=messages,
273
+ max_tokens=max_new_tokens,
274
+ temperature=temperature,
275
+ top_p=top_p,
276
+ top_k=top_k,
277
+ repeat_penalty=repetition_penalty
278
  )
279
+ response_text = response['choices'][0]['message']['content']
280
+ if isinstance(response_text, str):
281
+ response = response_text.replace('<think>', "# &lt;think&gt;").replace('</think>', "&lt;/think&gt;")
282
+ elif isinstance(response_text, list):
283
+ response = [t.replace('<think>', "# &lt;think&gt;").replace('</think>', "&lt;/think&gt;") for t in response_text]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
  else:
285
+ raise ValueError("Malformed response from tokenizer")
286
 
 
287
  history.append({"role": "assistant", "content": ""})
288
  for ch in response:
289
  history[-1]["content"] += ch
290
  yield history
291
 
292
+ # ---------- UI ----------
 
 
293
  with gr.Blocks(title="SmolLM3-3B Chat") as demo:
294
  gr.Markdown("## 🤖 SmolLM3-3B Chatbot (Streaming)")
295
  with gr.Row():
296
  enable_think = gr.Checkbox(label="Enable Extended Thinking (/think)", value=True)
297
  temperature = gr.Slider(0.0, 1.0, value=0.6, label="Temperature")
298
  top_p = gr.Slider(0.0, 1.0, value=0.95, label="Top-p")
299
+ top_k = gr.Slider(1, 40, value=20, label="Top-k")
300
+ repetition_penalty = gr.Slider(1.0, 1.4, value=1.1, label="Repetition Penalty")
301
+ max_new_tokens = gr.Slider(1000, MAX_TOKENS, value=MAX_TOKENS, label="Max New Tokens")
302
+
303
  chatbot = gr.Chatbot(type="messages")
304
+ with gr.Row():
305
+ msg = gr.Textbox(placeholder="Type your message here…", lines=1, scale=8)
306
+ send_btn = gr.Button("Send", scale=1)
307
+ file_uploader = gr.File(label="Attach file(s)", file_count="multiple", file_types=None)
308
+
309
  clear = gr.Button("Clear")
310
 
311
+ def user_fn(user_msg, history, files):
312
+ if files:
313
+ file_contents = "\n\n".join(read_file(Path(fp)) for fp in files)
314
+ user_msg += f"\n\n# FILE CONTENT:\n\n{file_contents}"
315
+ return "", history + [{"role": "user", "content": user_msg}], None # clear file_uploader
316
+
317
+ # Submit on button click or Enter key
318
+ for trigger in (msg.submit, send_btn.click):
319
+ trigger(
320
+ user_fn, [msg, chatbot, file_uploader], [msg, chatbot, file_uploader], queue=False
321
+ ).then(
322
+ chat_fn,
323
+ [chatbot, enable_think, temperature, top_p, top_k, repetition_penalty, max_new_tokens],
324
+ chatbot
325
+ )
326
 
 
 
 
 
 
327
  clear.click(lambda: None, None, chatbot, queue=False)
328
 
329
  demo.queue().launch()