Stefan Ivchenko commited on
Commit
5c4cc72
·
1 Parent(s): db33913

DIE DIE DIE

Browse files
Files changed (1) hide show
  1. app.py +4 -16
app.py CHANGED
@@ -1,36 +1,24 @@
1
  import gradio as gr
2
- from huggingface_hub import hf_hub_download
3
  from ctransformers import AutoModelForCausalLM
4
 
5
- # Download your GGUF from HF Hub
6
- model_path = hf_hub_download(
7
- repo_id="StefanCoder1/Scalable-tuned-GGUF",
8
- filename="model-f16.gguf",
9
- # token=True, # uncomment + set HF_TOKEN secret if the repo is private
10
- )
11
-
12
- # Load model with ctransformers
13
  llm = AutoModelForCausalLM.from_pretrained(
14
- model_path,
15
- model_type="llama", # adjust if needed (e.g. "llama", "mistral", etc.)
16
- gpu_layers=0, # CPU only; tweak if you later have GPU
 
17
  )
18
 
19
  def respond(message, history):
20
- # history: list of (user, assistant) pairs
21
  prompt = ""
22
  for user_msg, assistant_msg in (history or []):
23
  prompt += f"User: {user_msg}\nAssistant: {assistant_msg}\n"
24
  prompt += f"User: {message}\nAssistant:"
25
 
26
- # Generate response
27
  reply = llm(
28
  prompt,
29
  max_new_tokens=256,
30
  temperature=0.7,
31
  )
32
-
33
- # ctransformers returns a string directly
34
  return reply
35
 
36
  chat = gr.ChatInterface(
 
1
  import gradio as gr
 
2
  from ctransformers import AutoModelForCausalLM
3
 
 
 
 
 
 
 
 
 
4
  llm = AutoModelForCausalLM.from_pretrained(
5
+ "StefanCoder1/Scalable-tuned-GGUF", # HF repo id
6
+ model_file="model-f16.gguf", # exact GGUF filename
7
+ model_type="llama", # Llama architecture
8
+ gpu_layers=0, # CPU only
9
  )
10
 
11
  def respond(message, history):
 
12
  prompt = ""
13
  for user_msg, assistant_msg in (history or []):
14
  prompt += f"User: {user_msg}\nAssistant: {assistant_msg}\n"
15
  prompt += f"User: {message}\nAssistant:"
16
 
 
17
  reply = llm(
18
  prompt,
19
  max_new_tokens=256,
20
  temperature=0.7,
21
  )
 
 
22
  return reply
23
 
24
  chat = gr.ChatInterface(