Stefan Ivchenko commited on
Commit
db33913
·
1 Parent(s): 1441524
Files changed (2) hide show
  1. app.py +14 -13
  2. requirements.txt +1 -2
app.py CHANGED
@@ -1,36 +1,37 @@
1
  import gradio as gr
2
  from huggingface_hub import hf_hub_download
3
- from llama_cpp import Llama
4
 
5
- # Download GGUF from your HF model repo
6
  model_path = hf_hub_download(
7
  repo_id="StefanCoder1/Scalable-tuned-GGUF",
8
  filename="model-f16.gguf",
9
- # token=True # uncomment if repo is private and HF_TOKEN is set
10
  )
11
 
12
- llm = Llama(
13
- model_path=model_path,
14
- n_ctx=4096,
15
- n_threads=4, # adjust to Space hardware
 
16
  )
17
 
18
  def respond(message, history):
19
- # history is a list of (user, assistant) tuples
20
  prompt = ""
21
  for user_msg, assistant_msg in (history or []):
22
  prompt += f"User: {user_msg}\nAssistant: {assistant_msg}\n"
23
  prompt += f"User: {message}\nAssistant:"
24
 
25
- output = llm(
 
26
  prompt,
27
- max_tokens=256,
28
  temperature=0.7,
29
- stop=["User:", "Assistant:"],
30
  )
31
 
32
- reply = output["choices"][0]["text"]
33
- return reply # ChatInterface will append (message, reply) to history
34
 
35
  chat = gr.ChatInterface(
36
  respond,
 
1
  import gradio as gr
2
  from huggingface_hub import hf_hub_download
3
+ from ctransformers import AutoModelForCausalLM
4
 
5
+ # Download your GGUF from HF Hub
6
  model_path = hf_hub_download(
7
  repo_id="StefanCoder1/Scalable-tuned-GGUF",
8
  filename="model-f16.gguf",
9
+ # token=True, # uncomment + set HF_TOKEN secret if the repo is private
10
  )
11
 
12
+ # Load model with ctransformers
13
+ llm = AutoModelForCausalLM.from_pretrained(
14
+ model_path,
15
+ model_type="llama", # adjust if needed (e.g. "llama", "mistral", etc.)
16
+ gpu_layers=0, # CPU only; tweak if you later have GPU
17
  )
18
 
19
  def respond(message, history):
20
+ # history: list of (user, assistant) pairs
21
  prompt = ""
22
  for user_msg, assistant_msg in (history or []):
23
  prompt += f"User: {user_msg}\nAssistant: {assistant_msg}\n"
24
  prompt += f"User: {message}\nAssistant:"
25
 
26
+ # Generate response
27
+ reply = llm(
28
  prompt,
29
+ max_new_tokens=256,
30
  temperature=0.7,
 
31
  )
32
 
33
+ # ctransformers returns a string directly
34
+ return reply
35
 
36
  chat = gr.ChatInterface(
37
  respond,
requirements.txt CHANGED
@@ -1,3 +1,2 @@
1
- --extra-index-url https://jllllll.github.io/llama-cpp-python-cuBLAS-wheels/
2
- llama-cpp-python
3
  huggingface_hub
 
1
+ ctransformers
 
2
  huggingface_hub