Spaces:

StefanCoder1
/

LoreChat

Sleeping

Stefan Ivchenko commited on 23 days ago

Commit

db33913

1 Parent(s): 1441524

dieee

Files changed (2) hide show

app.py CHANGED Viewed

@@ -1,36 +1,37 @@
 import gradio as gr
 from huggingface_hub import hf_hub_download
-from llama_cpp import Llama
-# Download GGUF from your HF model repo
 model_path = hf_hub_download(
     repo_id="StefanCoder1/Scalable-tuned-GGUF",
     filename="model-f16.gguf",
-    # token=True  # uncomment if repo is private and HF_TOKEN is set
 )
-llm = Llama(
-    model_path=model_path,
-    n_ctx=4096,
-    n_threads=4,  # adjust to Space hardware
 )
 def respond(message, history):
-    # history is a list of (user, assistant) tuples
     prompt = ""
     for user_msg, assistant_msg in (history or []):
         prompt += f"User: {user_msg}\nAssistant: {assistant_msg}\n"
     prompt += f"User: {message}\nAssistant:"
-    output = llm(
         prompt,
-        max_tokens=256,
         temperature=0.7,
-        stop=["User:", "Assistant:"],
     )
-    reply = output["choices"][0]["text"]
-    return reply  # ChatInterface will append (message, reply) to history
 chat = gr.ChatInterface(
     respond,

 import gradio as gr
 from huggingface_hub import hf_hub_download
+from ctransformers import AutoModelForCausalLM
+# Download your GGUF from HF Hub
 model_path = hf_hub_download(
     repo_id="StefanCoder1/Scalable-tuned-GGUF",
     filename="model-f16.gguf",
+    # token=True,  # uncomment + set HF_TOKEN secret if the repo is private
 )
+# Load model with ctransformers
+llm = AutoModelForCausalLM.from_pretrained(
+    model_path,
+    model_type="llama",   # adjust if needed (e.g. "llama", "mistral", etc.)
+    gpu_layers=0,         # CPU only; tweak if you later have GPU
 )
 def respond(message, history):
+    # history: list of (user, assistant) pairs
     prompt = ""
     for user_msg, assistant_msg in (history or []):
         prompt += f"User: {user_msg}\nAssistant: {assistant_msg}\n"
     prompt += f"User: {message}\nAssistant:"
+    # Generate response
+    reply = llm(
         prompt,
+        max_new_tokens=256,
         temperature=0.7,
     )
+    # ctransformers returns a string directly
+    return reply
 chat = gr.ChatInterface(
     respond,

requirements.txt CHANGED Viewed

@@ -1,3 +1,2 @@
---extra-index-url https://jllllll.github.io/llama-cpp-python-cuBLAS-wheels/
-llama-cpp-python
 huggingface_hub