0-Parth-D commited on
Commit
2eb3831
·
1 Parent(s): 4267652

Set up GitHub Actions deployment to Hugging Face

Browse files
.github/workflows/deploy.yml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Deploy to Hugging Face Space
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+
7
+ jobs:
8
+ sync-to-hub:
9
+ runs-on: ubuntu-latest
10
+ steps:
11
+ - name: Checkout code
12
+ uses: actions/checkout@v4
13
+ with:
14
+ fetch-depth: 0
15
+
16
+ - name: Push to Hugging Face
17
+ env:
18
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
19
+ run: |
20
+ git push --force https://parthtamu:$HF_TOKEN@huggingface.co/spaces/parthtamu/rag-code-assistant main
Dockerfile CHANGED
@@ -2,17 +2,39 @@ FROM python:3.10-slim
2
 
3
  WORKDIR /app
4
 
 
 
 
 
 
 
 
 
5
  RUN python -m venv /opt/venv
6
  ENV PATH="/opt/venv/bin:$PATH"
7
 
 
8
  COPY requirements.txt .
9
-
10
  RUN pip install --no-cache-dir --upgrade pip \
11
  && pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu \
12
  && pip install --no-cache-dir -r requirements.txt
13
 
 
 
 
 
 
 
 
 
 
14
  RUN python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2')"
15
 
 
16
  COPY src/rag_code_assistant/agent.py .
17
 
18
- CMD ["python", "agent.py"]
 
 
 
 
 
2
 
3
  WORKDIR /app
4
 
5
+ # 1. Install C++ Build Tools (Required for pybind11 and CMake)
6
+ RUN apt-get update && apt-get install -y --no-install-recommends \
7
+ build-essential \
8
+ cmake \
9
+ git \
10
+ && rm -rf /var/lib/apt/lists/*
11
+
12
+ # 2. Setup Virtual Environment
13
  RUN python -m venv /opt/venv
14
  ENV PATH="/opt/venv/bin:$PATH"
15
 
16
+ # 3. Install basic Python dependencies first (helps with Docker caching)
17
  COPY requirements.txt .
 
18
  RUN pip install --no-cache-dir --upgrade pip \
19
  && pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu \
20
  && pip install --no-cache-dir -r requirements.txt
21
 
22
+ # 4. Copy the ENTIRE fast_tokenizer directory into the container
23
+ # This ensures setup.py, CMakeLists.txt, and the cpp files are all present
24
+ COPY src/fast_tokenizer/ ./src/fast_tokenizer/
25
+
26
+ # 5. Compile and install your C++ extension locally
27
+ # Navigate into the specific folder we just copied and install it
28
+ RUN cd src/fast_tokenizer && pip install .
29
+
30
+ # 6. Pre-download the Hugging Face model
31
  RUN python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2')"
32
 
33
+ # 7. Copy the rest of your agent's source code
34
  COPY src/rag_code_assistant/agent.py .
35
 
36
+ # 8. Expose Hugging Face Port
37
+ EXPOSE 7860
38
+
39
+ # 9. Start FastAPI via uvicorn
40
+ CMD ["uvicorn", "agent:app", "--host", "0.0.0.0", "--port", "7860"]
requirements.txt CHANGED
@@ -6,3 +6,11 @@ langchain-ollama
6
  langchain-text-splitters
7
  langchain-chroma
8
  sentence-transformers
 
 
 
 
 
 
 
 
 
6
  langchain-text-splitters
7
  langchain-chroma
8
  sentence-transformers
9
+ python-dotenv
10
+
11
+ fastapi
12
+ uvicorn
13
+ python-multipart
14
+ pinecone-client
15
+ langchain-pinecone
16
+ pypdf
src/fast_tokenizer/src/tokenizer.cpp CHANGED
@@ -22,8 +22,16 @@ std::vector<std::string> tokenize(std::string s) {
22
  return result;
23
  }
24
 
 
 
 
 
 
 
 
25
  /* pybind11 bindings */
26
  PYBIND11_MODULE(fast_tokenizer, m) {
27
  m.doc() = "Fast C++ tokenizer plugin for RAG Code Assistant";
28
  m.def("tokenize", &tokenize, "A function that splits a string by whitespace and lowercases it");
 
29
  }
 
22
  return result;
23
  }
24
 
25
+ // Inside your C++ code
26
+ size_t count_tokens(const std::string& text) {
27
+ // Assuming tokenize() is your existing function that returns std::vector<std::string>
28
+ std::vector<std::string> tokens = tokenize(text);
29
+ return tokens.size();
30
+ }
31
+
32
  /* pybind11 bindings */
33
  PYBIND11_MODULE(fast_tokenizer, m) {
34
  m.doc() = "Fast C++ tokenizer plugin for RAG Code Assistant";
35
  m.def("tokenize", &tokenize, "A function that splits a string by whitespace and lowercases it");
36
+ m.def("count_tokens", &count_tokens, "Returns the number of tokens in the text");
37
  }
src/rag_code_assistant/agent.py CHANGED
@@ -1,33 +1,51 @@
1
- from langchain_chroma import Chroma
 
 
 
 
 
 
 
 
 
 
2
  from langchain_ollama import ChatOllama
3
  from langchain_huggingface import HuggingFaceEmbeddings
4
  from langchain_core.tools.retriever import create_retriever_tool
5
  from langchain.agents import create_agent
6
  from langchain_core.messages import HumanMessage, AIMessage, AIMessageChunk
7
- import os
 
 
 
 
 
 
8
 
9
  def load_vectorstore():
10
  embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
11
- return Chroma(
12
- persist_directory="chroma_db",
13
- embedding_function=embeddings,
14
- collection_name="rag_code_assistant"
15
  )
16
 
17
  def load_llm():
18
- # Make sure to use the dedicated ollama import to avoid JSON parsing errors
19
- base_url = os.environ.get("OLLAMA_BASE_URL", "http://localhost:11434")
20
- return ChatOllama(model="llama3.1", temperature=0.1, base_url=base_url)
 
 
21
 
22
  def load_retriever(vectorstore):
 
23
  return vectorstore.as_retriever(
24
  search_type="mmr",
25
  search_kwargs={"k": 4, "fetch_k": 20}
26
  )
27
 
28
  def load_retriever_tool(retriever):
29
- # This built-in tool automatically accepts a "query" argument from the LLM,
30
- # searches the DB, and returns the raw text context back to the LLM.
31
  return create_retriever_tool(
32
  retriever,
33
  "rag_retriever",
@@ -35,6 +53,7 @@ def load_retriever_tool(retriever):
35
  )
36
 
37
  def load_agent(tools, llm):
 
38
  system_prompt = (
39
  "You are an expert all in one assistant. Follow these rules strictly:\n\n"
40
  "1. PYTHON QUESTIONS: YOU MUST use tools to search for the answer.\n"
@@ -54,53 +73,132 @@ def load_agent(tools, llm):
54
  system_prompt=system_prompt,
55
  )
56
 
57
- if __name__ == "__main__":
58
- # --- INITIALIZATION ---
59
- vectorstore = load_vectorstore()
60
- llm = load_llm()
61
- retriever = load_retriever(vectorstore)
62
 
63
- retriever_tool = load_retriever_tool(retriever)
64
- tools = [retriever_tool]
65
 
66
- agent = load_agent(tools, llm)
67
 
68
- # --- CONTINUOUS CHAT LOOP ---
69
- print("\n" + "="*50)
70
- print("🐍 Python Coding Assistant Initialized")
71
- print("Type 'quit', 'exit', or 'q' to end the conversation.")
72
- print("="*50 + "\n")
 
 
73
 
74
- chat_history = []
 
 
 
 
 
 
 
 
 
75
 
76
- while True:
77
- user_input = input("You: ")
78
- print("=== YOUR QUESTION ===")
79
- print(user_input)
80
 
81
- if user_input.lower() in ['quit', 'exit', 'q']:
82
- print("\nGoodbye!")
83
- break
 
 
 
 
 
 
84
 
85
- chat_history.append(HumanMessage(content=user_input))
86
-
87
- print("Thinking...\n")
88
- print("=== ASSISTANT'S ANSWER ===")
89
-
90
  try:
91
- full_response = ""
92
  for chunk, metadata in agent.stream(
93
  {"messages": chat_history},
94
  stream_mode="messages",
95
  ):
96
  if isinstance(chunk, AIMessageChunk) and chunk.content:
97
- print(chunk.content, end="", flush=True)
98
- full_response += chunk.content
 
 
 
 
99
 
100
- print("\n" + "="*50 + "\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
- chat_history.append(AIMessage(content=full_response))
 
 
 
 
 
 
 
 
 
 
 
103
 
104
- except Exception as e:
105
- print(f"\n[Error]: {e}")
106
- chat_history.pop()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import uvicorn
3
+ from dotenv import load_dotenv
4
+
5
+ load_dotenv()
6
+ from fastapi import FastAPI, UploadFile, File
7
+ from fastapi.middleware.cors import CORSMiddleware
8
+ from fastapi.responses import StreamingResponse
9
+ from pydantic import BaseModel
10
+
11
+ from langchain_pinecone import PineconeVectorStore # Changed from Chroma
12
  from langchain_ollama import ChatOllama
13
  from langchain_huggingface import HuggingFaceEmbeddings
14
  from langchain_core.tools.retriever import create_retriever_tool
15
  from langchain.agents import create_agent
16
  from langchain_core.messages import HumanMessage, AIMessage, AIMessageChunk
17
+
18
+ from langchain_community.document_loaders import PyPDFLoader
19
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
20
+
21
+ import fast_tokenizer
22
+ from pathlib import Path
23
+ from langchain_community.document_loaders import UnstructuredHTMLLoader, UnstructuredMarkdownLoader, TextLoader, BSHTMLLoader
24
 
25
  def load_vectorstore():
26
  embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
27
+ return PineconeVectorStore(
28
+ index_name="rag-agent",
29
+ embedding=embeddings,
30
+ pinecone_api_key=os.environ["PINECONE_API_KEY"],
31
  )
32
 
33
  def load_llm():
34
+ return ChatOllama(
35
+ model="llama3.1",
36
+ temperature=0.1,
37
+ base_url=os.environ["OLLAMA_BASE_URL"],
38
+ )
39
 
40
  def load_retriever(vectorstore):
41
+ # Kept exactly as you wrote it
42
  return vectorstore.as_retriever(
43
  search_type="mmr",
44
  search_kwargs={"k": 4, "fetch_k": 20}
45
  )
46
 
47
  def load_retriever_tool(retriever):
48
+ # Kept exactly as you wrote it
 
49
  return create_retriever_tool(
50
  retriever,
51
  "rag_retriever",
 
53
  )
54
 
55
  def load_agent(tools, llm):
56
+ # Kept exactly as you wrote it
57
  system_prompt = (
58
  "You are an expert all in one assistant. Follow these rules strictly:\n\n"
59
  "1. PYTHON QUESTIONS: YOU MUST use tools to search for the answer.\n"
 
73
  system_prompt=system_prompt,
74
  )
75
 
 
 
 
 
 
76
 
77
+ # --- FASTAPI SETUP & GLOBAL INITIALIZATION ---
 
78
 
79
+ app = FastAPI(title="Python RAG Agent API")
80
 
81
+ app.add_middleware(
82
+ CORSMiddleware,
83
+ allow_origins=["*"],
84
+ allow_credentials=True,
85
+ allow_methods=["*"],
86
+ allow_headers=["*"],
87
+ )
88
 
89
+ # Initialize your agent once when the server starts
90
+ vectorstore = load_vectorstore()
91
+ llm = load_llm()
92
+ retriever = load_retriever(vectorstore)
93
+ retriever_tool = load_retriever_tool(retriever)
94
+ tools = [retriever_tool]
95
+ agent = load_agent(tools, llm)
96
+
97
+
98
+ # --- API ENDPOINTS ---
99
 
100
+ class ChatRequest(BaseModel):
101
+ message: str
102
+ history: list[dict] = [] # Allows UI to send previous messages
 
103
 
104
+ @app.post("/chat")
105
+ async def chat_endpoint(request: ChatRequest):
106
+ # 1. Build the chat history array from the UI's request
107
+ chat_history = []
108
+ for msg in request.history:
109
+ if msg["role"] == "user":
110
+ chat_history.append(HumanMessage(content=msg["content"]))
111
+ else:
112
+ chat_history.append(AIMessage(content=msg["content"]))
113
 
114
+ chat_history.append(HumanMessage(content=request.message))
115
+
116
+ # 2. Wrap your exact original streaming logic in a generator function
117
+ async def generate_stream():
 
118
  try:
 
119
  for chunk, metadata in agent.stream(
120
  {"messages": chat_history},
121
  stream_mode="messages",
122
  ):
123
  if isinstance(chunk, AIMessageChunk) and chunk.content:
124
+ yield chunk.content
125
+ except Exception as e:
126
+ yield f"\n[Error]: {e}"
127
+
128
+ # 3. Stream the output to the Vercel frontend
129
+ return StreamingResponse(generate_stream(), media_type="text/event-stream")
130
 
131
+
132
+ # 1. Add your custom token length function back
133
+ def custom_token_length(text):
134
+ tokens = fast_tokenizer.tokenize(text)
135
+ return len(tokens)
136
+
137
+ @app.post("/upload")
138
+ async def upload_document(file: UploadFile = File(...)):
139
+ """Accepts PDF, HTML, MD, and TXT files and uploads them to Pinecone using fast_tokenizer."""
140
+
141
+ ext = Path(file.filename).suffix.lower()
142
+
143
+ supported_extensions = [".pdf", ".html", ".htm", ".md", ".txt"]
144
+ if ext not in supported_extensions:
145
+ return {"error": f"Unsupported file type. Please upload one of: {', '.join(supported_extensions)}"}
146
+
147
+ temp_file_path = f"temp_{file.filename}"
148
+ with open(temp_file_path, "wb") as f:
149
+ f.write(await file.read())
150
+
151
+ try:
152
+ if ext == ".pdf":
153
+ loader = PyPDFLoader(temp_file_path)
154
+ docs = loader.load()
155
 
156
+ elif ext in [".html", ".htm"]:
157
+ try:
158
+ loader = UnstructuredHTMLLoader(temp_file_path)
159
+ docs = loader.load()
160
+ except Exception as e:
161
+ print(f"Warning: UnstructuredHTMLLoader failed, trying BSHTMLLoader: {e}")
162
+ loader = BSHTMLLoader(temp_file_path)
163
+ docs = loader.load()
164
+
165
+ elif ext == ".md":
166
+ loader = UnstructuredMarkdownLoader(temp_file_path)
167
+ docs = loader.load()
168
 
169
+ elif ext == ".txt":
170
+ loader = TextLoader(temp_file_path)
171
+ docs = loader.load()
172
+
173
+ # 2. Re-implement your exact RecursiveCharacterTextSplitter settings
174
+ text_splitter = RecursiveCharacterTextSplitter(
175
+ chunk_size=350, # Max 350 tokens per chunk
176
+ chunk_overlap=50, # Overlap of 50 tokens
177
+ length_function=custom_token_length # Tells LangChain to use your C++ tool
178
+ )
179
+
180
+ splits = text_splitter.split_documents(docs)
181
+
182
+ # 3. Upload the perfectly tokenized chunks to Pinecone
183
+ vectorstore.add_documents(splits)
184
+
185
+ return {
186
+ "status": "success",
187
+ "message": f"Successfully processed {file.filename} into {len(splits)} chunks and uploaded to Pinecone."
188
+ }
189
+
190
+ except Exception as e:
191
+ return {"error": f"Failed to process file: {str(e)}"}
192
+
193
+ finally:
194
+ # Clean up temp file
195
+ if os.path.exists(temp_file_path):
196
+ os.remove(temp_file_path)
197
+
198
+
199
+ if __name__ == "__main__":
200
+ # Runs the API server on port 7860 (Required for Hugging Face Spaces)
201
+ print("\n" + "="*50)
202
+ print("🐍 Python RAG API Initialized on Port 7860")
203
+ print("="*50 + "\n")
204
+ uvicorn.run("agent:app", host="0.0.0.0", port=7860, reload=True)
src/rag_code_assistant/ingest.py CHANGED
@@ -1,13 +1,19 @@
 
 
 
 
 
1
  from langchain_community.document_loaders import UnstructuredHTMLLoader, UnstructuredMarkdownLoader, TextLoader, BSHTMLLoader
2
  from langchain_text_splitters import RecursiveCharacterTextSplitter
3
- from langchain_community.embeddings import HuggingFaceEmbeddings
4
- from langchain_community.vectorstores import Chroma
5
  from pathlib import Path
6
  import fast_tokenizer
7
 
8
  base_dir = Path("./docs")
9
  paths = list(base_dir.rglob("*"))
10
 
 
11
  def load_docs(paths):
12
  all_docs = []
13
  for p in paths:
@@ -43,9 +49,16 @@ def load_docs(paths):
43
 
44
  return all_docs
45
 
 
 
46
  def custom_token_length(text):
47
- tokens = fast_tokenizer.tokenize(text)
48
- return len(tokens)
 
 
 
 
 
49
 
50
  def split_docs(docs):
51
  splitter = RecursiveCharacterTextSplitter(
@@ -55,18 +68,28 @@ def split_docs(docs):
55
  )
56
  return splitter.split_documents(docs)
57
 
 
58
  def store_docs(texts):
 
 
59
  model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
60
- vectorstore = Chroma.from_documents(
 
61
  documents=texts,
62
  embedding=model,
63
- persist_directory="chroma_db",
64
- collection_name="rag_code_assistant"
65
  )
66
  return vectorstore
67
 
68
- docs = load_docs(paths)
69
- texts = split_docs(docs)
70
- vectorstore = store_docs(texts)
71
 
72
- print("Documents Loaded: ", len(docs))
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+
4
+ load_dotenv()
5
+
6
  from langchain_community.document_loaders import UnstructuredHTMLLoader, UnstructuredMarkdownLoader, TextLoader, BSHTMLLoader
7
  from langchain_text_splitters import RecursiveCharacterTextSplitter
8
+ from langchain_huggingface import HuggingFaceEmbeddings
9
+ from langchain_pinecone import PineconeVectorStore
10
  from pathlib import Path
11
  import fast_tokenizer
12
 
13
  base_dir = Path("./docs")
14
  paths = list(base_dir.rglob("*"))
15
 
16
+
17
  def load_docs(paths):
18
  all_docs = []
19
  for p in paths:
 
49
 
50
  return all_docs
51
 
52
+
53
+ # Temporary Python fallback for local Windows ingestion
54
  def custom_token_length(text):
55
+ # Ensure text is clean UTF-8
56
+ clean_text = text.encode('utf-8', 'ignore').decode('utf-8')
57
+
58
+ # A standard rule of thumb for English text is that 1 token is roughly 4 characters.
59
+ # This avoids needing the C++ fast_tokenizer on Windows!
60
+ return len(clean_text) // 4
61
+
62
 
63
  def split_docs(docs):
64
  splitter = RecursiveCharacterTextSplitter(
 
68
  )
69
  return splitter.split_documents(docs)
70
 
71
+
72
  def store_docs(texts):
73
+ print("Embedding documents and uploading to Pinecone... (This may take a minute)")
74
+
75
  model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
76
+
77
+ vectorstore = PineconeVectorStore.from_documents(
78
  documents=texts,
79
  embedding=model,
80
+ index_name="rag-agent",
81
+ pinecone_api_key=os.environ["PINECONE_API_KEY"],
82
  )
83
  return vectorstore
84
 
 
 
 
85
 
86
+ if __name__ == "__main__":
87
+ docs = load_docs(paths)
88
+ texts = split_docs(docs)
89
+ vectorstore = store_docs(texts)
90
+
91
+ print("="*50)
92
+ print("✅ SUCCESS!")
93
+ print(f"Documents Loaded: {len(docs)}")
94
+ print(f"Total Chunks Uploaded to Pinecone: {len(texts)}")
95
+ print("="*50)