Spaces:
Sleeping
Sleeping
0-Parth-D commited on
Commit ·
2eb3831
1
Parent(s): 4267652
Set up GitHub Actions deployment to Hugging Face
Browse files- .github/workflows/deploy.yml +20 -0
- Dockerfile +24 -2
- requirements.txt +8 -0
- src/fast_tokenizer/src/tokenizer.cpp +8 -0
- src/rag_code_assistant/agent.py +143 -45
- src/rag_code_assistant/ingest.py +34 -11
.github/workflows/deploy.yml
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Deploy to Hugging Face Space
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
push:
|
| 5 |
+
branches: [main]
|
| 6 |
+
|
| 7 |
+
jobs:
|
| 8 |
+
sync-to-hub:
|
| 9 |
+
runs-on: ubuntu-latest
|
| 10 |
+
steps:
|
| 11 |
+
- name: Checkout code
|
| 12 |
+
uses: actions/checkout@v4
|
| 13 |
+
with:
|
| 14 |
+
fetch-depth: 0
|
| 15 |
+
|
| 16 |
+
- name: Push to Hugging Face
|
| 17 |
+
env:
|
| 18 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
| 19 |
+
run: |
|
| 20 |
+
git push --force https://parthtamu:$HF_TOKEN@huggingface.co/spaces/parthtamu/rag-code-assistant main
|
Dockerfile
CHANGED
|
@@ -2,17 +2,39 @@ FROM python:3.10-slim
|
|
| 2 |
|
| 3 |
WORKDIR /app
|
| 4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
RUN python -m venv /opt/venv
|
| 6 |
ENV PATH="/opt/venv/bin:$PATH"
|
| 7 |
|
|
|
|
| 8 |
COPY requirements.txt .
|
| 9 |
-
|
| 10 |
RUN pip install --no-cache-dir --upgrade pip \
|
| 11 |
&& pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu \
|
| 12 |
&& pip install --no-cache-dir -r requirements.txt
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
RUN python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2')"
|
| 15 |
|
|
|
|
| 16 |
COPY src/rag_code_assistant/agent.py .
|
| 17 |
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
WORKDIR /app
|
| 4 |
|
| 5 |
+
# 1. Install C++ Build Tools (Required for pybind11 and CMake)
|
| 6 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 7 |
+
build-essential \
|
| 8 |
+
cmake \
|
| 9 |
+
git \
|
| 10 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 11 |
+
|
| 12 |
+
# 2. Setup Virtual Environment
|
| 13 |
RUN python -m venv /opt/venv
|
| 14 |
ENV PATH="/opt/venv/bin:$PATH"
|
| 15 |
|
| 16 |
+
# 3. Install basic Python dependencies first (helps with Docker caching)
|
| 17 |
COPY requirements.txt .
|
|
|
|
| 18 |
RUN pip install --no-cache-dir --upgrade pip \
|
| 19 |
&& pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu \
|
| 20 |
&& pip install --no-cache-dir -r requirements.txt
|
| 21 |
|
| 22 |
+
# 4. Copy the ENTIRE fast_tokenizer directory into the container
|
| 23 |
+
# This ensures setup.py, CMakeLists.txt, and the cpp files are all present
|
| 24 |
+
COPY src/fast_tokenizer/ ./src/fast_tokenizer/
|
| 25 |
+
|
| 26 |
+
# 5. Compile and install your C++ extension locally
|
| 27 |
+
# Navigate into the specific folder we just copied and install it
|
| 28 |
+
RUN cd src/fast_tokenizer && pip install .
|
| 29 |
+
|
| 30 |
+
# 6. Pre-download the Hugging Face model
|
| 31 |
RUN python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2')"
|
| 32 |
|
| 33 |
+
# 7. Copy the rest of your agent's source code
|
| 34 |
COPY src/rag_code_assistant/agent.py .
|
| 35 |
|
| 36 |
+
# 8. Expose Hugging Face Port
|
| 37 |
+
EXPOSE 7860
|
| 38 |
+
|
| 39 |
+
# 9. Start FastAPI via uvicorn
|
| 40 |
+
CMD ["uvicorn", "agent:app", "--host", "0.0.0.0", "--port", "7860"]
|
requirements.txt
CHANGED
|
@@ -6,3 +6,11 @@ langchain-ollama
|
|
| 6 |
langchain-text-splitters
|
| 7 |
langchain-chroma
|
| 8 |
sentence-transformers
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
langchain-text-splitters
|
| 7 |
langchain-chroma
|
| 8 |
sentence-transformers
|
| 9 |
+
python-dotenv
|
| 10 |
+
|
| 11 |
+
fastapi
|
| 12 |
+
uvicorn
|
| 13 |
+
python-multipart
|
| 14 |
+
pinecone-client
|
| 15 |
+
langchain-pinecone
|
| 16 |
+
pypdf
|
src/fast_tokenizer/src/tokenizer.cpp
CHANGED
|
@@ -22,8 +22,16 @@ std::vector<std::string> tokenize(std::string s) {
|
|
| 22 |
return result;
|
| 23 |
}
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
/* pybind11 bindings */
|
| 26 |
PYBIND11_MODULE(fast_tokenizer, m) {
|
| 27 |
m.doc() = "Fast C++ tokenizer plugin for RAG Code Assistant";
|
| 28 |
m.def("tokenize", &tokenize, "A function that splits a string by whitespace and lowercases it");
|
|
|
|
| 29 |
}
|
|
|
|
| 22 |
return result;
|
| 23 |
}
|
| 24 |
|
| 25 |
+
// Inside your C++ code
|
| 26 |
+
size_t count_tokens(const std::string& text) {
|
| 27 |
+
// Assuming tokenize() is your existing function that returns std::vector<std::string>
|
| 28 |
+
std::vector<std::string> tokens = tokenize(text);
|
| 29 |
+
return tokens.size();
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
/* pybind11 bindings */
|
| 33 |
PYBIND11_MODULE(fast_tokenizer, m) {
|
| 34 |
m.doc() = "Fast C++ tokenizer plugin for RAG Code Assistant";
|
| 35 |
m.def("tokenize", &tokenize, "A function that splits a string by whitespace and lowercases it");
|
| 36 |
+
m.def("count_tokens", &count_tokens, "Returns the number of tokens in the text");
|
| 37 |
}
|
src/rag_code_assistant/agent.py
CHANGED
|
@@ -1,33 +1,51 @@
|
|
| 1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
from langchain_ollama import ChatOllama
|
| 3 |
from langchain_huggingface import HuggingFaceEmbeddings
|
| 4 |
from langchain_core.tools.retriever import create_retriever_tool
|
| 5 |
from langchain.agents import create_agent
|
| 6 |
from langchain_core.messages import HumanMessage, AIMessage, AIMessageChunk
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
def load_vectorstore():
|
| 10 |
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
|
| 11 |
-
return
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
)
|
| 16 |
|
| 17 |
def load_llm():
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
|
|
|
|
|
|
| 21 |
|
| 22 |
def load_retriever(vectorstore):
|
|
|
|
| 23 |
return vectorstore.as_retriever(
|
| 24 |
search_type="mmr",
|
| 25 |
search_kwargs={"k": 4, "fetch_k": 20}
|
| 26 |
)
|
| 27 |
|
| 28 |
def load_retriever_tool(retriever):
|
| 29 |
-
#
|
| 30 |
-
# searches the DB, and returns the raw text context back to the LLM.
|
| 31 |
return create_retriever_tool(
|
| 32 |
retriever,
|
| 33 |
"rag_retriever",
|
|
@@ -35,6 +53,7 @@ def load_retriever_tool(retriever):
|
|
| 35 |
)
|
| 36 |
|
| 37 |
def load_agent(tools, llm):
|
|
|
|
| 38 |
system_prompt = (
|
| 39 |
"You are an expert all in one assistant. Follow these rules strictly:\n\n"
|
| 40 |
"1. PYTHON QUESTIONS: YOU MUST use tools to search for the answer.\n"
|
|
@@ -54,53 +73,132 @@ def load_agent(tools, llm):
|
|
| 54 |
system_prompt=system_prompt,
|
| 55 |
)
|
| 56 |
|
| 57 |
-
if __name__ == "__main__":
|
| 58 |
-
# --- INITIALIZATION ---
|
| 59 |
-
vectorstore = load_vectorstore()
|
| 60 |
-
llm = load_llm()
|
| 61 |
-
retriever = load_retriever(vectorstore)
|
| 62 |
|
| 63 |
-
|
| 64 |
-
tools = [retriever_tool]
|
| 65 |
|
| 66 |
-
|
| 67 |
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
|
|
|
|
|
|
| 73 |
|
| 74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
print(user_input)
|
| 80 |
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
try:
|
| 91 |
-
full_response = ""
|
| 92 |
for chunk, metadata in agent.stream(
|
| 93 |
{"messages": chat_history},
|
| 94 |
stream_mode="messages",
|
| 95 |
):
|
| 96 |
if isinstance(chunk, AIMessageChunk) and chunk.content:
|
| 97 |
-
|
| 98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
|
| 102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import uvicorn
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
|
| 5 |
+
load_dotenv()
|
| 6 |
+
from fastapi import FastAPI, UploadFile, File
|
| 7 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 8 |
+
from fastapi.responses import StreamingResponse
|
| 9 |
+
from pydantic import BaseModel
|
| 10 |
+
|
| 11 |
+
from langchain_pinecone import PineconeVectorStore # Changed from Chroma
|
| 12 |
from langchain_ollama import ChatOllama
|
| 13 |
from langchain_huggingface import HuggingFaceEmbeddings
|
| 14 |
from langchain_core.tools.retriever import create_retriever_tool
|
| 15 |
from langchain.agents import create_agent
|
| 16 |
from langchain_core.messages import HumanMessage, AIMessage, AIMessageChunk
|
| 17 |
+
|
| 18 |
+
from langchain_community.document_loaders import PyPDFLoader
|
| 19 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 20 |
+
|
| 21 |
+
import fast_tokenizer
|
| 22 |
+
from pathlib import Path
|
| 23 |
+
from langchain_community.document_loaders import UnstructuredHTMLLoader, UnstructuredMarkdownLoader, TextLoader, BSHTMLLoader
|
| 24 |
|
| 25 |
def load_vectorstore():
|
| 26 |
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
|
| 27 |
+
return PineconeVectorStore(
|
| 28 |
+
index_name="rag-agent",
|
| 29 |
+
embedding=embeddings,
|
| 30 |
+
pinecone_api_key=os.environ["PINECONE_API_KEY"],
|
| 31 |
)
|
| 32 |
|
| 33 |
def load_llm():
|
| 34 |
+
return ChatOllama(
|
| 35 |
+
model="llama3.1",
|
| 36 |
+
temperature=0.1,
|
| 37 |
+
base_url=os.environ["OLLAMA_BASE_URL"],
|
| 38 |
+
)
|
| 39 |
|
| 40 |
def load_retriever(vectorstore):
|
| 41 |
+
# Kept exactly as you wrote it
|
| 42 |
return vectorstore.as_retriever(
|
| 43 |
search_type="mmr",
|
| 44 |
search_kwargs={"k": 4, "fetch_k": 20}
|
| 45 |
)
|
| 46 |
|
| 47 |
def load_retriever_tool(retriever):
|
| 48 |
+
# Kept exactly as you wrote it
|
|
|
|
| 49 |
return create_retriever_tool(
|
| 50 |
retriever,
|
| 51 |
"rag_retriever",
|
|
|
|
| 53 |
)
|
| 54 |
|
| 55 |
def load_agent(tools, llm):
|
| 56 |
+
# Kept exactly as you wrote it
|
| 57 |
system_prompt = (
|
| 58 |
"You are an expert all in one assistant. Follow these rules strictly:\n\n"
|
| 59 |
"1. PYTHON QUESTIONS: YOU MUST use tools to search for the answer.\n"
|
|
|
|
| 73 |
system_prompt=system_prompt,
|
| 74 |
)
|
| 75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
|
| 77 |
+
# --- FASTAPI SETUP & GLOBAL INITIALIZATION ---
|
|
|
|
| 78 |
|
| 79 |
+
app = FastAPI(title="Python RAG Agent API")
|
| 80 |
|
| 81 |
+
app.add_middleware(
|
| 82 |
+
CORSMiddleware,
|
| 83 |
+
allow_origins=["*"],
|
| 84 |
+
allow_credentials=True,
|
| 85 |
+
allow_methods=["*"],
|
| 86 |
+
allow_headers=["*"],
|
| 87 |
+
)
|
| 88 |
|
| 89 |
+
# Initialize your agent once when the server starts
|
| 90 |
+
vectorstore = load_vectorstore()
|
| 91 |
+
llm = load_llm()
|
| 92 |
+
retriever = load_retriever(vectorstore)
|
| 93 |
+
retriever_tool = load_retriever_tool(retriever)
|
| 94 |
+
tools = [retriever_tool]
|
| 95 |
+
agent = load_agent(tools, llm)
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
# --- API ENDPOINTS ---
|
| 99 |
|
| 100 |
+
class ChatRequest(BaseModel):
|
| 101 |
+
message: str
|
| 102 |
+
history: list[dict] = [] # Allows UI to send previous messages
|
|
|
|
| 103 |
|
| 104 |
+
@app.post("/chat")
|
| 105 |
+
async def chat_endpoint(request: ChatRequest):
|
| 106 |
+
# 1. Build the chat history array from the UI's request
|
| 107 |
+
chat_history = []
|
| 108 |
+
for msg in request.history:
|
| 109 |
+
if msg["role"] == "user":
|
| 110 |
+
chat_history.append(HumanMessage(content=msg["content"]))
|
| 111 |
+
else:
|
| 112 |
+
chat_history.append(AIMessage(content=msg["content"]))
|
| 113 |
|
| 114 |
+
chat_history.append(HumanMessage(content=request.message))
|
| 115 |
+
|
| 116 |
+
# 2. Wrap your exact original streaming logic in a generator function
|
| 117 |
+
async def generate_stream():
|
|
|
|
| 118 |
try:
|
|
|
|
| 119 |
for chunk, metadata in agent.stream(
|
| 120 |
{"messages": chat_history},
|
| 121 |
stream_mode="messages",
|
| 122 |
):
|
| 123 |
if isinstance(chunk, AIMessageChunk) and chunk.content:
|
| 124 |
+
yield chunk.content
|
| 125 |
+
except Exception as e:
|
| 126 |
+
yield f"\n[Error]: {e}"
|
| 127 |
+
|
| 128 |
+
# 3. Stream the output to the Vercel frontend
|
| 129 |
+
return StreamingResponse(generate_stream(), media_type="text/event-stream")
|
| 130 |
|
| 131 |
+
|
| 132 |
+
# 1. Add your custom token length function back
|
| 133 |
+
def custom_token_length(text):
|
| 134 |
+
tokens = fast_tokenizer.tokenize(text)
|
| 135 |
+
return len(tokens)
|
| 136 |
+
|
| 137 |
+
@app.post("/upload")
|
| 138 |
+
async def upload_document(file: UploadFile = File(...)):
|
| 139 |
+
"""Accepts PDF, HTML, MD, and TXT files and uploads them to Pinecone using fast_tokenizer."""
|
| 140 |
+
|
| 141 |
+
ext = Path(file.filename).suffix.lower()
|
| 142 |
+
|
| 143 |
+
supported_extensions = [".pdf", ".html", ".htm", ".md", ".txt"]
|
| 144 |
+
if ext not in supported_extensions:
|
| 145 |
+
return {"error": f"Unsupported file type. Please upload one of: {', '.join(supported_extensions)}"}
|
| 146 |
+
|
| 147 |
+
temp_file_path = f"temp_{file.filename}"
|
| 148 |
+
with open(temp_file_path, "wb") as f:
|
| 149 |
+
f.write(await file.read())
|
| 150 |
+
|
| 151 |
+
try:
|
| 152 |
+
if ext == ".pdf":
|
| 153 |
+
loader = PyPDFLoader(temp_file_path)
|
| 154 |
+
docs = loader.load()
|
| 155 |
|
| 156 |
+
elif ext in [".html", ".htm"]:
|
| 157 |
+
try:
|
| 158 |
+
loader = UnstructuredHTMLLoader(temp_file_path)
|
| 159 |
+
docs = loader.load()
|
| 160 |
+
except Exception as e:
|
| 161 |
+
print(f"Warning: UnstructuredHTMLLoader failed, trying BSHTMLLoader: {e}")
|
| 162 |
+
loader = BSHTMLLoader(temp_file_path)
|
| 163 |
+
docs = loader.load()
|
| 164 |
+
|
| 165 |
+
elif ext == ".md":
|
| 166 |
+
loader = UnstructuredMarkdownLoader(temp_file_path)
|
| 167 |
+
docs = loader.load()
|
| 168 |
|
| 169 |
+
elif ext == ".txt":
|
| 170 |
+
loader = TextLoader(temp_file_path)
|
| 171 |
+
docs = loader.load()
|
| 172 |
+
|
| 173 |
+
# 2. Re-implement your exact RecursiveCharacterTextSplitter settings
|
| 174 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
| 175 |
+
chunk_size=350, # Max 350 tokens per chunk
|
| 176 |
+
chunk_overlap=50, # Overlap of 50 tokens
|
| 177 |
+
length_function=custom_token_length # Tells LangChain to use your C++ tool
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
splits = text_splitter.split_documents(docs)
|
| 181 |
+
|
| 182 |
+
# 3. Upload the perfectly tokenized chunks to Pinecone
|
| 183 |
+
vectorstore.add_documents(splits)
|
| 184 |
+
|
| 185 |
+
return {
|
| 186 |
+
"status": "success",
|
| 187 |
+
"message": f"Successfully processed {file.filename} into {len(splits)} chunks and uploaded to Pinecone."
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
except Exception as e:
|
| 191 |
+
return {"error": f"Failed to process file: {str(e)}"}
|
| 192 |
+
|
| 193 |
+
finally:
|
| 194 |
+
# Clean up temp file
|
| 195 |
+
if os.path.exists(temp_file_path):
|
| 196 |
+
os.remove(temp_file_path)
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
if __name__ == "__main__":
|
| 200 |
+
# Runs the API server on port 7860 (Required for Hugging Face Spaces)
|
| 201 |
+
print("\n" + "="*50)
|
| 202 |
+
print("🐍 Python RAG API Initialized on Port 7860")
|
| 203 |
+
print("="*50 + "\n")
|
| 204 |
+
uvicorn.run("agent:app", host="0.0.0.0", port=7860, reload=True)
|
src/rag_code_assistant/ingest.py
CHANGED
|
@@ -1,13 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from langchain_community.document_loaders import UnstructuredHTMLLoader, UnstructuredMarkdownLoader, TextLoader, BSHTMLLoader
|
| 2 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 3 |
-
from
|
| 4 |
-
from
|
| 5 |
from pathlib import Path
|
| 6 |
import fast_tokenizer
|
| 7 |
|
| 8 |
base_dir = Path("./docs")
|
| 9 |
paths = list(base_dir.rglob("*"))
|
| 10 |
|
|
|
|
| 11 |
def load_docs(paths):
|
| 12 |
all_docs = []
|
| 13 |
for p in paths:
|
|
@@ -43,9 +49,16 @@ def load_docs(paths):
|
|
| 43 |
|
| 44 |
return all_docs
|
| 45 |
|
|
|
|
|
|
|
| 46 |
def custom_token_length(text):
|
| 47 |
-
|
| 48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
def split_docs(docs):
|
| 51 |
splitter = RecursiveCharacterTextSplitter(
|
|
@@ -55,18 +68,28 @@ def split_docs(docs):
|
|
| 55 |
)
|
| 56 |
return splitter.split_documents(docs)
|
| 57 |
|
|
|
|
| 58 |
def store_docs(texts):
|
|
|
|
|
|
|
| 59 |
model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
|
| 60 |
-
|
|
|
|
| 61 |
documents=texts,
|
| 62 |
embedding=model,
|
| 63 |
-
|
| 64 |
-
|
| 65 |
)
|
| 66 |
return vectorstore
|
| 67 |
|
| 68 |
-
docs = load_docs(paths)
|
| 69 |
-
texts = split_docs(docs)
|
| 70 |
-
vectorstore = store_docs(texts)
|
| 71 |
|
| 72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
|
| 4 |
+
load_dotenv()
|
| 5 |
+
|
| 6 |
from langchain_community.document_loaders import UnstructuredHTMLLoader, UnstructuredMarkdownLoader, TextLoader, BSHTMLLoader
|
| 7 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 8 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
| 9 |
+
from langchain_pinecone import PineconeVectorStore
|
| 10 |
from pathlib import Path
|
| 11 |
import fast_tokenizer
|
| 12 |
|
| 13 |
base_dir = Path("./docs")
|
| 14 |
paths = list(base_dir.rglob("*"))
|
| 15 |
|
| 16 |
+
|
| 17 |
def load_docs(paths):
|
| 18 |
all_docs = []
|
| 19 |
for p in paths:
|
|
|
|
| 49 |
|
| 50 |
return all_docs
|
| 51 |
|
| 52 |
+
|
| 53 |
+
# Temporary Python fallback for local Windows ingestion
|
| 54 |
def custom_token_length(text):
|
| 55 |
+
# Ensure text is clean UTF-8
|
| 56 |
+
clean_text = text.encode('utf-8', 'ignore').decode('utf-8')
|
| 57 |
+
|
| 58 |
+
# A standard rule of thumb for English text is that 1 token is roughly 4 characters.
|
| 59 |
+
# This avoids needing the C++ fast_tokenizer on Windows!
|
| 60 |
+
return len(clean_text) // 4
|
| 61 |
+
|
| 62 |
|
| 63 |
def split_docs(docs):
|
| 64 |
splitter = RecursiveCharacterTextSplitter(
|
|
|
|
| 68 |
)
|
| 69 |
return splitter.split_documents(docs)
|
| 70 |
|
| 71 |
+
|
| 72 |
def store_docs(texts):
|
| 73 |
+
print("Embedding documents and uploading to Pinecone... (This may take a minute)")
|
| 74 |
+
|
| 75 |
model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
|
| 76 |
+
|
| 77 |
+
vectorstore = PineconeVectorStore.from_documents(
|
| 78 |
documents=texts,
|
| 79 |
embedding=model,
|
| 80 |
+
index_name="rag-agent",
|
| 81 |
+
pinecone_api_key=os.environ["PINECONE_API_KEY"],
|
| 82 |
)
|
| 83 |
return vectorstore
|
| 84 |
|
|
|
|
|
|
|
|
|
|
| 85 |
|
| 86 |
+
if __name__ == "__main__":
|
| 87 |
+
docs = load_docs(paths)
|
| 88 |
+
texts = split_docs(docs)
|
| 89 |
+
vectorstore = store_docs(texts)
|
| 90 |
+
|
| 91 |
+
print("="*50)
|
| 92 |
+
print("✅ SUCCESS!")
|
| 93 |
+
print(f"Documents Loaded: {len(docs)}")
|
| 94 |
+
print(f"Total Chunks Uploaded to Pinecone: {len(texts)}")
|
| 95 |
+
print("="*50)
|