Spaces:
Sleeping
Sleeping
| import os | |
| import glob | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_openai import OpenAIEmbeddings | |
| from langchain_chroma import Chroma | |
| from langchain_community.document_loaders import TextLoader, PyPDFLoader | |
| from dotenv import load_dotenv | |
| BASE_DIR = os.path.dirname(__file__) | |
| DATA_PATH = os.path.join(BASE_DIR, "data") | |
| DB_DIR = os.path.join(BASE_DIR, "rag_db") | |
| load_dotenv(override=True) | |
| OPENAI_API_KEY=os.getenv("OPENAI_API_KEY") | |
| def load_and_chunk(): | |
| docs = [] | |
| # Iterate over all files in the folder | |
| for file_path in glob.glob(os.path.join(DATA_PATH, "*")): | |
| ext = file_path.lower() | |
| print(file_path) | |
| if ext.endswith(".txt") or ext.endswith(".md"): | |
| loader = TextLoader(file_path) | |
| elif ext.endswith(".pdf"): | |
| loader = PyPDFLoader(file_path) | |
| else: | |
| print(f"Skipping unsupported file: {file_path}") | |
| continue | |
| docs.extend(loader.load()) | |
| # 2. Chunk them | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=1200, | |
| chunk_overlap=150 | |
| ) | |
| return splitter.split_documents(docs) | |
| def init_vectorstore(): # vectorizes our embeddings | |
| """ | |
| Function that initializes the vectorstore, could be used in eg main loop | |
| """ | |
| def db_is_empty(db_path): | |
| # Check if chroma sqlite file exists and is > 0 bytes | |
| sqlite_file = os.path.join(db_path, "chroma.sqlite3") | |
| return not os.path.exists(sqlite_file) or os.path.getsize(sqlite_file) == 0 | |
| emb = OpenAIEmbeddings(api_key=OPENAI_API_KEY) | |
| if db_is_empty(DB_DIR) is False: | |
| # Load existing DB (no re-embedding) | |
| return Chroma( | |
| persist_directory=DB_DIR, | |
| embedding_function=emb | |
| ) | |
| # First-time: create DB | |
| print("first time creating the vec store") | |
| chunks = load_and_chunk() | |
| db = Chroma.from_documents( | |
| documents=chunks, | |
| embedding=emb, | |
| persist_directory=DB_DIR | |
| ) | |
| return db | |
| # Initialize | |
| # Format documents for the prompt | |
| def format_docs(docs): | |
| """Format retrieved documents into a string.""" | |
| return "\n\n".join( | |
| f"Source: {doc.metadata.get('source', 'Unknown')}\n{doc.page_content}" | |
| for doc in docs | |
| ) | |
| # This is what will be used! | |
| def ask(query: str, retriver_moedel) -> str: | |
| """Simple call for external modules (like narrator).""" | |
| docs = retriver_moedel.invoke(query) | |
| return docs | |