# %% from qdrant_client import QdrantClient from qdrant_client.models import VectorParams, Distance from langchain_core.documents import Document from langchain_qdrant import QdrantVectorStore from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_huggingface import HuggingFaceEmbeddings from langchain_community.document_loaders import PyPDFLoader import os from pathlib import Path from uuid import uuid4 # %% QDRANT_URL = os.getenv('QDRANT_URL') QDRANT_API_KEY = os.getenv('QDRANT_API_KEY') # %% FAQ_COLLECTION = "faqs" BLOGS_COLLECTION = "blogs" TECHNOLOGY_COLLECTION = "technology" REVOLUTION_COLLECTION = "revolution" SUPPORT_COLLECTION = "support" PRODUCT_COLLECTION = "product" # %% client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY) embedding_model = "intfloat/e5-base-v2" embeddings = HuggingFaceEmbeddings(model_name=embedding_model) # %% data_directory = Path(__file__).parent / "data" text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=64) # %% #Delete Collection def delete_collection(collection_name): if client.collection_exists(collection_name): client.delete_collection(collection_name) print(f"Collection '{collection_name}' deleted.") # %% #Create Collection def create_collection(collection_name): if not client.collection_exists(collection_name): client.create_collection( collection_name=collection_name, vectors_config=VectorParams(size=1024, distance=Distance.COSINE), ) print(f"Created Collection: {collection_name}") # %% def load_documents_from_folder(folder_path): documents = [] for file_path in folder_path.rglob("*.txt"): with open(file_path, 'r', encoding='utf-8') as f: lines = f.readlines() if not lines: print(f"{file_path} is empty") continue source_url = lines[0].replace("Source URL:","").strip() content = "".join(lines[1:]).strip() topic = file_path.parent.name if content: doc = Document( page_content=content, metadata={'source': source_url, 'topic': topic} ) documents.append(doc) for file_path in folder_path.rglob("*.pdf"): try: loader = PyPDFLoader(file_path) docs = loader.load() for doc in docs: doc.metadata["topic"] = file_path.parent.name documents.extend(docs) except Exception as e: print(f"Failed to load PDF {file_path}: {e}") return documents # %% def split_and_upload_to_qdrant(collection_name, documents): splits = text_splitter.split_documents(documents) uuids = [str(uuid4()) for _ in range(len(splits))] vector_store = QdrantVectorStore( client=client, collection_name=collection_name, embedding=embeddings ) vector_store.add_documents(documents=splits, ids=uuids) print(f"Uploaded {len(splits)} chunks to {collection_name}") # %% sub_folders = [sub_folder for sub_folder in data_directory.iterdir() if sub_folder.is_dir()] for topic in sub_folders: collection_name = topic.name print(f"Processing: {topic.name}") delete_collection(collection_name) create_collection(collection_name) docs = load_documents_from_folder(topic) print(f"Loaded {len(docs)} docs from {topic}") if docs: split_and_upload_to_qdrant(collection_name, docs) print('\n') # %% """collection_name = 'wellness_docs' delete_collection(collection_name) create_collection(collection_name) sub_folders = [sub_folder for sub_folder in data_directory.iterdir() if sub_folder.is_dir()] for topic in sub_folders: print(f"Processing: {topic.name}") docs = load_documents_from_folder(topic) print(f"Loaded {len(docs)} docs from {topic}") if docs: split_and_upload_to_qdrant(collection_name, docs) print('\n')"""