vip11017 commited on
Commit
86b6056
·
1 Parent(s): c9cb780

adjusted embedding now to e5-base

Browse files
app/embed_documents.py CHANGED
@@ -6,6 +6,7 @@ from langchain_core.documents import Document
6
  from langchain_qdrant import QdrantVectorStore
7
  from langchain_text_splitters import RecursiveCharacterTextSplitter
8
  from langchain_huggingface import HuggingFaceEmbeddings
 
9
 
10
  import os
11
  from pathlib import Path
@@ -13,6 +14,7 @@ from uuid import uuid4
13
 
14
  # %%
15
  QDRANT_URL = os.getenv('QDRANT_URL')
 
16
 
17
  # %%
18
  FAQ_COLLECTION = "faqs"
@@ -23,12 +25,12 @@ SUPPORT_COLLECTION = "support"
23
  PRODUCT_COLLECTION = "product"
24
 
25
  # %%
26
- client = QdrantClient(url=QDRANT_URL, port=6333)
27
- embedding_model = "intfloat/e5-large-v2"
28
  embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
29
 
30
  # %%
31
- data_directory = Path("app/data")
32
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=64)
33
 
34
  # %%
@@ -39,6 +41,7 @@ def delete_collection(collection_name):
39
  print(f"Collection '{collection_name}' deleted.")
40
 
41
  # %%
 
42
  def create_collection(collection_name):
43
  if not client.collection_exists(collection_name):
44
  client.create_collection(
@@ -70,6 +73,17 @@ def load_documents_from_folder(folder_path):
70
  'topic': topic}
71
  )
72
  documents.append(doc)
 
 
 
 
 
 
 
 
 
 
 
73
  return documents
74
 
75
  # %%
@@ -105,7 +119,7 @@ for topic in sub_folders:
105
  print('\n')
106
 
107
  # %%
108
- collection_name = 'wellness_docs'
109
  delete_collection(collection_name)
110
  create_collection(collection_name)
111
 
@@ -118,13 +132,6 @@ for topic in sub_folders:
118
  if docs:
119
  split_and_upload_to_qdrant(collection_name, docs)
120
 
121
- print('\n')
122
-
123
- # %%
124
- print(client.get_collections())
125
-
126
-
127
- # %%
128
-
129
 
130
 
 
6
  from langchain_qdrant import QdrantVectorStore
7
  from langchain_text_splitters import RecursiveCharacterTextSplitter
8
  from langchain_huggingface import HuggingFaceEmbeddings
9
+ from langchain_community.document_loaders import PyPDFLoader
10
 
11
  import os
12
  from pathlib import Path
 
14
 
15
  # %%
16
  QDRANT_URL = os.getenv('QDRANT_URL')
17
+ QDRANT_API_KEY = os.getenv('QDRANT_API_KEY')
18
 
19
  # %%
20
  FAQ_COLLECTION = "faqs"
 
25
  PRODUCT_COLLECTION = "product"
26
 
27
  # %%
28
+ client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)
29
+ embedding_model = "intfloat/e5-base-v2"
30
  embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
31
 
32
  # %%
33
+ data_directory = Path(__file__).parent / "data"
34
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=64)
35
 
36
  # %%
 
41
  print(f"Collection '{collection_name}' deleted.")
42
 
43
  # %%
44
+ #Create Collection
45
  def create_collection(collection_name):
46
  if not client.collection_exists(collection_name):
47
  client.create_collection(
 
73
  'topic': topic}
74
  )
75
  documents.append(doc)
76
+
77
+ for file_path in folder_path.rglob("*.pdf"):
78
+ try:
79
+ loader = PyPDFLoader(file_path)
80
+ docs = loader.load()
81
+ for doc in docs:
82
+ doc.metadata["topic"] = file_path.parent.name
83
+ documents.extend(docs)
84
+ except Exception as e:
85
+ print(f"Failed to load PDF {file_path}: {e}")
86
+
87
  return documents
88
 
89
  # %%
 
119
  print('\n')
120
 
121
  # %%
122
+ """collection_name = 'wellness_docs'
123
  delete_collection(collection_name)
124
  create_collection(collection_name)
125
 
 
132
  if docs:
133
  split_and_upload_to_qdrant(collection_name, docs)
134
 
135
+ print('\n')"""
 
 
 
 
 
 
 
136
 
137
 
app/notebooks/embed_documents.ipynb CHANGED
@@ -10,7 +10,7 @@
10
  "name": "stderr",
11
  "output_type": "stream",
12
  "text": [
13
- "/Users/vishalpatel/Documents/Internship/Auro/chatbot/auro_chatbot_backend/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
14
  " from .autonotebook import tqdm as notebook_tqdm\n"
15
  ]
16
  }
@@ -64,7 +64,7 @@
64
  "outputs": [],
65
  "source": [
66
  "client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)\n",
67
- "embedding_model = \"intfloat/e5-large-v2\"\n",
68
  "embeddings = HuggingFaceEmbeddings(model_name=embedding_model)"
69
  ]
70
  },
@@ -105,7 +105,7 @@
105
  " if not client.collection_exists(collection_name):\n",
106
  " client.create_collection(\n",
107
  " collection_name=collection_name,\n",
108
- " vectors_config=VectorParams(size=1024, distance=Distance.COSINE),\n",
109
  " )\n",
110
  " print(f\"Created Collection: {collection_name}\")"
111
  ]
@@ -187,57 +187,44 @@
187
  "Processing: blogs\n",
188
  "Collection 'blogs' deleted.\n",
189
  "Created Collection: blogs\n",
190
- "Loaded 105 docs from ../data/blogs\n",
191
  "Uploaded 1045 chunks to blogs\n",
192
  "\n",
193
  "\n",
194
- "Processing: technology\n",
195
- "Collection 'technology' deleted.\n",
196
- "Created Collection: technology\n"
197
- ]
198
- },
199
- {
200
- "name": "stderr",
201
- "output_type": "stream",
202
- "text": [
203
- "Ignoring wrong pointing object 6 0 (offset 0)\n"
204
- ]
205
- },
206
- {
207
- "name": "stdout",
208
- "output_type": "stream",
209
- "text": [
210
- "Loaded 3 docs from ../data/technology\n",
211
- "Uploaded 11 chunks to technology\n",
212
- "\n",
213
- "\n",
214
- "Processing: revolution\n",
215
- "Collection 'revolution' deleted.\n",
216
- "Created Collection: revolution\n",
217
- "Loaded 274 docs from ../data/revolution\n",
218
- "Uploaded 1415 chunks to revolution\n",
219
  "\n",
220
  "\n",
221
  "Processing: product\n",
222
  "Collection 'product' deleted.\n",
223
  "Created Collection: product\n",
224
- "Loaded 19 docs from ../data/product\n",
225
  "Uploaded 132 chunks to product\n",
226
  "\n",
227
  "\n",
228
- "Processing: faqs\n",
229
- "Collection 'faqs' deleted.\n",
230
- "Created Collection: faqs\n",
231
- "Loaded 1 docs from ../data/faqs\n",
232
- "Uploaded 14 chunks to faqs\n",
233
  "\n",
234
  "\n",
235
  "Processing: support\n",
236
  "Collection 'support' deleted.\n",
237
  "Created Collection: support\n",
238
- "Loaded 2 docs from ../data/support\n",
239
  "Uploaded 15 chunks to support\n",
240
  "\n",
 
 
 
 
 
 
 
241
  "\n"
242
  ]
243
  }
 
10
  "name": "stderr",
11
  "output_type": "stream",
12
  "text": [
13
+ "c:\\Users\\vip11\\Documents\\Projects\\Auro_Chatbot\\auro_chatbot_backend\\.venv\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
14
  " from .autonotebook import tqdm as notebook_tqdm\n"
15
  ]
16
  }
 
64
  "outputs": [],
65
  "source": [
66
  "client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)\n",
67
+ "embedding_model = \"intfloat/e5-base-v2\"\n",
68
  "embeddings = HuggingFaceEmbeddings(model_name=embedding_model)"
69
  ]
70
  },
 
105
  " if not client.collection_exists(collection_name):\n",
106
  " client.create_collection(\n",
107
  " collection_name=collection_name,\n",
108
+ " vectors_config=VectorParams(size=768, distance=Distance.COSINE),\n",
109
  " )\n",
110
  " print(f\"Created Collection: {collection_name}\")"
111
  ]
 
187
  "Processing: blogs\n",
188
  "Collection 'blogs' deleted.\n",
189
  "Created Collection: blogs\n",
190
+ "Loaded 105 docs from ..\\data\\blogs\n",
191
  "Uploaded 1045 chunks to blogs\n",
192
  "\n",
193
  "\n",
194
+ "Processing: faqs\n",
195
+ "Collection 'faqs' deleted.\n",
196
+ "Created Collection: faqs\n",
197
+ "Loaded 1 docs from ..\\data\\faqs\n",
198
+ "Uploaded 14 chunks to faqs\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  "\n",
200
  "\n",
201
  "Processing: product\n",
202
  "Collection 'product' deleted.\n",
203
  "Created Collection: product\n",
204
+ "Loaded 19 docs from ..\\data\\product\n",
205
  "Uploaded 132 chunks to product\n",
206
  "\n",
207
  "\n",
208
+ "Processing: revolution\n",
209
+ "Collection 'revolution' deleted.\n",
210
+ "Created Collection: revolution\n",
211
+ "Loaded 1 docs from ..\\data\\revolution\n",
212
+ "Uploaded 32 chunks to revolution\n",
213
  "\n",
214
  "\n",
215
  "Processing: support\n",
216
  "Collection 'support' deleted.\n",
217
  "Created Collection: support\n",
218
+ "Loaded 2 docs from ..\\data\\support\n",
219
  "Uploaded 15 chunks to support\n",
220
  "\n",
221
+ "\n",
222
+ "Processing: technology\n",
223
+ "Collection 'technology' deleted.\n",
224
+ "Created Collection: technology\n",
225
+ "Loaded 1 docs from ..\\data\\technology\n",
226
+ "Uploaded 8 chunks to technology\n",
227
+ "\n",
228
  "\n"
229
  ]
230
  }
app/notebooks/embed_documents.py DELETED
@@ -1,137 +0,0 @@
1
- # %%
2
- from qdrant_client import QdrantClient
3
- from qdrant_client.models import VectorParams, Distance
4
-
5
- from langchain_core.documents import Document
6
- from langchain_qdrant import QdrantVectorStore
7
- from langchain_text_splitters import RecursiveCharacterTextSplitter
8
- from langchain_huggingface import HuggingFaceEmbeddings
9
- from langchain_community.document_loaders import PyPDFLoader
10
-
11
- import os
12
- from pathlib import Path
13
- from uuid import uuid4
14
-
15
- # %%
16
- QDRANT_URL = os.getenv('QDRANT_URL')
17
- QDRANT_API_KEY = os.getenv('QDRANT_API_KEY')
18
-
19
- # %%
20
- FAQ_COLLECTION = "faqs"
21
- BLOGS_COLLECTION = "blogs"
22
- TECHNOLOGY_COLLECTION = "technology"
23
- REVOLUTION_COLLECTION = "revolution"
24
- SUPPORT_COLLECTION = "support"
25
- PRODUCT_COLLECTION = "product"
26
-
27
- # %%
28
- client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)
29
- embedding_model = "intfloat/e5-large-v2"
30
- embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
31
-
32
- # %%
33
- data_directory = Path("../data")
34
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=64)
35
-
36
- # %%
37
- #Delete Collection
38
- def delete_collection(collection_name):
39
- if client.collection_exists(collection_name):
40
- client.delete_collection(collection_name)
41
- print(f"Collection '{collection_name}' deleted.")
42
-
43
- # %%
44
- #Create Collection
45
- def create_collection(collection_name):
46
- if not client.collection_exists(collection_name):
47
- client.create_collection(
48
- collection_name=collection_name,
49
- vectors_config=VectorParams(size=1024, distance=Distance.COSINE),
50
- )
51
- print(f"Created Collection: {collection_name}")
52
-
53
- # %%
54
- def load_documents_from_folder(folder_path):
55
- documents = []
56
-
57
- for file_path in folder_path.rglob("*.txt"):
58
- with open(file_path, 'r', encoding='utf-8') as f:
59
- lines = f.readlines()
60
-
61
- if not lines:
62
- print(f"{file_path} is empty")
63
- continue
64
-
65
- source_url = lines[0].replace("Source URL:","").strip()
66
- content = "".join(lines[1:]).strip()
67
- topic = file_path.parent.name
68
-
69
- if content:
70
- doc = Document(
71
- page_content=content,
72
- metadata={'source': source_url,
73
- 'topic': topic}
74
- )
75
- documents.append(doc)
76
-
77
- for file_path in folder_path.rglob("*.pdf"):
78
- try:
79
- loader = PyPDFLoader(file_path)
80
- docs = loader.load()
81
- for doc in docs:
82
- doc.metadata["topic"] = file_path.parent.name
83
- documents.extend(docs)
84
- except Exception as e:
85
- print(f"Failed to load PDF {file_path}: {e}")
86
-
87
- return documents
88
-
89
- # %%
90
- def split_and_upload_to_qdrant(collection_name, documents):
91
- splits = text_splitter.split_documents(documents)
92
- uuids = [str(uuid4()) for _ in range(len(splits))]
93
-
94
- vector_store = QdrantVectorStore(
95
- client=client,
96
- collection_name=collection_name,
97
- embedding=embeddings
98
- )
99
-
100
- vector_store.add_documents(documents=splits, ids=uuids)
101
- print(f"Uploaded {len(splits)} chunks to {collection_name}")
102
-
103
- # %%
104
- sub_folders = [sub_folder for sub_folder in data_directory.iterdir() if sub_folder.is_dir()]
105
-
106
- for topic in sub_folders:
107
- collection_name = topic.name
108
- print(f"Processing: {topic.name}")
109
-
110
- delete_collection(collection_name)
111
- create_collection(collection_name)
112
-
113
- docs = load_documents_from_folder(topic)
114
- print(f"Loaded {len(docs)} docs from {topic}")
115
-
116
- if docs:
117
- split_and_upload_to_qdrant(collection_name, docs)
118
-
119
- print('\n')
120
-
121
- # %%
122
- """collection_name = 'wellness_docs'
123
- delete_collection(collection_name)
124
- create_collection(collection_name)
125
-
126
- sub_folders = [sub_folder for sub_folder in data_directory.iterdir() if sub_folder.is_dir()]
127
- for topic in sub_folders:
128
- print(f"Processing: {topic.name}")
129
- docs = load_documents_from_folder(topic)
130
- print(f"Loaded {len(docs)} docs from {topic}")
131
-
132
- if docs:
133
- split_and_upload_to_qdrant(collection_name, docs)
134
-
135
- print('\n')"""
136
-
137
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app/notebooks/rag_original.py CHANGED
@@ -23,7 +23,7 @@ console = Console()
23
 
24
  client = QdrantClient(url="localhost", port=6333)
25
  COLLECTION_NAME = "wellness_docs"
26
- embedding_model = "intfloat/e5-large-v2"
27
 
28
 
29
  embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
 
23
 
24
  client = QdrantClient(url="localhost", port=6333)
25
  COLLECTION_NAME = "wellness_docs"
26
+ embedding_model = "intfloat/e5-base-v2"
27
 
28
 
29
  embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
app/rag.py CHANGED
@@ -27,7 +27,7 @@ session_histories: dict[str, list] = {}
27
  LLM_MODEL = "mistral-medium-latest"
28
  OPENROUTER_API_KEY = os.getenv('OPENROUTER_API_KEY')
29
  COLLECTION_NAME = "wellness_docs"
30
- EMBEDDING_MODEL = "intfloat/e5-large-v2"
31
  QDRANT_URL = os.getenv('QDRANT_URL')
32
  QDRANT_API_KEY = os.getenv('QDRANT_API_KEY')
33
  SUPABASE_URL = os.getenv('SUPABASE_URL')
 
27
  LLM_MODEL = "mistral-medium-latest"
28
  OPENROUTER_API_KEY = os.getenv('OPENROUTER_API_KEY')
29
  COLLECTION_NAME = "wellness_docs"
30
+ EMBEDDING_MODEL = "intfloat/e5-base-v2"
31
  QDRANT_URL = os.getenv('QDRANT_URL')
32
  QDRANT_API_KEY = os.getenv('QDRANT_API_KEY')
33
  SUPABASE_URL = os.getenv('SUPABASE_URL')