vip11017 commited on
Commit
314d904
·
1 Parent(s): a394e40

embed documents to include csv

Browse files
Files changed (2) hide show
  1. app/notebooks/embed_documents.ipynb +34 -32
  2. app/rag.py +2 -0
app/notebooks/embed_documents.ipynb CHANGED
@@ -10,7 +10,7 @@
10
  "name": "stderr",
11
  "output_type": "stream",
12
  "text": [
13
- "/Users/vishalpatel/Documents/Internship/Auro/chatbot/auro_chatbot_backend/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
14
  " from .autonotebook import tqdm as notebook_tqdm\n"
15
  ]
16
  }
@@ -23,7 +23,7 @@
23
  "from langchain_qdrant import QdrantVectorStore\n",
24
  "from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
25
  "from langchain_huggingface import HuggingFaceEmbeddings\n",
26
- "from langchain_community.document_loaders import PyPDFLoader\n",
27
  "\n",
28
  "import os\n",
29
  "from pathlib import Path\n",
@@ -150,6 +150,16 @@
150
  " except Exception as e:\n",
151
  " print(f\"Failed to load PDF {file_path}: {e}\")\n",
152
  "\n",
 
 
 
 
 
 
 
 
 
 
153
  " return documents"
154
  ]
155
  },
@@ -187,52 +197,44 @@
187
  "Processing: blogs\n",
188
  "Collection 'auro_blogs' deleted.\n",
189
  "Created Collection: auro_blogs\n",
190
- "Loaded 105 docs from ../data/blogs\n",
191
  "Uploaded 1045 chunks to auro_blogs\n",
192
  "\n",
193
  "\n",
194
- "Processing: technology\n"
195
- ]
196
- },
197
- {
198
- "name": "stderr",
199
- "output_type": "stream",
200
- "text": [
201
- "Ignoring wrong pointing object 6 0 (offset 0)\n"
202
- ]
203
- },
204
- {
205
- "name": "stdout",
206
- "output_type": "stream",
207
- "text": [
208
- "Created Collection: auro_technology\n",
209
- "Loaded 3 docs from ../data/technology\n",
210
- "Uploaded 11 chunks to auro_technology\n",
211
- "\n",
212
- "\n",
213
- "Processing: revolution\n",
214
- "Created Collection: auro_revolution\n",
215
- "Loaded 274 docs from ../data/revolution\n",
216
- "Uploaded 1415 chunks to auro_revolution\n",
217
  "\n",
218
  "\n",
219
  "Processing: product\n",
 
220
  "Created Collection: auro_product\n",
221
- "Loaded 19 docs from ../data/product\n",
222
  "Uploaded 132 chunks to auro_product\n",
223
  "\n",
224
  "\n",
225
- "Processing: faqs\n",
226
- "Created Collection: auro_faqs\n",
227
- "Loaded 1 docs from ../data/faqs\n",
228
- "Uploaded 14 chunks to auro_faqs\n",
 
229
  "\n",
230
  "\n",
231
  "Processing: support\n",
 
232
  "Created Collection: auro_support\n",
233
- "Loaded 2 docs from ../data/support\n",
234
  "Uploaded 15 chunks to auro_support\n",
235
  "\n",
 
 
 
 
 
 
 
236
  "\n"
237
  ]
238
  }
 
10
  "name": "stderr",
11
  "output_type": "stream",
12
  "text": [
13
+ "c:\\Users\\vip11\\Documents\\Projects\\Auro_Chatbot\\auro_chatbot_backend\\.venv\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
14
  " from .autonotebook import tqdm as notebook_tqdm\n"
15
  ]
16
  }
 
23
  "from langchain_qdrant import QdrantVectorStore\n",
24
  "from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
25
  "from langchain_huggingface import HuggingFaceEmbeddings\n",
26
+ "from langchain_community.document_loaders import PyPDFLoader, CSVLoader\n",
27
  "\n",
28
  "import os\n",
29
  "from pathlib import Path\n",
 
150
  " except Exception as e:\n",
151
  " print(f\"Failed to load PDF {file_path}: {e}\")\n",
152
  "\n",
153
+ " for file_path in folder_path.rglob(\"*.csv\"):\n",
154
+ " try:\n",
155
+ " loader = CSVLoader(file_path, encoding='utf-8')\n",
156
+ " docs = loader.load()\n",
157
+ " for doc in docs:\n",
158
+ " doc.metadata[\"topic\"] = file_path.parent.name\n",
159
+ " documents.extend(docs)\n",
160
+ " except Exception as e:\n",
161
+ " print(f\"Failed to load PDF {file_path}: {e}\")\n",
162
+ "\n",
163
  " return documents"
164
  ]
165
  },
 
197
  "Processing: blogs\n",
198
  "Collection 'auro_blogs' deleted.\n",
199
  "Created Collection: auro_blogs\n",
200
+ "Loaded 105 docs from ..\\data\\blogs\n",
201
  "Uploaded 1045 chunks to auro_blogs\n",
202
  "\n",
203
  "\n",
204
+ "Processing: faqs\n",
205
+ "Collection 'auro_faqs' deleted.\n",
206
+ "Created Collection: auro_faqs\n",
207
+ "Loaded 108 docs from ..\\data\\faqs\n",
208
+ "Uploaded 150 chunks to auro_faqs\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  "\n",
210
  "\n",
211
  "Processing: product\n",
212
+ "Collection 'auro_product' deleted.\n",
213
  "Created Collection: auro_product\n",
214
+ "Loaded 19 docs from ..\\data\\product\n",
215
  "Uploaded 132 chunks to auro_product\n",
216
  "\n",
217
  "\n",
218
+ "Processing: revolution\n",
219
+ "Collection 'auro_revolution' deleted.\n",
220
+ "Created Collection: auro_revolution\n",
221
+ "Loaded 1 docs from ..\\data\\revolution\n",
222
+ "Uploaded 32 chunks to auro_revolution\n",
223
  "\n",
224
  "\n",
225
  "Processing: support\n",
226
+ "Collection 'auro_support' deleted.\n",
227
  "Created Collection: auro_support\n",
228
+ "Loaded 2 docs from ..\\data\\support\n",
229
  "Uploaded 15 chunks to auro_support\n",
230
  "\n",
231
+ "\n",
232
+ "Processing: technology\n",
233
+ "Collection 'auro_technology' deleted.\n",
234
+ "Created Collection: auro_technology\n",
235
+ "Loaded 1 docs from ..\\data\\technology\n",
236
+ "Uploaded 8 chunks to auro_technology\n",
237
+ "\n",
238
  "\n"
239
  ]
240
  }
app/rag.py CHANGED
@@ -432,5 +432,7 @@ async def get_response(query: str, name, email, config) -> dict:
432
  latency_ms= latency_ms,
433
  metadata=metadata
434
  )
 
 
435
  return result
436
  # %%
 
432
  latency_ms= latency_ms,
433
  metadata=metadata
434
  )
435
+
436
+
437
  return result
438
  # %%