Spaces:
Running
Running
embed documents to include csv
Browse files- app/notebooks/embed_documents.ipynb +34 -32
- app/rag.py +2 -0
app/notebooks/embed_documents.ipynb
CHANGED
|
@@ -10,7 +10,7 @@
|
|
| 10 |
"name": "stderr",
|
| 11 |
"output_type": "stream",
|
| 12 |
"text": [
|
| 13 |
-
"
|
| 14 |
" from .autonotebook import tqdm as notebook_tqdm\n"
|
| 15 |
]
|
| 16 |
}
|
|
@@ -23,7 +23,7 @@
|
|
| 23 |
"from langchain_qdrant import QdrantVectorStore\n",
|
| 24 |
"from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
|
| 25 |
"from langchain_huggingface import HuggingFaceEmbeddings\n",
|
| 26 |
-
"from langchain_community.document_loaders import PyPDFLoader\n",
|
| 27 |
"\n",
|
| 28 |
"import os\n",
|
| 29 |
"from pathlib import Path\n",
|
|
@@ -150,6 +150,16 @@
|
|
| 150 |
" except Exception as e:\n",
|
| 151 |
" print(f\"Failed to load PDF {file_path}: {e}\")\n",
|
| 152 |
"\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
" return documents"
|
| 154 |
]
|
| 155 |
},
|
|
@@ -187,52 +197,44 @@
|
|
| 187 |
"Processing: blogs\n",
|
| 188 |
"Collection 'auro_blogs' deleted.\n",
|
| 189 |
"Created Collection: auro_blogs\n",
|
| 190 |
-
"Loaded 105 docs from
|
| 191 |
"Uploaded 1045 chunks to auro_blogs\n",
|
| 192 |
"\n",
|
| 193 |
"\n",
|
| 194 |
-
"Processing:
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
"output_type": "stream",
|
| 200 |
-
"text": [
|
| 201 |
-
"Ignoring wrong pointing object 6 0 (offset 0)\n"
|
| 202 |
-
]
|
| 203 |
-
},
|
| 204 |
-
{
|
| 205 |
-
"name": "stdout",
|
| 206 |
-
"output_type": "stream",
|
| 207 |
-
"text": [
|
| 208 |
-
"Created Collection: auro_technology\n",
|
| 209 |
-
"Loaded 3 docs from ../data/technology\n",
|
| 210 |
-
"Uploaded 11 chunks to auro_technology\n",
|
| 211 |
-
"\n",
|
| 212 |
-
"\n",
|
| 213 |
-
"Processing: revolution\n",
|
| 214 |
-
"Created Collection: auro_revolution\n",
|
| 215 |
-
"Loaded 274 docs from ../data/revolution\n",
|
| 216 |
-
"Uploaded 1415 chunks to auro_revolution\n",
|
| 217 |
"\n",
|
| 218 |
"\n",
|
| 219 |
"Processing: product\n",
|
|
|
|
| 220 |
"Created Collection: auro_product\n",
|
| 221 |
-
"Loaded 19 docs from
|
| 222 |
"Uploaded 132 chunks to auro_product\n",
|
| 223 |
"\n",
|
| 224 |
"\n",
|
| 225 |
-
"Processing:
|
| 226 |
-
"
|
| 227 |
-
"
|
| 228 |
-
"
|
|
|
|
| 229 |
"\n",
|
| 230 |
"\n",
|
| 231 |
"Processing: support\n",
|
|
|
|
| 232 |
"Created Collection: auro_support\n",
|
| 233 |
-
"Loaded 2 docs from
|
| 234 |
"Uploaded 15 chunks to auro_support\n",
|
| 235 |
"\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 236 |
"\n"
|
| 237 |
]
|
| 238 |
}
|
|
|
|
| 10 |
"name": "stderr",
|
| 11 |
"output_type": "stream",
|
| 12 |
"text": [
|
| 13 |
+
"c:\\Users\\vip11\\Documents\\Projects\\Auro_Chatbot\\auro_chatbot_backend\\.venv\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
| 14 |
" from .autonotebook import tqdm as notebook_tqdm\n"
|
| 15 |
]
|
| 16 |
}
|
|
|
|
| 23 |
"from langchain_qdrant import QdrantVectorStore\n",
|
| 24 |
"from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
|
| 25 |
"from langchain_huggingface import HuggingFaceEmbeddings\n",
|
| 26 |
+
"from langchain_community.document_loaders import PyPDFLoader, CSVLoader\n",
|
| 27 |
"\n",
|
| 28 |
"import os\n",
|
| 29 |
"from pathlib import Path\n",
|
|
|
|
| 150 |
" except Exception as e:\n",
|
| 151 |
" print(f\"Failed to load PDF {file_path}: {e}\")\n",
|
| 152 |
"\n",
|
| 153 |
+
" for file_path in folder_path.rglob(\"*.csv\"):\n",
|
| 154 |
+
" try:\n",
|
| 155 |
+
" loader = CSVLoader(file_path, encoding='utf-8')\n",
|
| 156 |
+
" docs = loader.load()\n",
|
| 157 |
+
" for doc in docs:\n",
|
| 158 |
+
" doc.metadata[\"topic\"] = file_path.parent.name\n",
|
| 159 |
+
" documents.extend(docs)\n",
|
| 160 |
+
" except Exception as e:\n",
|
| 161 |
+
" print(f\"Failed to load PDF {file_path}: {e}\")\n",
|
| 162 |
+
"\n",
|
| 163 |
" return documents"
|
| 164 |
]
|
| 165 |
},
|
|
|
|
| 197 |
"Processing: blogs\n",
|
| 198 |
"Collection 'auro_blogs' deleted.\n",
|
| 199 |
"Created Collection: auro_blogs\n",
|
| 200 |
+
"Loaded 105 docs from ..\\data\\blogs\n",
|
| 201 |
"Uploaded 1045 chunks to auro_blogs\n",
|
| 202 |
"\n",
|
| 203 |
"\n",
|
| 204 |
+
"Processing: faqs\n",
|
| 205 |
+
"Collection 'auro_faqs' deleted.\n",
|
| 206 |
+
"Created Collection: auro_faqs\n",
|
| 207 |
+
"Loaded 108 docs from ..\\data\\faqs\n",
|
| 208 |
+
"Uploaded 150 chunks to auro_faqs\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
"\n",
|
| 210 |
"\n",
|
| 211 |
"Processing: product\n",
|
| 212 |
+
"Collection 'auro_product' deleted.\n",
|
| 213 |
"Created Collection: auro_product\n",
|
| 214 |
+
"Loaded 19 docs from ..\\data\\product\n",
|
| 215 |
"Uploaded 132 chunks to auro_product\n",
|
| 216 |
"\n",
|
| 217 |
"\n",
|
| 218 |
+
"Processing: revolution\n",
|
| 219 |
+
"Collection 'auro_revolution' deleted.\n",
|
| 220 |
+
"Created Collection: auro_revolution\n",
|
| 221 |
+
"Loaded 1 docs from ..\\data\\revolution\n",
|
| 222 |
+
"Uploaded 32 chunks to auro_revolution\n",
|
| 223 |
"\n",
|
| 224 |
"\n",
|
| 225 |
"Processing: support\n",
|
| 226 |
+
"Collection 'auro_support' deleted.\n",
|
| 227 |
"Created Collection: auro_support\n",
|
| 228 |
+
"Loaded 2 docs from ..\\data\\support\n",
|
| 229 |
"Uploaded 15 chunks to auro_support\n",
|
| 230 |
"\n",
|
| 231 |
+
"\n",
|
| 232 |
+
"Processing: technology\n",
|
| 233 |
+
"Collection 'auro_technology' deleted.\n",
|
| 234 |
+
"Created Collection: auro_technology\n",
|
| 235 |
+
"Loaded 1 docs from ..\\data\\technology\n",
|
| 236 |
+
"Uploaded 8 chunks to auro_technology\n",
|
| 237 |
+
"\n",
|
| 238 |
"\n"
|
| 239 |
]
|
| 240 |
}
|
app/rag.py
CHANGED
|
@@ -432,5 +432,7 @@ async def get_response(query: str, name, email, config) -> dict:
|
|
| 432 |
latency_ms= latency_ms,
|
| 433 |
metadata=metadata
|
| 434 |
)
|
|
|
|
|
|
|
| 435 |
return result
|
| 436 |
# %%
|
|
|
|
| 432 |
latency_ms= latency_ms,
|
| 433 |
metadata=metadata
|
| 434 |
)
|
| 435 |
+
|
| 436 |
+
|
| 437 |
return result
|
| 438 |
# %%
|