|
|
import os |
|
|
import asyncio |
|
|
import streamlit as st |
|
|
|
|
|
from crawl4ai import AsyncWebCrawler |
|
|
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig |
|
|
|
|
|
from langchain_core.documents import Document |
|
|
from langchain_text_splitters import RecursiveCharacterTextSplitter |
|
|
|
|
|
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint |
|
|
from langchain_huggingface.embeddings import HuggingFaceEmbeddings |
|
|
from langchain_community.vectorstores import Chroma |
|
|
|
|
|
|
|
|
os.environ["HUGGINGFACEHUB_API_KEY"] = st.secrets["HUGGINGFACEHUB_API_KEY"] |
|
|
os.environ["HF_TOKEN"] = st.secrets["HF_TOKEN"] |
|
|
|
|
|
async def run_pipeline(url: str, query: str): |
|
|
|
|
|
browser_config = BrowserConfig() |
|
|
run_config = CrawlerRunConfig() |
|
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler: |
|
|
result = await crawler.arun(url=url, config=run_config) |
|
|
|
|
|
|
|
|
doc = Document(page_content=result.markdown.raw_markdown) |
|
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) |
|
|
chunks = text_splitter.split_documents([doc]) |
|
|
|
|
|
|
|
|
emb = HuggingFaceEmbeddings(model="avsolatorio/GIST-small-Embedding-v0") |
|
|
cb = Chroma(embedding_function=emb) |
|
|
|
|
|
cb.add_documents(chunks) |
|
|
|
|
|
|
|
|
docs = cb.similarity_search(query, k=3) |
|
|
|
|
|
|
|
|
llama_model = HuggingFaceEndpoint( |
|
|
repo_id="meta-llama/Llama-3.1-8B-Instruct", |
|
|
provider="nebius", |
|
|
temperature=0.7, |
|
|
max_new_tokens=300, |
|
|
task="conversational" |
|
|
) |
|
|
|
|
|
llama = ChatHuggingFace( |
|
|
llm=llama_model, |
|
|
repo_id="meta-llama/Llama-3.1-8B-Instruct", |
|
|
provider="nebius", |
|
|
temperature=0.7, |
|
|
max_new_tokens=300, |
|
|
task="conversational" |
|
|
) |
|
|
|
|
|
response = llama.invoke( |
|
|
f"Context: {docs[0].page_content}\n\nQuestion: {query}" |
|
|
) |
|
|
return response.content |
|
|
|
|
|
|
|
|
st.title("ππ Ask Any Website with Llama3") |
|
|
st.write("Enter a URL and your question β this app crawls the site and answers using Llama3!") |
|
|
|
|
|
url = st.text_input("π Website URL", placeholder="https://www.example.com") |
|
|
query = st.text_input("π¬ Your Question", placeholder="What is this website about?") |
|
|
|
|
|
if st.button("π Crawl & Answer"): |
|
|
if not url.strip() or not query.strip(): |
|
|
st.warning("β Please enter both a URL and a question.") |
|
|
else: |
|
|
with st.spinner("πΈοΈ Crawling website and generating answer..."): |
|
|
result = asyncio.run(run_pipeline(url, query)) |
|
|
st.success(f"β
**Answer:** {result}") |
|
|
|